pr feedback - fail faster on pending pods and detect scheduling failures
parent
45e7ed0fcb
commit
4b182a065a
|
|
@ -4857,9 +4857,50 @@ class KubernetesTaskRunner {
|
||||||
}
|
}
|
||||||
if (phase === 'Pending') {
|
if (phase === 'Pending') {
|
||||||
consecutivePendingCount++;
|
consecutivePendingCount++;
|
||||||
|
// Check for scheduling failures in events (faster than waiting for conditions)
|
||||||
|
try {
|
||||||
|
const events = await kubeClient.listNamespacedEvent(namespace);
|
||||||
|
const podEvents = events.body.items.filter((x) => x.involvedObject?.name === podName);
|
||||||
|
const failedSchedulingEvents = podEvents.filter((x) => x.reason === 'FailedScheduling' || x.reason === 'SchedulingGated');
|
||||||
|
if (failedSchedulingEvents.length > 0) {
|
||||||
|
const schedulingMessage = failedSchedulingEvents
|
||||||
|
.map((x) => `${x.reason}: ${x.message || ''}`)
|
||||||
|
.join('; ');
|
||||||
|
message = `Pod ${podName} cannot be scheduled:\n${schedulingMessage}`;
|
||||||
|
cloud_runner_logger_1.default.logWarning(message);
|
||||||
|
waitComplete = false;
|
||||||
|
return true; // Exit wait loop to throw error
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch {
|
||||||
|
// Ignore event fetch errors
|
||||||
|
}
|
||||||
|
// For tests, fail faster if stuck in Pending (2 minutes = 8 checks at 15s interval)
|
||||||
|
const isTest = process.env['cloudRunnerTests'] === 'true';
|
||||||
|
const maxPendingChecks = isTest ? 8 : 80; // 2 minutes for tests, 20 minutes for production
|
||||||
|
if (consecutivePendingCount >= maxPendingChecks) {
|
||||||
|
message = `Pod ${podName} stuck in Pending state for too long (${consecutivePendingCount} checks). This indicates a scheduling problem.`;
|
||||||
|
// Get events for context
|
||||||
|
try {
|
||||||
|
const events = await kubeClient.listNamespacedEvent(namespace);
|
||||||
|
const podEvents = events.body.items
|
||||||
|
.filter((x) => x.involvedObject?.name === podName)
|
||||||
|
.slice(-5)
|
||||||
|
.map((x) => `${x.type}: ${x.reason} - ${x.message}`);
|
||||||
|
if (podEvents.length > 0) {
|
||||||
|
message += `\n\nRecent Events:\n${podEvents.join('\n')}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch {
|
||||||
|
// Ignore event fetch errors
|
||||||
|
}
|
||||||
|
cloud_runner_logger_1.default.logWarning(message);
|
||||||
|
waitComplete = false;
|
||||||
|
return true; // Exit wait loop to throw error
|
||||||
|
}
|
||||||
// Log diagnostic info every 4 checks (1 minute) if still pending
|
// Log diagnostic info every 4 checks (1 minute) if still pending
|
||||||
if (consecutivePendingCount % 4 === 0) {
|
if (consecutivePendingCount % 4 === 0) {
|
||||||
const pendingMessage = `Pod ${podName} still Pending (check ${consecutivePendingCount}). Phase: ${phase}`;
|
const pendingMessage = `Pod ${podName} still Pending (check ${consecutivePendingCount}/${maxPendingChecks}). Phase: ${phase}`;
|
||||||
const conditionMessages = conditions
|
const conditionMessages = conditions
|
||||||
.map((c) => `${c.type}: ${c.reason || 'N/A'} - ${c.message || 'N/A'}`)
|
.map((c) => `${c.type}: ${c.reason || 'N/A'} - ${c.message || 'N/A'}`)
|
||||||
.join('; ');
|
.join('; ');
|
||||||
|
|
@ -4888,7 +4929,7 @@ class KubernetesTaskRunner {
|
||||||
return true;
|
return true;
|
||||||
return false;
|
return false;
|
||||||
}, {
|
}, {
|
||||||
timeout: 2000000,
|
timeout: process.env['cloudRunnerTests'] === 'true' ? 300000 : 2000000,
|
||||||
intervalBetweenAttempts: 15000, // 15 seconds
|
intervalBetweenAttempts: 15000, // 15 seconds
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -481,9 +481,55 @@ class KubernetesTaskRunner {
|
||||||
|
|
||||||
if (phase === 'Pending') {
|
if (phase === 'Pending') {
|
||||||
consecutivePendingCount++;
|
consecutivePendingCount++;
|
||||||
|
|
||||||
|
// Check for scheduling failures in events (faster than waiting for conditions)
|
||||||
|
try {
|
||||||
|
const events = await kubeClient.listNamespacedEvent(namespace);
|
||||||
|
const podEvents = events.body.items.filter((x) => x.involvedObject?.name === podName);
|
||||||
|
const failedSchedulingEvents = podEvents.filter(
|
||||||
|
(x) => x.reason === 'FailedScheduling' || x.reason === 'SchedulingGated',
|
||||||
|
);
|
||||||
|
|
||||||
|
if (failedSchedulingEvents.length > 0) {
|
||||||
|
const schedulingMessage = failedSchedulingEvents
|
||||||
|
.map((x) => `${x.reason}: ${x.message || ''}`)
|
||||||
|
.join('; ');
|
||||||
|
message = `Pod ${podName} cannot be scheduled:\n${schedulingMessage}`;
|
||||||
|
CloudRunnerLogger.logWarning(message);
|
||||||
|
waitComplete = false;
|
||||||
|
return true; // Exit wait loop to throw error
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Ignore event fetch errors
|
||||||
|
}
|
||||||
|
|
||||||
|
// For tests, fail faster if stuck in Pending (2 minutes = 8 checks at 15s interval)
|
||||||
|
const isTest = process.env['cloudRunnerTests'] === 'true';
|
||||||
|
const maxPendingChecks = isTest ? 8 : 80; // 2 minutes for tests, 20 minutes for production
|
||||||
|
|
||||||
|
if (consecutivePendingCount >= maxPendingChecks) {
|
||||||
|
message = `Pod ${podName} stuck in Pending state for too long (${consecutivePendingCount} checks). This indicates a scheduling problem.`;
|
||||||
|
// Get events for context
|
||||||
|
try {
|
||||||
|
const events = await kubeClient.listNamespacedEvent(namespace);
|
||||||
|
const podEvents = events.body.items
|
||||||
|
.filter((x) => x.involvedObject?.name === podName)
|
||||||
|
.slice(-5)
|
||||||
|
.map((x) => `${x.type}: ${x.reason} - ${x.message}`);
|
||||||
|
if (podEvents.length > 0) {
|
||||||
|
message += `\n\nRecent Events:\n${podEvents.join('\n')}`;
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Ignore event fetch errors
|
||||||
|
}
|
||||||
|
CloudRunnerLogger.logWarning(message);
|
||||||
|
waitComplete = false;
|
||||||
|
return true; // Exit wait loop to throw error
|
||||||
|
}
|
||||||
|
|
||||||
// Log diagnostic info every 4 checks (1 minute) if still pending
|
// Log diagnostic info every 4 checks (1 minute) if still pending
|
||||||
if (consecutivePendingCount % 4 === 0) {
|
if (consecutivePendingCount % 4 === 0) {
|
||||||
const pendingMessage = `Pod ${podName} still Pending (check ${consecutivePendingCount}). Phase: ${phase}`;
|
const pendingMessage = `Pod ${podName} still Pending (check ${consecutivePendingCount}/${maxPendingChecks}). Phase: ${phase}`;
|
||||||
const conditionMessages = conditions
|
const conditionMessages = conditions
|
||||||
.map((c: any) => `${c.type}: ${c.reason || 'N/A'} - ${c.message || 'N/A'}`)
|
.map((c: any) => `${c.type}: ${c.reason || 'N/A'} - ${c.message || 'N/A'}`)
|
||||||
.join('; ');
|
.join('; ');
|
||||||
|
|
@ -517,7 +563,7 @@ class KubernetesTaskRunner {
|
||||||
return false;
|
return false;
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
timeout: 2000000, // ~33 minutes
|
timeout: process.env['cloudRunnerTests'] === 'true' ? 300000 : 2000000, // 5 minutes for tests, ~33 minutes for production
|
||||||
intervalBetweenAttempts: 15000, // 15 seconds
|
intervalBetweenAttempts: 15000, // 15 seconds
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue