pr feedback

pull/767/head
Frostebite 2025-12-27 16:04:59 +00:00
parent 71f48ceff4
commit a61fe5b771
3 changed files with 67 additions and 23 deletions

41
dist/index.js vendored
View File

@ -4734,23 +4734,24 @@ class KubernetesTaskRunner {
lastPhase = phase;
consecutivePendingCount = 0;
}
// Check for failure conditions that mean the pod will never start
const failureReasons = [
// Check for failure conditions that mean the pod will never start (permanent failures)
// Note: We don't treat "Failed" phase as a permanent failure because the pod might have
// completed its work before being killed (OOM), and we should still try to get logs
const permanentFailureReasons = [
'Unschedulable',
'Failed',
'ImagePullBackOff',
'ErrImagePull',
'CrashLoopBackOff',
'CreateContainerError',
'CreateContainerConfigError',
];
const hasFailureCondition = conditions.some((condition) => failureReasons.some((reason) => condition.reason?.includes(reason)));
const hasFailureContainerStatus = containerStatuses.some((containerStatus) => failureReasons.some((reason) => containerStatus.state?.waiting?.reason?.includes(reason)));
if (hasFailureCondition || hasFailureContainerStatus) {
const hasPermanentFailureCondition = conditions.some((condition) => permanentFailureReasons.some((reason) => condition.reason?.includes(reason)));
const hasPermanentFailureContainerStatus = containerStatuses.some((containerStatus) => permanentFailureReasons.some((reason) => containerStatus.state?.waiting?.reason?.includes(reason)));
// Only treat permanent failures as errors - pods that completed (Failed/Succeeded) should continue
if (hasPermanentFailureCondition || hasPermanentFailureContainerStatus) {
// Get detailed failure information
const failureCondition = conditions.find((condition) => failureReasons.some((reason) => condition.reason?.includes(reason)));
const failureContainer = containerStatuses.find((containerStatus) => failureReasons.some((reason) => containerStatus.state?.waiting?.reason?.includes(reason)));
message = `Pod ${podName} failed to start:\nPhase: ${phase}\n`;
const failureCondition = conditions.find((condition) => permanentFailureReasons.some((reason) => condition.reason?.includes(reason)));
const failureContainer = containerStatuses.find((containerStatus) => permanentFailureReasons.some((reason) => containerStatus.state?.waiting?.reason?.includes(reason)));
message = `Pod ${podName} failed to start (permanent failure):\nPhase: ${phase}\n`;
if (failureCondition) {
message += `Condition Reason: ${failureCondition.reason}\nCondition Message: ${failureCondition.message}\n`;
}
@ -4778,7 +4779,13 @@ class KubernetesTaskRunner {
waitComplete = false; // Mark as not complete so we throw an error
return true; // Return true to exit wait loop
}
// Pod is complete if it's not Pending or Unknown - it might be Running, Succeeded, or Failed
// For Failed/Succeeded pods, we still want to try to get logs, so we mark as complete
waitComplete = phase !== 'Pending' && phase !== 'Unknown';
// If pod completed (Succeeded/Failed), log it but don't throw - we'll try to get logs
if (waitComplete && phase !== 'Running') {
cloud_runner_logger_1.default.log(`Pod ${podName} completed with phase: ${phase}. Will attempt to retrieve logs.`);
}
if (phase === 'Pending') {
consecutivePendingCount++;
// Log diagnostic info every 4 checks (1 minute) if still pending
@ -4846,7 +4853,21 @@ class KubernetesTaskRunner {
}
throw new Error(`Pod ${podName} failed to start within timeout. ${message}`);
}
// Only throw if we detected a permanent failure condition
// If the pod completed (Failed/Succeeded), we should still try to get logs
if (!waitComplete) {
// Check the final phase to see if it's a permanent failure or just completed
try {
const finalStatus = await kubeClient.readNamespacedPodStatus(podName, namespace);
const finalPhase = finalStatus?.body.status?.phase || 'Unknown';
if (finalPhase === 'Failed' || finalPhase === 'Succeeded') {
cloud_runner_logger_1.default.logWarning(`Pod ${podName} completed with phase ${finalPhase} before reaching Running state. Will attempt to retrieve logs.`);
return true; // Allow workflow to continue and try to get logs
}
}
catch {
// If we can't check status, fall through to throw error
}
cloud_runner_logger_1.default.logWarning(`Pod ${podName} did not reach running state: ${message}`);
throw new Error(`Pod ${podName} did not start successfully: ${message}`);
}

2
dist/index.js.map vendored

File diff suppressed because one or more lines are too long

View File

@ -393,35 +393,36 @@ class KubernetesTaskRunner {
consecutivePendingCount = 0;
}
// Check for failure conditions that mean the pod will never start
const failureReasons = [
// Check for failure conditions that mean the pod will never start (permanent failures)
// Note: We don't treat "Failed" phase as a permanent failure because the pod might have
// completed its work before being killed (OOM), and we should still try to get logs
const permanentFailureReasons = [
'Unschedulable',
'Failed',
'ImagePullBackOff',
'ErrImagePull',
'CrashLoopBackOff',
'CreateContainerError',
'CreateContainerConfigError',
];
const hasFailureCondition = conditions.some((condition: any) =>
failureReasons.some((reason) => condition.reason?.includes(reason)),
const hasPermanentFailureCondition = conditions.some((condition: any) =>
permanentFailureReasons.some((reason) => condition.reason?.includes(reason)),
);
const hasFailureContainerStatus = containerStatuses.some((containerStatus: any) =>
failureReasons.some((reason) => containerStatus.state?.waiting?.reason?.includes(reason)),
const hasPermanentFailureContainerStatus = containerStatuses.some((containerStatus: any) =>
permanentFailureReasons.some((reason) => containerStatus.state?.waiting?.reason?.includes(reason)),
);
if (hasFailureCondition || hasFailureContainerStatus) {
// Only treat permanent failures as errors - pods that completed (Failed/Succeeded) should continue
if (hasPermanentFailureCondition || hasPermanentFailureContainerStatus) {
// Get detailed failure information
const failureCondition = conditions.find((condition: any) =>
failureReasons.some((reason) => condition.reason?.includes(reason)),
permanentFailureReasons.some((reason) => condition.reason?.includes(reason)),
);
const failureContainer = containerStatuses.find((containerStatus: any) =>
failureReasons.some((reason) => containerStatus.state?.waiting?.reason?.includes(reason)),
permanentFailureReasons.some((reason) => containerStatus.state?.waiting?.reason?.includes(reason)),
);
message = `Pod ${podName} failed to start:\nPhase: ${phase}\n`;
message = `Pod ${podName} failed to start (permanent failure):\nPhase: ${phase}\n`;
if (failureCondition) {
message += `Condition Reason: ${failureCondition.reason}\nCondition Message: ${failureCondition.message}\n`;
}
@ -451,7 +452,14 @@ class KubernetesTaskRunner {
return true; // Return true to exit wait loop
}
// Pod is complete if it's not Pending or Unknown - it might be Running, Succeeded, or Failed
// For Failed/Succeeded pods, we still want to try to get logs, so we mark as complete
waitComplete = phase !== 'Pending' && phase !== 'Unknown';
// If pod completed (Succeeded/Failed), log it but don't throw - we'll try to get logs
if (waitComplete && phase !== 'Running') {
CloudRunnerLogger.log(`Pod ${podName} completed with phase: ${phase}. Will attempt to retrieve logs.`);
}
if (phase === 'Pending') {
consecutivePendingCount++;
@ -527,7 +535,22 @@ class KubernetesTaskRunner {
throw new Error(`Pod ${podName} failed to start within timeout. ${message}`);
}
// Only throw if we detected a permanent failure condition
// If the pod completed (Failed/Succeeded), we should still try to get logs
if (!waitComplete) {
// Check the final phase to see if it's a permanent failure or just completed
try {
const finalStatus = await kubeClient.readNamespacedPodStatus(podName, namespace);
const finalPhase = finalStatus?.body.status?.phase || 'Unknown';
if (finalPhase === 'Failed' || finalPhase === 'Succeeded') {
CloudRunnerLogger.logWarning(
`Pod ${podName} completed with phase ${finalPhase} before reaching Running state. Will attempt to retrieve logs.`,
);
return true; // Allow workflow to continue and try to get logs
}
} catch {
// If we can't check status, fall through to throw error
}
CloudRunnerLogger.logWarning(`Pod ${podName} did not reach running state: ${message}`);
throw new Error(`Pod ${podName} did not start successfully: ${message}`);
}