PR feedback
parent
c216e3bb41
commit
f783857278
|
|
@ -4020,9 +4020,33 @@ class KubernetesPods {
|
|||
// If pod was killed and we have PreStopHook failure but no container status yet, wait a bit
|
||||
// The container might have succeeded but status hasn't been updated yet
|
||||
if (wasKilled && hasPreStopHookFailure && containerExitCode === undefined) {
|
||||
cloud_runner_logger_1.default.log(`Pod ${podName} was killed with PreStopHook failure, but container status not yet available. This may be non-fatal if container succeeded.`);
|
||||
// Still throw error for now, but with more context
|
||||
// The task runner will retry and get the actual container status
|
||||
cloud_runner_logger_1.default.log(`Pod ${podName} was killed with PreStopHook failure, but container status not yet available. Waiting for container status...`);
|
||||
// Wait a bit for container status to become available (up to 30 seconds)
|
||||
for (let i = 0; i < 6; i++) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 5000));
|
||||
try {
|
||||
const updatedPod = (await kubeClient.listNamespacedPod(namespace)).body.items.find((x) => podName === x.metadata?.name);
|
||||
if (updatedPod?.status?.containerStatuses && updatedPod.status.containerStatuses.length > 0) {
|
||||
const updatedContainerStatus = updatedPod.status.containerStatuses[0];
|
||||
if (updatedContainerStatus.state?.terminated) {
|
||||
const updatedExitCode = updatedContainerStatus.state.terminated.exitCode;
|
||||
if (updatedExitCode === 0) {
|
||||
cloud_runner_logger_1.default.logWarning(`Pod ${podName} container succeeded (exit code 0) after waiting. PreStopHook failure is non-fatal.`);
|
||||
return false; // Pod is not running, but container succeeded
|
||||
}
|
||||
else {
|
||||
cloud_runner_logger_1.default.log(`Pod ${podName} container failed with exit code ${updatedExitCode} after waiting.`);
|
||||
errorDetails.push(`Container terminated after wait: exit code ${updatedExitCode}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (waitError) {
|
||||
cloud_runner_logger_1.default.log(`Error while waiting for container status: ${waitError}`);
|
||||
}
|
||||
}
|
||||
cloud_runner_logger_1.default.log(`Container status still not available after waiting. Assuming failure due to PreStopHook issues.`);
|
||||
}
|
||||
const errorMessage = `K8s pod failed\n${errorDetails.join('\n')}`;
|
||||
cloud_runner_logger_1.default.log(errorMessage);
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -87,10 +87,42 @@ class KubernetesPods {
|
|||
// The container might have succeeded but status hasn't been updated yet
|
||||
if (wasKilled && hasPreStopHookFailure && containerExitCode === undefined) {
|
||||
CloudRunnerLogger.log(
|
||||
`Pod ${podName} was killed with PreStopHook failure, but container status not yet available. This may be non-fatal if container succeeded.`,
|
||||
`Pod ${podName} was killed with PreStopHook failure, but container status not yet available. Waiting for container status...`,
|
||||
);
|
||||
// Wait a bit for container status to become available (up to 30 seconds)
|
||||
for (let i = 0; i < 6; i++) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 5000));
|
||||
try {
|
||||
const updatedPod = (
|
||||
await kubeClient.listNamespacedPod(namespace)
|
||||
).body.items.find((x) => podName === x.metadata?.name);
|
||||
if (updatedPod?.status?.containerStatuses && updatedPod.status.containerStatuses.length > 0) {
|
||||
const updatedContainerStatus = updatedPod.status.containerStatuses[0];
|
||||
if (updatedContainerStatus.state?.terminated) {
|
||||
const updatedExitCode = updatedContainerStatus.state.terminated.exitCode;
|
||||
if (updatedExitCode === 0) {
|
||||
CloudRunnerLogger.logWarning(
|
||||
`Pod ${podName} container succeeded (exit code 0) after waiting. PreStopHook failure is non-fatal.`,
|
||||
);
|
||||
return false; // Pod is not running, but container succeeded
|
||||
} else {
|
||||
CloudRunnerLogger.log(
|
||||
`Pod ${podName} container failed with exit code ${updatedExitCode} after waiting.`,
|
||||
);
|
||||
errorDetails.push(
|
||||
`Container terminated after wait: exit code ${updatedExitCode}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (waitError) {
|
||||
CloudRunnerLogger.log(`Error while waiting for container status: ${waitError}`);
|
||||
}
|
||||
}
|
||||
CloudRunnerLogger.log(
|
||||
`Container status still not available after waiting. Assuming failure due to PreStopHook issues.`,
|
||||
);
|
||||
// Still throw error for now, but with more context
|
||||
// The task runner will retry and get the actual container status
|
||||
}
|
||||
|
||||
const errorMessage = `K8s pod failed\n${errorDetails.join('\n')}`;
|
||||
|
|
|
|||
|
|
@ -95,9 +95,7 @@ export class Caching {
|
|||
|
||||
// If disk usage is high (>90%), proactively clean up old cache files
|
||||
if (diskUsagePercent > 90) {
|
||||
CloudRunnerLogger.log(
|
||||
`Disk usage is ${diskUsagePercent}% - cleaning up old cache files before tar operation`,
|
||||
);
|
||||
CloudRunnerLogger.log(`Disk usage is ${diskUsagePercent}% - cleaning up old cache files before tar operation`);
|
||||
try {
|
||||
const cacheParent = path.dirname(cacheFolder);
|
||||
if (await fileExists(cacheParent)) {
|
||||
|
|
@ -106,9 +104,7 @@ export class Caching {
|
|||
`find ${cacheParent} -name "*.tar*" -type f -mmin +360 -delete 2>/dev/null || true`,
|
||||
);
|
||||
// Also try to remove old cache directories
|
||||
await CloudRunnerSystem.Run(
|
||||
`find ${cacheParent} -type d -empty -delete 2>/dev/null || true`,
|
||||
);
|
||||
await CloudRunnerSystem.Run(`find ${cacheParent} -type d -empty -delete 2>/dev/null || true`);
|
||||
CloudRunnerLogger.log(`Cleanup completed. Checking disk space again...`);
|
||||
const diskCheckAfter = await CloudRunnerSystem.Run(`df . 2>/dev/null || df /data 2>/dev/null || true`);
|
||||
CloudRunnerLogger.log(`Disk space after cleanup: ${diskCheckAfter}`);
|
||||
|
|
@ -143,9 +139,7 @@ export class Caching {
|
|||
`find ${cacheParent} -name "*.tar*" -type f -mmin +60 -delete 2>/dev/null || true`,
|
||||
);
|
||||
// Remove empty cache directories
|
||||
await CloudRunnerSystem.Run(
|
||||
`find ${cacheParent} -type d -empty -delete 2>/dev/null || true`,
|
||||
);
|
||||
await CloudRunnerSystem.Run(`find ${cacheParent} -type d -empty -delete 2>/dev/null || true`);
|
||||
// Also try to clean up the entire cache folder if it's getting too large
|
||||
const cacheRoot = path.resolve(cacheParent, '..');
|
||||
if (await fileExists(cacheRoot)) {
|
||||
|
|
@ -165,7 +159,9 @@ export class Caching {
|
|||
retrySucceeded = true;
|
||||
} catch (retryError: any) {
|
||||
throw new Error(
|
||||
`Failed to create cache archive after cleanup. Original error: ${errorMessage}. Retry error: ${retryError?.message || retryError}`,
|
||||
`Failed to create cache archive after cleanup. Original error: ${errorMessage}. Retry error: ${
|
||||
retryError?.message || retryError
|
||||
}`,
|
||||
);
|
||||
}
|
||||
// If retry succeeded, don't throw the original error - let execution continue after catch block
|
||||
|
|
@ -181,7 +177,9 @@ export class Caching {
|
|||
} catch (cleanupError: any) {
|
||||
CloudRunnerLogger.log(`Cleanup attempt failed: ${cleanupError}`);
|
||||
throw new Error(
|
||||
`Failed to create cache archive due to insufficient disk space. Error: ${errorMessage}. Cleanup failed: ${cleanupError?.message || cleanupError}`,
|
||||
`Failed to create cache archive due to insufficient disk space. Error: ${errorMessage}. Cleanup failed: ${
|
||||
cleanupError?.message || cleanupError
|
||||
}`,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
|
|
|
|||
Loading…
Reference in New Issue