PR feedback

cloud-runner-develop
Frostebite 2025-12-06 00:06:22 +00:00
parent c216e3bb41
commit f783857278
4 changed files with 72 additions and 18 deletions

30
dist/index.js vendored
View File

@ -4020,9 +4020,33 @@ class KubernetesPods {
// If pod was killed and we have PreStopHook failure but no container status yet, wait a bit // If pod was killed and we have PreStopHook failure but no container status yet, wait a bit
// The container might have succeeded but status hasn't been updated yet // The container might have succeeded but status hasn't been updated yet
if (wasKilled && hasPreStopHookFailure && containerExitCode === undefined) { if (wasKilled && hasPreStopHookFailure && containerExitCode === undefined) {
cloud_runner_logger_1.default.log(`Pod ${podName} was killed with PreStopHook failure, but container status not yet available. This may be non-fatal if container succeeded.`); cloud_runner_logger_1.default.log(`Pod ${podName} was killed with PreStopHook failure, but container status not yet available. Waiting for container status...`);
// Still throw error for now, but with more context // Wait a bit for container status to become available (up to 30 seconds)
// The task runner will retry and get the actual container status for (let i = 0; i < 6; i++) {
await new Promise((resolve) => setTimeout(resolve, 5000));
try {
const updatedPod = (await kubeClient.listNamespacedPod(namespace)).body.items.find((x) => podName === x.metadata?.name);
if (updatedPod?.status?.containerStatuses && updatedPod.status.containerStatuses.length > 0) {
const updatedContainerStatus = updatedPod.status.containerStatuses[0];
if (updatedContainerStatus.state?.terminated) {
const updatedExitCode = updatedContainerStatus.state.terminated.exitCode;
if (updatedExitCode === 0) {
cloud_runner_logger_1.default.logWarning(`Pod ${podName} container succeeded (exit code 0) after waiting. PreStopHook failure is non-fatal.`);
return false; // Pod is not running, but container succeeded
}
else {
cloud_runner_logger_1.default.log(`Pod ${podName} container failed with exit code ${updatedExitCode} after waiting.`);
errorDetails.push(`Container terminated after wait: exit code ${updatedExitCode}`);
break;
}
}
}
}
catch (waitError) {
cloud_runner_logger_1.default.log(`Error while waiting for container status: ${waitError}`);
}
}
cloud_runner_logger_1.default.log(`Container status still not available after waiting. Assuming failure due to PreStopHook issues.`);
} }
const errorMessage = `K8s pod failed\n${errorDetails.join('\n')}`; const errorMessage = `K8s pod failed\n${errorDetails.join('\n')}`;
cloud_runner_logger_1.default.log(errorMessage); cloud_runner_logger_1.default.log(errorMessage);

2
dist/index.js.map vendored

File diff suppressed because one or more lines are too long

View File

@ -87,10 +87,42 @@ class KubernetesPods {
// The container might have succeeded but status hasn't been updated yet // The container might have succeeded but status hasn't been updated yet
if (wasKilled && hasPreStopHookFailure && containerExitCode === undefined) { if (wasKilled && hasPreStopHookFailure && containerExitCode === undefined) {
CloudRunnerLogger.log( CloudRunnerLogger.log(
`Pod ${podName} was killed with PreStopHook failure, but container status not yet available. This may be non-fatal if container succeeded.`, `Pod ${podName} was killed with PreStopHook failure, but container status not yet available. Waiting for container status...`,
);
// Wait a bit for container status to become available (up to 30 seconds)
for (let i = 0; i < 6; i++) {
await new Promise((resolve) => setTimeout(resolve, 5000));
try {
const updatedPod = (
await kubeClient.listNamespacedPod(namespace)
).body.items.find((x) => podName === x.metadata?.name);
if (updatedPod?.status?.containerStatuses && updatedPod.status.containerStatuses.length > 0) {
const updatedContainerStatus = updatedPod.status.containerStatuses[0];
if (updatedContainerStatus.state?.terminated) {
const updatedExitCode = updatedContainerStatus.state.terminated.exitCode;
if (updatedExitCode === 0) {
CloudRunnerLogger.logWarning(
`Pod ${podName} container succeeded (exit code 0) after waiting. PreStopHook failure is non-fatal.`,
);
return false; // Pod is not running, but container succeeded
} else {
CloudRunnerLogger.log(
`Pod ${podName} container failed with exit code ${updatedExitCode} after waiting.`,
);
errorDetails.push(
`Container terminated after wait: exit code ${updatedExitCode}`,
);
break;
}
}
}
} catch (waitError) {
CloudRunnerLogger.log(`Error while waiting for container status: ${waitError}`);
}
}
CloudRunnerLogger.log(
`Container status still not available after waiting. Assuming failure due to PreStopHook issues.`,
); );
// Still throw error for now, but with more context
// The task runner will retry and get the actual container status
} }
const errorMessage = `K8s pod failed\n${errorDetails.join('\n')}`; const errorMessage = `K8s pod failed\n${errorDetails.join('\n')}`;

View File

@ -95,9 +95,7 @@ export class Caching {
// If disk usage is high (>90%), proactively clean up old cache files // If disk usage is high (>90%), proactively clean up old cache files
if (diskUsagePercent > 90) { if (diskUsagePercent > 90) {
CloudRunnerLogger.log( CloudRunnerLogger.log(`Disk usage is ${diskUsagePercent}% - cleaning up old cache files before tar operation`);
`Disk usage is ${diskUsagePercent}% - cleaning up old cache files before tar operation`,
);
try { try {
const cacheParent = path.dirname(cacheFolder); const cacheParent = path.dirname(cacheFolder);
if (await fileExists(cacheParent)) { if (await fileExists(cacheParent)) {
@ -106,9 +104,7 @@ export class Caching {
`find ${cacheParent} -name "*.tar*" -type f -mmin +360 -delete 2>/dev/null || true`, `find ${cacheParent} -name "*.tar*" -type f -mmin +360 -delete 2>/dev/null || true`,
); );
// Also try to remove old cache directories // Also try to remove old cache directories
await CloudRunnerSystem.Run( await CloudRunnerSystem.Run(`find ${cacheParent} -type d -empty -delete 2>/dev/null || true`);
`find ${cacheParent} -type d -empty -delete 2>/dev/null || true`,
);
CloudRunnerLogger.log(`Cleanup completed. Checking disk space again...`); CloudRunnerLogger.log(`Cleanup completed. Checking disk space again...`);
const diskCheckAfter = await CloudRunnerSystem.Run(`df . 2>/dev/null || df /data 2>/dev/null || true`); const diskCheckAfter = await CloudRunnerSystem.Run(`df . 2>/dev/null || df /data 2>/dev/null || true`);
CloudRunnerLogger.log(`Disk space after cleanup: ${diskCheckAfter}`); CloudRunnerLogger.log(`Disk space after cleanup: ${diskCheckAfter}`);
@ -143,9 +139,7 @@ export class Caching {
`find ${cacheParent} -name "*.tar*" -type f -mmin +60 -delete 2>/dev/null || true`, `find ${cacheParent} -name "*.tar*" -type f -mmin +60 -delete 2>/dev/null || true`,
); );
// Remove empty cache directories // Remove empty cache directories
await CloudRunnerSystem.Run( await CloudRunnerSystem.Run(`find ${cacheParent} -type d -empty -delete 2>/dev/null || true`);
`find ${cacheParent} -type d -empty -delete 2>/dev/null || true`,
);
// Also try to clean up the entire cache folder if it's getting too large // Also try to clean up the entire cache folder if it's getting too large
const cacheRoot = path.resolve(cacheParent, '..'); const cacheRoot = path.resolve(cacheParent, '..');
if (await fileExists(cacheRoot)) { if (await fileExists(cacheRoot)) {
@ -165,7 +159,9 @@ export class Caching {
retrySucceeded = true; retrySucceeded = true;
} catch (retryError: any) { } catch (retryError: any) {
throw new Error( throw new Error(
`Failed to create cache archive after cleanup. Original error: ${errorMessage}. Retry error: ${retryError?.message || retryError}`, `Failed to create cache archive after cleanup. Original error: ${errorMessage}. Retry error: ${
retryError?.message || retryError
}`,
); );
} }
// If retry succeeded, don't throw the original error - let execution continue after catch block // If retry succeeded, don't throw the original error - let execution continue after catch block
@ -181,7 +177,9 @@ export class Caching {
} catch (cleanupError: any) { } catch (cleanupError: any) {
CloudRunnerLogger.log(`Cleanup attempt failed: ${cleanupError}`); CloudRunnerLogger.log(`Cleanup attempt failed: ${cleanupError}`);
throw new Error( throw new Error(
`Failed to create cache archive due to insufficient disk space. Error: ${errorMessage}. Cleanup failed: ${cleanupError?.message || cleanupError}`, `Failed to create cache archive due to insufficient disk space. Error: ${errorMessage}. Cleanup failed: ${
cleanupError?.message || cleanupError
}`,
); );
} }
} else { } else {