PR feedback
parent
956b2e4324
commit
bea818fb9c
|
|
@ -4002,6 +4002,9 @@ class KubernetesPods {
|
||||||
}
|
}
|
||||||
// Check if only PreStopHook failed but container succeeded
|
// Check if only PreStopHook failed but container succeeded
|
||||||
const hasPreStopHookFailure = events.some((e) => e.reason === 'FailedPreStopHook');
|
const hasPreStopHookFailure = events.some((e) => e.reason === 'FailedPreStopHook');
|
||||||
|
const wasKilled = events.some((e) => e.reason === 'Killing');
|
||||||
|
// If container succeeded (exit code 0), PreStopHook failure is non-critical
|
||||||
|
// Also check if pod was killed but container might have succeeded
|
||||||
if (containerSucceeded && containerExitCode === 0) {
|
if (containerSucceeded && containerExitCode === 0) {
|
||||||
// Container succeeded - PreStopHook failure is non-critical
|
// Container succeeded - PreStopHook failure is non-critical
|
||||||
if (hasPreStopHookFailure) {
|
if (hasPreStopHookFailure) {
|
||||||
|
|
@ -4014,6 +4017,13 @@ class KubernetesPods {
|
||||||
// Don't throw error - container succeeded, PreStopHook failure is non-critical
|
// Don't throw error - container succeeded, PreStopHook failure is non-critical
|
||||||
return false; // Pod is not running, but we don't treat it as a failure
|
return false; // Pod is not running, but we don't treat it as a failure
|
||||||
}
|
}
|
||||||
|
// If pod was killed and we have PreStopHook failure but no container status yet, wait a bit
|
||||||
|
// The container might have succeeded but status hasn't been updated yet
|
||||||
|
if (wasKilled && hasPreStopHookFailure && containerExitCode === undefined) {
|
||||||
|
cloud_runner_logger_1.default.log(`Pod ${podName} was killed with PreStopHook failure, but container status not yet available. This may be non-fatal if container succeeded.`);
|
||||||
|
// Still throw error for now, but with more context
|
||||||
|
// The task runner will retry and get the actual container status
|
||||||
|
}
|
||||||
const errorMessage = `K8s pod failed\n${errorDetails.join('\n')}`;
|
const errorMessage = `K8s pod failed\n${errorDetails.join('\n')}`;
|
||||||
cloud_runner_logger_1.default.log(errorMessage);
|
cloud_runner_logger_1.default.log(errorMessage);
|
||||||
throw new Error(errorMessage);
|
throw new Error(errorMessage);
|
||||||
|
|
@ -5202,7 +5212,44 @@ class Caching {
|
||||||
process.chdir(`${startPath}`);
|
process.chdir(`${startPath}`);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
// Check disk space before creating tar archive
|
||||||
|
try {
|
||||||
|
const diskCheckOutput = await cloud_runner_system_1.CloudRunnerSystem.Run(`df -h . 2>/dev/null || df -h /data 2>/dev/null || true`);
|
||||||
|
cloud_runner_logger_1.default.log(`Disk space before tar: ${diskCheckOutput}`);
|
||||||
|
}
|
||||||
|
catch (error) {
|
||||||
|
// Ignore disk check errors
|
||||||
|
}
|
||||||
|
// Clean up any existing incomplete tar files
|
||||||
|
try {
|
||||||
|
await cloud_runner_system_1.CloudRunnerSystem.Run(`rm -f ${cacheArtifactName}.tar${compressionSuffix} 2>/dev/null || true`);
|
||||||
|
}
|
||||||
|
catch (error) {
|
||||||
|
// Ignore cleanup errors
|
||||||
|
}
|
||||||
|
try {
|
||||||
await cloud_runner_system_1.CloudRunnerSystem.Run(`tar -cf ${cacheArtifactName}.tar${compressionSuffix} "${node_path_1.default.basename(sourceFolder)}"`);
|
await cloud_runner_system_1.CloudRunnerSystem.Run(`tar -cf ${cacheArtifactName}.tar${compressionSuffix} "${node_path_1.default.basename(sourceFolder)}"`);
|
||||||
|
}
|
||||||
|
catch (error) {
|
||||||
|
// Check if error is due to disk space
|
||||||
|
const errorMessage = error?.message || error?.toString() || '';
|
||||||
|
if (errorMessage.includes('No space left') || errorMessage.includes('Wrote only')) {
|
||||||
|
cloud_runner_logger_1.default.log(`Disk space error detected. Attempting cleanup...`);
|
||||||
|
// Try to clean up old cache files
|
||||||
|
try {
|
||||||
|
const cacheParent = node_path_1.default.dirname(cacheFolder);
|
||||||
|
if (await fileExists(cacheParent)) {
|
||||||
|
// Find and remove old cache entries (keep only the most recent)
|
||||||
|
await cloud_runner_system_1.CloudRunnerSystem.Run(`find ${cacheParent} -name "*.tar*" -type f -mtime +1 -delete 2>/dev/null || true`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (cleanupError) {
|
||||||
|
cloud_runner_logger_1.default.log(`Cleanup attempt failed: ${cleanupError}`);
|
||||||
|
}
|
||||||
|
throw new Error(`Failed to create cache archive due to insufficient disk space. Error: ${errorMessage}. Please free up disk space and retry.`);
|
||||||
|
}
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
await cloud_runner_system_1.CloudRunnerSystem.Run(`du ${cacheArtifactName}.tar${compressionSuffix}`);
|
await cloud_runner_system_1.CloudRunnerSystem.Run(`du ${cacheArtifactName}.tar${compressionSuffix}`);
|
||||||
(0, node_console_1.assert)(await fileExists(`${cacheArtifactName}.tar${compressionSuffix}`), 'cache archive exists');
|
(0, node_console_1.assert)(await fileExists(`${cacheArtifactName}.tar${compressionSuffix}`), 'cache archive exists');
|
||||||
(0, node_console_1.assert)(await fileExists(node_path_1.default.basename(sourceFolder)), 'source folder exists');
|
(0, node_console_1.assert)(await fileExists(node_path_1.default.basename(sourceFolder)), 'source folder exists');
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -63,7 +63,10 @@ class KubernetesPods {
|
||||||
|
|
||||||
// Check if only PreStopHook failed but container succeeded
|
// Check if only PreStopHook failed but container succeeded
|
||||||
const hasPreStopHookFailure = events.some((e) => e.reason === 'FailedPreStopHook');
|
const hasPreStopHookFailure = events.some((e) => e.reason === 'FailedPreStopHook');
|
||||||
|
const wasKilled = events.some((e) => e.reason === 'Killing');
|
||||||
|
|
||||||
|
// If container succeeded (exit code 0), PreStopHook failure is non-critical
|
||||||
|
// Also check if pod was killed but container might have succeeded
|
||||||
if (containerSucceeded && containerExitCode === 0) {
|
if (containerSucceeded && containerExitCode === 0) {
|
||||||
// Container succeeded - PreStopHook failure is non-critical
|
// Container succeeded - PreStopHook failure is non-critical
|
||||||
if (hasPreStopHookFailure) {
|
if (hasPreStopHookFailure) {
|
||||||
|
|
@ -80,6 +83,16 @@ class KubernetesPods {
|
||||||
return false; // Pod is not running, but we don't treat it as a failure
|
return false; // Pod is not running, but we don't treat it as a failure
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If pod was killed and we have PreStopHook failure but no container status yet, wait a bit
|
||||||
|
// The container might have succeeded but status hasn't been updated yet
|
||||||
|
if (wasKilled && hasPreStopHookFailure && containerExitCode === undefined) {
|
||||||
|
CloudRunnerLogger.log(
|
||||||
|
`Pod ${podName} was killed with PreStopHook failure, but container status not yet available. This may be non-fatal if container succeeded.`,
|
||||||
|
);
|
||||||
|
// Still throw error for now, but with more context
|
||||||
|
// The task runner will retry and get the actual container status
|
||||||
|
}
|
||||||
|
|
||||||
const errorMessage = `K8s pod failed\n${errorDetails.join('\n')}`;
|
const errorMessage = `K8s pod failed\n${errorDetails.join('\n')}`;
|
||||||
CloudRunnerLogger.log(errorMessage);
|
CloudRunnerLogger.log(errorMessage);
|
||||||
throw new Error(errorMessage);
|
throw new Error(errorMessage);
|
||||||
|
|
|
||||||
|
|
@ -79,9 +79,48 @@ export class Caching {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check disk space before creating tar archive
|
||||||
|
try {
|
||||||
|
const diskCheckOutput = await CloudRunnerSystem.Run(`df -h . 2>/dev/null || df -h /data 2>/dev/null || true`);
|
||||||
|
CloudRunnerLogger.log(`Disk space before tar: ${diskCheckOutput}`);
|
||||||
|
} catch (error) {
|
||||||
|
// Ignore disk check errors
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clean up any existing incomplete tar files
|
||||||
|
try {
|
||||||
|
await CloudRunnerSystem.Run(`rm -f ${cacheArtifactName}.tar${compressionSuffix} 2>/dev/null || true`);
|
||||||
|
} catch (error) {
|
||||||
|
// Ignore cleanup errors
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
await CloudRunnerSystem.Run(
|
await CloudRunnerSystem.Run(
|
||||||
`tar -cf ${cacheArtifactName}.tar${compressionSuffix} "${path.basename(sourceFolder)}"`,
|
`tar -cf ${cacheArtifactName}.tar${compressionSuffix} "${path.basename(sourceFolder)}"`,
|
||||||
);
|
);
|
||||||
|
} catch (error: any) {
|
||||||
|
// Check if error is due to disk space
|
||||||
|
const errorMessage = error?.message || error?.toString() || '';
|
||||||
|
if (errorMessage.includes('No space left') || errorMessage.includes('Wrote only')) {
|
||||||
|
CloudRunnerLogger.log(`Disk space error detected. Attempting cleanup...`);
|
||||||
|
// Try to clean up old cache files
|
||||||
|
try {
|
||||||
|
const cacheParent = path.dirname(cacheFolder);
|
||||||
|
if (await fileExists(cacheParent)) {
|
||||||
|
// Find and remove old cache entries (keep only the most recent)
|
||||||
|
await CloudRunnerSystem.Run(
|
||||||
|
`find ${cacheParent} -name "*.tar*" -type f -mtime +1 -delete 2>/dev/null || true`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} catch (cleanupError) {
|
||||||
|
CloudRunnerLogger.log(`Cleanup attempt failed: ${cleanupError}`);
|
||||||
|
}
|
||||||
|
throw new Error(
|
||||||
|
`Failed to create cache archive due to insufficient disk space. Error: ${errorMessage}. Please free up disk space and retry.`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
await CloudRunnerSystem.Run(`du ${cacheArtifactName}.tar${compressionSuffix}`);
|
await CloudRunnerSystem.Run(`du ${cacheArtifactName}.tar${compressionSuffix}`);
|
||||||
assert(await fileExists(`${cacheArtifactName}.tar${compressionSuffix}`), 'cache archive exists');
|
assert(await fileExists(`${cacheArtifactName}.tar${compressionSuffix}`), 'cache archive exists');
|
||||||
assert(await fileExists(path.basename(sourceFolder)), 'source folder exists');
|
assert(await fileExists(path.basename(sourceFolder)), 'source folder exists');
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue