pr feedback
parent
d12244db60
commit
7f133d8cc7
|
|
@ -4594,6 +4594,47 @@ class KubernetesTaskRunner {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// After kubectl logs loop ends, read log file as fallback to capture any messages
|
||||||
|
// written after kubectl stopped reading (e.g., "Collected Logs" from post-build)
|
||||||
|
// This ensures all log messages are included in BuildResults for test assertions
|
||||||
|
try {
|
||||||
|
const isPodStillRunning = await kubernetes_pods_1.default.IsPodRunning(podName, namespace, kubeClient);
|
||||||
|
if (!isPodStillRunning) {
|
||||||
|
cloud_runner_logger_1.default.log('Pod is terminated, reading log file as fallback to capture post-build messages...');
|
||||||
|
try {
|
||||||
|
// Try to read the log file from the terminated pod
|
||||||
|
// Use kubectl exec with --previous flag or try to access via PVC
|
||||||
|
const logFileContent = await cloud_runner_system_1.CloudRunnerSystem.Run(`kubectl exec ${podName} -c ${containerName} -n ${namespace} --previous -- cat /home/job-log.txt 2>/dev/null || kubectl exec ${podName} -c ${containerName} -n ${namespace} -- cat /home/job-log.txt 2>/dev/null || echo ""`, true, true);
|
||||||
|
if (logFileContent && logFileContent.trim()) {
|
||||||
|
cloud_runner_logger_1.default.log(`Read log file from pod as fallback (${logFileContent.length} chars) to capture missing messages`);
|
||||||
|
// Get the lines we already have in output to avoid duplicates
|
||||||
|
const existingLines = new Set(output.split('\n').map((line) => line.trim()));
|
||||||
|
// Process the log file content line by line and add missing lines
|
||||||
|
for (const line of logFileContent.split(`\n`)) {
|
||||||
|
const trimmedLine = line.trim();
|
||||||
|
const lowerLine = trimmedLine.toLowerCase();
|
||||||
|
// Skip empty lines, kubectl errors, and lines we already have
|
||||||
|
if (trimmedLine &&
|
||||||
|
!lowerLine.includes('unable to retrieve container logs') &&
|
||||||
|
!existingLines.has(trimmedLine)) {
|
||||||
|
// Add missing line to output
|
||||||
|
output += `${line}\n`;
|
||||||
|
// Process through FollowLogStreamService to ensure proper handling
|
||||||
|
({ shouldReadLogs, shouldCleanup, output } = follow_log_stream_service_1.FollowLogStreamService.handleIteration(line, shouldReadLogs, shouldCleanup, output));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (logFileError) {
|
||||||
|
cloud_runner_logger_1.default.logWarning(`Could not read log file from pod as fallback: ${logFileError?.message || logFileError}`);
|
||||||
|
// Continue with existing output - this is a best-effort fallback
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (fallbackError) {
|
||||||
|
cloud_runner_logger_1.default.logWarning(`Error checking pod status for log file fallback: ${fallbackError?.message || fallbackError}`);
|
||||||
|
// Continue with existing output - this is a best-effort fallback
|
||||||
|
}
|
||||||
// Filter out kubectl error messages from the final output
|
// Filter out kubectl error messages from the final output
|
||||||
// These errors can be added via stderr even when kubectl fails
|
// These errors can be added via stderr even when kubectl fails
|
||||||
// We filter them out so they don't pollute the BuildResults
|
// We filter them out so they don't pollute the BuildResults
|
||||||
|
|
@ -5452,9 +5493,28 @@ class Caching {
|
||||||
cloud_runner_logger_1.default.log(`Cleanup completed. Checking disk space again...`);
|
cloud_runner_logger_1.default.log(`Cleanup completed. Checking disk space again...`);
|
||||||
const diskCheckAfter = await cloud_runner_system_1.CloudRunnerSystem.Run(`df . 2>/dev/null || df /data 2>/dev/null || true`);
|
const diskCheckAfter = await cloud_runner_system_1.CloudRunnerSystem.Run(`df . 2>/dev/null || df /data 2>/dev/null || true`);
|
||||||
cloud_runner_logger_1.default.log(`Disk space after cleanup: ${diskCheckAfter}`);
|
cloud_runner_logger_1.default.log(`Disk space after cleanup: ${diskCheckAfter}`);
|
||||||
|
// Check disk usage again after cleanup
|
||||||
|
let diskUsageAfterCleanup = 0;
|
||||||
|
try {
|
||||||
|
const usageMatchAfter = diskCheckAfter.match(/(\d+)%/);
|
||||||
|
if (usageMatchAfter) {
|
||||||
|
diskUsageAfterCleanup = Number.parseInt(usageMatchAfter[1], 10);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch {
|
||||||
|
// Ignore parsing errors
|
||||||
|
}
|
||||||
|
// If disk is still at 100% after cleanup, skip tar operation to prevent hang
|
||||||
|
if (diskUsageAfterCleanup >= 100) {
|
||||||
|
throw new Error(`Cannot create cache archive: disk is still at ${diskUsageAfterCleanup}% after cleanup. Tar operation would hang. Please free up disk space manually.`);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (cleanupError) {
|
catch (cleanupError) {
|
||||||
|
// If cleanupError is our disk space error, rethrow it
|
||||||
|
if (cleanupError instanceof Error && cleanupError.message.includes('Cannot create cache archive')) {
|
||||||
|
throw cleanupError;
|
||||||
|
}
|
||||||
cloud_runner_logger_1.default.log(`Proactive cleanup failed: ${cleanupError}`);
|
cloud_runner_logger_1.default.log(`Proactive cleanup failed: ${cleanupError}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -5466,12 +5526,31 @@ class Caching {
|
||||||
// Ignore cleanup errors
|
// Ignore cleanup errors
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
await cloud_runner_system_1.CloudRunnerSystem.Run(`tar -cf ${cacheArtifactName}.tar${compressionSuffix} "${node_path_1.default.basename(sourceFolder)}"`);
|
// Add timeout to tar command to prevent hanging when disk is full
|
||||||
|
// Use timeout command with 10 minute limit (600 seconds) if available
|
||||||
|
// Check if timeout command exists, otherwise use regular tar
|
||||||
|
const tarCommand = `tar -cf ${cacheArtifactName}.tar${compressionSuffix} "${node_path_1.default.basename(sourceFolder)}"`;
|
||||||
|
let tarCommandToRun = tarCommand;
|
||||||
|
try {
|
||||||
|
// Check if timeout command is available
|
||||||
|
await cloud_runner_system_1.CloudRunnerSystem.Run(`which timeout > /dev/null 2>&1`, true, true);
|
||||||
|
// Use timeout if available (600 seconds = 10 minutes)
|
||||||
|
tarCommandToRun = `timeout 600 ${tarCommand}`;
|
||||||
|
}
|
||||||
|
catch {
|
||||||
|
// timeout command not available, use regular tar
|
||||||
|
// Note: This could still hang if disk is full, but the disk space check above should prevent this
|
||||||
|
tarCommandToRun = tarCommand;
|
||||||
|
}
|
||||||
|
await cloud_runner_system_1.CloudRunnerSystem.Run(tarCommandToRun);
|
||||||
}
|
}
|
||||||
catch (error) {
|
catch (error) {
|
||||||
// Check if error is due to disk space
|
// Check if error is due to disk space or timeout
|
||||||
const errorMessage = error?.message || error?.toString() || '';
|
const errorMessage = error?.message || error?.toString() || '';
|
||||||
if (errorMessage.includes('No space left') || errorMessage.includes('Wrote only')) {
|
if (errorMessage.includes('No space left') ||
|
||||||
|
errorMessage.includes('Wrote only') ||
|
||||||
|
errorMessage.includes('timeout') ||
|
||||||
|
errorMessage.includes('Terminated')) {
|
||||||
cloud_runner_logger_1.default.log(`Disk space error detected. Attempting aggressive cleanup...`);
|
cloud_runner_logger_1.default.log(`Disk space error detected. Attempting aggressive cleanup...`);
|
||||||
// Try to clean up old cache files more aggressively
|
// Try to clean up old cache files more aggressively
|
||||||
try {
|
try {
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -76,7 +76,8 @@ class KubernetesTaskRunner {
|
||||||
|
|
||||||
// Filter out kubectl error messages from the error output
|
// Filter out kubectl error messages from the error output
|
||||||
const errorMessage = error?.message || error?.toString() || '';
|
const errorMessage = error?.message || error?.toString() || '';
|
||||||
const isKubectlLogsError = errorMessage.includes('unable to retrieve container logs for containerd://') ||
|
const isKubectlLogsError =
|
||||||
|
errorMessage.includes('unable to retrieve container logs for containerd://') ||
|
||||||
errorMessage.toLowerCase().includes('unable to retrieve container logs');
|
errorMessage.toLowerCase().includes('unable to retrieve container logs');
|
||||||
|
|
||||||
if (isKubectlLogsError) {
|
if (isKubectlLogsError) {
|
||||||
|
|
@ -208,6 +209,64 @@ class KubernetesTaskRunner {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// After kubectl logs loop ends, read log file as fallback to capture any messages
|
||||||
|
// written after kubectl stopped reading (e.g., "Collected Logs" from post-build)
|
||||||
|
// This ensures all log messages are included in BuildResults for test assertions
|
||||||
|
try {
|
||||||
|
const isPodStillRunning = await KubernetesPods.IsPodRunning(podName, namespace, kubeClient);
|
||||||
|
if (!isPodStillRunning) {
|
||||||
|
CloudRunnerLogger.log('Pod is terminated, reading log file as fallback to capture post-build messages...');
|
||||||
|
try {
|
||||||
|
// Try to read the log file from the terminated pod
|
||||||
|
// Use kubectl exec with --previous flag or try to access via PVC
|
||||||
|
const logFileContent = await CloudRunnerSystem.Run(
|
||||||
|
`kubectl exec ${podName} -c ${containerName} -n ${namespace} --previous -- cat /home/job-log.txt 2>/dev/null || kubectl exec ${podName} -c ${containerName} -n ${namespace} -- cat /home/job-log.txt 2>/dev/null || echo ""`,
|
||||||
|
true,
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (logFileContent && logFileContent.trim()) {
|
||||||
|
CloudRunnerLogger.log(
|
||||||
|
`Read log file from pod as fallback (${logFileContent.length} chars) to capture missing messages`,
|
||||||
|
);
|
||||||
|
// Get the lines we already have in output to avoid duplicates
|
||||||
|
const existingLines = new Set(output.split('\n').map((line) => line.trim()));
|
||||||
|
// Process the log file content line by line and add missing lines
|
||||||
|
for (const line of logFileContent.split(`\n`)) {
|
||||||
|
const trimmedLine = line.trim();
|
||||||
|
const lowerLine = trimmedLine.toLowerCase();
|
||||||
|
// Skip empty lines, kubectl errors, and lines we already have
|
||||||
|
if (
|
||||||
|
trimmedLine &&
|
||||||
|
!lowerLine.includes('unable to retrieve container logs') &&
|
||||||
|
!existingLines.has(trimmedLine)
|
||||||
|
) {
|
||||||
|
// Add missing line to output
|
||||||
|
output += `${line}\n`;
|
||||||
|
// Process through FollowLogStreamService to ensure proper handling
|
||||||
|
({ shouldReadLogs, shouldCleanup, output } = FollowLogStreamService.handleIteration(
|
||||||
|
line,
|
||||||
|
shouldReadLogs,
|
||||||
|
shouldCleanup,
|
||||||
|
output,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (logFileError: any) {
|
||||||
|
CloudRunnerLogger.logWarning(
|
||||||
|
`Could not read log file from pod as fallback: ${logFileError?.message || logFileError}`,
|
||||||
|
);
|
||||||
|
// Continue with existing output - this is a best-effort fallback
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (fallbackError: any) {
|
||||||
|
CloudRunnerLogger.logWarning(
|
||||||
|
`Error checking pod status for log file fallback: ${fallbackError?.message || fallbackError}`,
|
||||||
|
);
|
||||||
|
// Continue with existing output - this is a best-effort fallback
|
||||||
|
}
|
||||||
|
|
||||||
// Filter out kubectl error messages from the final output
|
// Filter out kubectl error messages from the final output
|
||||||
// These errors can be added via stderr even when kubectl fails
|
// These errors can be added via stderr even when kubectl fails
|
||||||
// We filter them out so they don't pollute the BuildResults
|
// We filter them out so they don't pollute the BuildResults
|
||||||
|
|
|
||||||
|
|
@ -110,8 +110,30 @@ export class Caching {
|
||||||
CloudRunnerLogger.log(`Cleanup completed. Checking disk space again...`);
|
CloudRunnerLogger.log(`Cleanup completed. Checking disk space again...`);
|
||||||
const diskCheckAfter = await CloudRunnerSystem.Run(`df . 2>/dev/null || df /data 2>/dev/null || true`);
|
const diskCheckAfter = await CloudRunnerSystem.Run(`df . 2>/dev/null || df /data 2>/dev/null || true`);
|
||||||
CloudRunnerLogger.log(`Disk space after cleanup: ${diskCheckAfter}`);
|
CloudRunnerLogger.log(`Disk space after cleanup: ${diskCheckAfter}`);
|
||||||
|
|
||||||
|
// Check disk usage again after cleanup
|
||||||
|
let diskUsageAfterCleanup = 0;
|
||||||
|
try {
|
||||||
|
const usageMatchAfter = diskCheckAfter.match(/(\d+)%/);
|
||||||
|
if (usageMatchAfter) {
|
||||||
|
diskUsageAfterCleanup = Number.parseInt(usageMatchAfter[1], 10);
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Ignore parsing errors
|
||||||
|
}
|
||||||
|
|
||||||
|
// If disk is still at 100% after cleanup, skip tar operation to prevent hang
|
||||||
|
if (diskUsageAfterCleanup >= 100) {
|
||||||
|
throw new Error(
|
||||||
|
`Cannot create cache archive: disk is still at ${diskUsageAfterCleanup}% after cleanup. Tar operation would hang. Please free up disk space manually.`,
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (cleanupError) {
|
} catch (cleanupError) {
|
||||||
|
// If cleanupError is our disk space error, rethrow it
|
||||||
|
if (cleanupError instanceof Error && cleanupError.message.includes('Cannot create cache archive')) {
|
||||||
|
throw cleanupError;
|
||||||
|
}
|
||||||
CloudRunnerLogger.log(`Proactive cleanup failed: ${cleanupError}`);
|
CloudRunnerLogger.log(`Proactive cleanup failed: ${cleanupError}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -124,13 +146,32 @@ export class Caching {
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await CloudRunnerSystem.Run(
|
// Add timeout to tar command to prevent hanging when disk is full
|
||||||
`tar -cf ${cacheArtifactName}.tar${compressionSuffix} "${path.basename(sourceFolder)}"`,
|
// Use timeout command with 10 minute limit (600 seconds) if available
|
||||||
);
|
// Check if timeout command exists, otherwise use regular tar
|
||||||
|
const tarCommand = `tar -cf ${cacheArtifactName}.tar${compressionSuffix} "${path.basename(sourceFolder)}"`;
|
||||||
|
let tarCommandToRun = tarCommand;
|
||||||
|
try {
|
||||||
|
// Check if timeout command is available
|
||||||
|
await CloudRunnerSystem.Run(`which timeout > /dev/null 2>&1`, true, true);
|
||||||
|
// Use timeout if available (600 seconds = 10 minutes)
|
||||||
|
tarCommandToRun = `timeout 600 ${tarCommand}`;
|
||||||
|
} catch {
|
||||||
|
// timeout command not available, use regular tar
|
||||||
|
// Note: This could still hang if disk is full, but the disk space check above should prevent this
|
||||||
|
tarCommandToRun = tarCommand;
|
||||||
|
}
|
||||||
|
|
||||||
|
await CloudRunnerSystem.Run(tarCommandToRun);
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
// Check if error is due to disk space
|
// Check if error is due to disk space or timeout
|
||||||
const errorMessage = error?.message || error?.toString() || '';
|
const errorMessage = error?.message || error?.toString() || '';
|
||||||
if (errorMessage.includes('No space left') || errorMessage.includes('Wrote only')) {
|
if (
|
||||||
|
errorMessage.includes('No space left') ||
|
||||||
|
errorMessage.includes('Wrote only') ||
|
||||||
|
errorMessage.includes('timeout') ||
|
||||||
|
errorMessage.includes('Terminated')
|
||||||
|
) {
|
||||||
CloudRunnerLogger.log(`Disk space error detected. Attempting aggressive cleanup...`);
|
CloudRunnerLogger.log(`Disk space error detected. Attempting aggressive cleanup...`);
|
||||||
|
|
||||||
// Try to clean up old cache files more aggressively
|
// Try to clean up old cache files more aggressively
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue