697 lines
33 KiB
TypeScript
697 lines
33 KiB
TypeScript
import { CoreV1Api, KubeConfig } from '@kubernetes/client-node';
|
|
import CloudRunnerLogger from '../../services/core/cloud-runner-logger';
|
|
import { waitUntil } from 'async-wait-until';
|
|
import { CloudRunnerSystem } from '../../services/core/cloud-runner-system';
|
|
import CloudRunner from '../../cloud-runner';
|
|
import KubernetesPods from './kubernetes-pods';
|
|
import { FollowLogStreamService } from '../../services/core/follow-log-stream-service';
|
|
|
|
class KubernetesTaskRunner {
|
|
static readonly maxRetry: number = 3;
|
|
static lastReceivedMessage: string = ``;
|
|
|
|
static async runTask(
|
|
kubeConfig: KubeConfig,
|
|
kubeClient: CoreV1Api,
|
|
jobName: string,
|
|
podName: string,
|
|
containerName: string,
|
|
namespace: string,
|
|
) {
|
|
let output = '';
|
|
let shouldReadLogs = true;
|
|
let shouldCleanup = true;
|
|
let retriesAfterFinish = 0;
|
|
let kubectlLogsFailedCount = 0;
|
|
const maxKubectlLogsFailures = 3;
|
|
// eslint-disable-next-line no-constant-condition
|
|
while (true) {
|
|
await new Promise((resolve) => setTimeout(resolve, 3000));
|
|
CloudRunnerLogger.log(
|
|
`Streaming logs from pod: ${podName} container: ${containerName} namespace: ${namespace} ${CloudRunner.buildParameters.kubeVolumeSize}/${CloudRunner.buildParameters.containerCpu}/${CloudRunner.buildParameters.containerMemory}`,
|
|
);
|
|
const isRunning = await KubernetesPods.IsPodRunning(podName, namespace, kubeClient);
|
|
|
|
const callback = (outputChunk: string) => {
|
|
// Filter out kubectl error messages about being unable to retrieve container logs
|
|
// These errors pollute the output and don't contain useful information
|
|
const lowerChunk = outputChunk.toLowerCase();
|
|
if (lowerChunk.includes('unable to retrieve container logs')) {
|
|
CloudRunnerLogger.log(`Filtered kubectl error: ${outputChunk.trim()}`);
|
|
return;
|
|
}
|
|
|
|
output += outputChunk;
|
|
|
|
// split output chunk and handle per line
|
|
for (const chunk of outputChunk.split(`\n`)) {
|
|
// Skip empty chunks and kubectl error messages (case-insensitive)
|
|
const lowerChunk = chunk.toLowerCase();
|
|
if (chunk.trim() && !lowerChunk.includes('unable to retrieve container logs')) {
|
|
({ shouldReadLogs, shouldCleanup, output } = FollowLogStreamService.handleIteration(
|
|
chunk,
|
|
shouldReadLogs,
|
|
shouldCleanup,
|
|
output,
|
|
));
|
|
}
|
|
}
|
|
};
|
|
try {
|
|
// Always specify container name explicitly to avoid containerd:// errors
|
|
// Use -f for running pods, --previous for terminated pods
|
|
await CloudRunnerSystem.Run(
|
|
`kubectl logs ${podName} -c ${containerName} -n ${namespace}${isRunning ? ' -f' : ' --previous'}`,
|
|
false,
|
|
true,
|
|
callback,
|
|
);
|
|
// Reset failure count on success
|
|
kubectlLogsFailedCount = 0;
|
|
} catch (error: any) {
|
|
kubectlLogsFailedCount++;
|
|
await new Promise((resolve) => setTimeout(resolve, 3000));
|
|
const continueStreaming = await KubernetesPods.IsPodRunning(podName, namespace, kubeClient);
|
|
CloudRunnerLogger.log(`K8s logging error ${error} ${continueStreaming}`);
|
|
|
|
// Filter out kubectl error messages from the error output
|
|
const errorMessage = error?.message || error?.toString() || '';
|
|
const isKubectlLogsError =
|
|
errorMessage.includes('unable to retrieve container logs for containerd://') ||
|
|
errorMessage.toLowerCase().includes('unable to retrieve container logs');
|
|
|
|
if (isKubectlLogsError) {
|
|
CloudRunnerLogger.log(
|
|
`Kubectl unable to retrieve logs, attempt ${kubectlLogsFailedCount}/${maxKubectlLogsFailures}`,
|
|
);
|
|
|
|
// If kubectl logs has failed multiple times, try reading the log file directly from the pod
|
|
// This works even if the pod is terminated, as long as it hasn't been deleted
|
|
if (kubectlLogsFailedCount >= maxKubectlLogsFailures && !isRunning && !continueStreaming) {
|
|
CloudRunnerLogger.log(`Attempting to read log file directly from pod as fallback...`);
|
|
try {
|
|
// Try to read the log file from the pod
|
|
// Use kubectl exec for running pods, or try to access via PVC if pod is terminated
|
|
let logFileContent = '';
|
|
|
|
if (isRunning) {
|
|
// Pod is still running, try exec
|
|
logFileContent = await CloudRunnerSystem.Run(
|
|
`kubectl exec ${podName} -c ${containerName} -n ${namespace} -- cat /home/job-log.txt 2>/dev/null || echo ""`,
|
|
true,
|
|
true,
|
|
);
|
|
} else {
|
|
// Pod is terminated, try to create a temporary pod to read from the PVC
|
|
// First, check if we can still access the pod's filesystem
|
|
CloudRunnerLogger.log(`Pod is terminated, attempting to read log file via temporary pod...`);
|
|
// For terminated pods, we might not be able to exec, so we'll skip this fallback
|
|
// and rely on the log file being written to the PVC (if mounted)
|
|
CloudRunnerLogger.logWarning(`Cannot read log file from terminated pod via exec`);
|
|
}
|
|
|
|
if (logFileContent && logFileContent.trim()) {
|
|
CloudRunnerLogger.log(`Successfully read log file from pod (${logFileContent.length} chars)`);
|
|
// Process the log file content line by line
|
|
for (const line of logFileContent.split(`\n`)) {
|
|
const lowerLine = line.toLowerCase();
|
|
if (line.trim() && !lowerLine.includes('unable to retrieve container logs')) {
|
|
({ shouldReadLogs, shouldCleanup, output } = FollowLogStreamService.handleIteration(
|
|
line,
|
|
shouldReadLogs,
|
|
shouldCleanup,
|
|
output,
|
|
));
|
|
}
|
|
}
|
|
|
|
// Check if we got the end of transmission marker
|
|
if (FollowLogStreamService.DidReceiveEndOfTransmission) {
|
|
CloudRunnerLogger.log('end of log stream (from log file)');
|
|
break;
|
|
}
|
|
} else {
|
|
CloudRunnerLogger.logWarning(`Log file read returned empty content, continuing with available logs`);
|
|
// If we can't read the log file, break out of the loop to return whatever logs we have
|
|
// This prevents infinite retries when kubectl logs consistently fails
|
|
break;
|
|
}
|
|
} catch (execError: any) {
|
|
CloudRunnerLogger.logWarning(`Failed to read log file from pod: ${execError}`);
|
|
// If we've exhausted all options, break to return whatever logs we have
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// If pod is not running and we tried --previous but it failed, try without --previous
|
|
if (!isRunning && !continueStreaming && error?.message?.includes('previous terminated container')) {
|
|
CloudRunnerLogger.log(`Previous container not found, trying current container logs...`);
|
|
try {
|
|
await CloudRunnerSystem.Run(
|
|
`kubectl logs ${podName} -c ${containerName} -n ${namespace}`,
|
|
false,
|
|
true,
|
|
callback,
|
|
);
|
|
|
|
// If we successfully got logs, check for end of transmission
|
|
if (FollowLogStreamService.DidReceiveEndOfTransmission) {
|
|
CloudRunnerLogger.log('end of log stream');
|
|
break;
|
|
}
|
|
|
|
// If we got logs but no end marker, continue trying (might be more logs)
|
|
if (retriesAfterFinish < KubernetesTaskRunner.maxRetry) {
|
|
retriesAfterFinish++;
|
|
continue;
|
|
}
|
|
|
|
// If we've exhausted retries, break
|
|
break;
|
|
} catch (fallbackError: any) {
|
|
CloudRunnerLogger.log(`Fallback log fetch also failed: ${fallbackError}`);
|
|
|
|
// If both fail, continue retrying if we haven't exhausted retries
|
|
if (retriesAfterFinish < KubernetesTaskRunner.maxRetry) {
|
|
retriesAfterFinish++;
|
|
continue;
|
|
}
|
|
|
|
// Only break if we've exhausted all retries
|
|
CloudRunnerLogger.logWarning(
|
|
`Could not fetch any container logs after ${KubernetesTaskRunner.maxRetry} retries`,
|
|
);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (continueStreaming) {
|
|
continue;
|
|
}
|
|
if (retriesAfterFinish < KubernetesTaskRunner.maxRetry) {
|
|
retriesAfterFinish++;
|
|
continue;
|
|
}
|
|
|
|
// If we've exhausted retries and it's not a previous container issue, throw
|
|
if (!error?.message?.includes('previous terminated container')) {
|
|
throw error;
|
|
}
|
|
|
|
// For previous container errors, we've already tried fallback, so just break
|
|
CloudRunnerLogger.logWarning(
|
|
`Could not fetch previous container logs after retries, but continuing with available logs`,
|
|
);
|
|
break;
|
|
}
|
|
if (FollowLogStreamService.DidReceiveEndOfTransmission) {
|
|
CloudRunnerLogger.log('end of log stream');
|
|
break;
|
|
}
|
|
}
|
|
|
|
// After kubectl logs loop ends, read log file as fallback to capture any messages
|
|
// written after kubectl stopped reading (e.g., "Collected Logs" from post-build)
|
|
// This ensures all log messages are included in BuildResults for test assertions
|
|
// If output is empty, we need to be more aggressive about getting logs
|
|
const needsFallback = output.trim().length === 0;
|
|
const missingCollectedLogs = !output.includes('Collected Logs');
|
|
|
|
if (needsFallback) {
|
|
CloudRunnerLogger.log('Output is empty, attempting aggressive log collection fallback...');
|
|
// Give the pod a moment to finish writing logs before we try to read them
|
|
await new Promise((resolve) => setTimeout(resolve, 5000));
|
|
}
|
|
|
|
// Always try fallback if output is empty, if pod is terminated, or if "Collected Logs" is missing
|
|
// The "Collected Logs" check ensures we try to get post-build messages even if we have some output
|
|
try {
|
|
const isPodStillRunning = await KubernetesPods.IsPodRunning(podName, namespace, kubeClient);
|
|
const shouldTryFallback = !isPodStillRunning || needsFallback || missingCollectedLogs;
|
|
|
|
if (shouldTryFallback) {
|
|
const reason = needsFallback
|
|
? 'output is empty'
|
|
: missingCollectedLogs
|
|
? 'Collected Logs missing from output'
|
|
: 'pod is terminated';
|
|
CloudRunnerLogger.log(
|
|
`Pod is ${isPodStillRunning ? 'running' : 'terminated'} and ${reason}, reading log file as fallback...`,
|
|
);
|
|
try {
|
|
// Try to read the log file from the pod
|
|
// For killed pods (OOM), kubectl exec might not work, so we try multiple approaches
|
|
// First try --previous flag for terminated containers, then try without it
|
|
let logFileContent = '';
|
|
|
|
// Try multiple approaches to get the log file
|
|
// Order matters: try terminated container first, then current, then PVC, then kubectl logs as last resort
|
|
// For K8s, the PVC is mounted at /data, so try reading from there too
|
|
const attempts = [
|
|
// For terminated pods, try --previous first
|
|
`kubectl exec ${podName} -c ${containerName} -n ${namespace} --previous -- cat /home/job-log.txt 2>/dev/null || echo ""`,
|
|
// Try current container
|
|
`kubectl exec ${podName} -c ${containerName} -n ${namespace} -- cat /home/job-log.txt 2>/dev/null || echo ""`,
|
|
// Try reading from PVC (/data) in case log was copied there
|
|
`kubectl exec ${podName} -c ${containerName} -n ${namespace} --previous -- cat /data/job-log.txt 2>/dev/null || echo ""`,
|
|
`kubectl exec ${podName} -c ${containerName} -n ${namespace} -- cat /data/job-log.txt 2>/dev/null || echo ""`,
|
|
// Try kubectl logs as fallback (might capture stdout even if exec fails)
|
|
`kubectl logs ${podName} -c ${containerName} -n ${namespace} --previous 2>/dev/null || echo ""`,
|
|
`kubectl logs ${podName} -c ${containerName} -n ${namespace} 2>/dev/null || echo ""`,
|
|
];
|
|
|
|
for (const attempt of attempts) {
|
|
// If we already have content with "Collected Logs", no need to try more
|
|
if (logFileContent && logFileContent.trim() && logFileContent.includes('Collected Logs')) {
|
|
CloudRunnerLogger.log('Found "Collected Logs" in fallback content, stopping attempts.');
|
|
break;
|
|
}
|
|
try {
|
|
CloudRunnerLogger.log(`Trying fallback method: ${attempt.substring(0, 80)}...`);
|
|
const result = await CloudRunnerSystem.Run(attempt, true, true);
|
|
if (result && result.trim()) {
|
|
// Prefer content that has "Collected Logs" over content that doesn't
|
|
if (!logFileContent || !logFileContent.includes('Collected Logs')) {
|
|
logFileContent = result;
|
|
CloudRunnerLogger.log(
|
|
`Successfully read logs using fallback method (${logFileContent.length} chars): ${attempt.substring(
|
|
0,
|
|
50,
|
|
)}...`,
|
|
);
|
|
// If this content has "Collected Logs", we're done
|
|
if (logFileContent.includes('Collected Logs')) {
|
|
CloudRunnerLogger.log('Fallback method successfully captured "Collected Logs".');
|
|
break;
|
|
}
|
|
} else {
|
|
CloudRunnerLogger.log(`Skipping this result - already have content with "Collected Logs".`);
|
|
}
|
|
} else {
|
|
CloudRunnerLogger.log(`Fallback method returned empty result: ${attempt.substring(0, 50)}...`);
|
|
}
|
|
} catch (attemptError: any) {
|
|
CloudRunnerLogger.log(
|
|
`Fallback method failed: ${attempt.substring(0, 50)}... Error: ${
|
|
attemptError?.message || attemptError
|
|
}`,
|
|
);
|
|
// Continue to next attempt
|
|
}
|
|
}
|
|
|
|
if (!logFileContent || !logFileContent.trim()) {
|
|
CloudRunnerLogger.logWarning(
|
|
'Could not read log file from pod after all fallback attempts (may be OOM-killed or pod not accessible).',
|
|
);
|
|
}
|
|
|
|
if (logFileContent && logFileContent.trim()) {
|
|
CloudRunnerLogger.log(
|
|
`Read log file from pod as fallback (${logFileContent.length} chars) to capture missing messages`,
|
|
);
|
|
// Get the lines we already have in output to avoid duplicates
|
|
const existingLines = new Set(output.split('\n').map((line) => line.trim()));
|
|
// Process the log file content line by line and add missing lines
|
|
for (const line of logFileContent.split(`\n`)) {
|
|
const trimmedLine = line.trim();
|
|
const lowerLine = trimmedLine.toLowerCase();
|
|
// Skip empty lines, kubectl errors, and lines we already have
|
|
if (
|
|
trimmedLine &&
|
|
!lowerLine.includes('unable to retrieve container logs') &&
|
|
!existingLines.has(trimmedLine)
|
|
) {
|
|
// Process through FollowLogStreamService - it will append to output
|
|
// Don't add to output manually since handleIteration does it
|
|
({ shouldReadLogs, shouldCleanup, output } = FollowLogStreamService.handleIteration(
|
|
trimmedLine,
|
|
shouldReadLogs,
|
|
shouldCleanup,
|
|
output,
|
|
));
|
|
}
|
|
}
|
|
}
|
|
} catch (logFileError: any) {
|
|
CloudRunnerLogger.logWarning(
|
|
`Could not read log file from pod as fallback: ${logFileError?.message || logFileError}`,
|
|
);
|
|
// Continue with existing output - this is a best-effort fallback
|
|
}
|
|
}
|
|
|
|
// If output is still empty or missing "Collected Logs" after fallback attempts, add a warning message
|
|
// This ensures BuildResults is not completely empty, which would cause test failures
|
|
if ((needsFallback && output.trim().length === 0) || (!output.includes('Collected Logs') && shouldTryFallback)) {
|
|
CloudRunnerLogger.logWarning(
|
|
'Could not retrieve "Collected Logs" from pod after all attempts. Pod may have been killed before logs were written.',
|
|
);
|
|
// Add a minimal message so BuildResults is not completely empty
|
|
// This helps with debugging and prevents test failures due to empty results
|
|
if (output.trim().length === 0) {
|
|
output = 'Pod logs unavailable - pod may have been terminated before logs could be collected.\n';
|
|
} else if (!output.includes('Collected Logs')) {
|
|
// We have some output but missing "Collected Logs" - append the fallback message
|
|
output +=
|
|
'\nPod logs incomplete - "Collected Logs" marker not found. Pod may have been terminated before post-build completed.\n';
|
|
}
|
|
}
|
|
} catch (fallbackError: any) {
|
|
CloudRunnerLogger.logWarning(
|
|
`Error checking pod status for log file fallback: ${fallbackError?.message || fallbackError}`,
|
|
);
|
|
// If output is empty and we hit an error, still add a message so BuildResults isn't empty
|
|
if (needsFallback && output.trim().length === 0) {
|
|
output = `Error retrieving logs: ${fallbackError?.message || fallbackError}\n`;
|
|
}
|
|
// Continue with existing output - this is a best-effort fallback
|
|
}
|
|
|
|
// Filter out kubectl error messages from the final output
|
|
// These errors can be added via stderr even when kubectl fails
|
|
// We filter them out so they don't pollute the BuildResults
|
|
const lines = output.split('\n');
|
|
const filteredLines = lines.filter((line) => !line.toLowerCase().includes('unable to retrieve container logs'));
|
|
const filteredOutput = filteredLines.join('\n');
|
|
|
|
// Log if we filtered out significant content
|
|
const originalLineCount = lines.length;
|
|
const filteredLineCount = filteredLines.length;
|
|
if (originalLineCount > filteredLineCount) {
|
|
CloudRunnerLogger.log(
|
|
`Filtered out ${originalLineCount - filteredLineCount} kubectl error message(s) from output`,
|
|
);
|
|
}
|
|
|
|
return filteredOutput;
|
|
}
|
|
|
|
static async watchUntilPodRunning(kubeClient: CoreV1Api, podName: string, namespace: string) {
|
|
let waitComplete: boolean = false;
|
|
let message = ``;
|
|
let lastPhase = '';
|
|
let consecutivePendingCount = 0;
|
|
CloudRunnerLogger.log(`Watching ${podName} ${namespace}`);
|
|
|
|
try {
|
|
await waitUntil(
|
|
async () => {
|
|
const status = await kubeClient.readNamespacedPodStatus(podName, namespace);
|
|
const phase = status?.body.status?.phase || 'Unknown';
|
|
const conditions = status?.body.status?.conditions || [];
|
|
const containerStatuses = status?.body.status?.containerStatuses || [];
|
|
|
|
// Log phase changes
|
|
if (phase !== lastPhase) {
|
|
CloudRunnerLogger.log(`Pod ${podName} phase changed: ${lastPhase} -> ${phase}`);
|
|
lastPhase = phase;
|
|
consecutivePendingCount = 0;
|
|
}
|
|
|
|
// Check for failure conditions that mean the pod will never start (permanent failures)
|
|
// Note: We don't treat "Failed" phase as a permanent failure because the pod might have
|
|
// completed its work before being killed (OOM), and we should still try to get logs
|
|
const permanentFailureReasons = [
|
|
'Unschedulable',
|
|
'ImagePullBackOff',
|
|
'ErrImagePull',
|
|
'CreateContainerError',
|
|
'CreateContainerConfigError',
|
|
];
|
|
|
|
const hasPermanentFailureCondition = conditions.some((condition: any) =>
|
|
permanentFailureReasons.some((reason) => condition.reason?.includes(reason)),
|
|
);
|
|
|
|
const hasPermanentFailureContainerStatus = containerStatuses.some((containerStatus: any) =>
|
|
permanentFailureReasons.some((reason) => containerStatus.state?.waiting?.reason?.includes(reason)),
|
|
);
|
|
|
|
// Only treat permanent failures as errors - pods that completed (Failed/Succeeded) should continue
|
|
if (hasPermanentFailureCondition || hasPermanentFailureContainerStatus) {
|
|
// Get detailed failure information
|
|
const failureCondition = conditions.find((condition: any) =>
|
|
permanentFailureReasons.some((reason) => condition.reason?.includes(reason)),
|
|
);
|
|
const failureContainer = containerStatuses.find((containerStatus: any) =>
|
|
permanentFailureReasons.some((reason) => containerStatus.state?.waiting?.reason?.includes(reason)),
|
|
);
|
|
|
|
message = `Pod ${podName} failed to start (permanent failure):\nPhase: ${phase}\n`;
|
|
if (failureCondition) {
|
|
message += `Condition Reason: ${failureCondition.reason}\nCondition Message: ${failureCondition.message}\n`;
|
|
}
|
|
if (failureContainer) {
|
|
message += `Container Reason: ${failureContainer.state?.waiting?.reason}\nContainer Message: ${failureContainer.state?.waiting?.message}\n`;
|
|
}
|
|
|
|
// Log pod events for additional context
|
|
try {
|
|
const events = await kubeClient.listNamespacedEvent(namespace);
|
|
const podEvents = events.body.items
|
|
.filter((x) => x.involvedObject?.name === podName)
|
|
.map((x) => ({
|
|
message: x.message || ``,
|
|
reason: x.reason || ``,
|
|
type: x.type || ``,
|
|
}));
|
|
if (podEvents.length > 0) {
|
|
message += `\nRecent Events:\n${JSON.stringify(podEvents.slice(-5), undefined, 2)}`;
|
|
}
|
|
} catch (eventError) {
|
|
// Ignore event fetch errors
|
|
}
|
|
|
|
CloudRunnerLogger.logWarning(message);
|
|
// For permanent failures, mark as incomplete and store the error message
|
|
// We'll throw an error after the wait loop exits
|
|
waitComplete = false;
|
|
return true; // Return true to exit wait loop
|
|
}
|
|
|
|
// Pod is complete if it's not Pending or Unknown - it might be Running, Succeeded, or Failed
|
|
// For Failed/Succeeded pods, we still want to try to get logs, so we mark as complete
|
|
waitComplete = phase !== 'Pending' && phase !== 'Unknown';
|
|
|
|
// If pod completed (Succeeded/Failed), log it but don't throw - we'll try to get logs
|
|
if (waitComplete && phase !== 'Running') {
|
|
CloudRunnerLogger.log(`Pod ${podName} completed with phase: ${phase}. Will attempt to retrieve logs.`);
|
|
}
|
|
|
|
if (phase === 'Pending') {
|
|
consecutivePendingCount++;
|
|
|
|
// Check for scheduling failures in events (faster than waiting for conditions)
|
|
try {
|
|
const events = await kubeClient.listNamespacedEvent(namespace);
|
|
const podEvents = events.body.items.filter((x) => x.involvedObject?.name === podName);
|
|
const failedSchedulingEvents = podEvents.filter(
|
|
(x) => x.reason === 'FailedScheduling' || x.reason === 'SchedulingGated',
|
|
);
|
|
|
|
if (failedSchedulingEvents.length > 0) {
|
|
const schedulingMessage = failedSchedulingEvents
|
|
.map((x) => `${x.reason}: ${x.message || ''}`)
|
|
.join('; ');
|
|
message = `Pod ${podName} cannot be scheduled:\n${schedulingMessage}`;
|
|
CloudRunnerLogger.logWarning(message);
|
|
waitComplete = false;
|
|
return true; // Exit wait loop to throw error
|
|
}
|
|
|
|
// Check if pod is actively pulling an image - if so, allow more time
|
|
const isPullingImage = podEvents.some(
|
|
(x) => x.reason === 'Pulling' || x.reason === 'Pulled' || x.message?.includes('Pulling image'),
|
|
);
|
|
const hasImagePullError = podEvents.some(
|
|
(x) => x.reason === 'Failed' && (x.message?.includes('pull') || x.message?.includes('image')),
|
|
);
|
|
|
|
if (hasImagePullError) {
|
|
message = `Pod ${podName} failed to pull image. Check image availability and credentials.`;
|
|
CloudRunnerLogger.logWarning(message);
|
|
waitComplete = false;
|
|
return true; // Exit wait loop to throw error
|
|
}
|
|
|
|
// If actively pulling image, reset pending count to allow more time
|
|
// Large images (like Unity 3.9GB) can take 3-5 minutes to pull
|
|
if (isPullingImage && consecutivePendingCount > 4) {
|
|
CloudRunnerLogger.log(
|
|
`Pod ${podName} is pulling image (check ${consecutivePendingCount}). This may take several minutes for large images.`,
|
|
);
|
|
// Don't increment consecutivePendingCount if we're actively pulling
|
|
consecutivePendingCount = Math.max(4, consecutivePendingCount - 1);
|
|
}
|
|
} catch {
|
|
// Ignore event fetch errors
|
|
}
|
|
|
|
// For tests, allow more time if image is being pulled (large images need 5+ minutes)
|
|
// Otherwise fail faster if stuck in Pending (2 minutes = 8 checks at 15s interval)
|
|
const isTest = process.env['cloudRunnerTests'] === 'true';
|
|
const isPullingImage =
|
|
containerStatuses.some(
|
|
(cs: any) => cs.state?.waiting?.reason === 'ImagePull' || cs.state?.waiting?.reason === 'ErrImagePull',
|
|
) || conditions.some((c: any) => c.reason?.includes('Pulling'));
|
|
|
|
// Allow up to 20 minutes for image pulls in tests (80 checks), 2 minutes otherwise
|
|
const maxPendingChecks = isTest && isPullingImage ? 80 : isTest ? 8 : 80;
|
|
|
|
if (consecutivePendingCount >= maxPendingChecks) {
|
|
message = `Pod ${podName} stuck in Pending state for too long (${consecutivePendingCount} checks). This indicates a scheduling problem.`;
|
|
// Get events for context
|
|
try {
|
|
const events = await kubeClient.listNamespacedEvent(namespace);
|
|
const podEvents = events.body.items
|
|
.filter((x) => x.involvedObject?.name === podName)
|
|
.slice(-10)
|
|
.map((x) => `${x.type}: ${x.reason} - ${x.message}`);
|
|
if (podEvents.length > 0) {
|
|
message += `\n\nRecent Events:\n${podEvents.join('\n')}`;
|
|
}
|
|
|
|
// Get pod details to check for scheduling issues
|
|
try {
|
|
const podStatus = await kubeClient.readNamespacedPodStatus(podName, namespace);
|
|
const podSpec = podStatus.body.spec;
|
|
const podStatusDetails = podStatus.body.status;
|
|
|
|
// Check container resource requests
|
|
if (podSpec?.containers?.[0]?.resources?.requests) {
|
|
const requests = podSpec.containers[0].resources.requests;
|
|
message += `\n\nContainer Resource Requests:\n CPU: ${requests.cpu || 'not set'}\n Memory: ${
|
|
requests.memory || 'not set'
|
|
}\n Ephemeral Storage: ${requests['ephemeral-storage'] || 'not set'}`;
|
|
}
|
|
|
|
// Check node selector and tolerations
|
|
if (podSpec?.nodeSelector && Object.keys(podSpec.nodeSelector).length > 0) {
|
|
message += `\n\nNode Selector: ${JSON.stringify(podSpec.nodeSelector)}`;
|
|
}
|
|
if (podSpec?.tolerations && podSpec.tolerations.length > 0) {
|
|
message += `\n\nTolerations: ${JSON.stringify(podSpec.tolerations)}`;
|
|
}
|
|
|
|
// Check pod conditions for scheduling issues
|
|
if (podStatusDetails?.conditions) {
|
|
const unschedulable = podStatusDetails.conditions.find(
|
|
(c: any) => c.type === 'PodScheduled' && c.status === 'False',
|
|
);
|
|
if (unschedulable) {
|
|
message += `\n\nScheduling Issue: ${unschedulable.reason || 'Unknown'} - ${
|
|
unschedulable.message || 'No message'
|
|
}`;
|
|
}
|
|
}
|
|
} catch (podStatusError) {
|
|
// Ignore pod status fetch errors
|
|
}
|
|
} catch {
|
|
// Ignore event fetch errors
|
|
}
|
|
CloudRunnerLogger.logWarning(message);
|
|
waitComplete = false;
|
|
return true; // Exit wait loop to throw error
|
|
}
|
|
|
|
// Log diagnostic info every 4 checks (1 minute) if still pending
|
|
if (consecutivePendingCount % 4 === 0) {
|
|
const pendingMessage = `Pod ${podName} still Pending (check ${consecutivePendingCount}/${maxPendingChecks}). Phase: ${phase}`;
|
|
const conditionMessages = conditions
|
|
.map((c: any) => `${c.type}: ${c.reason || 'N/A'} - ${c.message || 'N/A'}`)
|
|
.join('; ');
|
|
CloudRunnerLogger.log(`${pendingMessage}. Conditions: ${conditionMessages || 'None'}`);
|
|
|
|
// Log events periodically to help diagnose
|
|
if (consecutivePendingCount % 8 === 0) {
|
|
try {
|
|
const events = await kubeClient.listNamespacedEvent(namespace);
|
|
const podEvents = events.body.items
|
|
.filter((x) => x.involvedObject?.name === podName)
|
|
.slice(-3)
|
|
.map((x) => `${x.type}: ${x.reason} - ${x.message}`)
|
|
.join('; ');
|
|
if (podEvents) {
|
|
CloudRunnerLogger.log(`Recent pod events: ${podEvents}`);
|
|
}
|
|
} catch {
|
|
// Ignore event fetch errors
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
message = `Phase:${phase} \n Reason:${conditions[0]?.reason || ''} \n Message:${
|
|
conditions[0]?.message || ''
|
|
}`;
|
|
|
|
if (waitComplete || phase !== 'Pending') return true;
|
|
|
|
return false;
|
|
},
|
|
{
|
|
timeout: process.env['cloudRunnerTests'] === 'true' ? 300000 : 2000000, // 5 minutes for tests, ~33 minutes for production
|
|
intervalBetweenAttempts: 15000, // 15 seconds
|
|
},
|
|
);
|
|
} catch (waitError: any) {
|
|
// If waitUntil times out or throws, get final pod status
|
|
try {
|
|
const finalStatus = await kubeClient.readNamespacedPodStatus(podName, namespace);
|
|
const phase = finalStatus?.body.status?.phase || 'Unknown';
|
|
const conditions = finalStatus?.body.status?.conditions || [];
|
|
message = `Pod ${podName} timed out waiting to start.\nFinal Phase: ${phase}\n`;
|
|
message += conditions.map((c: any) => `${c.type}: ${c.reason} - ${c.message}`).join('\n');
|
|
|
|
// Get events for context
|
|
try {
|
|
const events = await kubeClient.listNamespacedEvent(namespace);
|
|
const podEvents = events.body.items
|
|
.filter((x) => x.involvedObject?.name === podName)
|
|
.slice(-5)
|
|
.map((x) => `${x.type}: ${x.reason} - ${x.message}`);
|
|
if (podEvents.length > 0) {
|
|
message += `\n\nRecent Events:\n${podEvents.join('\n')}`;
|
|
}
|
|
} catch {
|
|
// Ignore event fetch errors
|
|
}
|
|
|
|
CloudRunnerLogger.logWarning(message);
|
|
} catch (statusError) {
|
|
message = `Pod ${podName} timed out and could not retrieve final status: ${waitError?.message || waitError}`;
|
|
CloudRunnerLogger.logWarning(message);
|
|
}
|
|
|
|
throw new Error(`Pod ${podName} failed to start within timeout. ${message}`);
|
|
}
|
|
|
|
// Only throw if we detected a permanent failure condition
|
|
// If the pod completed (Failed/Succeeded), we should still try to get logs
|
|
if (!waitComplete) {
|
|
// Check the final phase to see if it's a permanent failure or just completed
|
|
try {
|
|
const finalStatus = await kubeClient.readNamespacedPodStatus(podName, namespace);
|
|
const finalPhase = finalStatus?.body.status?.phase || 'Unknown';
|
|
if (finalPhase === 'Failed' || finalPhase === 'Succeeded') {
|
|
CloudRunnerLogger.logWarning(
|
|
`Pod ${podName} completed with phase ${finalPhase} before reaching Running state. Will attempt to retrieve logs.`,
|
|
);
|
|
return true; // Allow workflow to continue and try to get logs
|
|
}
|
|
} catch {
|
|
// If we can't check status, fall through to throw error
|
|
}
|
|
CloudRunnerLogger.logWarning(`Pod ${podName} did not reach running state: ${message}`);
|
|
throw new Error(`Pod ${podName} did not start successfully: ${message}`);
|
|
}
|
|
|
|
return waitComplete;
|
|
}
|
|
}
|
|
|
|
export default KubernetesTaskRunner;
|