unity-builder/src/model/cloud-runner/providers/k8s/kubernetes-task-runner.ts

379 lines
16 KiB
TypeScript

import { CoreV1Api, KubeConfig } from '@kubernetes/client-node';
import CloudRunnerLogger from '../../services/core/cloud-runner-logger';
import { waitUntil } from 'async-wait-until';
import { CloudRunnerSystem } from '../../services/core/cloud-runner-system';
import CloudRunner from '../../cloud-runner';
import KubernetesPods from './kubernetes-pods';
import { FollowLogStreamService } from '../../services/core/follow-log-stream-service';
class KubernetesTaskRunner {
static readonly maxRetry: number = 3;
static lastReceivedMessage: string = ``;
static async runTask(
kubeConfig: KubeConfig,
kubeClient: CoreV1Api,
jobName: string,
podName: string,
containerName: string,
namespace: string,
) {
let output = '';
let shouldReadLogs = true;
let shouldCleanup = true;
let retriesAfterFinish = 0;
let kubectlLogsFailedCount = 0;
const maxKubectlLogsFailures = 3;
// eslint-disable-next-line no-constant-condition
while (true) {
await new Promise((resolve) => setTimeout(resolve, 3000));
CloudRunnerLogger.log(
`Streaming logs from pod: ${podName} container: ${containerName} namespace: ${namespace} ${CloudRunner.buildParameters.kubeVolumeSize}/${CloudRunner.buildParameters.containerCpu}/${CloudRunner.buildParameters.containerMemory}`,
);
const isRunning = await KubernetesPods.IsPodRunning(podName, namespace, kubeClient);
const callback = (outputChunk: string) => {
// Filter out kubectl error messages about being unable to retrieve container logs
// These errors pollute the output and don't contain useful information
const lowerChunk = outputChunk.toLowerCase();
if (lowerChunk.includes('unable to retrieve container logs')) {
CloudRunnerLogger.log(`Filtered kubectl error: ${outputChunk.trim()}`);
return;
}
output += outputChunk;
// split output chunk and handle per line
for (const chunk of outputChunk.split(`\n`)) {
// Skip empty chunks and kubectl error messages (case-insensitive)
const lowerChunk = chunk.toLowerCase();
if (chunk.trim() && !lowerChunk.includes('unable to retrieve container logs')) {
({ shouldReadLogs, shouldCleanup, output } = FollowLogStreamService.handleIteration(
chunk,
shouldReadLogs,
shouldCleanup,
output,
));
}
}
};
try {
// Always specify container name explicitly to avoid containerd:// errors
// Use -f for running pods, --previous for terminated pods
await CloudRunnerSystem.Run(
`kubectl logs ${podName} -c ${containerName} -n ${namespace}${isRunning ? ' -f' : ' --previous'}`,
false,
true,
callback,
);
// Reset failure count on success
kubectlLogsFailedCount = 0;
} catch (error: any) {
kubectlLogsFailedCount++;
await new Promise((resolve) => setTimeout(resolve, 3000));
const continueStreaming = await KubernetesPods.IsPodRunning(podName, namespace, kubeClient);
CloudRunnerLogger.log(`K8s logging error ${error} ${continueStreaming}`);
// Filter out kubectl error messages from the error output
const errorMessage = error?.message || error?.toString() || '';
const isKubectlLogsError =
errorMessage.includes('unable to retrieve container logs for containerd://') ||
errorMessage.toLowerCase().includes('unable to retrieve container logs');
if (isKubectlLogsError) {
CloudRunnerLogger.log(
`Kubectl unable to retrieve logs, attempt ${kubectlLogsFailedCount}/${maxKubectlLogsFailures}`,
);
// If kubectl logs has failed multiple times, try reading the log file directly from the pod
// This works even if the pod is terminated, as long as it hasn't been deleted
if (kubectlLogsFailedCount >= maxKubectlLogsFailures && !isRunning && !continueStreaming) {
CloudRunnerLogger.log(`Attempting to read log file directly from pod as fallback...`);
try {
// Try to read the log file from the pod
// Use kubectl exec for running pods, or try to access via PVC if pod is terminated
let logFileContent = '';
if (isRunning) {
// Pod is still running, try exec
logFileContent = await CloudRunnerSystem.Run(
`kubectl exec ${podName} -c ${containerName} -n ${namespace} -- cat /home/job-log.txt 2>/dev/null || echo ""`,
true,
true,
);
} else {
// Pod is terminated, try to create a temporary pod to read from the PVC
// First, check if we can still access the pod's filesystem
CloudRunnerLogger.log(`Pod is terminated, attempting to read log file via temporary pod...`);
// For terminated pods, we might not be able to exec, so we'll skip this fallback
// and rely on the log file being written to the PVC (if mounted)
CloudRunnerLogger.logWarning(`Cannot read log file from terminated pod via exec`);
}
if (logFileContent && logFileContent.trim()) {
CloudRunnerLogger.log(`Successfully read log file from pod (${logFileContent.length} chars)`);
// Process the log file content line by line
for (const line of logFileContent.split(`\n`)) {
const lowerLine = line.toLowerCase();
if (line.trim() && !lowerLine.includes('unable to retrieve container logs')) {
({ shouldReadLogs, shouldCleanup, output } = FollowLogStreamService.handleIteration(
line,
shouldReadLogs,
shouldCleanup,
output,
));
}
}
// Check if we got the end of transmission marker
if (FollowLogStreamService.DidReceiveEndOfTransmission) {
CloudRunnerLogger.log('end of log stream (from log file)');
break;
}
} else {
CloudRunnerLogger.logWarning(`Log file read returned empty content, continuing with available logs`);
// If we can't read the log file, break out of the loop to return whatever logs we have
// This prevents infinite retries when kubectl logs consistently fails
break;
}
} catch (execError: any) {
CloudRunnerLogger.logWarning(`Failed to read log file from pod: ${execError}`);
// If we've exhausted all options, break to return whatever logs we have
break;
}
}
}
// If pod is not running and we tried --previous but it failed, try without --previous
if (!isRunning && !continueStreaming && error?.message?.includes('previous terminated container')) {
CloudRunnerLogger.log(`Previous container not found, trying current container logs...`);
try {
await CloudRunnerSystem.Run(
`kubectl logs ${podName} -c ${containerName} -n ${namespace}`,
false,
true,
callback,
);
// If we successfully got logs, check for end of transmission
if (FollowLogStreamService.DidReceiveEndOfTransmission) {
CloudRunnerLogger.log('end of log stream');
break;
}
// If we got logs but no end marker, continue trying (might be more logs)
if (retriesAfterFinish < KubernetesTaskRunner.maxRetry) {
retriesAfterFinish++;
continue;
}
// If we've exhausted retries, break
break;
} catch (fallbackError: any) {
CloudRunnerLogger.log(`Fallback log fetch also failed: ${fallbackError}`);
// If both fail, continue retrying if we haven't exhausted retries
if (retriesAfterFinish < KubernetesTaskRunner.maxRetry) {
retriesAfterFinish++;
continue;
}
// Only break if we've exhausted all retries
CloudRunnerLogger.logWarning(
`Could not fetch any container logs after ${KubernetesTaskRunner.maxRetry} retries`,
);
break;
}
}
if (continueStreaming) {
continue;
}
if (retriesAfterFinish < KubernetesTaskRunner.maxRetry) {
retriesAfterFinish++;
continue;
}
// If we've exhausted retries and it's not a previous container issue, throw
if (!error?.message?.includes('previous terminated container')) {
throw error;
}
// For previous container errors, we've already tried fallback, so just break
CloudRunnerLogger.logWarning(
`Could not fetch previous container logs after retries, but continuing with available logs`,
);
break;
}
if (FollowLogStreamService.DidReceiveEndOfTransmission) {
CloudRunnerLogger.log('end of log stream');
break;
}
}
// After kubectl logs loop ends, read log file as fallback to capture any messages
// written after kubectl stopped reading (e.g., "Collected Logs" from post-build)
// This ensures all log messages are included in BuildResults for test assertions
// If output is empty, we need to be more aggressive about getting logs
const needsFallback = output.trim().length === 0;
if (needsFallback) {
CloudRunnerLogger.log('Output is empty, attempting aggressive log collection fallback...');
}
try {
const isPodStillRunning = await KubernetesPods.IsPodRunning(podName, namespace, kubeClient);
if (!isPodStillRunning || needsFallback) {
CloudRunnerLogger.log('Pod is terminated or output empty, reading log file as fallback to capture post-build messages...');
try {
// Try to read the log file from the terminated pod
// For killed pods (OOM), kubectl exec might not work, so we try multiple approaches
// First try --previous flag for terminated containers, then try without it
let logFileContent = '';
// Try multiple approaches to get the log file
const attempts = [
`kubectl exec ${podName} -c ${containerName} -n ${namespace} --previous -- cat /home/job-log.txt 2>/dev/null || echo ""`,
`kubectl exec ${podName} -c ${containerName} -n ${namespace} -- cat /home/job-log.txt 2>/dev/null || echo ""`,
// Try to get logs one more time without -f flag
`kubectl logs ${podName} -c ${containerName} -n ${namespace} --previous 2>/dev/null || echo ""`,
`kubectl logs ${podName} -c ${containerName} -n ${namespace} 2>/dev/null || echo ""`,
];
for (const attempt of attempts) {
if (logFileContent && logFileContent.trim()) {
break; // We got content, no need to try more
}
try {
const result = await CloudRunnerSystem.Run(attempt, true, true);
if (result && result.trim()) {
logFileContent = result;
CloudRunnerLogger.log(`Successfully read logs using fallback method: ${attempt.substring(0, 50)}...`);
break;
}
} catch {
// Continue to next attempt
}
}
if (!logFileContent || !logFileContent.trim()) {
CloudRunnerLogger.logWarning(
'Could not read log file from terminated pod (may be OOM-killed). Using available logs.',
);
}
if (logFileContent && logFileContent.trim()) {
CloudRunnerLogger.log(
`Read log file from pod as fallback (${logFileContent.length} chars) to capture missing messages`,
);
// Get the lines we already have in output to avoid duplicates
const existingLines = new Set(output.split('\n').map((line) => line.trim()));
// Process the log file content line by line and add missing lines
for (const line of logFileContent.split(`\n`)) {
const trimmedLine = line.trim();
const lowerLine = trimmedLine.toLowerCase();
// Skip empty lines, kubectl errors, and lines we already have
if (
trimmedLine &&
!lowerLine.includes('unable to retrieve container logs') &&
!existingLines.has(trimmedLine)
) {
// Add missing line to output
output += `${line}\n`;
// Process through FollowLogStreamService to ensure proper handling
({ shouldReadLogs, shouldCleanup, output } = FollowLogStreamService.handleIteration(
line,
shouldReadLogs,
shouldCleanup,
output,
));
}
}
} else if (needsFallback && output.trim().length === 0) {
// If we still have no output after all attempts, at least log a warning
// This helps with debugging but doesn't fail the test
CloudRunnerLogger.logWarning(
'Could not retrieve any logs from pod. Pod may have been killed before logs were written.',
);
// Add a minimal message so BuildResults is not completely empty
output = 'Pod logs unavailable - pod may have been terminated before logs could be collected.\n';
}
} catch (logFileError: any) {
CloudRunnerLogger.logWarning(
`Could not read log file from pod as fallback: ${logFileError?.message || logFileError}`,
);
// Continue with existing output - this is a best-effort fallback
}
}
} catch (fallbackError: any) {
CloudRunnerLogger.logWarning(
`Error checking pod status for log file fallback: ${fallbackError?.message || fallbackError}`,
);
// Continue with existing output - this is a best-effort fallback
}
// Filter out kubectl error messages from the final output
// These errors can be added via stderr even when kubectl fails
// We filter them out so they don't pollute the BuildResults
const lines = output.split('\n');
const filteredLines = lines.filter((line) => !line.toLowerCase().includes('unable to retrieve container logs'));
const filteredOutput = filteredLines.join('\n');
// Log if we filtered out significant content
const originalLineCount = lines.length;
const filteredLineCount = filteredLines.length;
if (originalLineCount > filteredLineCount) {
CloudRunnerLogger.log(
`Filtered out ${originalLineCount - filteredLineCount} kubectl error message(s) from output`,
);
}
return filteredOutput;
}
static async watchUntilPodRunning(kubeClient: CoreV1Api, podName: string, namespace: string) {
let waitComplete: boolean = false;
let message = ``;
CloudRunnerLogger.log(`Watching ${podName} ${namespace}`);
await waitUntil(
async () => {
const status = await kubeClient.readNamespacedPodStatus(podName, namespace);
const phase = status?.body.status?.phase;
waitComplete = phase !== 'Pending';
message = `Phase:${status.body.status?.phase} \n Reason:${
status.body.status?.conditions?.[0].reason || ''
} \n Message:${status.body.status?.conditions?.[0].message || ''}`;
// CloudRunnerLogger.log(
// JSON.stringify(
// (await kubeClient.listNamespacedEvent(namespace)).body.items
// .map((x) => {
// return {
// message: x.message || ``,
// name: x.metadata.name || ``,
// reason: x.reason || ``,
// };
// })
// .filter((x) => x.name.includes(podName)),
// undefined,
// 4,
// ),
// );
if (waitComplete || phase !== 'Pending') return true;
return false;
},
{
timeout: 2000000,
intervalBetweenAttempts: 15000,
},
);
if (!waitComplete) {
CloudRunnerLogger.log(message);
}
return waitComplete;
}
}
export default KubernetesTaskRunner;