unity-builder/src/model/cloud-runner/providers/k8s/kubernetes-task-runner.ts

import { CoreV1Api, KubeConfig } from '@kubernetes/client-node';
import CloudRunnerLogger from '../../services/core/cloud-runner-logger';
import { waitUntil } from 'async-wait-until';
import { CloudRunnerSystem } from '../../services/core/cloud-runner-system';
import CloudRunner from '../../cloud-runner';
import KubernetesPods from './kubernetes-pods';
import { FollowLogStreamService } from '../../services/core/follow-log-stream-service';

class KubernetesTaskRunner {
  static readonly maxRetry: number = 3;
  static lastReceivedMessage: string = ``;

  static async runTask(
    kubeConfig: KubeConfig,
    kubeClient: CoreV1Api,
    jobName: string,
    podName: string,
    containerName: string,
    namespace: string,
  ) {
    let output = '';
    let shouldReadLogs = true;
    let shouldCleanup = true;
    let retriesAfterFinish = 0;
    let kubectlLogsFailedCount = 0;
    const maxKubectlLogsFailures = 3;
    // eslint-disable-next-line no-constant-condition
    while (true) {
      await new Promise((resolve) => setTimeout(resolve, 3000));
      CloudRunnerLogger.log(
        `Streaming logs from pod: ${podName} container: ${containerName} namespace: ${namespace} ${CloudRunner.buildParameters.kubeVolumeSize}/${CloudRunner.buildParameters.containerCpu}/${CloudRunner.buildParameters.containerMemory}`,
      );
      const isRunning = await KubernetesPods.IsPodRunning(podName, namespace, kubeClient);

      const callback = (outputChunk: string) => {
        // Filter out kubectl error messages about being unable to retrieve container logs
        // These errors pollute the output and don't contain useful information
        const lowerChunk = outputChunk.toLowerCase();
        if (lowerChunk.includes('unable to retrieve container logs')) {
          CloudRunnerLogger.log(`Filtered kubectl error: ${outputChunk.trim()}`);
          return;
        }

        output += outputChunk;

        // split output chunk and handle per line
        for (const chunk of outputChunk.split(`\n`)) {
          // Skip empty chunks and kubectl error messages (case-insensitive)
          const lowerChunk = chunk.toLowerCase();
          if (chunk.trim() && !lowerChunk.includes('unable to retrieve container logs')) {
            ({ shouldReadLogs, shouldCleanup, output } = FollowLogStreamService.handleIteration(
              chunk,
              shouldReadLogs,
              shouldCleanup,
              output,
            ));
          }
        }
      };
      try {
        // Always specify container name explicitly to avoid containerd:// errors
        // Use -f for running pods, --previous for terminated pods
        await CloudRunnerSystem.Run(
          `kubectl logs ${podName} -c ${containerName} -n ${namespace}${isRunning ? ' -f' : ' --previous'}`,
          false,
          true,
          callback,
        );
        // Reset failure count on success
        kubectlLogsFailedCount = 0;
      } catch (error: any) {
        kubectlLogsFailedCount++;
        await new Promise((resolve) => setTimeout(resolve, 3000));
        const continueStreaming = await KubernetesPods.IsPodRunning(podName, namespace, kubeClient);
        CloudRunnerLogger.log(`K8s logging error ${error} ${continueStreaming}`);

        // Filter out kubectl error messages from the error output
        const errorMessage = error?.message || error?.toString() || '';
        const isKubectlLogsError =
          errorMessage.includes('unable to retrieve container logs for containerd://') ||
          errorMessage.toLowerCase().includes('unable to retrieve container logs');

        if (isKubectlLogsError) {
          CloudRunnerLogger.log(
            `Kubectl unable to retrieve logs, attempt ${kubectlLogsFailedCount}/${maxKubectlLogsFailures}`,
          );

          // If kubectl logs has failed multiple times, try reading the log file directly from the pod
          // This works even if the pod is terminated, as long as it hasn't been deleted
          if (kubectlLogsFailedCount >= maxKubectlLogsFailures && !isRunning && !continueStreaming) {
            CloudRunnerLogger.log(`Attempting to read log file directly from pod as fallback...`);
            try {
              // Try to read the log file from the pod
              // Use kubectl exec for running pods, or try to access via PVC if pod is terminated
              let logFileContent = '';

              if (isRunning) {
                // Pod is still running, try exec
                logFileContent = await CloudRunnerSystem.Run(
                  `kubectl exec ${podName} -c ${containerName} -n ${namespace} -- cat /home/job-log.txt 2>/dev/null || echo ""`,
                  true,
                  true,
                );
              } else {
                // Pod is terminated, try to create a temporary pod to read from the PVC
                // First, check if we can still access the pod's filesystem
                CloudRunnerLogger.log(`Pod is terminated, attempting to read log file via temporary pod...`);
                // For terminated pods, we might not be able to exec, so we'll skip this fallback
                // and rely on the log file being written to the PVC (if mounted)
                CloudRunnerLogger.logWarning(`Cannot read log file from terminated pod via exec`);
              }

              if (logFileContent && logFileContent.trim()) {
                CloudRunnerLogger.log(`Successfully read log file from pod (${logFileContent.length} chars)`);
                // Process the log file content line by line
                for (const line of logFileContent.split(`\n`)) {
                  const lowerLine = line.toLowerCase();
                  if (line.trim() && !lowerLine.includes('unable to retrieve container logs')) {
                    ({ shouldReadLogs, shouldCleanup, output } = FollowLogStreamService.handleIteration(
                      line,
                      shouldReadLogs,
                      shouldCleanup,
                      output,
                    ));
                  }
                }

                // Check if we got the end of transmission marker
                if (FollowLogStreamService.DidReceiveEndOfTransmission) {
                  CloudRunnerLogger.log('end of log stream (from log file)');
                  break;
                }
              } else {
                CloudRunnerLogger.logWarning(`Log file read returned empty content, continuing with available logs`);
                // If we can't read the log file, break out of the loop to return whatever logs we have
                // This prevents infinite retries when kubectl logs consistently fails
                break;
              }
            } catch (execError: any) {
              CloudRunnerLogger.logWarning(`Failed to read log file from pod: ${execError}`);
              // If we've exhausted all options, break to return whatever logs we have
              break;
            }
          }
        }

        // If pod is not running and we tried --previous but it failed, try without --previous
        if (!isRunning && !continueStreaming && error?.message?.includes('previous terminated container')) {
          CloudRunnerLogger.log(`Previous container not found, trying current container logs...`);
          try {
            await CloudRunnerSystem.Run(
              `kubectl logs ${podName} -c ${containerName} -n ${namespace}`,
              false,
              true,
              callback,
            );

            // If we successfully got logs, check for end of transmission
            if (FollowLogStreamService.DidReceiveEndOfTransmission) {
              CloudRunnerLogger.log('end of log stream');
              break;
            }

            // If we got logs but no end marker, continue trying (might be more logs)
            if (retriesAfterFinish < KubernetesTaskRunner.maxRetry) {
              retriesAfterFinish++;
              continue;
            }

            // If we've exhausted retries, break
            break;
          } catch (fallbackError: any) {
            CloudRunnerLogger.log(`Fallback log fetch also failed: ${fallbackError}`);

            // If both fail, continue retrying if we haven't exhausted retries
            if (retriesAfterFinish < KubernetesTaskRunner.maxRetry) {
              retriesAfterFinish++;
              continue;
            }

            // Only break if we've exhausted all retries
            CloudRunnerLogger.logWarning(
              `Could not fetch any container logs after ${KubernetesTaskRunner.maxRetry} retries`,
            );
            break;
          }
        }

        if (continueStreaming) {
          continue;
        }
        if (retriesAfterFinish < KubernetesTaskRunner.maxRetry) {
          retriesAfterFinish++;
          continue;
        }

        // If we've exhausted retries and it's not a previous container issue, throw
        if (!error?.message?.includes('previous terminated container')) {
          throw error;
        }

        // For previous container errors, we've already tried fallback, so just break
        CloudRunnerLogger.logWarning(
          `Could not fetch previous container logs after retries, but continuing with available logs`,
        );
        break;
      }
      if (FollowLogStreamService.DidReceiveEndOfTransmission) {
        CloudRunnerLogger.log('end of log stream');
        break;
      }
    }

    // After kubectl logs loop ends, read log file as fallback to capture any messages
    // written after kubectl stopped reading (e.g., "Collected Logs" from post-build)
    // This ensures all log messages are included in BuildResults for test assertions
    // If output is empty, we need to be more aggressive about getting logs
    const needsFallback = output.trim().length === 0;
    const missingCollectedLogs = !output.includes('Collected Logs');
    
    if (needsFallback) {
      CloudRunnerLogger.log('Output is empty, attempting aggressive log collection fallback...');
      // Give the pod a moment to finish writing logs before we try to read them
      await new Promise((resolve) => setTimeout(resolve, 5000));
    }

    // Always try fallback if output is empty, if pod is terminated, or if "Collected Logs" is missing
    // The "Collected Logs" check ensures we try to get post-build messages even if we have some output
    try {
      const isPodStillRunning = await KubernetesPods.IsPodRunning(podName, namespace, kubeClient);
      const shouldTryFallback = !isPodStillRunning || needsFallback || missingCollectedLogs;

      if (shouldTryFallback) {
        const reason = needsFallback
          ? 'output is empty'
          : missingCollectedLogs
            ? 'Collected Logs missing from output'
            : 'pod is terminated';
        CloudRunnerLogger.log(
          `Pod is ${isPodStillRunning ? 'running' : 'terminated'} and ${reason}, reading log file as fallback...`,
        );
        try {
          // Try to read the log file from the pod
          // For killed pods (OOM), kubectl exec might not work, so we try multiple approaches
          // First try --previous flag for terminated containers, then try without it
          let logFileContent = '';

          // Try multiple approaches to get the log file
          // Order matters: try terminated container first, then current, then PVC, then kubectl logs as last resort
          // For K8s, the PVC is mounted at /data, so try reading from there too
          const attempts = [
            // For terminated pods, try --previous first
            `kubectl exec ${podName} -c ${containerName} -n ${namespace} --previous -- cat /home/job-log.txt 2>/dev/null || echo ""`,
            // Try current container
            `kubectl exec ${podName} -c ${containerName} -n ${namespace} -- cat /home/job-log.txt 2>/dev/null || echo ""`,
            // Try reading from PVC (/data) in case log was copied there
            `kubectl exec ${podName} -c ${containerName} -n ${namespace} --previous -- cat /data/job-log.txt 2>/dev/null || echo ""`,
            `kubectl exec ${podName} -c ${containerName} -n ${namespace} -- cat /data/job-log.txt 2>/dev/null || echo ""`,
            // Try kubectl logs as fallback (might capture stdout even if exec fails)
            `kubectl logs ${podName} -c ${containerName} -n ${namespace} --previous 2>/dev/null || echo ""`,
            `kubectl logs ${podName} -c ${containerName} -n ${namespace} 2>/dev/null || echo ""`,
          ];

          for (const attempt of attempts) {
            if (logFileContent && logFileContent.trim()) {
              break; // We got content, no need to try more
            }
            try {
              CloudRunnerLogger.log(`Trying fallback method: ${attempt.substring(0, 80)}...`);
              const result = await CloudRunnerSystem.Run(attempt, true, true);
              if (result && result.trim()) {
                logFileContent = result;
                CloudRunnerLogger.log(
                  `Successfully read logs using fallback method (${logFileContent.length} chars): ${attempt.substring(
                    0,
                    50,
                  )}...`,
                );
                break;
              } else {
                CloudRunnerLogger.log(`Fallback method returned empty result: ${attempt.substring(0, 50)}...`);
              }
            } catch (attemptError: any) {
              CloudRunnerLogger.log(
                `Fallback method failed: ${attempt.substring(0, 50)}... Error: ${
                  attemptError?.message || attemptError
                }`,
              );
              // Continue to next attempt
            }
          }

          if (!logFileContent || !logFileContent.trim()) {
            CloudRunnerLogger.logWarning(
              'Could not read log file from pod after all fallback attempts (may be OOM-killed or pod not accessible).',
            );
          }

          if (logFileContent && logFileContent.trim()) {
            CloudRunnerLogger.log(
              `Read log file from pod as fallback (${logFileContent.length} chars) to capture missing messages`,
            );
            // Get the lines we already have in output to avoid duplicates
            const existingLines = new Set(output.split('\n').map((line) => line.trim()));
            // Process the log file content line by line and add missing lines
            for (const line of logFileContent.split(`\n`)) {
              const trimmedLine = line.trim();
              const lowerLine = trimmedLine.toLowerCase();
              // Skip empty lines, kubectl errors, and lines we already have
              if (
                trimmedLine &&
                !lowerLine.includes('unable to retrieve container logs') &&
                !existingLines.has(trimmedLine)
              ) {
                // Process through FollowLogStreamService - it will append to output
                // Don't add to output manually since handleIteration does it
                ({ shouldReadLogs, shouldCleanup, output } = FollowLogStreamService.handleIteration(
                  trimmedLine,
                  shouldReadLogs,
                  shouldCleanup,
                  output,
                ));
              }
            }
          }
        } catch (logFileError: any) {
          CloudRunnerLogger.logWarning(
            `Could not read log file from pod as fallback: ${logFileError?.message || logFileError}`,
          );
          // Continue with existing output - this is a best-effort fallback
        }
      }

      // If output is still empty after fallback attempts, add a warning message
      // This ensures BuildResults is not completely empty, which would cause test failures
      if (needsFallback && output.trim().length === 0) {
        CloudRunnerLogger.logWarning(
          'Could not retrieve any logs from pod after all attempts. Pod may have been killed before logs were written.',
        );
        // Add a minimal message so BuildResults is not completely empty
        // This helps with debugging and prevents test failures due to empty results
        output = 'Pod logs unavailable - pod may have been terminated before logs could be collected.\n';
      }
    } catch (fallbackError: any) {
      CloudRunnerLogger.logWarning(
        `Error checking pod status for log file fallback: ${fallbackError?.message || fallbackError}`,
      );
      // If output is empty and we hit an error, still add a message so BuildResults isn't empty
      if (needsFallback && output.trim().length === 0) {
        output = `Error retrieving logs: ${fallbackError?.message || fallbackError}\n`;
      }
      // Continue with existing output - this is a best-effort fallback
    }

    // Filter out kubectl error messages from the final output
    // These errors can be added via stderr even when kubectl fails
    // We filter them out so they don't pollute the BuildResults
    const lines = output.split('\n');
    const filteredLines = lines.filter((line) => !line.toLowerCase().includes('unable to retrieve container logs'));
    const filteredOutput = filteredLines.join('\n');

    // Log if we filtered out significant content
    const originalLineCount = lines.length;
    const filteredLineCount = filteredLines.length;
    if (originalLineCount > filteredLineCount) {
      CloudRunnerLogger.log(
        `Filtered out ${originalLineCount - filteredLineCount} kubectl error message(s) from output`,
      );
    }

    return filteredOutput;
  }

  static async watchUntilPodRunning(kubeClient: CoreV1Api, podName: string, namespace: string) {
    let waitComplete: boolean = false;
    let message = ``;
    let lastPhase = '';
    let consecutivePendingCount = 0;
    CloudRunnerLogger.log(`Watching ${podName} ${namespace}`);

    try {
      await waitUntil(
        async () => {
          const status = await kubeClient.readNamespacedPodStatus(podName, namespace);
          const phase = status?.body.status?.phase || 'Unknown';
          const conditions = status?.body.status?.conditions || [];
          const containerStatuses = status?.body.status?.containerStatuses || [];

          // Log phase changes
          if (phase !== lastPhase) {
            CloudRunnerLogger.log(`Pod ${podName} phase changed: ${lastPhase} -> ${phase}`);
            lastPhase = phase;
            consecutivePendingCount = 0;
          }

          // Check for failure conditions that mean the pod will never start
          const failureReasons = [
            'Unschedulable',
            'Failed',
            'ImagePullBackOff',
            'ErrImagePull',
            'CrashLoopBackOff',
            'CreateContainerError',
            'CreateContainerConfigError',
          ];

          const hasFailureCondition = conditions.some((condition: any) =>
            failureReasons.some((reason) => condition.reason?.includes(reason)),
          );

          const hasFailureContainerStatus = containerStatuses.some((containerStatus: any) =>
            failureReasons.some((reason) => containerStatus.state?.waiting?.reason?.includes(reason)),
          );

          if (hasFailureCondition || hasFailureContainerStatus) {
            // Get detailed failure information
            const failureCondition = conditions.find((condition: any) =>
              failureReasons.some((reason) => condition.reason?.includes(reason)),
            );
            const failureContainer = containerStatuses.find((containerStatus: any) =>
              failureReasons.some((reason) => containerStatus.state?.waiting?.reason?.includes(reason)),
            );

            message = `Pod ${podName} failed to start:\nPhase: ${phase}\n`;
            if (failureCondition) {
              message += `Condition Reason: ${failureCondition.reason}\nCondition Message: ${failureCondition.message}\n`;
            }
            if (failureContainer) {
              message += `Container Reason: ${failureContainer.state?.waiting?.reason}\nContainer Message: ${failureContainer.state?.waiting?.message}\n`;
            }

            // Log pod events for additional context
            try {
              const events = await kubeClient.listNamespacedEvent(namespace);
              const podEvents = events.body.items
                .filter((x) => x.involvedObject?.name === podName)
                .map((x) => ({
                  message: x.message || ``,
                  reason: x.reason || ``,
                  type: x.type || ``,
                }));
              if (podEvents.length > 0) {
                message += `\nRecent Events:\n${JSON.stringify(podEvents.slice(-5), undefined, 2)}`;
              }
            } catch (eventError) {
              // Ignore event fetch errors
            }

            CloudRunnerLogger.logWarning(message);
            waitComplete = false; // Mark as not complete so we throw an error
            return true; // Return true to exit wait loop
          }

          waitComplete = phase !== 'Pending' && phase !== 'Unknown';

          if (phase === 'Pending') {
            consecutivePendingCount++;
            // Log diagnostic info every 4 checks (1 minute) if still pending
            if (consecutivePendingCount % 4 === 0) {
              const pendingMessage = `Pod ${podName} still Pending (check ${consecutivePendingCount}). Phase: ${phase}`;
              const conditionMessages = conditions
                .map((c: any) => `${c.type}: ${c.reason || 'N/A'} - ${c.message || 'N/A'}`)
                .join('; ');
              CloudRunnerLogger.log(`${pendingMessage}. Conditions: ${conditionMessages || 'None'}`);

              // Log events periodically to help diagnose
              if (consecutivePendingCount % 8 === 0) {
                try {
                  const events = await kubeClient.listNamespacedEvent(namespace);
                  const podEvents = events.body.items
                    .filter((x) => x.involvedObject?.name === podName)
                    .slice(-3)
                    .map((x) => `${x.type}: ${x.reason} - ${x.message}`)
                    .join('; ');
                  if (podEvents) {
                    CloudRunnerLogger.log(`Recent pod events: ${podEvents}`);
                  }
                } catch {
                  // Ignore event fetch errors
                }
              }
            }
          }

          message = `Phase:${phase} \n Reason:${conditions[0]?.reason || ''} \n Message:${
            conditions[0]?.message || ''
          }`;

          if (waitComplete || phase !== 'Pending') return true;

          return false;
        },
        {
          timeout: 2000000, // ~33 minutes
          intervalBetweenAttempts: 15000, // 15 seconds
        },
      );
    } catch (waitError: any) {
      // If waitUntil times out or throws, get final pod status
      try {
        const finalStatus = await kubeClient.readNamespacedPodStatus(podName, namespace);
        const phase = finalStatus?.body.status?.phase || 'Unknown';
        const conditions = finalStatus?.body.status?.conditions || [];
        message = `Pod ${podName} timed out waiting to start.\nFinal Phase: ${phase}\n`;
        message += conditions.map((c: any) => `${c.type}: ${c.reason} - ${c.message}`).join('\n');

        // Get events for context
        try {
          const events = await kubeClient.listNamespacedEvent(namespace);
          const podEvents = events.body.items
            .filter((x) => x.involvedObject?.name === podName)
            .slice(-5)
            .map((x) => `${x.type}: ${x.reason} - ${x.message}`);
          if (podEvents.length > 0) {
            message += `\n\nRecent Events:\n${podEvents.join('\n')}`;
          }
        } catch {
          // Ignore event fetch errors
        }

        CloudRunnerLogger.logWarning(message);
      } catch (statusError) {
        message = `Pod ${podName} timed out and could not retrieve final status: ${waitError?.message || waitError}`;
        CloudRunnerLogger.logWarning(message);
      }

      throw new Error(`Pod ${podName} failed to start within timeout. ${message}`);
    }

    if (!waitComplete) {
      CloudRunnerLogger.logWarning(`Pod ${podName} did not reach running state: ${message}`);
      throw new Error(`Pod ${podName} did not start successfully: ${message}`);
    }

    return waitComplete;
  }
}

export default KubernetesTaskRunner;