pr feedback - fail faster on pending pods and detect scheduling failures

2025-12-29 18:39:51 +00:00 · 2025-12-29 18:39:51 +00:00 · 4b182a065a
parent 45e7ed0fcb
commit 4b182a065a
3 changed files with 92 additions and 5 deletions
--- a/dist/index.js
+++ b/dist/index.js
@ -4857,9 +4857,50 @@ class KubernetesTaskRunner {
                }
                if (phase === 'Pending') {
                    consecutivePendingCount++;
+                    // Check for scheduling failures in events (faster than waiting for conditions)
+                    try {
+                        const events = await kubeClient.listNamespacedEvent(namespace);
+                        const podEvents = events.body.items.filter((x) => x.involvedObject?.name === podName);
+                        const failedSchedulingEvents = podEvents.filter((x) => x.reason === 'FailedScheduling' || x.reason === 'SchedulingGated');
+                        if (failedSchedulingEvents.length > 0) {
+                            const schedulingMessage = failedSchedulingEvents
+                                .map((x) => `${x.reason}: ${x.message || ''}`)
+                                .join('; ');
+                            message = `Pod ${podName} cannot be scheduled:\n${schedulingMessage}`;
+                            cloud_runner_logger_1.default.logWarning(message);
+                            waitComplete = false;
+                            return true; // Exit wait loop to throw error
+                        }
+                    }
+                    catch {
+                        // Ignore event fetch errors
+                    }
+                    // For tests, fail faster if stuck in Pending (2 minutes = 8 checks at 15s interval)
+                    const isTest = process.env['cloudRunnerTests'] === 'true';
+                    const maxPendingChecks = isTest ? 8 : 80; // 2 minutes for tests, 20 minutes for production
+                    if (consecutivePendingCount >= maxPendingChecks) {
+                        message = `Pod ${podName} stuck in Pending state for too long (${consecutivePendingCount} checks). This indicates a scheduling problem.`;
+                        // Get events for context
+                        try {
+                            const events = await kubeClient.listNamespacedEvent(namespace);
+                            const podEvents = events.body.items
+                                .filter((x) => x.involvedObject?.name === podName)
+                                .slice(-5)
+                                .map((x) => `${x.type}: ${x.reason} - ${x.message}`);
+                            if (podEvents.length > 0) {
+                                message += `\n\nRecent Events:\n${podEvents.join('\n')}`;
+                            }
+                        }
+                        catch {
+                            // Ignore event fetch errors
+                        }
+                        cloud_runner_logger_1.default.logWarning(message);
+                        waitComplete = false;
+                        return true; // Exit wait loop to throw error
+                    }
                    // Log diagnostic info every 4 checks (1 minute) if still pending
                    if (consecutivePendingCount % 4 === 0) {
-                        const pendingMessage = `Pod ${podName} still Pending (check ${consecutivePendingCount}). Phase: ${phase}`;
+                        const pendingMessage = `Pod ${podName} still Pending (check ${consecutivePendingCount}/${maxPendingChecks}). Phase: ${phase}`;
                        const conditionMessages = conditions
                            .map((c) => `${c.type}: ${c.reason || 'N/A'} - ${c.message || 'N/A'}`)
                            .join('; ');
@ -4888,7 +4929,7 @@ class KubernetesTaskRunner {
                    return true;
                return false;
            }, {
-                timeout: 2000000,
+                timeout: process.env['cloudRunnerTests'] === 'true' ? 300000 : 2000000,
                intervalBetweenAttempts: 15000, // 15 seconds
            });
        }
--- a/dist/index.js.map
+++ b/dist/index.js.map
--- a/src/model/cloud-runner/providers/k8s/kubernetes-task-runner.ts
+++ b/src/model/cloud-runner/providers/k8s/kubernetes-task-runner.ts
@ -481,9 +481,55 @@ class KubernetesTaskRunner {

          if (phase === 'Pending') {
            consecutivePendingCount++;
+            
+            // Check for scheduling failures in events (faster than waiting for conditions)
+            try {
+              const events = await kubeClient.listNamespacedEvent(namespace);
+              const podEvents = events.body.items.filter((x) => x.involvedObject?.name === podName);
+              const failedSchedulingEvents = podEvents.filter(
+                (x) => x.reason === 'FailedScheduling' || x.reason === 'SchedulingGated',
+              );
+              
+              if (failedSchedulingEvents.length > 0) {
+                const schedulingMessage = failedSchedulingEvents
+                  .map((x) => `${x.reason}: ${x.message || ''}`)
+                  .join('; ');
+                message = `Pod ${podName} cannot be scheduled:\n${schedulingMessage}`;
+                CloudRunnerLogger.logWarning(message);
+                waitComplete = false;
+                return true; // Exit wait loop to throw error
+              }
+            } catch {
+              // Ignore event fetch errors
+            }
+            
+            // For tests, fail faster if stuck in Pending (2 minutes = 8 checks at 15s interval)
+            const isTest = process.env['cloudRunnerTests'] === 'true';
+            const maxPendingChecks = isTest ? 8 : 80; // 2 minutes for tests, 20 minutes for production
+            
+            if (consecutivePendingCount >= maxPendingChecks) {
+              message = `Pod ${podName} stuck in Pending state for too long (${consecutivePendingCount} checks). This indicates a scheduling problem.`;
+              // Get events for context
+              try {
+                const events = await kubeClient.listNamespacedEvent(namespace);
+                const podEvents = events.body.items
+                  .filter((x) => x.involvedObject?.name === podName)
+                  .slice(-5)
+                  .map((x) => `${x.type}: ${x.reason} - ${x.message}`);
+                if (podEvents.length > 0) {
+                  message += `\n\nRecent Events:\n${podEvents.join('\n')}`;
+                }
+              } catch {
+                // Ignore event fetch errors
+              }
+              CloudRunnerLogger.logWarning(message);
+              waitComplete = false;
+              return true; // Exit wait loop to throw error
+            }
+            
            // Log diagnostic info every 4 checks (1 minute) if still pending
            if (consecutivePendingCount % 4 === 0) {
-              const pendingMessage = `Pod ${podName} still Pending (check ${consecutivePendingCount}). Phase: ${phase}`;
+              const pendingMessage = `Pod ${podName} still Pending (check ${consecutivePendingCount}/${maxPendingChecks}). Phase: ${phase}`;
              const conditionMessages = conditions
                .map((c: any) => `${c.type}: ${c.reason || 'N/A'} - ${c.message || 'N/A'}`)
                .join('; ');
@ -517,7 +563,7 @@ class KubernetesTaskRunner {
          return false;
        },
        {
-          timeout: 2000000, // ~33 minutes
+          timeout: process.env['cloudRunnerTests'] === 'true' ? 300000 : 2000000, // 5 minutes for tests, ~33 minutes for production
          intervalBetweenAttempts: 15000, // 15 seconds
        },
      );