diff --git a/src/model/cloud-runner/providers/k8s/kubernetes-storage.ts b/src/model/cloud-runner/providers/k8s/kubernetes-storage.ts index f4a32e34..5a702763 100644 --- a/src/model/cloud-runner/providers/k8s/kubernetes-storage.ts +++ b/src/model/cloud-runner/providers/k8s/kubernetes-storage.ts @@ -49,26 +49,46 @@ class KubernetesStorage { public static async watchUntilPVCNotPending(kubeClient: k8s.CoreV1Api, name: string, namespace: string) { try { CloudRunnerLogger.log(`watch Until PVC Not Pending ${name} ${namespace}`); - CloudRunnerLogger.log(`${await this.getPVCPhase(kubeClient, name, namespace)}`); + const initialPhase = await this.getPVCPhase(kubeClient, name, namespace); + CloudRunnerLogger.log(`Initial PVC phase: ${initialPhase}`); + + // Wait until PVC is NOT Pending (i.e., Bound or Available) await waitUntil( async () => { - return (await this.getPVCPhase(kubeClient, name, namespace)) === 'Pending'; + const phase = await this.getPVCPhase(kubeClient, name, namespace); + return phase !== 'Pending'; }, { timeout: 750000, intervalBetweenAttempts: 15000, }, ); + + const finalPhase = await this.getPVCPhase(kubeClient, name, namespace); + CloudRunnerLogger.log(`PVC phase after wait: ${finalPhase}`); + + if (finalPhase === 'Pending') { + throw new Error(`PVC ${name} is still Pending after timeout`); + } } catch (error: any) { core.error('Failed to watch PVC'); core.error(error.toString()); - core.error( - `PVC Body: ${JSON.stringify( - (await kubeClient.readNamespacedPersistentVolumeClaim(name, namespace)).body, - undefined, - 4, - )}`, - ); + try { + const pvcBody = (await kubeClient.readNamespacedPersistentVolumeClaim(name, namespace)).body; + core.error( + `PVC Body: ${JSON.stringify( + { + phase: pvcBody.status?.phase, + conditions: pvcBody.status?.conditions, + message: pvcBody.status?.message, + }, + undefined, + 4, + )}`, + ); + } catch { + // Ignore PVC read errors + } throw error; } } diff --git a/src/model/cloud-runner/providers/k8s/kubernetes-task-runner.ts b/src/model/cloud-runner/providers/k8s/kubernetes-task-runner.ts index a3b6c2e6..181583ac 100644 --- a/src/model/cloud-runner/providers/k8s/kubernetes-task-runner.ts +++ b/src/model/cloud-runner/providers/k8s/kubernetes-task-runner.ts @@ -577,6 +577,11 @@ class KubernetesTaskRunner { // Check pod conditions for scheduling issues if (podStatusDetails?.conditions) { + const allConditions = podStatusDetails.conditions.map( + (c: any) => `${c.type}: ${c.status}${c.reason ? ` (${c.reason})` : ''}${c.message ? ` - ${c.message}` : ''}`, + ); + message += `\n\nPod Conditions:\n${allConditions.join('\n')}`; + const unschedulable = podStatusDetails.conditions.find( (c: any) => c.type === 'PodScheduled' && c.status === 'False', ); @@ -585,6 +590,36 @@ class KubernetesTaskRunner { unschedulable.message || 'No message' }`; } + + // Check if pod is assigned to a node + if (podStatusDetails.hostIP) { + message += `\n\nPod assigned to node: ${podStatusDetails.hostIP}`; + } else { + message += `\n\nPod not yet assigned to a node (scheduling pending)`; + } + } + + // Check node resources if pod is assigned + if (podStatusDetails.hostIP) { + try { + const nodes = await kubeClient.listNode(); + const assignedNode = nodes.body.items.find((n: any) => + n.status.addresses?.some((a: any) => a.address === podStatusDetails.hostIP) + ); + if (assignedNode) { + const allocatable = assignedNode.status.allocatable || {}; + const capacity = assignedNode.status.capacity || {}; + message += `\n\nNode Resources (${assignedNode.metadata.name}):\n Allocatable CPU: ${allocatable.cpu || 'unknown'}\n Allocatable Memory: ${allocatable.memory || 'unknown'}\n Allocatable Ephemeral Storage: ${allocatable['ephemeral-storage'] || 'unknown'}`; + + // Check for taints that might prevent scheduling + if (assignedNode.spec.taints && assignedNode.spec.taints.length > 0) { + const taints = assignedNode.spec.taints.map((t: any) => `${t.key}=${t.value}:${t.effect}`).join(', '); + message += `\n Node Taints: ${taints}`; + } + } + } catch (nodeError) { + // Ignore node check errors + } } } catch (podStatusError) { // Ignore pod status fetch errors