pr feedback
parent
b98b1c7104
commit
eee8b4cbd1
|
|
@ -4708,36 +4708,136 @@ class KubernetesTaskRunner {
|
||||||
static async watchUntilPodRunning(kubeClient, podName, namespace) {
|
static async watchUntilPodRunning(kubeClient, podName, namespace) {
|
||||||
let waitComplete = false;
|
let waitComplete = false;
|
||||||
let message = ``;
|
let message = ``;
|
||||||
|
let lastPhase = '';
|
||||||
|
let consecutivePendingCount = 0;
|
||||||
cloud_runner_logger_1.default.log(`Watching ${podName} ${namespace}`);
|
cloud_runner_logger_1.default.log(`Watching ${podName} ${namespace}`);
|
||||||
await (0, async_wait_until_1.waitUntil)(async () => {
|
try {
|
||||||
const status = await kubeClient.readNamespacedPodStatus(podName, namespace);
|
await (0, async_wait_until_1.waitUntil)(async () => {
|
||||||
const phase = status?.body.status?.phase;
|
const status = await kubeClient.readNamespacedPodStatus(podName, namespace);
|
||||||
waitComplete = phase !== 'Pending';
|
const phase = status?.body.status?.phase || 'Unknown';
|
||||||
message = `Phase:${status.body.status?.phase} \n Reason:${status.body.status?.conditions?.[0].reason || ''} \n Message:${status.body.status?.conditions?.[0].message || ''}`;
|
const conditions = status?.body.status?.conditions || [];
|
||||||
// CloudRunnerLogger.log(
|
const containerStatuses = status?.body.status?.containerStatuses || [];
|
||||||
// JSON.stringify(
|
// Log phase changes
|
||||||
// (await kubeClient.listNamespacedEvent(namespace)).body.items
|
if (phase !== lastPhase) {
|
||||||
// .map((x) => {
|
cloud_runner_logger_1.default.log(`Pod ${podName} phase changed: ${lastPhase} -> ${phase}`);
|
||||||
// return {
|
lastPhase = phase;
|
||||||
// message: x.message || ``,
|
consecutivePendingCount = 0;
|
||||||
// name: x.metadata.name || ``,
|
}
|
||||||
// reason: x.reason || ``,
|
// Check for failure conditions that mean the pod will never start
|
||||||
// };
|
const failureReasons = [
|
||||||
// })
|
'Unschedulable',
|
||||||
// .filter((x) => x.name.includes(podName)),
|
'Failed',
|
||||||
// undefined,
|
'ImagePullBackOff',
|
||||||
// 4,
|
'ErrImagePull',
|
||||||
// ),
|
'CrashLoopBackOff',
|
||||||
// );
|
'CreateContainerError',
|
||||||
if (waitComplete || phase !== 'Pending')
|
'CreateContainerConfigError',
|
||||||
return true;
|
];
|
||||||
return false;
|
const hasFailureCondition = conditions.some((condition) => failureReasons.some((reason) => condition.reason?.includes(reason)));
|
||||||
}, {
|
const hasFailureContainerStatus = containerStatuses.some((containerStatus) => failureReasons.some((reason) => containerStatus.state?.waiting?.reason?.includes(reason)));
|
||||||
timeout: 2000000,
|
if (hasFailureCondition || hasFailureContainerStatus) {
|
||||||
intervalBetweenAttempts: 15000,
|
// Get detailed failure information
|
||||||
});
|
const failureCondition = conditions.find((condition) => failureReasons.some((reason) => condition.reason?.includes(reason)));
|
||||||
|
const failureContainer = containerStatuses.find((containerStatus) => failureReasons.some((reason) => containerStatus.state?.waiting?.reason?.includes(reason)));
|
||||||
|
message = `Pod ${podName} failed to start:\nPhase: ${phase}\n`;
|
||||||
|
if (failureCondition) {
|
||||||
|
message += `Condition Reason: ${failureCondition.reason}\nCondition Message: ${failureCondition.message}\n`;
|
||||||
|
}
|
||||||
|
if (failureContainer) {
|
||||||
|
message += `Container Reason: ${failureContainer.state?.waiting?.reason}\nContainer Message: ${failureContainer.state?.waiting?.message}\n`;
|
||||||
|
}
|
||||||
|
// Log pod events for additional context
|
||||||
|
try {
|
||||||
|
const events = await kubeClient.listNamespacedEvent(namespace);
|
||||||
|
const podEvents = events.body.items
|
||||||
|
.filter((x) => x.involvedObject?.name === podName)
|
||||||
|
.map((x) => ({
|
||||||
|
message: x.message || ``,
|
||||||
|
reason: x.reason || ``,
|
||||||
|
type: x.type || ``,
|
||||||
|
}));
|
||||||
|
if (podEvents.length > 0) {
|
||||||
|
message += `\nRecent Events:\n${JSON.stringify(podEvents.slice(-5), undefined, 2)}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (eventError) {
|
||||||
|
// Ignore event fetch errors
|
||||||
|
}
|
||||||
|
cloud_runner_logger_1.default.logWarning(message);
|
||||||
|
waitComplete = false; // Mark as not complete so we throw an error
|
||||||
|
return true; // Return true to exit wait loop
|
||||||
|
}
|
||||||
|
waitComplete = phase !== 'Pending' && phase !== 'Unknown';
|
||||||
|
if (phase === 'Pending') {
|
||||||
|
consecutivePendingCount++;
|
||||||
|
// Log diagnostic info every 4 checks (1 minute) if still pending
|
||||||
|
if (consecutivePendingCount % 4 === 0) {
|
||||||
|
const pendingMessage = `Pod ${podName} still Pending (check ${consecutivePendingCount}). Phase: ${phase}`;
|
||||||
|
const conditionMessages = conditions
|
||||||
|
.map((c) => `${c.type}: ${c.reason || 'N/A'} - ${c.message || 'N/A'}`)
|
||||||
|
.join('; ');
|
||||||
|
cloud_runner_logger_1.default.log(`${pendingMessage}. Conditions: ${conditionMessages || 'None'}`);
|
||||||
|
// Log events periodically to help diagnose
|
||||||
|
if (consecutivePendingCount % 8 === 0) {
|
||||||
|
try {
|
||||||
|
const events = await kubeClient.listNamespacedEvent(namespace);
|
||||||
|
const podEvents = events.body.items
|
||||||
|
.filter((x) => x.involvedObject?.name === podName)
|
||||||
|
.slice(-3)
|
||||||
|
.map((x) => `${x.type}: ${x.reason} - ${x.message}`)
|
||||||
|
.join('; ');
|
||||||
|
if (podEvents) {
|
||||||
|
cloud_runner_logger_1.default.log(`Recent pod events: ${podEvents}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch {
|
||||||
|
// Ignore event fetch errors
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
message = `Phase:${phase} \n Reason:${conditions[0]?.reason || ''} \n Message:${conditions[0]?.message || ''}`;
|
||||||
|
if (waitComplete || phase !== 'Pending')
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}, {
|
||||||
|
timeout: 2000000,
|
||||||
|
intervalBetweenAttempts: 15000, // 15 seconds
|
||||||
|
});
|
||||||
|
}
|
||||||
|
catch (waitError) {
|
||||||
|
// If waitUntil times out or throws, get final pod status
|
||||||
|
try {
|
||||||
|
const finalStatus = await kubeClient.readNamespacedPodStatus(podName, namespace);
|
||||||
|
const phase = finalStatus?.body.status?.phase || 'Unknown';
|
||||||
|
const conditions = finalStatus?.body.status?.conditions || [];
|
||||||
|
message = `Pod ${podName} timed out waiting to start.\nFinal Phase: ${phase}\n`;
|
||||||
|
message += conditions.map((c) => `${c.type}: ${c.reason} - ${c.message}`).join('\n');
|
||||||
|
// Get events for context
|
||||||
|
try {
|
||||||
|
const events = await kubeClient.listNamespacedEvent(namespace);
|
||||||
|
const podEvents = events.body.items
|
||||||
|
.filter((x) => x.involvedObject?.name === podName)
|
||||||
|
.slice(-5)
|
||||||
|
.map((x) => `${x.type}: ${x.reason} - ${x.message}`);
|
||||||
|
if (podEvents.length > 0) {
|
||||||
|
message += `\n\nRecent Events:\n${podEvents.join('\n')}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch {
|
||||||
|
// Ignore event fetch errors
|
||||||
|
}
|
||||||
|
cloud_runner_logger_1.default.logWarning(message);
|
||||||
|
}
|
||||||
|
catch (statusError) {
|
||||||
|
message = `Pod ${podName} timed out and could not retrieve final status: ${waitError?.message || waitError}`;
|
||||||
|
cloud_runner_logger_1.default.logWarning(message);
|
||||||
|
}
|
||||||
|
throw new Error(`Pod ${podName} failed to start within timeout. ${message}`);
|
||||||
|
}
|
||||||
if (!waitComplete) {
|
if (!waitComplete) {
|
||||||
cloud_runner_logger_1.default.log(message);
|
cloud_runner_logger_1.default.logWarning(`Pod ${podName} did not reach running state: ${message}`);
|
||||||
|
throw new Error(`Pod ${podName} did not start successfully: ${message}`);
|
||||||
}
|
}
|
||||||
return waitComplete;
|
return waitComplete;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -364,42 +364,162 @@ class KubernetesTaskRunner {
|
||||||
static async watchUntilPodRunning(kubeClient: CoreV1Api, podName: string, namespace: string) {
|
static async watchUntilPodRunning(kubeClient: CoreV1Api, podName: string, namespace: string) {
|
||||||
let waitComplete: boolean = false;
|
let waitComplete: boolean = false;
|
||||||
let message = ``;
|
let message = ``;
|
||||||
|
let lastPhase = '';
|
||||||
|
let consecutivePendingCount = 0;
|
||||||
CloudRunnerLogger.log(`Watching ${podName} ${namespace}`);
|
CloudRunnerLogger.log(`Watching ${podName} ${namespace}`);
|
||||||
await waitUntil(
|
|
||||||
async () => {
|
|
||||||
const status = await kubeClient.readNamespacedPodStatus(podName, namespace);
|
|
||||||
const phase = status?.body.status?.phase;
|
|
||||||
waitComplete = phase !== 'Pending';
|
|
||||||
message = `Phase:${status.body.status?.phase} \n Reason:${
|
|
||||||
status.body.status?.conditions?.[0].reason || ''
|
|
||||||
} \n Message:${status.body.status?.conditions?.[0].message || ''}`;
|
|
||||||
|
|
||||||
// CloudRunnerLogger.log(
|
try {
|
||||||
// JSON.stringify(
|
await waitUntil(
|
||||||
// (await kubeClient.listNamespacedEvent(namespace)).body.items
|
async () => {
|
||||||
// .map((x) => {
|
const status = await kubeClient.readNamespacedPodStatus(podName, namespace);
|
||||||
// return {
|
const phase = status?.body.status?.phase || 'Unknown';
|
||||||
// message: x.message || ``,
|
const conditions = status?.body.status?.conditions || [];
|
||||||
// name: x.metadata.name || ``,
|
const containerStatuses = status?.body.status?.containerStatuses || [];
|
||||||
// reason: x.reason || ``,
|
|
||||||
// };
|
// Log phase changes
|
||||||
// })
|
if (phase !== lastPhase) {
|
||||||
// .filter((x) => x.name.includes(podName)),
|
CloudRunnerLogger.log(`Pod ${podName} phase changed: ${lastPhase} -> ${phase}`);
|
||||||
// undefined,
|
lastPhase = phase;
|
||||||
// 4,
|
consecutivePendingCount = 0;
|
||||||
// ),
|
}
|
||||||
// );
|
|
||||||
if (waitComplete || phase !== 'Pending') return true;
|
// Check for failure conditions that mean the pod will never start
|
||||||
|
const failureReasons = [
|
||||||
|
'Unschedulable',
|
||||||
|
'Failed',
|
||||||
|
'ImagePullBackOff',
|
||||||
|
'ErrImagePull',
|
||||||
|
'CrashLoopBackOff',
|
||||||
|
'CreateContainerError',
|
||||||
|
'CreateContainerConfigError',
|
||||||
|
];
|
||||||
|
|
||||||
|
const hasFailureCondition = conditions.some((condition: any) =>
|
||||||
|
failureReasons.some((reason) => condition.reason?.includes(reason)),
|
||||||
|
);
|
||||||
|
|
||||||
|
const hasFailureContainerStatus = containerStatuses.some((containerStatus: any) =>
|
||||||
|
failureReasons.some((reason) => containerStatus.state?.waiting?.reason?.includes(reason)),
|
||||||
|
);
|
||||||
|
|
||||||
|
if (hasFailureCondition || hasFailureContainerStatus) {
|
||||||
|
// Get detailed failure information
|
||||||
|
const failureCondition = conditions.find((condition: any) =>
|
||||||
|
failureReasons.some((reason) => condition.reason?.includes(reason)),
|
||||||
|
);
|
||||||
|
const failureContainer = containerStatuses.find((containerStatus: any) =>
|
||||||
|
failureReasons.some((reason) => containerStatus.state?.waiting?.reason?.includes(reason)),
|
||||||
|
);
|
||||||
|
|
||||||
|
message = `Pod ${podName} failed to start:\nPhase: ${phase}\n`;
|
||||||
|
if (failureCondition) {
|
||||||
|
message += `Condition Reason: ${failureCondition.reason}\nCondition Message: ${failureCondition.message}\n`;
|
||||||
|
}
|
||||||
|
if (failureContainer) {
|
||||||
|
message += `Container Reason: ${failureContainer.state?.waiting?.reason}\nContainer Message: ${failureContainer.state?.waiting?.message}\n`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Log pod events for additional context
|
||||||
|
try {
|
||||||
|
const events = await kubeClient.listNamespacedEvent(namespace);
|
||||||
|
const podEvents = events.body.items
|
||||||
|
.filter((x) => x.involvedObject?.name === podName)
|
||||||
|
.map((x) => ({
|
||||||
|
message: x.message || ``,
|
||||||
|
reason: x.reason || ``,
|
||||||
|
type: x.type || ``,
|
||||||
|
}));
|
||||||
|
if (podEvents.length > 0) {
|
||||||
|
message += `\nRecent Events:\n${JSON.stringify(podEvents.slice(-5), undefined, 2)}`;
|
||||||
|
}
|
||||||
|
} catch (eventError) {
|
||||||
|
// Ignore event fetch errors
|
||||||
|
}
|
||||||
|
|
||||||
|
CloudRunnerLogger.logWarning(message);
|
||||||
|
waitComplete = false; // Mark as not complete so we throw an error
|
||||||
|
return true; // Return true to exit wait loop
|
||||||
|
}
|
||||||
|
|
||||||
|
waitComplete = phase !== 'Pending' && phase !== 'Unknown';
|
||||||
|
|
||||||
|
if (phase === 'Pending') {
|
||||||
|
consecutivePendingCount++;
|
||||||
|
// Log diagnostic info every 4 checks (1 minute) if still pending
|
||||||
|
if (consecutivePendingCount % 4 === 0) {
|
||||||
|
const pendingMessage = `Pod ${podName} still Pending (check ${consecutivePendingCount}). Phase: ${phase}`;
|
||||||
|
const conditionMessages = conditions
|
||||||
|
.map((c: any) => `${c.type}: ${c.reason || 'N/A'} - ${c.message || 'N/A'}`)
|
||||||
|
.join('; ');
|
||||||
|
CloudRunnerLogger.log(`${pendingMessage}. Conditions: ${conditionMessages || 'None'}`);
|
||||||
|
|
||||||
|
// Log events periodically to help diagnose
|
||||||
|
if (consecutivePendingCount % 8 === 0) {
|
||||||
|
try {
|
||||||
|
const events = await kubeClient.listNamespacedEvent(namespace);
|
||||||
|
const podEvents = events.body.items
|
||||||
|
.filter((x) => x.involvedObject?.name === podName)
|
||||||
|
.slice(-3)
|
||||||
|
.map((x) => `${x.type}: ${x.reason} - ${x.message}`)
|
||||||
|
.join('; ');
|
||||||
|
if (podEvents) {
|
||||||
|
CloudRunnerLogger.log(`Recent pod events: ${podEvents}`);
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Ignore event fetch errors
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
message = `Phase:${phase} \n Reason:${conditions[0]?.reason || ''} \n Message:${
|
||||||
|
conditions[0]?.message || ''
|
||||||
|
}`;
|
||||||
|
|
||||||
|
if (waitComplete || phase !== 'Pending') return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
},
|
||||||
|
{
|
||||||
|
timeout: 2000000, // ~33 minutes
|
||||||
|
intervalBetweenAttempts: 15000, // 15 seconds
|
||||||
|
},
|
||||||
|
);
|
||||||
|
} catch (waitError: any) {
|
||||||
|
// If waitUntil times out or throws, get final pod status
|
||||||
|
try {
|
||||||
|
const finalStatus = await kubeClient.readNamespacedPodStatus(podName, namespace);
|
||||||
|
const phase = finalStatus?.body.status?.phase || 'Unknown';
|
||||||
|
const conditions = finalStatus?.body.status?.conditions || [];
|
||||||
|
message = `Pod ${podName} timed out waiting to start.\nFinal Phase: ${phase}\n`;
|
||||||
|
message += conditions.map((c: any) => `${c.type}: ${c.reason} - ${c.message}`).join('\n');
|
||||||
|
|
||||||
|
// Get events for context
|
||||||
|
try {
|
||||||
|
const events = await kubeClient.listNamespacedEvent(namespace);
|
||||||
|
const podEvents = events.body.items
|
||||||
|
.filter((x) => x.involvedObject?.name === podName)
|
||||||
|
.slice(-5)
|
||||||
|
.map((x) => `${x.type}: ${x.reason} - ${x.message}`);
|
||||||
|
if (podEvents.length > 0) {
|
||||||
|
message += `\n\nRecent Events:\n${podEvents.join('\n')}`;
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Ignore event fetch errors
|
||||||
|
}
|
||||||
|
|
||||||
|
CloudRunnerLogger.logWarning(message);
|
||||||
|
} catch (statusError) {
|
||||||
|
message = `Pod ${podName} timed out and could not retrieve final status: ${waitError?.message || waitError}`;
|
||||||
|
CloudRunnerLogger.logWarning(message);
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error(`Pod ${podName} failed to start within timeout. ${message}`);
|
||||||
|
}
|
||||||
|
|
||||||
return false;
|
|
||||||
},
|
|
||||||
{
|
|
||||||
timeout: 2000000,
|
|
||||||
intervalBetweenAttempts: 15000,
|
|
||||||
},
|
|
||||||
);
|
|
||||||
if (!waitComplete) {
|
if (!waitComplete) {
|
||||||
CloudRunnerLogger.log(message);
|
CloudRunnerLogger.logWarning(`Pod ${podName} did not reach running state: ${message}`);
|
||||||
|
throw new Error(`Pod ${podName} did not start successfully: ${message}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return waitComplete;
|
return waitComplete;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue