pr feedback - increase timeout for image pulls in tests and detect active image pulls to allow more time
parent
6953319f7d
commit
256b0e97c2
|
|
@ -4887,13 +4887,32 @@ class KubernetesTaskRunner {
|
||||||
waitComplete = false;
|
waitComplete = false;
|
||||||
return true; // Exit wait loop to throw error
|
return true; // Exit wait loop to throw error
|
||||||
}
|
}
|
||||||
|
// Check if pod is actively pulling an image - if so, allow more time
|
||||||
|
const isPullingImage = podEvents.some((x) => x.reason === 'Pulling' || x.reason === 'Pulled' || x.message?.includes('Pulling image'));
|
||||||
|
const hasImagePullError = podEvents.some((x) => x.reason === 'Failed' && (x.message?.includes('pull') || x.message?.includes('image')));
|
||||||
|
if (hasImagePullError) {
|
||||||
|
message = `Pod ${podName} failed to pull image. Check image availability and credentials.`;
|
||||||
|
cloud_runner_logger_1.default.logWarning(message);
|
||||||
|
waitComplete = false;
|
||||||
|
return true; // Exit wait loop to throw error
|
||||||
|
}
|
||||||
|
// If actively pulling image, reset pending count to allow more time
|
||||||
|
// Large images (like Unity 3.9GB) can take 3-5 minutes to pull
|
||||||
|
if (isPullingImage && consecutivePendingCount > 4) {
|
||||||
|
cloud_runner_logger_1.default.log(`Pod ${podName} is pulling image (check ${consecutivePendingCount}). This may take several minutes for large images.`);
|
||||||
|
// Don't increment consecutivePendingCount if we're actively pulling
|
||||||
|
consecutivePendingCount = Math.max(4, consecutivePendingCount - 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
catch {
|
catch {
|
||||||
// Ignore event fetch errors
|
// Ignore event fetch errors
|
||||||
}
|
}
|
||||||
// For tests, fail faster if stuck in Pending (2 minutes = 8 checks at 15s interval)
|
// For tests, allow more time if image is being pulled (large images need 5+ minutes)
|
||||||
|
// Otherwise fail faster if stuck in Pending (2 minutes = 8 checks at 15s interval)
|
||||||
const isTest = process.env['cloudRunnerTests'] === 'true';
|
const isTest = process.env['cloudRunnerTests'] === 'true';
|
||||||
const maxPendingChecks = isTest ? 8 : 80; // 2 minutes for tests, 20 minutes for production
|
const isPullingImage = containerStatuses.some((cs) => cs.state?.waiting?.reason === 'ImagePull' || cs.state?.waiting?.reason === 'ErrImagePull') || conditions.some((c) => c.reason?.includes('Pulling'));
|
||||||
|
// Allow up to 20 minutes for image pulls in tests (80 checks), 2 minutes otherwise
|
||||||
|
const maxPendingChecks = isTest && isPullingImage ? 80 : isTest ? 8 : 80;
|
||||||
if (consecutivePendingCount >= maxPendingChecks) {
|
if (consecutivePendingCount >= maxPendingChecks) {
|
||||||
message = `Pod ${podName} stuck in Pending state for too long (${consecutivePendingCount} checks). This indicates a scheduling problem.`;
|
message = `Pod ${podName} stuck in Pending state for too long (${consecutivePendingCount} checks). This indicates a scheduling problem.`;
|
||||||
// Get events for context
|
// Get events for context
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -500,13 +500,42 @@ class KubernetesTaskRunner {
|
||||||
waitComplete = false;
|
waitComplete = false;
|
||||||
return true; // Exit wait loop to throw error
|
return true; // Exit wait loop to throw error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check if pod is actively pulling an image - if so, allow more time
|
||||||
|
const isPullingImage = podEvents.some(
|
||||||
|
(x) => x.reason === 'Pulling' || x.reason === 'Pulled' || x.message?.includes('Pulling image'),
|
||||||
|
);
|
||||||
|
const hasImagePullError = podEvents.some(
|
||||||
|
(x) => x.reason === 'Failed' && (x.message?.includes('pull') || x.message?.includes('image')),
|
||||||
|
);
|
||||||
|
|
||||||
|
if (hasImagePullError) {
|
||||||
|
message = `Pod ${podName} failed to pull image. Check image availability and credentials.`;
|
||||||
|
CloudRunnerLogger.logWarning(message);
|
||||||
|
waitComplete = false;
|
||||||
|
return true; // Exit wait loop to throw error
|
||||||
|
}
|
||||||
|
|
||||||
|
// If actively pulling image, reset pending count to allow more time
|
||||||
|
// Large images (like Unity 3.9GB) can take 3-5 minutes to pull
|
||||||
|
if (isPullingImage && consecutivePendingCount > 4) {
|
||||||
|
CloudRunnerLogger.log(`Pod ${podName} is pulling image (check ${consecutivePendingCount}). This may take several minutes for large images.`);
|
||||||
|
// Don't increment consecutivePendingCount if we're actively pulling
|
||||||
|
consecutivePendingCount = Math.max(4, consecutivePendingCount - 1);
|
||||||
|
}
|
||||||
} catch {
|
} catch {
|
||||||
// Ignore event fetch errors
|
// Ignore event fetch errors
|
||||||
}
|
}
|
||||||
|
|
||||||
// For tests, fail faster if stuck in Pending (2 minutes = 8 checks at 15s interval)
|
// For tests, allow more time if image is being pulled (large images need 5+ minutes)
|
||||||
|
// Otherwise fail faster if stuck in Pending (2 minutes = 8 checks at 15s interval)
|
||||||
const isTest = process.env['cloudRunnerTests'] === 'true';
|
const isTest = process.env['cloudRunnerTests'] === 'true';
|
||||||
const maxPendingChecks = isTest ? 8 : 80; // 2 minutes for tests, 20 minutes for production
|
const isPullingImage = containerStatuses.some(
|
||||||
|
(cs: any) => cs.state?.waiting?.reason === 'ImagePull' || cs.state?.waiting?.reason === 'ErrImagePull',
|
||||||
|
) || conditions.some((c: any) => c.reason?.includes('Pulling'));
|
||||||
|
|
||||||
|
// Allow up to 20 minutes for image pulls in tests (80 checks), 2 minutes otherwise
|
||||||
|
const maxPendingChecks = isTest && isPullingImage ? 80 : isTest ? 8 : 80;
|
||||||
|
|
||||||
if (consecutivePendingCount >= maxPendingChecks) {
|
if (consecutivePendingCount >= maxPendingChecks) {
|
||||||
message = `Pod ${podName} stuck in Pending state for too long (${consecutivePendingCount} checks). This indicates a scheduling problem.`;
|
message = `Pod ${podName} stuck in Pending state for too long (${consecutivePendingCount} checks). This indicates a scheduling problem.`;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue