pr feedback
parent
0497076eba
commit
5f552f2bc2
|
|
@ -3762,17 +3762,17 @@ class Kubernetes {
|
|||
const imageTag = image.split(':')[1] || 'latest';
|
||||
// More targeted cleanup: remove stopped containers and non-Unity images
|
||||
// IMPORTANT: Preserve Unity images to avoid re-pulling the 3.9GB image
|
||||
// Strategy: Only remove containers, don't prune images (which might remove Unity image)
|
||||
const cleanupCommands = [
|
||||
// Remove all stopped containers (this frees runtime space but keeps images)
|
||||
'docker exec k3d-unity-builder-agent-0 sh -c "crictl rm --all 2>/dev/null || true" || true',
|
||||
'docker exec k3d-unity-builder-server-0 sh -c "crictl rm --all 2>/dev/null || true" || true',
|
||||
// Remove non-Unity images only (preserve unityci/editor images)
|
||||
// List all images, filter out Unity images, then remove the rest
|
||||
'docker exec k3d-unity-builder-agent-0 sh -c "crictl images --format \\"table {{.ID}}\\t{{.Repository}}\\" 2>/dev/null | grep -vE \\"unityci/editor|unity|IMAGE\\" | awk \\"{print \\$1}\\" | xargs -r crictl rmi 2>/dev/null || true" || true',
|
||||
'docker exec k3d-unity-builder-server-0 sh -c "crictl images --format \\"table {{.ID}}\\t{{.Repository}}\\" 2>/dev/null | grep -vE \\"unityci/editor|unity|IMAGE\\" | awk \\"{print \\$1}\\" | xargs -r crictl rmi 2>/dev/null || true" || true',
|
||||
// Clean up unused layers/snapshots (prune should preserve referenced images)
|
||||
'docker exec k3d-unity-builder-agent-0 sh -c "crictl rmi --prune 2>/dev/null || true" || true',
|
||||
'docker exec k3d-unity-builder-server-0 sh -c "crictl rmi --prune 2>/dev/null || true" || true',
|
||||
// Remove specific non-Unity images by name (safer than filtering)
|
||||
// Only remove known system images, preserve everything else including Unity
|
||||
'docker exec k3d-unity-builder-agent-0 sh -c "crictl images --format \\"{{.Repository}}:{{.Tag}}\\" 2>/dev/null | grep -vE \\"unityci/editor|unity\\" | grep -E \\"rancher/|curlimages/|amazon/aws-cli|rclone/rclone|steamcmd/steamcmd|ubuntu:|alpine:\\" | xargs -r -I {} crictl rmi {} 2>/dev/null || true" || true',
|
||||
'docker exec k3d-unity-builder-server-0 sh -c "crictl images --format \\"{{.Repository}}:{{.Tag}}\\" 2>/dev/null | grep -vE \\"unityci/editor|unity\\" | grep -E \\"rancher/|curlimages/|amazon/aws-cli|rclone/rclone|steamcmd/steamcmd|ubuntu:|alpine:\\" | xargs -r -I {} crictl rmi {} 2>/dev/null || true" || true',
|
||||
// DO NOT use --prune as it might remove the Unity image if no containers are using it
|
||||
// Only clean up if we have very little space left
|
||||
];
|
||||
for (const cmd of cleanupCommands) {
|
||||
try {
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -91,10 +91,14 @@ class CloudRunner {
|
|||
let provider = CloudRunner.buildParameters.providerStrategy;
|
||||
if (provider === 'aws' && isLocalStack && !forceAwsProvider) {
|
||||
CloudRunnerLogger.log('LocalStack endpoints detected; routing provider to local-docker for this run');
|
||||
CloudRunnerLogger.log('Note: Set AWS_FORCE_PROVIDER=aws to force AWS provider with LocalStack for AWS functionality tests');
|
||||
CloudRunnerLogger.log(
|
||||
'Note: Set AWS_FORCE_PROVIDER=aws to force AWS provider with LocalStack for AWS functionality tests',
|
||||
);
|
||||
provider = 'local-docker';
|
||||
} else if (provider === 'aws' && isLocalStack && forceAwsProvider) {
|
||||
CloudRunnerLogger.log('LocalStack endpoints detected but AWS_FORCE_PROVIDER is set; using AWS provider to validate AWS functionality');
|
||||
CloudRunnerLogger.log(
|
||||
'LocalStack endpoints detected but AWS_FORCE_PROVIDER is set; using AWS provider to validate AWS functionality',
|
||||
);
|
||||
}
|
||||
|
||||
switch (provider) {
|
||||
|
|
@ -107,7 +111,9 @@ class CloudRunner {
|
|||
if (isLocalStack && forceAwsProvider) {
|
||||
CloudRunnerLogger.log('✓ AWS provider initialized with LocalStack - AWS functionality will be validated');
|
||||
} else if (isLocalStack && !forceAwsProvider) {
|
||||
CloudRunnerLogger.log('⚠ WARNING: AWS provider was requested but LocalStack detected without AWS_FORCE_PROVIDER');
|
||||
CloudRunnerLogger.log(
|
||||
'⚠ WARNING: AWS provider was requested but LocalStack detected without AWS_FORCE_PROVIDER',
|
||||
);
|
||||
CloudRunnerLogger.log('⚠ This may cause AWS functionality tests to fail validation');
|
||||
}
|
||||
break;
|
||||
|
|
@ -134,7 +140,7 @@ class CloudRunner {
|
|||
}
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
// Final validation: Ensure provider matches expectations
|
||||
const finalProviderName = CloudRunner.Provider.constructor.name;
|
||||
if (CloudRunner.buildParameters.providerStrategy === 'aws' && finalProviderName !== 'AWSBuildEnvironment') {
|
||||
|
|
|
|||
|
|
@ -155,33 +155,33 @@ class Kubernetes implements ProviderInterface {
|
|||
this.jobName = `unity-builder-job-${this.buildGuid}`;
|
||||
this.containerName = `main`;
|
||||
await KubernetesSecret.createSecret(secrets, this.secretName, this.namespace, this.kubeClient);
|
||||
|
||||
|
||||
// For tests, clean up old images before creating job to free space for image pull
|
||||
// IMPORTANT: Preserve the Unity image to avoid re-pulling it
|
||||
if (process.env['cloudRunnerTests'] === 'true') {
|
||||
try {
|
||||
CloudRunnerLogger.log('Cleaning up old images in k3d node before pulling new image...');
|
||||
const { CloudRunnerSystem } = await import('../../services/core/cloud-runner-system');
|
||||
|
||||
|
||||
// Extract image name without tag for matching
|
||||
const imageName = image.split(':')[0];
|
||||
const imageTag = image.split(':')[1] || 'latest';
|
||||
|
||||
|
||||
// More targeted cleanup: remove stopped containers and non-Unity images
|
||||
// IMPORTANT: Preserve Unity images to avoid re-pulling the 3.9GB image
|
||||
// Strategy: Only remove containers, don't prune images (which might remove Unity image)
|
||||
const cleanupCommands = [
|
||||
// Remove all stopped containers (this frees runtime space but keeps images)
|
||||
'docker exec k3d-unity-builder-agent-0 sh -c "crictl rm --all 2>/dev/null || true" || true',
|
||||
'docker exec k3d-unity-builder-server-0 sh -c "crictl rm --all 2>/dev/null || true" || true',
|
||||
// Remove non-Unity images only (preserve unityci/editor images)
|
||||
// List all images, filter out Unity images, then remove the rest
|
||||
'docker exec k3d-unity-builder-agent-0 sh -c "crictl images --format \\"table {{.ID}}\\t{{.Repository}}\\" 2>/dev/null | grep -vE \\"unityci/editor|unity|IMAGE\\" | awk \\"{print \\$1}\\" | xargs -r crictl rmi 2>/dev/null || true" || true',
|
||||
'docker exec k3d-unity-builder-server-0 sh -c "crictl images --format \\"table {{.ID}}\\t{{.Repository}}\\" 2>/dev/null | grep -vE \\"unityci/editor|unity|IMAGE\\" | awk \\"{print \\$1}\\" | xargs -r crictl rmi 2>/dev/null || true" || true',
|
||||
// Clean up unused layers/snapshots (prune should preserve referenced images)
|
||||
'docker exec k3d-unity-builder-agent-0 sh -c "crictl rmi --prune 2>/dev/null || true" || true',
|
||||
'docker exec k3d-unity-builder-server-0 sh -c "crictl rmi --prune 2>/dev/null || true" || true',
|
||||
// Remove specific non-Unity images by name (safer than filtering)
|
||||
// Only remove known system images, preserve everything else including Unity
|
||||
'docker exec k3d-unity-builder-agent-0 sh -c "crictl images --format \\"{{.Repository}}:{{.Tag}}\\" 2>/dev/null | grep -vE \\"unityci/editor|unity\\" | grep -E \\"rancher/|curlimages/|amazon/aws-cli|rclone/rclone|steamcmd/steamcmd|ubuntu:|alpine:\\" | xargs -r -I {} crictl rmi {} 2>/dev/null || true" || true',
|
||||
'docker exec k3d-unity-builder-server-0 sh -c "crictl images --format \\"{{.Repository}}:{{.Tag}}\\" 2>/dev/null | grep -vE \\"unityci/editor|unity\\" | grep -E \\"rancher/|curlimages/|amazon/aws-cli|rclone/rclone|steamcmd/steamcmd|ubuntu:|alpine:\\" | xargs -r -I {} crictl rmi {} 2>/dev/null || true" || true',
|
||||
// DO NOT use --prune as it might remove the Unity image if no containers are using it
|
||||
// Only clean up if we have very little space left
|
||||
];
|
||||
|
||||
|
||||
for (const cmd of cleanupCommands) {
|
||||
try {
|
||||
await CloudRunnerSystem.Run(cmd, true, true);
|
||||
|
|
@ -190,7 +190,7 @@ class Kubernetes implements ProviderInterface {
|
|||
CloudRunnerLogger.log(`Cleanup command failed (non-fatal): ${cmdError}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Verify Unity image is still cached
|
||||
try {
|
||||
const unityImageCheck = await CloudRunnerSystem.Run(
|
||||
|
|
@ -202,7 +202,7 @@ class Kubernetes implements ProviderInterface {
|
|||
} catch {
|
||||
// Ignore check failures
|
||||
}
|
||||
|
||||
|
||||
// Check disk space after cleanup
|
||||
try {
|
||||
const diskCheck = await CloudRunnerSystem.Run(
|
||||
|
|
@ -219,7 +219,7 @@ class Kubernetes implements ProviderInterface {
|
|||
// Continue anyway - image might already be cached
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
let output = '';
|
||||
try {
|
||||
CloudRunnerLogger.log('Job does not exist');
|
||||
|
|
|
|||
|
|
@ -95,7 +95,7 @@ class KubernetesJobSpecFactory {
|
|||
// Hook containers typically use utility images like aws-cli, rclone, etc.
|
||||
const lightweightImages = ['amazon/aws-cli', 'rclone/rclone', 'steamcmd/steamcmd', 'ubuntu'];
|
||||
const isLightweightContainer = lightweightImages.some((lightImage) => image.includes(lightImage));
|
||||
|
||||
|
||||
if (isLightweightContainer && process.env['cloudRunnerTests'] === 'true') {
|
||||
// For test environments, use minimal resources for hook containers
|
||||
return {
|
||||
|
|
@ -103,7 +103,7 @@ class KubernetesJobSpecFactory {
|
|||
cpu: '100m', // 0.1 CPU
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
// For main build containers, use the configured resources
|
||||
const memoryMB = Number.parseInt(buildParameters.containerMemory);
|
||||
const cpuMB = Number.parseInt(buildParameters.containerCpu);
|
||||
|
|
|
|||
|
|
@ -148,15 +148,17 @@ class KubernetesPods {
|
|||
}
|
||||
|
||||
// Check if pod was evicted due to disk pressure - this is an infrastructure issue
|
||||
const wasEvicted = errorDetails.some((detail) =>
|
||||
detail.toLowerCase().includes('evicted') || detail.toLowerCase().includes('diskpressure'),
|
||||
const wasEvicted = errorDetails.some(
|
||||
(detail) => detail.toLowerCase().includes('evicted') || detail.toLowerCase().includes('diskpressure'),
|
||||
);
|
||||
if (wasEvicted) {
|
||||
const evictionMessage = `Pod ${podName} was evicted due to disk pressure. This is a test infrastructure issue - the cluster doesn't have enough disk space.`;
|
||||
CloudRunnerLogger.logWarning(evictionMessage);
|
||||
CloudRunnerLogger.log(`Pod details: ${errorDetails.join('\n')}`);
|
||||
throw new Error(
|
||||
`${evictionMessage}\nThis indicates the test environment needs more disk space or better cleanup.\n${errorDetails.join('\n')}`,
|
||||
`${evictionMessage}\nThis indicates the test environment needs more disk space or better cleanup.\n${errorDetails.join(
|
||||
'\n',
|
||||
)}`,
|
||||
);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -500,7 +500,7 @@ class KubernetesTaskRunner {
|
|||
waitComplete = false;
|
||||
return true; // Exit wait loop to throw error
|
||||
}
|
||||
|
||||
|
||||
// Check if pod is actively pulling an image - if so, allow more time
|
||||
const isPullingImage = podEvents.some(
|
||||
(x) => x.reason === 'Pulling' || x.reason === 'Pulled' || x.message?.includes('Pulling image'),
|
||||
|
|
@ -508,18 +508,20 @@ class KubernetesTaskRunner {
|
|||
const hasImagePullError = podEvents.some(
|
||||
(x) => x.reason === 'Failed' && (x.message?.includes('pull') || x.message?.includes('image')),
|
||||
);
|
||||
|
||||
|
||||
if (hasImagePullError) {
|
||||
message = `Pod ${podName} failed to pull image. Check image availability and credentials.`;
|
||||
CloudRunnerLogger.logWarning(message);
|
||||
waitComplete = false;
|
||||
return true; // Exit wait loop to throw error
|
||||
}
|
||||
|
||||
|
||||
// If actively pulling image, reset pending count to allow more time
|
||||
// Large images (like Unity 3.9GB) can take 3-5 minutes to pull
|
||||
if (isPullingImage && consecutivePendingCount > 4) {
|
||||
CloudRunnerLogger.log(`Pod ${podName} is pulling image (check ${consecutivePendingCount}). This may take several minutes for large images.`);
|
||||
CloudRunnerLogger.log(
|
||||
`Pod ${podName} is pulling image (check ${consecutivePendingCount}). This may take several minutes for large images.`,
|
||||
);
|
||||
// Don't increment consecutivePendingCount if we're actively pulling
|
||||
consecutivePendingCount = Math.max(4, consecutivePendingCount - 1);
|
||||
}
|
||||
|
|
@ -530,10 +532,11 @@ class KubernetesTaskRunner {
|
|||
// For tests, allow more time if image is being pulled (large images need 5+ minutes)
|
||||
// Otherwise fail faster if stuck in Pending (2 minutes = 8 checks at 15s interval)
|
||||
const isTest = process.env['cloudRunnerTests'] === 'true';
|
||||
const isPullingImage = containerStatuses.some(
|
||||
(cs: any) => cs.state?.waiting?.reason === 'ImagePull' || cs.state?.waiting?.reason === 'ErrImagePull',
|
||||
) || conditions.some((c: any) => c.reason?.includes('Pulling'));
|
||||
|
||||
const isPullingImage =
|
||||
containerStatuses.some(
|
||||
(cs: any) => cs.state?.waiting?.reason === 'ImagePull' || cs.state?.waiting?.reason === 'ErrImagePull',
|
||||
) || conditions.some((c: any) => c.reason?.includes('Pulling'));
|
||||
|
||||
// Allow up to 20 minutes for image pulls in tests (80 checks), 2 minutes otherwise
|
||||
const maxPendingChecks = isTest && isPullingImage ? 80 : isTest ? 8 : 80;
|
||||
|
||||
|
|
@ -549,19 +552,21 @@ class KubernetesTaskRunner {
|
|||
if (podEvents.length > 0) {
|
||||
message += `\n\nRecent Events:\n${podEvents.join('\n')}`;
|
||||
}
|
||||
|
||||
|
||||
// Get pod details to check for scheduling issues
|
||||
try {
|
||||
const podStatus = await kubeClient.readNamespacedPodStatus(podName, namespace);
|
||||
const podSpec = podStatus.body.spec;
|
||||
const podStatusDetails = podStatus.body.status;
|
||||
|
||||
|
||||
// Check container resource requests
|
||||
if (podSpec?.containers?.[0]?.resources?.requests) {
|
||||
const requests = podSpec.containers[0].resources.requests;
|
||||
message += `\n\nContainer Resource Requests:\n CPU: ${requests.cpu || 'not set'}\n Memory: ${requests.memory || 'not set'}\n Ephemeral Storage: ${requests['ephemeral-storage'] || 'not set'}`;
|
||||
message += `\n\nContainer Resource Requests:\n CPU: ${requests.cpu || 'not set'}\n Memory: ${
|
||||
requests.memory || 'not set'
|
||||
}\n Ephemeral Storage: ${requests['ephemeral-storage'] || 'not set'}`;
|
||||
}
|
||||
|
||||
|
||||
// Check node selector and tolerations
|
||||
if (podSpec?.nodeSelector && Object.keys(podSpec.nodeSelector).length > 0) {
|
||||
message += `\n\nNode Selector: ${JSON.stringify(podSpec.nodeSelector)}`;
|
||||
|
|
@ -569,12 +574,16 @@ class KubernetesTaskRunner {
|
|||
if (podSpec?.tolerations && podSpec.tolerations.length > 0) {
|
||||
message += `\n\nTolerations: ${JSON.stringify(podSpec.tolerations)}`;
|
||||
}
|
||||
|
||||
|
||||
// Check pod conditions for scheduling issues
|
||||
if (podStatusDetails?.conditions) {
|
||||
const unschedulable = podStatusDetails.conditions.find((c: any) => c.type === 'PodScheduled' && c.status === 'False');
|
||||
const unschedulable = podStatusDetails.conditions.find(
|
||||
(c: any) => c.type === 'PodScheduled' && c.status === 'False',
|
||||
);
|
||||
if (unschedulable) {
|
||||
message += `\n\nScheduling Issue: ${unschedulable.reason || 'Unknown'} - ${unschedulable.message || 'No message'}`;
|
||||
message += `\n\nScheduling Issue: ${unschedulable.reason || 'Unknown'} - ${
|
||||
unschedulable.message || 'No message'
|
||||
}`;
|
||||
}
|
||||
}
|
||||
} catch (podStatusError) {
|
||||
|
|
|
|||
|
|
@ -194,7 +194,9 @@ export class ContainerHookService {
|
|||
ENDPOINT_ARGS=""
|
||||
if [ -n "$AWS_S3_ENDPOINT" ]; then ENDPOINT_ARGS="--endpoint-url $AWS_S3_ENDPOINT"; fi
|
||||
aws $ENDPOINT_ARGS s3 ls ${CloudRunner.buildParameters.awsStackName}/cloud-runner-cache/ 2>/dev/null || true
|
||||
aws $ENDPOINT_ARGS s3 ls ${CloudRunner.buildParameters.awsStackName}/cloud-runner-cache/$CACHE_KEY/ 2>/dev/null || true
|
||||
aws $ENDPOINT_ARGS s3 ls ${
|
||||
CloudRunner.buildParameters.awsStackName
|
||||
}/cloud-runner-cache/$CACHE_KEY/ 2>/dev/null || true
|
||||
BUCKET1="${CloudRunner.buildParameters.awsStackName}/cloud-runner-cache/$CACHE_KEY/Library/"
|
||||
OBJECT1=""
|
||||
LS_OUTPUT1="$(aws $ENDPOINT_ARGS s3 ls $BUCKET1 2>/dev/null || echo '')"
|
||||
|
|
|
|||
|
|
@ -60,15 +60,10 @@ describe('Cloud Runner Retain Workspace', () => {
|
|||
true,
|
||||
true,
|
||||
);
|
||||
// Remove non-Unity images only (preserve unityci/editor images)
|
||||
// Only remove specific known system images, preserve Unity and everything else
|
||||
// DO NOT use --prune as it might remove Unity image
|
||||
await CloudRunnerSystem.Run(
|
||||
`docker exec ${NODE} sh -c "crictl images --format 'table {{.ID}}\\t{{.Repository}}' 2>/dev/null | grep -vE 'unityci/editor|unity|IMAGE' | awk '{print \\$1}' | xargs -r crictl rmi 2>/dev/null || true" || true`,
|
||||
true,
|
||||
true,
|
||||
);
|
||||
// Clean up unused layers
|
||||
await CloudRunnerSystem.Run(
|
||||
`docker exec ${NODE} sh -c "crictl rmi --prune 2>/dev/null || true" || true`,
|
||||
`docker exec ${NODE} sh -c "crictl images --format '{{.Repository}}:{{.Tag}}' 2>/dev/null | grep -vE 'unityci/editor|unity' | grep -E 'rancher/|curlimages/|amazon/aws-cli|rclone/rclone|steamcmd/steamcmd|ubuntu:|alpine:' | xargs -r -I {} crictl rmi {} 2>/dev/null || true" || true`,
|
||||
true,
|
||||
true,
|
||||
);
|
||||
|
|
|
|||
|
|
@ -55,13 +55,15 @@ describe('Cloud Runner Kubernetes', () => {
|
|||
|
||||
// Check if pod was evicted due to resource constraints - this is a test infrastructure failure
|
||||
// Evictions indicate the cluster doesn't have enough resources, which is a test environment issue
|
||||
if (results.includes('The node was low on resource: ephemeral-storage') ||
|
||||
results.includes('TerminationByKubelet') ||
|
||||
results.includes('Evicted')) {
|
||||
if (
|
||||
results.includes('The node was low on resource: ephemeral-storage') ||
|
||||
results.includes('TerminationByKubelet') ||
|
||||
results.includes('Evicted')
|
||||
) {
|
||||
throw new Error(
|
||||
`Test failed: Pod was evicted due to resource constraints (ephemeral-storage). ` +
|
||||
`This indicates the test environment doesn't have enough disk space. ` +
|
||||
`Results: ${results.substring(0, 500)}`
|
||||
`This indicates the test environment doesn't have enough disk space. ` +
|
||||
`Results: ${results.substring(0, 500)}`,
|
||||
);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue