pr feedback

cloud-runner-develop
Frostebite 2026-01-17 05:48:22 +00:00
parent 0497076eba
commit 5f552f2bc2
10 changed files with 76 additions and 60 deletions

14
dist/index.js vendored
View File

@ -3762,17 +3762,17 @@ class Kubernetes {
const imageTag = image.split(':')[1] || 'latest';
// More targeted cleanup: remove stopped containers and non-Unity images
// IMPORTANT: Preserve Unity images to avoid re-pulling the 3.9GB image
// Strategy: Only remove containers, don't prune images (which might remove Unity image)
const cleanupCommands = [
// Remove all stopped containers (this frees runtime space but keeps images)
'docker exec k3d-unity-builder-agent-0 sh -c "crictl rm --all 2>/dev/null || true" || true',
'docker exec k3d-unity-builder-server-0 sh -c "crictl rm --all 2>/dev/null || true" || true',
// Remove non-Unity images only (preserve unityci/editor images)
// List all images, filter out Unity images, then remove the rest
'docker exec k3d-unity-builder-agent-0 sh -c "crictl images --format \\"table {{.ID}}\\t{{.Repository}}\\" 2>/dev/null | grep -vE \\"unityci/editor|unity|IMAGE\\" | awk \\"{print \\$1}\\" | xargs -r crictl rmi 2>/dev/null || true" || true',
'docker exec k3d-unity-builder-server-0 sh -c "crictl images --format \\"table {{.ID}}\\t{{.Repository}}\\" 2>/dev/null | grep -vE \\"unityci/editor|unity|IMAGE\\" | awk \\"{print \\$1}\\" | xargs -r crictl rmi 2>/dev/null || true" || true',
// Clean up unused layers/snapshots (prune should preserve referenced images)
'docker exec k3d-unity-builder-agent-0 sh -c "crictl rmi --prune 2>/dev/null || true" || true',
'docker exec k3d-unity-builder-server-0 sh -c "crictl rmi --prune 2>/dev/null || true" || true',
// Remove specific non-Unity images by name (safer than filtering)
// Only remove known system images, preserve everything else including Unity
'docker exec k3d-unity-builder-agent-0 sh -c "crictl images --format \\"{{.Repository}}:{{.Tag}}\\" 2>/dev/null | grep -vE \\"unityci/editor|unity\\" | grep -E \\"rancher/|curlimages/|amazon/aws-cli|rclone/rclone|steamcmd/steamcmd|ubuntu:|alpine:\\" | xargs -r -I {} crictl rmi {} 2>/dev/null || true" || true',
'docker exec k3d-unity-builder-server-0 sh -c "crictl images --format \\"{{.Repository}}:{{.Tag}}\\" 2>/dev/null | grep -vE \\"unityci/editor|unity\\" | grep -E \\"rancher/|curlimages/|amazon/aws-cli|rclone/rclone|steamcmd/steamcmd|ubuntu:|alpine:\\" | xargs -r -I {} crictl rmi {} 2>/dev/null || true" || true',
// DO NOT use --prune as it might remove the Unity image if no containers are using it
// Only clean up if we have very little space left
];
for (const cmd of cleanupCommands) {
try {

2
dist/index.js.map vendored

File diff suppressed because one or more lines are too long

View File

@ -91,10 +91,14 @@ class CloudRunner {
let provider = CloudRunner.buildParameters.providerStrategy;
if (provider === 'aws' && isLocalStack && !forceAwsProvider) {
CloudRunnerLogger.log('LocalStack endpoints detected; routing provider to local-docker for this run');
CloudRunnerLogger.log('Note: Set AWS_FORCE_PROVIDER=aws to force AWS provider with LocalStack for AWS functionality tests');
CloudRunnerLogger.log(
'Note: Set AWS_FORCE_PROVIDER=aws to force AWS provider with LocalStack for AWS functionality tests',
);
provider = 'local-docker';
} else if (provider === 'aws' && isLocalStack && forceAwsProvider) {
CloudRunnerLogger.log('LocalStack endpoints detected but AWS_FORCE_PROVIDER is set; using AWS provider to validate AWS functionality');
CloudRunnerLogger.log(
'LocalStack endpoints detected but AWS_FORCE_PROVIDER is set; using AWS provider to validate AWS functionality',
);
}
switch (provider) {
@ -107,7 +111,9 @@ class CloudRunner {
if (isLocalStack && forceAwsProvider) {
CloudRunnerLogger.log('✓ AWS provider initialized with LocalStack - AWS functionality will be validated');
} else if (isLocalStack && !forceAwsProvider) {
CloudRunnerLogger.log('⚠ WARNING: AWS provider was requested but LocalStack detected without AWS_FORCE_PROVIDER');
CloudRunnerLogger.log(
'⚠ WARNING: AWS provider was requested but LocalStack detected without AWS_FORCE_PROVIDER',
);
CloudRunnerLogger.log('⚠ This may cause AWS functionality tests to fail validation');
}
break;
@ -134,7 +140,7 @@ class CloudRunner {
}
break;
}
// Final validation: Ensure provider matches expectations
const finalProviderName = CloudRunner.Provider.constructor.name;
if (CloudRunner.buildParameters.providerStrategy === 'aws' && finalProviderName !== 'AWSBuildEnvironment') {

View File

@ -155,33 +155,33 @@ class Kubernetes implements ProviderInterface {
this.jobName = `unity-builder-job-${this.buildGuid}`;
this.containerName = `main`;
await KubernetesSecret.createSecret(secrets, this.secretName, this.namespace, this.kubeClient);
// For tests, clean up old images before creating job to free space for image pull
// IMPORTANT: Preserve the Unity image to avoid re-pulling it
if (process.env['cloudRunnerTests'] === 'true') {
try {
CloudRunnerLogger.log('Cleaning up old images in k3d node before pulling new image...');
const { CloudRunnerSystem } = await import('../../services/core/cloud-runner-system');
// Extract image name without tag for matching
const imageName = image.split(':')[0];
const imageTag = image.split(':')[1] || 'latest';
// More targeted cleanup: remove stopped containers and non-Unity images
// IMPORTANT: Preserve Unity images to avoid re-pulling the 3.9GB image
// Strategy: Only remove containers, don't prune images (which might remove Unity image)
const cleanupCommands = [
// Remove all stopped containers (this frees runtime space but keeps images)
'docker exec k3d-unity-builder-agent-0 sh -c "crictl rm --all 2>/dev/null || true" || true',
'docker exec k3d-unity-builder-server-0 sh -c "crictl rm --all 2>/dev/null || true" || true',
// Remove non-Unity images only (preserve unityci/editor images)
// List all images, filter out Unity images, then remove the rest
'docker exec k3d-unity-builder-agent-0 sh -c "crictl images --format \\"table {{.ID}}\\t{{.Repository}}\\" 2>/dev/null | grep -vE \\"unityci/editor|unity|IMAGE\\" | awk \\"{print \\$1}\\" | xargs -r crictl rmi 2>/dev/null || true" || true',
'docker exec k3d-unity-builder-server-0 sh -c "crictl images --format \\"table {{.ID}}\\t{{.Repository}}\\" 2>/dev/null | grep -vE \\"unityci/editor|unity|IMAGE\\" | awk \\"{print \\$1}\\" | xargs -r crictl rmi 2>/dev/null || true" || true',
// Clean up unused layers/snapshots (prune should preserve referenced images)
'docker exec k3d-unity-builder-agent-0 sh -c "crictl rmi --prune 2>/dev/null || true" || true',
'docker exec k3d-unity-builder-server-0 sh -c "crictl rmi --prune 2>/dev/null || true" || true',
// Remove specific non-Unity images by name (safer than filtering)
// Only remove known system images, preserve everything else including Unity
'docker exec k3d-unity-builder-agent-0 sh -c "crictl images --format \\"{{.Repository}}:{{.Tag}}\\" 2>/dev/null | grep -vE \\"unityci/editor|unity\\" | grep -E \\"rancher/|curlimages/|amazon/aws-cli|rclone/rclone|steamcmd/steamcmd|ubuntu:|alpine:\\" | xargs -r -I {} crictl rmi {} 2>/dev/null || true" || true',
'docker exec k3d-unity-builder-server-0 sh -c "crictl images --format \\"{{.Repository}}:{{.Tag}}\\" 2>/dev/null | grep -vE \\"unityci/editor|unity\\" | grep -E \\"rancher/|curlimages/|amazon/aws-cli|rclone/rclone|steamcmd/steamcmd|ubuntu:|alpine:\\" | xargs -r -I {} crictl rmi {} 2>/dev/null || true" || true',
// DO NOT use --prune as it might remove the Unity image if no containers are using it
// Only clean up if we have very little space left
];
for (const cmd of cleanupCommands) {
try {
await CloudRunnerSystem.Run(cmd, true, true);
@ -190,7 +190,7 @@ class Kubernetes implements ProviderInterface {
CloudRunnerLogger.log(`Cleanup command failed (non-fatal): ${cmdError}`);
}
}
// Verify Unity image is still cached
try {
const unityImageCheck = await CloudRunnerSystem.Run(
@ -202,7 +202,7 @@ class Kubernetes implements ProviderInterface {
} catch {
// Ignore check failures
}
// Check disk space after cleanup
try {
const diskCheck = await CloudRunnerSystem.Run(
@ -219,7 +219,7 @@ class Kubernetes implements ProviderInterface {
// Continue anyway - image might already be cached
}
}
let output = '';
try {
CloudRunnerLogger.log('Job does not exist');

View File

@ -95,7 +95,7 @@ class KubernetesJobSpecFactory {
// Hook containers typically use utility images like aws-cli, rclone, etc.
const lightweightImages = ['amazon/aws-cli', 'rclone/rclone', 'steamcmd/steamcmd', 'ubuntu'];
const isLightweightContainer = lightweightImages.some((lightImage) => image.includes(lightImage));
if (isLightweightContainer && process.env['cloudRunnerTests'] === 'true') {
// For test environments, use minimal resources for hook containers
return {
@ -103,7 +103,7 @@ class KubernetesJobSpecFactory {
cpu: '100m', // 0.1 CPU
};
}
// For main build containers, use the configured resources
const memoryMB = Number.parseInt(buildParameters.containerMemory);
const cpuMB = Number.parseInt(buildParameters.containerCpu);

View File

@ -148,15 +148,17 @@ class KubernetesPods {
}
// Check if pod was evicted due to disk pressure - this is an infrastructure issue
const wasEvicted = errorDetails.some((detail) =>
detail.toLowerCase().includes('evicted') || detail.toLowerCase().includes('diskpressure'),
const wasEvicted = errorDetails.some(
(detail) => detail.toLowerCase().includes('evicted') || detail.toLowerCase().includes('diskpressure'),
);
if (wasEvicted) {
const evictionMessage = `Pod ${podName} was evicted due to disk pressure. This is a test infrastructure issue - the cluster doesn't have enough disk space.`;
CloudRunnerLogger.logWarning(evictionMessage);
CloudRunnerLogger.log(`Pod details: ${errorDetails.join('\n')}`);
throw new Error(
`${evictionMessage}\nThis indicates the test environment needs more disk space or better cleanup.\n${errorDetails.join('\n')}`,
`${evictionMessage}\nThis indicates the test environment needs more disk space or better cleanup.\n${errorDetails.join(
'\n',
)}`,
);
}

View File

@ -500,7 +500,7 @@ class KubernetesTaskRunner {
waitComplete = false;
return true; // Exit wait loop to throw error
}
// Check if pod is actively pulling an image - if so, allow more time
const isPullingImage = podEvents.some(
(x) => x.reason === 'Pulling' || x.reason === 'Pulled' || x.message?.includes('Pulling image'),
@ -508,18 +508,20 @@ class KubernetesTaskRunner {
const hasImagePullError = podEvents.some(
(x) => x.reason === 'Failed' && (x.message?.includes('pull') || x.message?.includes('image')),
);
if (hasImagePullError) {
message = `Pod ${podName} failed to pull image. Check image availability and credentials.`;
CloudRunnerLogger.logWarning(message);
waitComplete = false;
return true; // Exit wait loop to throw error
}
// If actively pulling image, reset pending count to allow more time
// Large images (like Unity 3.9GB) can take 3-5 minutes to pull
if (isPullingImage && consecutivePendingCount > 4) {
CloudRunnerLogger.log(`Pod ${podName} is pulling image (check ${consecutivePendingCount}). This may take several minutes for large images.`);
CloudRunnerLogger.log(
`Pod ${podName} is pulling image (check ${consecutivePendingCount}). This may take several minutes for large images.`,
);
// Don't increment consecutivePendingCount if we're actively pulling
consecutivePendingCount = Math.max(4, consecutivePendingCount - 1);
}
@ -530,10 +532,11 @@ class KubernetesTaskRunner {
// For tests, allow more time if image is being pulled (large images need 5+ minutes)
// Otherwise fail faster if stuck in Pending (2 minutes = 8 checks at 15s interval)
const isTest = process.env['cloudRunnerTests'] === 'true';
const isPullingImage = containerStatuses.some(
(cs: any) => cs.state?.waiting?.reason === 'ImagePull' || cs.state?.waiting?.reason === 'ErrImagePull',
) || conditions.some((c: any) => c.reason?.includes('Pulling'));
const isPullingImage =
containerStatuses.some(
(cs: any) => cs.state?.waiting?.reason === 'ImagePull' || cs.state?.waiting?.reason === 'ErrImagePull',
) || conditions.some((c: any) => c.reason?.includes('Pulling'));
// Allow up to 20 minutes for image pulls in tests (80 checks), 2 minutes otherwise
const maxPendingChecks = isTest && isPullingImage ? 80 : isTest ? 8 : 80;
@ -549,19 +552,21 @@ class KubernetesTaskRunner {
if (podEvents.length > 0) {
message += `\n\nRecent Events:\n${podEvents.join('\n')}`;
}
// Get pod details to check for scheduling issues
try {
const podStatus = await kubeClient.readNamespacedPodStatus(podName, namespace);
const podSpec = podStatus.body.spec;
const podStatusDetails = podStatus.body.status;
// Check container resource requests
if (podSpec?.containers?.[0]?.resources?.requests) {
const requests = podSpec.containers[0].resources.requests;
message += `\n\nContainer Resource Requests:\n CPU: ${requests.cpu || 'not set'}\n Memory: ${requests.memory || 'not set'}\n Ephemeral Storage: ${requests['ephemeral-storage'] || 'not set'}`;
message += `\n\nContainer Resource Requests:\n CPU: ${requests.cpu || 'not set'}\n Memory: ${
requests.memory || 'not set'
}\n Ephemeral Storage: ${requests['ephemeral-storage'] || 'not set'}`;
}
// Check node selector and tolerations
if (podSpec?.nodeSelector && Object.keys(podSpec.nodeSelector).length > 0) {
message += `\n\nNode Selector: ${JSON.stringify(podSpec.nodeSelector)}`;
@ -569,12 +574,16 @@ class KubernetesTaskRunner {
if (podSpec?.tolerations && podSpec.tolerations.length > 0) {
message += `\n\nTolerations: ${JSON.stringify(podSpec.tolerations)}`;
}
// Check pod conditions for scheduling issues
if (podStatusDetails?.conditions) {
const unschedulable = podStatusDetails.conditions.find((c: any) => c.type === 'PodScheduled' && c.status === 'False');
const unschedulable = podStatusDetails.conditions.find(
(c: any) => c.type === 'PodScheduled' && c.status === 'False',
);
if (unschedulable) {
message += `\n\nScheduling Issue: ${unschedulable.reason || 'Unknown'} - ${unschedulable.message || 'No message'}`;
message += `\n\nScheduling Issue: ${unschedulable.reason || 'Unknown'} - ${
unschedulable.message || 'No message'
}`;
}
}
} catch (podStatusError) {

View File

@ -194,7 +194,9 @@ export class ContainerHookService {
ENDPOINT_ARGS=""
if [ -n "$AWS_S3_ENDPOINT" ]; then ENDPOINT_ARGS="--endpoint-url $AWS_S3_ENDPOINT"; fi
aws $ENDPOINT_ARGS s3 ls ${CloudRunner.buildParameters.awsStackName}/cloud-runner-cache/ 2>/dev/null || true
aws $ENDPOINT_ARGS s3 ls ${CloudRunner.buildParameters.awsStackName}/cloud-runner-cache/$CACHE_KEY/ 2>/dev/null || true
aws $ENDPOINT_ARGS s3 ls ${
CloudRunner.buildParameters.awsStackName
}/cloud-runner-cache/$CACHE_KEY/ 2>/dev/null || true
BUCKET1="${CloudRunner.buildParameters.awsStackName}/cloud-runner-cache/$CACHE_KEY/Library/"
OBJECT1=""
LS_OUTPUT1="$(aws $ENDPOINT_ARGS s3 ls $BUCKET1 2>/dev/null || echo '')"

View File

@ -60,15 +60,10 @@ describe('Cloud Runner Retain Workspace', () => {
true,
true,
);
// Remove non-Unity images only (preserve unityci/editor images)
// Only remove specific known system images, preserve Unity and everything else
// DO NOT use --prune as it might remove Unity image
await CloudRunnerSystem.Run(
`docker exec ${NODE} sh -c "crictl images --format 'table {{.ID}}\\t{{.Repository}}' 2>/dev/null | grep -vE 'unityci/editor|unity|IMAGE' | awk '{print \\$1}' | xargs -r crictl rmi 2>/dev/null || true" || true`,
true,
true,
);
// Clean up unused layers
await CloudRunnerSystem.Run(
`docker exec ${NODE} sh -c "crictl rmi --prune 2>/dev/null || true" || true`,
`docker exec ${NODE} sh -c "crictl images --format '{{.Repository}}:{{.Tag}}' 2>/dev/null | grep -vE 'unityci/editor|unity' | grep -E 'rancher/|curlimages/|amazon/aws-cli|rclone/rclone|steamcmd/steamcmd|ubuntu:|alpine:' | xargs -r -I {} crictl rmi {} 2>/dev/null || true" || true`,
true,
true,
);

View File

@ -55,13 +55,15 @@ describe('Cloud Runner Kubernetes', () => {
// Check if pod was evicted due to resource constraints - this is a test infrastructure failure
// Evictions indicate the cluster doesn't have enough resources, which is a test environment issue
if (results.includes('The node was low on resource: ephemeral-storage') ||
results.includes('TerminationByKubelet') ||
results.includes('Evicted')) {
if (
results.includes('The node was low on resource: ephemeral-storage') ||
results.includes('TerminationByKubelet') ||
results.includes('Evicted')
) {
throw new Error(
`Test failed: Pod was evicted due to resource constraints (ephemeral-storage). ` +
`This indicates the test environment doesn't have enough disk space. ` +
`Results: ${results.substring(0, 500)}`
`This indicates the test environment doesn't have enough disk space. ` +
`Results: ${results.substring(0, 500)}`,
);
}