pr feedback

cloud-runner-develop
Frostebite 2026-01-03 15:36:15 +00:00
parent 9dc0888c46
commit 4f59e1729d
9 changed files with 306 additions and 205 deletions

View File

@ -1 +1,2 @@
cloud runner build workflow starting
cloud runner build workflow starting

View File

@ -29,7 +29,7 @@ jobs:
name: Cloud Runner Tests (K8s)
runs-on: ubuntu-latest
env:
K3D_NODE_CONTAINERS: "k3d-unity-builder-agent-0"
K3D_NODE_CONTAINERS: 'k3d-unity-builder-agent-0'
steps:
- uses: actions/checkout@v4
with:
@ -38,7 +38,7 @@ jobs:
- name: Set up kubectl
uses: azure/setup-kubectl@v4
with:
version: 'v1.34.1'
version: 'v1.31.0'
- name: Install k3d
run: |
curl -s https://raw.githubusercontent.com/k3d-io/k3d/main/install.sh | bash
@ -62,12 +62,14 @@ jobs:
- name: Create k3s cluster (k3d)
timeout-minutes: 5
run: |
# Clean up any existing cluster and free disk space before creating new one
# Only delete if exists - don't aggressively clean up (may cause issues)
k3d cluster delete unity-builder || true
docker system prune -af --volumes || true
# Create cluster - host.k3d.internal will allow pods to access host services
# No port mapping needed - LocalStack is on host, accessible via host.k3d.internal:4566
k3d cluster create unity-builder --agents 1 --wait
# Create cluster with explicit eviction thresholds to prevent premature evictions
# host.k3d.internal will allow pods to access host services (LocalStack)
k3d cluster create unity-builder \
--agents 1 \
--wait \
--k3s-arg '--kubelet-arg=eviction-hard=imagefs.available<5%,memory.available<100Mi@agent:*'
kubectl config current-context | cat
- name: Verify cluster readiness and LocalStack connectivity
timeout-minutes: 2
@ -111,40 +113,14 @@ jobs:
}
cleanup_k3d_nodes
docker system prune -af --volumes || true
# Wait for disk pressure taints to clear (with timeout)
# Check for disk pressure taints (informational only - k3s will manage)
echo "Checking for disk pressure taints on nodes..."
for i in {1..30}; do
if kubectl describe nodes | grep -q "node.kubernetes.io/disk-pressure"; then
echo "Disk pressure detected, waiting for it to clear... ($i/30)"
cleanup_k3d_nodes
docker system prune -af --volumes || true
sleep 2
else
echo "No disk pressure taints found"
break
fi
done
kubectl describe nodes | grep -i taint || echo "No taints found"
- name: Pre-pull Unity image into k3d node
timeout-minutes: 5
run: |
echo "Pre-pulling Unity image into k3d node to avoid evictions during tests..."
# Clean up old images first to make space
K3D_NODE_CONTAINERS="${K3D_NODE_CONTAINERS:-k3d-unity-builder-agent-0}"
for NODE in $K3D_NODE_CONTAINERS; do
docker exec "$NODE" sh -c "crictl rmi --prune 2>/dev/null || true" || true
done
# Pre-pull the Unity image that will be used in tests
# This ensures it's cached and doesn't need to be pulled during test execution
UNITY_IMAGE="unityci/editor:ubuntu-2021.3.45f1-base-3"
echo "Pulling ${UNITY_IMAGE} into k3d node..."
for NODE in $K3D_NODE_CONTAINERS; do
docker exec "$NODE" sh -c "crictl pull ${UNITY_IMAGE} 2>&1 || echo 'Image pull failed or already exists'" || true
done
echo "Image pre-pull completed. Checking disk space..."
for NODE in $K3D_NODE_CONTAINERS; do
docker exec "$NODE" sh -c "df -h / | tail -1" || true
done
if kubectl describe nodes 2>/dev/null | grep -q "node.kubernetes.io/disk-pressure"; then
echo "WARNING: Disk pressure taint detected. k3s will manage this automatically."
kubectl describe nodes | grep -i taint || true
else
echo "No disk pressure taints found"
fi
- uses: actions/setup-node@v4
with:
node-version: 20
@ -202,155 +178,9 @@ jobs:
# Clean up disk space - aggressive cleanup to prevent evictions
rm -rf ./cloud-runner-cache/* || true
docker system prune -af --volumes || true
# Clean up disk space on k3d node to prevent ephemeral-storage evictions and disk pressure
echo "Cleaning up disk space on k3d node..."
# Use containerd/crictl commands (docker not available in k3d nodes)
K3D_NODE_CONTAINERS="${K3D_NODE_CONTAINERS:-k3d-unity-builder-agent-0}"
cleanup_k3d_nodes() {
for NODE in $K3D_NODE_CONTAINERS; do
docker exec "$NODE" sh -c "
crictl rmi --prune 2>/dev/null || true
crictl rmp --all 2>/dev/null || true
crictl images -q | xargs -r crictl rmi 2>/dev/null || true
find /var/lib/rancher/k3s/agent/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots -mindepth 1 -maxdepth 1 -type d -exec rm -rf {} + 2>/dev/null || true
find /var/lib/rancher/k3s/storage -mindepth 1 -maxdepth 1 -type d -exec rm -rf {} + 2>/dev/null || true
find /var/log -type f -name '*.log' -delete 2>/dev/null || true
find /tmp -type f -delete 2>/dev/null || true
find /var/lib/rancher/k3s -type f -name '*.log' -delete 2>/dev/null || true
df -h /
" || true
done
}
cleanup_k3d_nodes
# Clean up containerd snapshots and images more aggressively
cleanup_k3d_nodes
# Wait for disk pressure taints to clear before proceeding
echo "Checking for disk pressure taints..."
for i in {1..20}; do
if kubectl describe nodes 2>/dev/null | grep -q "node.kubernetes.io/disk-pressure"; then
echo "Disk pressure detected, cleaning up and waiting... ($i/20)"
for NODE in $K3D_NODE_CONTAINERS; do
docker exec "$NODE" sh -c "
crictl rmi --prune 2>/dev/null || true
crictl rmp --all 2>/dev/null || true
crictl images -q | xargs -r crictl rmi 2>/dev/null || true
find /var/lib/rancher/k3s/agent/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots -mindepth 1 -maxdepth 1 -type d -exec rm -rf {} + 2>/dev/null || true
find /var/lib/rancher/k3s/storage -mindepth 1 -maxdepth 1 -type d -exec rm -rf {} + 2>/dev/null || true
" || true
done
docker system prune -af --volumes || true
sleep 3
else
echo "No disk pressure taints found, proceeding with test"
break
fi
done
- name: Ensure disk pressure cleared before test
timeout-minutes: 3
run: |
echo "Ensuring disk pressure is cleared before test..."
rm -rf ./cloud-runner-cache/* || true
docker system prune -af --volumes || true
K3D_NODE_CONTAINERS="${K3D_NODE_CONTAINERS:-k3d-unity-builder-agent-0}"
for NODE in $K3D_NODE_CONTAINERS; do
docker exec "$NODE" sh -c "docker system prune -af --volumes 2>/dev/null || true" || true
done
# Wait for disk pressure taints to clear (with aggressive cleanup)
# Limit to 10 attempts to avoid timeout - if cleanup doesn't work, just remove the taint
PREVIOUS_DISK_USAGE=100
for i in {1..10}; do
HAS_DISK_PRESSURE=$(kubectl describe nodes 2>/dev/null | grep -q "node.kubernetes.io/disk-pressure" && echo "true" || echo "false")
if [ "$HAS_DISK_PRESSURE" = "true" ]; then
echo "Disk pressure detected, cleaning up aggressively... ($i/10)"
# Check actual disk usage on the node
PRIMARY_NODE=$(echo "$K3D_NODE_CONTAINERS" | awk '{print $1}')
DISK_USAGE=$(docker exec "$PRIMARY_NODE" sh -c "df -h / 2>/dev/null | tail -1 | awk '{print \$5}' | sed 's/%//'" || echo "unknown")
echo "Current disk usage on k3d node: ${DISK_USAGE}%"
# Use k3s/containerd commands instead of docker (docker not available in k3d nodes)
# Clean up k3s containerd snapshots and images
for NODE in $K3D_NODE_CONTAINERS; do
docker exec "$NODE" sh -c "crictl rmi --prune 2>/dev/null || true" || true
docker exec "$NODE" sh -c "crictl rmp --all 2>/dev/null || true" || true
done
# Clean up old containerd snapshots
for NODE in $K3D_NODE_CONTAINERS; do
docker exec "$NODE" sh -c "find /var/lib/rancher/k3s/agent/containerd -type d -name 'snapshots' -exec rm -rf {}/* 2>/dev/null \; || true" || true
done
# Clean up k3s logs and temp files
for NODE in $K3D_NODE_CONTAINERS; do
docker exec "$NODE" sh -c "find /var/lib/rancher/k3s -type f -name '*.log' -delete 2>/dev/null || true" || true
docker exec "$NODE" sh -c "find /tmp -type f -mtime +0 -delete 2>/dev/null || true" || true
docker exec "$NODE" sh -c "find /var/log -type f -name '*.log' -mtime +0 -delete 2>/dev/null || true" || true
done
# Clean up host docker
docker system prune -af --volumes || true
# Check if disk usage improved
NEW_DISK_USAGE=$(docker exec "$PRIMARY_NODE" sh -c "df -h / 2>/dev/null | tail -1 | awk '{print \$5}' | sed 's/%//'" || echo "unknown")
if [ "$NEW_DISK_USAGE" != "unknown" ] && [ "$PREVIOUS_DISK_USAGE" != "unknown" ]; then
if [ "$NEW_DISK_USAGE" -ge "$PREVIOUS_DISK_USAGE" ] && [ "$i" -ge 3 ]; then
echo "Disk usage not improving (${PREVIOUS_DISK_USAGE}% -> ${NEW_DISK_USAGE}%), breaking cleanup loop and removing taint manually"
break
fi
PREVIOUS_DISK_USAGE=$NEW_DISK_USAGE
fi
sleep 3
else
echo "No disk pressure taints found, proceeding with test"
kubectl describe nodes | grep -i taint || echo "No taints found"
break
fi
done
# If disk pressure taint is still present after cleanup, manually remove it (CI only)
# Try multiple times as Kubernetes may re-add it if condition persists
if kubectl describe nodes 2>/dev/null | grep -q "node.kubernetes.io/disk-pressure"; then
echo "WARNING: Disk pressure taint still present after cleanup. Manually removing taint for CI..."
NODE_NAMES=$(kubectl get nodes -o name 2>/dev/null | sed 's/node\///' || echo "")
for node in $NODE_NAMES; do
# Try removing with NoSchedule effect (most common)
kubectl taint nodes "$node" node.kubernetes.io/disk-pressure:NoSchedule- 2>/dev/null || true
# Also try without effect specifier
kubectl taint nodes "$node" node.kubernetes.io/disk-pressure- 2>/dev/null || true
# Use patch as fallback
kubectl patch node "$node" -p '{"spec":{"taints":[]}}' 2>/dev/null || true
done
sleep 2
echo "Taint removal attempted. Checking nodes..."
kubectl describe nodes | grep -i taint || echo "No taints found"
fi
# Wait for disk pressure condition to clear (not just taint)
echo "Waiting for disk pressure condition to clear on nodes..."
for i in {1..20}; do
HAS_DISK_PRESSURE_CONDITION=$(kubectl get nodes -o json 2>/dev/null | grep -q '"type":"DiskPressure"' && echo "true" || echo "false")
if [ "$HAS_DISK_PRESSURE_CONDITION" = "true" ]; then
echo "Disk pressure condition still present, waiting... ($i/20)"
sleep 2
else
echo "Disk pressure condition cleared, proceeding with test"
break
fi
done
# Final check - if condition still exists, remove taint and wait a bit more
if kubectl get nodes -o json 2>/dev/null | grep -q '"type":"DiskPressure"'; then
echo "WARNING: Disk pressure condition still exists. Removing taint and waiting 10 seconds..."
NODE_NAMES=$(kubectl get nodes -o name 2>/dev/null | sed 's/node\///' || echo "")
for node in $NODE_NAMES; do
# Try removing with NoSchedule effect (most common)
kubectl taint nodes "$node" node.kubernetes.io/disk-pressure:NoSchedule- 2>/dev/null || true
# Also try without effect specifier
kubectl taint nodes "$node" node.kubernetes.io/disk-pressure- 2>/dev/null || true
# Use patch as fallback to remove all taints
kubectl patch node "$node" -p '{"spec":{"taints":[]}}' 2>/dev/null || true
done
sleep 10
# Verify taint is actually removed
if kubectl describe nodes 2>/dev/null | grep -q "node.kubernetes.io/disk-pressure"; then
echo "ERROR: Taint still present after removal attempts. This may cause pod scheduling issues."
else
echo "Taint successfully removed."
fi
fi
# Simple cleanup - trust k3s to manage resources
echo "Cleaning up test resources..."
docker system prune -f || true
- name: Run cloud-runner-image test (validate image creation)
timeout-minutes: 10
run: yarn run test "cloud-runner-image" --detectOpenHandles --forceExit --runInBand
@ -364,7 +194,7 @@ jobs:
versioning: None
KUBE_STORAGE_CLASS: local-path
PROVIDER_STRATEGY: k8s
KUBE_VOLUME_SIZE: 5Gi
KUBE_VOLUME_SIZE: 2Gi
containerCpu: '1000'
containerMemory: '1024'
AWS_ACCESS_KEY_ID: test
@ -495,7 +325,7 @@ jobs:
versioning: None
KUBE_STORAGE_CLASS: local-path
PROVIDER_STRATEGY: k8s
KUBE_VOLUME_SIZE: 5Gi
KUBE_VOLUME_SIZE: 2Gi
ENABLE_K8S_E2E: 'true'
containerCpu: '1000'
containerMemory: '1024'
@ -825,7 +655,7 @@ jobs:
versioning: None
KUBE_STORAGE_CLASS: local-path
PROVIDER_STRATEGY: k8s
KUBE_VOLUME_SIZE: 5Gi
KUBE_VOLUME_SIZE: 2Gi
# Set resource requests for tests - increased memory to prevent OOM kills
containerCpu: '1000'
containerMemory: '1024'
@ -945,7 +775,7 @@ jobs:
versioning: None
KUBE_STORAGE_CLASS: local-path
PROVIDER_STRATEGY: k8s
KUBE_VOLUME_SIZE: 5Gi
KUBE_VOLUME_SIZE: 2Gi
containerCpu: '512'
containerMemory: '512'
AWS_ACCESS_KEY_ID: test

View File

@ -0,0 +1,250 @@
# K8s Integrity Test Failure Diagnosis and Fix Plan
## Executive Summary
The K8s integrity tests on `cloud-runner-develop` have been failing consistently since September 2025. The last
successful runs were in early September 2025 (commits 464a9d1, 98963da). Since then, we've added extensive disk pressure
handling, cleanup logic, and resource management, but tests continue to fail with pod evictions and disk pressure
issues.
## Key Findings
### 1. Successful Configuration (September 2025)
**Workflow Characteristics:**
- **Simple k3d cluster creation**: `k3d cluster create unity-builder --agents 1 --wait`
- **No pre-cleanup**: Cluster created directly without aggressive cleanup
- **No disk pressure handling**: No taint detection or removal logic
- **No image pre-pulling**: Images pulled on-demand during tests
- **Simple test execution**: Direct test runs without intermediate cleanup
- **Kubectl version**: v1.29.0
- **k3d version**: Latest (v5.8.3 equivalent)
**Key Differences:**
```yaml
# Successful version (464a9d1)
- name: Create k3s cluster (k3d)
run: |
k3d cluster create unity-builder --agents 1 --wait
kubectl config current-context | cat
```
### 2. Current Configuration (December 2025)
**Workflow Characteristics:**
- **Complex cleanup before cluster creation**: `k3d cluster delete`, `docker system prune`
- **Extensive disk pressure handling**: Taint detection, removal loops, cleanup retries
- **Image pre-pulling**: Attempts to pre-pull Unity image (3.9GB) into k3d node
- **Aggressive cleanup between tests**: PVC deletion, PV cleanup, containerd cleanup
- **Kubectl version**: v1.34.1 (newer)
- **k3d version**: v5.8.3
**Current Issues:**
1. **Pod evictions due to disk pressure** - Even after cleanup, pods get evicted
2. **PreStopHook failures** - Pods killed before graceful shutdown
3. **Exit code 137** - OOM kills (memory pressure) or disk evictions
4. **"Collected Logs" missing** - Pods terminated before post-build completes
5. **Disk usage at 96%** - Cleanup not effectively freeing space
## Root Cause Analysis
### Primary Issue: Disk Space Management
**Problem**: GitHub Actions runners have limited disk space (~72GB total), and k3d nodes share this space with:
- Docker images (Unity image: 3.9GB)
- k3s/containerd data
- PVC storage (5Gi per test)
- Logs and temporary files
- System overhead
**Why Current Approach Fails:**
1. **Cleanup happens too late**: Disk pressure taints appear after space is already exhausted
2. **Cleanup is ineffective**: `crictl rmi --prune` and manual cleanup don't free enough space
3. **Image pre-pulling makes it worse**: Pulling 3.9GB image before tests reduces available space
4. **PVC accumulation**: Multiple tests create 5Gi PVCs that aren't cleaned up fast enough
5. **Ephemeral storage requests**: Even though removed for tests, k3s still tracks usage
### Secondary Issues
1. **k3d/k3s version compatibility**: Newer k3d (v5.8.3) with k3s v1.31.5 may have different resource management
2. **Kubectl version mismatch**: v1.34.1 client with v1.31.5 server may cause issues
3. **LocalStack connectivity**: `host.k3d.internal` DNS resolution failures in some cases
4. **Test timeout**: 5-minute timeout may be too short for cleanup + test execution
## Fix Plan
### Phase 1: Simplify and Stabilize (Immediate)
**Goal**: Return to a simpler, more reliable configuration similar to successful runs.
#### 1.1 Revert to Simpler k3d Configuration
```yaml
- name: Create k3s cluster (k3d)
run: |
# Only delete if exists, no aggressive cleanup
k3d cluster delete unity-builder || true
# Create with minimal configuration
k3d cluster create unity-builder \
--agents 1 \
--wait \
--k3s-arg '--kubelet-arg=eviction-hard=imagefs.available<5%,memory.available<100Mi@agent:*'
kubectl config current-context | cat
```
**Rationale**:
- Set eviction thresholds explicitly to prevent premature evictions
- Don't pre-cleanup aggressively (may cause issues)
- Let k3s manage resources naturally
#### 1.2 Reduce PVC Size
- Change `KUBE_VOLUME_SIZE` from `5Gi` to `2Gi` for tests
- Tests don't need 5GB, and this reduces pressure significantly
#### 1.3 Remove Image Pre-pulling
- Remove the "Pre-pull Unity image" step
- Let images pull on-demand (k3s handles caching)
- Pre-pulling uses space that may be needed later
#### 1.4 Simplify Cleanup Between Tests
- Keep PVC cleanup but remove aggressive containerd cleanup
- Remove disk pressure taint loops (they're not effective)
- Trust k3s to manage resources
#### 1.5 Match Kubectl Version to k3s
- Use kubectl v1.31.x to match k3s v1.31.5
- Or pin k3d to use compatible k3s version
### Phase 2: Resource Optimization (Short-term)
#### 2.1 Use Smaller Test Images
- Consider using a smaller Unity base image for tests
- Or use a minimal test image that doesn't require 3.9GB
#### 2.2 Implement PVC Reuse
- Reuse PVCs across tests instead of creating new ones
- Only create new PVC if previous one is still in use
#### 2.3 Add Resource Limits
- Set explicit resource limits on test pods
- Prevent pods from consuming all available resources
#### 2.4 Optimize Job TTL
- Keep `ttlSecondsAfterFinished: 300` (5 minutes)
- Ensure jobs are cleaned up promptly
### Phase 3: Monitoring and Diagnostics (Medium-term)
#### 3.1 Add Disk Usage Monitoring
- Log disk usage before/after each test
- Track which components use most space
- Alert when usage exceeds thresholds
#### 3.2 Improve Error Messages
- Detect evictions explicitly and provide clear errors
- Log disk pressure events with context
- Show available vs. requested resources
#### 3.3 Add Retry Logic
- Retry tests that fail due to infrastructure issues (evictions)
- Skip retry for actual test failures
## Implementation Steps
### Step 1: Immediate Fixes (High Priority)
1. ✅ Remove image pre-pulling step
2. ✅ Simplify k3d cluster creation (remove aggressive cleanup)
3. ✅ Reduce PVC size to 2Gi
4. ✅ Remove disk pressure taint loops
5. ✅ Match kubectl version to k3s version
### Step 2: Test and Validate
1. Run integrity checks multiple times
2. Monitor disk usage patterns
3. Verify no evictions occur
4. Check test reliability
### Step 3: Iterate Based on Results
1. If still failing, add eviction thresholds
2. If space is issue, implement PVC reuse
3. If timing is issue, increase timeouts
## Expected Outcomes
### Success Criteria
- ✅ All K8s integrity tests pass consistently
- ✅ No pod evictions during test execution
- ✅ Disk usage stays below 85%
- ✅ Tests complete within timeout (5 minutes)
- ✅ "Collected Logs" always present in output
### Metrics to Track
- Test pass rate (target: 100%)
- Average disk usage during tests
- Number of evictions per run
- Test execution time
- Cleanup effectiveness
## Risk Assessment
### Low Risk Changes
- Removing image pre-pulling
- Reducing PVC size
- Simplifying cleanup
### Medium Risk Changes
- Changing k3d configuration
- Modifying eviction thresholds
- Changing kubectl version
### High Risk Changes
- PVC reuse (requires careful state management)
- Changing k3s version
- Major workflow restructuring
## Rollback Plan
If changes make things worse:
1. Revert to commit 464a9d1 workflow configuration
2. Gradually add back only essential changes
3. Test each change individually
## Timeline
- **Phase 1**: 1-2 days (immediate fixes)
- **Phase 2**: 3-5 days (optimization)
- **Phase 3**: 1 week (monitoring)
## Notes
- The successful September runs used a much simpler approach
- Complexity has increased without solving the root problem
- Simplification is likely the key to reliability
- GitHub Actions runners have limited resources - we must work within constraints

3
dist/index.js vendored
View File

@ -4772,7 +4772,8 @@ class KubernetesTaskRunner {
}
else if (!output.includes('Collected Logs')) {
// We have some output but missing "Collected Logs" - append the fallback message
output += '\nPod logs incomplete - "Collected Logs" marker not found. Pod may have been terminated before post-build completed.\n';
output +=
'\nPod logs incomplete - "Collected Logs" marker not found. Pod may have been terminated before post-build completed.\n';
}
}
}

2
dist/index.js.map vendored

File diff suppressed because one or more lines are too long

Binary file not shown.

Binary file not shown.

View File

@ -217,7 +217,7 @@ class KubernetesTaskRunner {
// If output is empty, we need to be more aggressive about getting logs
const needsFallback = output.trim().length === 0;
const missingCollectedLogs = !output.includes('Collected Logs');
if (needsFallback) {
CloudRunnerLogger.log('Output is empty, attempting aggressive log collection fallback...');
// Give the pod a moment to finish writing logs before we try to read them
@ -234,8 +234,8 @@ class KubernetesTaskRunner {
const reason = needsFallback
? 'output is empty'
: missingCollectedLogs
? 'Collected Logs missing from output'
: 'pod is terminated';
? 'Collected Logs missing from output'
: 'pod is terminated';
CloudRunnerLogger.log(
`Pod is ${isPodStillRunning ? 'running' : 'terminated'} and ${reason}, reading log file as fallback...`,
);
@ -354,7 +354,8 @@ class KubernetesTaskRunner {
output = 'Pod logs unavailable - pod may have been terminated before logs could be collected.\n';
} else if (!output.includes('Collected Logs')) {
// We have some output but missing "Collected Logs" - append the fallback message
output += '\nPod logs incomplete - "Collected Logs" marker not found. Pod may have been terminated before post-build completed.\n';
output +=
'\nPod logs incomplete - "Collected Logs" marker not found. Pod may have been terminated before post-build completed.\n';
}
}
} catch (fallbackError: any) {
@ -473,7 +474,7 @@ class KubernetesTaskRunner {
// Pod is complete if it's not Pending or Unknown - it might be Running, Succeeded, or Failed
// For Failed/Succeeded pods, we still want to try to get logs, so we mark as complete
waitComplete = phase !== 'Pending' && phase !== 'Unknown';
// If pod completed (Succeeded/Failed), log it but don't throw - we'll try to get logs
if (waitComplete && phase !== 'Running') {
CloudRunnerLogger.log(`Pod ${podName} completed with phase: ${phase}. Will attempt to retrieve logs.`);
@ -481,7 +482,7 @@ class KubernetesTaskRunner {
if (phase === 'Pending') {
consecutivePendingCount++;
// Check for scheduling failures in events (faster than waiting for conditions)
try {
const events = await kubeClient.listNamespacedEvent(namespace);
@ -489,7 +490,7 @@ class KubernetesTaskRunner {
const failedSchedulingEvents = podEvents.filter(
(x) => x.reason === 'FailedScheduling' || x.reason === 'SchedulingGated',
);
if (failedSchedulingEvents.length > 0) {
const schedulingMessage = failedSchedulingEvents
.map((x) => `${x.reason}: ${x.message || ''}`)
@ -502,11 +503,11 @@ class KubernetesTaskRunner {
} catch {
// Ignore event fetch errors
}
// For tests, fail faster if stuck in Pending (2 minutes = 8 checks at 15s interval)
const isTest = process.env['cloudRunnerTests'] === 'true';
const maxPendingChecks = isTest ? 8 : 80; // 2 minutes for tests, 20 minutes for production
if (consecutivePendingCount >= maxPendingChecks) {
message = `Pod ${podName} stuck in Pending state for too long (${consecutivePendingCount} checks). This indicates a scheduling problem.`;
// Get events for context
@ -526,7 +527,7 @@ class KubernetesTaskRunner {
waitComplete = false;
return true; // Exit wait loop to throw error
}
// Log diagnostic info every 4 checks (1 minute) if still pending
if (consecutivePendingCount % 4 === 0) {
const pendingMessage = `Pod ${podName} still Pending (check ${consecutivePendingCount}/${maxPendingChecks}). Phase: ${phase}`;

View File

@ -16,3 +16,21 @@
[Client] Error: Command failed: rclone lsf local:./temp/rclone-remote
2025/12/29 16:36:40 CRITICAL: Failed to create file system for "local:./temp/rclone-remote": didn't find section in config file ("local")
[Client] bash -lc 'mkdir -p /data/cache/$CACHE_KEY/Library/ ; mkdir -p /data/cache/$CACHE_KEY/lfs/ ; if command -v rclone > /dev/null 2>&1; then ; rclone copy local:./temp/rclone-remote/cloud-runner-cache/$CACHE_KEY/Library /data/cache/$CACHE_KEY/Library/ || true ; rclone copy local:./temp/rclone-remote/cloud-runner-cache/$CACHE_KEY/lfs /data/cache/$CACHE_KEY/lfs/ || true ; else ; echo "rclone not available, skipping rclone-pull-cache" ; fi'
[Client] [0]
[Client] The system cannot find the path specified.
[Client]
[Client] bash -lc 'echo "cloud runner build workflow starting" ; # skipping apt-get in local-docker or non-container provider ; # skipping toolchain setup in local-docker or non-container provider ; export GITHUB_WORKSPACE="/data/0-linux64-pl38/repo" ; # skipping df on /data in non-container provider ; export LOG_FILE=$(pwd)/temp/job-log.txt ; export GIT_DISCOVERY_ACROSS_FILESYSTEM=1 ; mkdir -p "$(dirname "$LOG_FILE")" ; echo "log start" >> "$LOG_FILE" ; echo "CACHE_KEY=$CACHE_KEY" ; echo "game ci start" ; echo "game ci start" >> "$LOG_FILE" ; timeout 3s node C:/Users/Mark/OneDrive/Documents/unity-builder/dist/index.js -m remote-cli-log-stream --logFile "$LOG_FILE" || true ; node C:/Users/Mark/OneDrive/Documents/unity-builder/dist/index.js -m remote-cli-post-build'
[Client] [0]
[Client]
[Client] bash -lc 'if command -v rclone > /dev/null 2>&1; then ; rclone copy /data/cache/$CACHE_KEY/build/build-0-linux64-pl38.tar local:./temp/rclone-remote/cloud-runner-cache/$CACHE_KEY/build/ || true ; rm /data/cache/$CACHE_KEY/build/build-0-linux64-pl38.tar || true ; else ; echo "rclone not available, skipping rclone-upload-build" ; fi'
[Client] [0]
[Client] The system cannot find the path specified.
[Client]
[Client] bash -lc 'if command -v rclone > /dev/null 2>&1; then ; rclone copy /data/cache/$CACHE_KEY/lfs local:./temp/rclone-remote/cloud-runner-cache/$CACHE_KEY/lfs || true ; rm -r /data/cache/$CACHE_KEY/lfs || true ; rclone copy /data/cache/$CACHE_KEY/Library local:./temp/rclone-remote/cloud-runner-cache/$CACHE_KEY/Library || true ; rm -r /data/cache/$CACHE_KEY/Library || true ; else ; echo "rclone not available, skipping rclone-upload-cache" ; fi'
[Client] [0]
[Client] The system cannot find the path specified.
[Client]
[Client] Error: Command failed: rclone lsf local:./temp/rclone-remote
2026/01/03 15:36:12 CRITICAL: Failed to create file system for "local:./temp/rclone-remote": didn't find section in config file ("local")