pr feedback
parent
9dc0888c46
commit
4f59e1729d
|
|
@ -1 +1,2 @@
|
||||||
cloud runner build workflow starting
|
cloud runner build workflow starting
|
||||||
|
cloud runner build workflow starting
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,7 @@ jobs:
|
||||||
name: Cloud Runner Tests (K8s)
|
name: Cloud Runner Tests (K8s)
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
env:
|
env:
|
||||||
K3D_NODE_CONTAINERS: "k3d-unity-builder-agent-0"
|
K3D_NODE_CONTAINERS: 'k3d-unity-builder-agent-0'
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
|
|
@ -38,7 +38,7 @@ jobs:
|
||||||
- name: Set up kubectl
|
- name: Set up kubectl
|
||||||
uses: azure/setup-kubectl@v4
|
uses: azure/setup-kubectl@v4
|
||||||
with:
|
with:
|
||||||
version: 'v1.34.1'
|
version: 'v1.31.0'
|
||||||
- name: Install k3d
|
- name: Install k3d
|
||||||
run: |
|
run: |
|
||||||
curl -s https://raw.githubusercontent.com/k3d-io/k3d/main/install.sh | bash
|
curl -s https://raw.githubusercontent.com/k3d-io/k3d/main/install.sh | bash
|
||||||
|
|
@ -62,12 +62,14 @@ jobs:
|
||||||
- name: Create k3s cluster (k3d)
|
- name: Create k3s cluster (k3d)
|
||||||
timeout-minutes: 5
|
timeout-minutes: 5
|
||||||
run: |
|
run: |
|
||||||
# Clean up any existing cluster and free disk space before creating new one
|
# Only delete if exists - don't aggressively clean up (may cause issues)
|
||||||
k3d cluster delete unity-builder || true
|
k3d cluster delete unity-builder || true
|
||||||
docker system prune -af --volumes || true
|
# Create cluster with explicit eviction thresholds to prevent premature evictions
|
||||||
# Create cluster - host.k3d.internal will allow pods to access host services
|
# host.k3d.internal will allow pods to access host services (LocalStack)
|
||||||
# No port mapping needed - LocalStack is on host, accessible via host.k3d.internal:4566
|
k3d cluster create unity-builder \
|
||||||
k3d cluster create unity-builder --agents 1 --wait
|
--agents 1 \
|
||||||
|
--wait \
|
||||||
|
--k3s-arg '--kubelet-arg=eviction-hard=imagefs.available<5%,memory.available<100Mi@agent:*'
|
||||||
kubectl config current-context | cat
|
kubectl config current-context | cat
|
||||||
- name: Verify cluster readiness and LocalStack connectivity
|
- name: Verify cluster readiness and LocalStack connectivity
|
||||||
timeout-minutes: 2
|
timeout-minutes: 2
|
||||||
|
|
@ -111,40 +113,14 @@ jobs:
|
||||||
}
|
}
|
||||||
cleanup_k3d_nodes
|
cleanup_k3d_nodes
|
||||||
docker system prune -af --volumes || true
|
docker system prune -af --volumes || true
|
||||||
# Wait for disk pressure taints to clear (with timeout)
|
# Check for disk pressure taints (informational only - k3s will manage)
|
||||||
echo "Checking for disk pressure taints on nodes..."
|
echo "Checking for disk pressure taints on nodes..."
|
||||||
for i in {1..30}; do
|
if kubectl describe nodes 2>/dev/null | grep -q "node.kubernetes.io/disk-pressure"; then
|
||||||
if kubectl describe nodes | grep -q "node.kubernetes.io/disk-pressure"; then
|
echo "WARNING: Disk pressure taint detected. k3s will manage this automatically."
|
||||||
echo "Disk pressure detected, waiting for it to clear... ($i/30)"
|
kubectl describe nodes | grep -i taint || true
|
||||||
cleanup_k3d_nodes
|
else
|
||||||
docker system prune -af --volumes || true
|
echo "No disk pressure taints found"
|
||||||
sleep 2
|
fi
|
||||||
else
|
|
||||||
echo "No disk pressure taints found"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
kubectl describe nodes | grep -i taint || echo "No taints found"
|
|
||||||
- name: Pre-pull Unity image into k3d node
|
|
||||||
timeout-minutes: 5
|
|
||||||
run: |
|
|
||||||
echo "Pre-pulling Unity image into k3d node to avoid evictions during tests..."
|
|
||||||
# Clean up old images first to make space
|
|
||||||
K3D_NODE_CONTAINERS="${K3D_NODE_CONTAINERS:-k3d-unity-builder-agent-0}"
|
|
||||||
for NODE in $K3D_NODE_CONTAINERS; do
|
|
||||||
docker exec "$NODE" sh -c "crictl rmi --prune 2>/dev/null || true" || true
|
|
||||||
done
|
|
||||||
# Pre-pull the Unity image that will be used in tests
|
|
||||||
# This ensures it's cached and doesn't need to be pulled during test execution
|
|
||||||
UNITY_IMAGE="unityci/editor:ubuntu-2021.3.45f1-base-3"
|
|
||||||
echo "Pulling ${UNITY_IMAGE} into k3d node..."
|
|
||||||
for NODE in $K3D_NODE_CONTAINERS; do
|
|
||||||
docker exec "$NODE" sh -c "crictl pull ${UNITY_IMAGE} 2>&1 || echo 'Image pull failed or already exists'" || true
|
|
||||||
done
|
|
||||||
echo "Image pre-pull completed. Checking disk space..."
|
|
||||||
for NODE in $K3D_NODE_CONTAINERS; do
|
|
||||||
docker exec "$NODE" sh -c "df -h / | tail -1" || true
|
|
||||||
done
|
|
||||||
- uses: actions/setup-node@v4
|
- uses: actions/setup-node@v4
|
||||||
with:
|
with:
|
||||||
node-version: 20
|
node-version: 20
|
||||||
|
|
@ -202,155 +178,9 @@ jobs:
|
||||||
# Clean up disk space - aggressive cleanup to prevent evictions
|
# Clean up disk space - aggressive cleanup to prevent evictions
|
||||||
rm -rf ./cloud-runner-cache/* || true
|
rm -rf ./cloud-runner-cache/* || true
|
||||||
docker system prune -af --volumes || true
|
docker system prune -af --volumes || true
|
||||||
# Clean up disk space on k3d node to prevent ephemeral-storage evictions and disk pressure
|
# Simple cleanup - trust k3s to manage resources
|
||||||
echo "Cleaning up disk space on k3d node..."
|
echo "Cleaning up test resources..."
|
||||||
# Use containerd/crictl commands (docker not available in k3d nodes)
|
docker system prune -f || true
|
||||||
K3D_NODE_CONTAINERS="${K3D_NODE_CONTAINERS:-k3d-unity-builder-agent-0}"
|
|
||||||
cleanup_k3d_nodes() {
|
|
||||||
for NODE in $K3D_NODE_CONTAINERS; do
|
|
||||||
docker exec "$NODE" sh -c "
|
|
||||||
crictl rmi --prune 2>/dev/null || true
|
|
||||||
crictl rmp --all 2>/dev/null || true
|
|
||||||
crictl images -q | xargs -r crictl rmi 2>/dev/null || true
|
|
||||||
find /var/lib/rancher/k3s/agent/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots -mindepth 1 -maxdepth 1 -type d -exec rm -rf {} + 2>/dev/null || true
|
|
||||||
find /var/lib/rancher/k3s/storage -mindepth 1 -maxdepth 1 -type d -exec rm -rf {} + 2>/dev/null || true
|
|
||||||
find /var/log -type f -name '*.log' -delete 2>/dev/null || true
|
|
||||||
find /tmp -type f -delete 2>/dev/null || true
|
|
||||||
find /var/lib/rancher/k3s -type f -name '*.log' -delete 2>/dev/null || true
|
|
||||||
df -h /
|
|
||||||
" || true
|
|
||||||
done
|
|
||||||
}
|
|
||||||
cleanup_k3d_nodes
|
|
||||||
# Clean up containerd snapshots and images more aggressively
|
|
||||||
cleanup_k3d_nodes
|
|
||||||
# Wait for disk pressure taints to clear before proceeding
|
|
||||||
echo "Checking for disk pressure taints..."
|
|
||||||
for i in {1..20}; do
|
|
||||||
if kubectl describe nodes 2>/dev/null | grep -q "node.kubernetes.io/disk-pressure"; then
|
|
||||||
echo "Disk pressure detected, cleaning up and waiting... ($i/20)"
|
|
||||||
for NODE in $K3D_NODE_CONTAINERS; do
|
|
||||||
docker exec "$NODE" sh -c "
|
|
||||||
crictl rmi --prune 2>/dev/null || true
|
|
||||||
crictl rmp --all 2>/dev/null || true
|
|
||||||
crictl images -q | xargs -r crictl rmi 2>/dev/null || true
|
|
||||||
find /var/lib/rancher/k3s/agent/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots -mindepth 1 -maxdepth 1 -type d -exec rm -rf {} + 2>/dev/null || true
|
|
||||||
find /var/lib/rancher/k3s/storage -mindepth 1 -maxdepth 1 -type d -exec rm -rf {} + 2>/dev/null || true
|
|
||||||
" || true
|
|
||||||
done
|
|
||||||
docker system prune -af --volumes || true
|
|
||||||
sleep 3
|
|
||||||
else
|
|
||||||
echo "No disk pressure taints found, proceeding with test"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
- name: Ensure disk pressure cleared before test
|
|
||||||
timeout-minutes: 3
|
|
||||||
run: |
|
|
||||||
echo "Ensuring disk pressure is cleared before test..."
|
|
||||||
rm -rf ./cloud-runner-cache/* || true
|
|
||||||
docker system prune -af --volumes || true
|
|
||||||
K3D_NODE_CONTAINERS="${K3D_NODE_CONTAINERS:-k3d-unity-builder-agent-0}"
|
|
||||||
for NODE in $K3D_NODE_CONTAINERS; do
|
|
||||||
docker exec "$NODE" sh -c "docker system prune -af --volumes 2>/dev/null || true" || true
|
|
||||||
done
|
|
||||||
# Wait for disk pressure taints to clear (with aggressive cleanup)
|
|
||||||
# Limit to 10 attempts to avoid timeout - if cleanup doesn't work, just remove the taint
|
|
||||||
PREVIOUS_DISK_USAGE=100
|
|
||||||
for i in {1..10}; do
|
|
||||||
HAS_DISK_PRESSURE=$(kubectl describe nodes 2>/dev/null | grep -q "node.kubernetes.io/disk-pressure" && echo "true" || echo "false")
|
|
||||||
if [ "$HAS_DISK_PRESSURE" = "true" ]; then
|
|
||||||
echo "Disk pressure detected, cleaning up aggressively... ($i/10)"
|
|
||||||
# Check actual disk usage on the node
|
|
||||||
PRIMARY_NODE=$(echo "$K3D_NODE_CONTAINERS" | awk '{print $1}')
|
|
||||||
DISK_USAGE=$(docker exec "$PRIMARY_NODE" sh -c "df -h / 2>/dev/null | tail -1 | awk '{print \$5}' | sed 's/%//'" || echo "unknown")
|
|
||||||
echo "Current disk usage on k3d node: ${DISK_USAGE}%"
|
|
||||||
|
|
||||||
# Use k3s/containerd commands instead of docker (docker not available in k3d nodes)
|
|
||||||
# Clean up k3s containerd snapshots and images
|
|
||||||
for NODE in $K3D_NODE_CONTAINERS; do
|
|
||||||
docker exec "$NODE" sh -c "crictl rmi --prune 2>/dev/null || true" || true
|
|
||||||
docker exec "$NODE" sh -c "crictl rmp --all 2>/dev/null || true" || true
|
|
||||||
done
|
|
||||||
# Clean up old containerd snapshots
|
|
||||||
for NODE in $K3D_NODE_CONTAINERS; do
|
|
||||||
docker exec "$NODE" sh -c "find /var/lib/rancher/k3s/agent/containerd -type d -name 'snapshots' -exec rm -rf {}/* 2>/dev/null \; || true" || true
|
|
||||||
done
|
|
||||||
# Clean up k3s logs and temp files
|
|
||||||
for NODE in $K3D_NODE_CONTAINERS; do
|
|
||||||
docker exec "$NODE" sh -c "find /var/lib/rancher/k3s -type f -name '*.log' -delete 2>/dev/null || true" || true
|
|
||||||
docker exec "$NODE" sh -c "find /tmp -type f -mtime +0 -delete 2>/dev/null || true" || true
|
|
||||||
docker exec "$NODE" sh -c "find /var/log -type f -name '*.log' -mtime +0 -delete 2>/dev/null || true" || true
|
|
||||||
done
|
|
||||||
# Clean up host docker
|
|
||||||
docker system prune -af --volumes || true
|
|
||||||
|
|
||||||
# Check if disk usage improved
|
|
||||||
NEW_DISK_USAGE=$(docker exec "$PRIMARY_NODE" sh -c "df -h / 2>/dev/null | tail -1 | awk '{print \$5}' | sed 's/%//'" || echo "unknown")
|
|
||||||
if [ "$NEW_DISK_USAGE" != "unknown" ] && [ "$PREVIOUS_DISK_USAGE" != "unknown" ]; then
|
|
||||||
if [ "$NEW_DISK_USAGE" -ge "$PREVIOUS_DISK_USAGE" ] && [ "$i" -ge 3 ]; then
|
|
||||||
echo "Disk usage not improving (${PREVIOUS_DISK_USAGE}% -> ${NEW_DISK_USAGE}%), breaking cleanup loop and removing taint manually"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
PREVIOUS_DISK_USAGE=$NEW_DISK_USAGE
|
|
||||||
fi
|
|
||||||
sleep 3
|
|
||||||
else
|
|
||||||
echo "No disk pressure taints found, proceeding with test"
|
|
||||||
kubectl describe nodes | grep -i taint || echo "No taints found"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
# If disk pressure taint is still present after cleanup, manually remove it (CI only)
|
|
||||||
# Try multiple times as Kubernetes may re-add it if condition persists
|
|
||||||
if kubectl describe nodes 2>/dev/null | grep -q "node.kubernetes.io/disk-pressure"; then
|
|
||||||
echo "WARNING: Disk pressure taint still present after cleanup. Manually removing taint for CI..."
|
|
||||||
NODE_NAMES=$(kubectl get nodes -o name 2>/dev/null | sed 's/node\///' || echo "")
|
|
||||||
for node in $NODE_NAMES; do
|
|
||||||
# Try removing with NoSchedule effect (most common)
|
|
||||||
kubectl taint nodes "$node" node.kubernetes.io/disk-pressure:NoSchedule- 2>/dev/null || true
|
|
||||||
# Also try without effect specifier
|
|
||||||
kubectl taint nodes "$node" node.kubernetes.io/disk-pressure- 2>/dev/null || true
|
|
||||||
# Use patch as fallback
|
|
||||||
kubectl patch node "$node" -p '{"spec":{"taints":[]}}' 2>/dev/null || true
|
|
||||||
done
|
|
||||||
sleep 2
|
|
||||||
echo "Taint removal attempted. Checking nodes..."
|
|
||||||
kubectl describe nodes | grep -i taint || echo "No taints found"
|
|
||||||
fi
|
|
||||||
# Wait for disk pressure condition to clear (not just taint)
|
|
||||||
echo "Waiting for disk pressure condition to clear on nodes..."
|
|
||||||
for i in {1..20}; do
|
|
||||||
HAS_DISK_PRESSURE_CONDITION=$(kubectl get nodes -o json 2>/dev/null | grep -q '"type":"DiskPressure"' && echo "true" || echo "false")
|
|
||||||
if [ "$HAS_DISK_PRESSURE_CONDITION" = "true" ]; then
|
|
||||||
echo "Disk pressure condition still present, waiting... ($i/20)"
|
|
||||||
sleep 2
|
|
||||||
else
|
|
||||||
echo "Disk pressure condition cleared, proceeding with test"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
# Final check - if condition still exists, remove taint and wait a bit more
|
|
||||||
if kubectl get nodes -o json 2>/dev/null | grep -q '"type":"DiskPressure"'; then
|
|
||||||
echo "WARNING: Disk pressure condition still exists. Removing taint and waiting 10 seconds..."
|
|
||||||
NODE_NAMES=$(kubectl get nodes -o name 2>/dev/null | sed 's/node\///' || echo "")
|
|
||||||
for node in $NODE_NAMES; do
|
|
||||||
# Try removing with NoSchedule effect (most common)
|
|
||||||
kubectl taint nodes "$node" node.kubernetes.io/disk-pressure:NoSchedule- 2>/dev/null || true
|
|
||||||
# Also try without effect specifier
|
|
||||||
kubectl taint nodes "$node" node.kubernetes.io/disk-pressure- 2>/dev/null || true
|
|
||||||
# Use patch as fallback to remove all taints
|
|
||||||
kubectl patch node "$node" -p '{"spec":{"taints":[]}}' 2>/dev/null || true
|
|
||||||
done
|
|
||||||
sleep 10
|
|
||||||
# Verify taint is actually removed
|
|
||||||
if kubectl describe nodes 2>/dev/null | grep -q "node.kubernetes.io/disk-pressure"; then
|
|
||||||
echo "ERROR: Taint still present after removal attempts. This may cause pod scheduling issues."
|
|
||||||
else
|
|
||||||
echo "Taint successfully removed."
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
- name: Run cloud-runner-image test (validate image creation)
|
- name: Run cloud-runner-image test (validate image creation)
|
||||||
timeout-minutes: 10
|
timeout-minutes: 10
|
||||||
run: yarn run test "cloud-runner-image" --detectOpenHandles --forceExit --runInBand
|
run: yarn run test "cloud-runner-image" --detectOpenHandles --forceExit --runInBand
|
||||||
|
|
@ -364,7 +194,7 @@ jobs:
|
||||||
versioning: None
|
versioning: None
|
||||||
KUBE_STORAGE_CLASS: local-path
|
KUBE_STORAGE_CLASS: local-path
|
||||||
PROVIDER_STRATEGY: k8s
|
PROVIDER_STRATEGY: k8s
|
||||||
KUBE_VOLUME_SIZE: 5Gi
|
KUBE_VOLUME_SIZE: 2Gi
|
||||||
containerCpu: '1000'
|
containerCpu: '1000'
|
||||||
containerMemory: '1024'
|
containerMemory: '1024'
|
||||||
AWS_ACCESS_KEY_ID: test
|
AWS_ACCESS_KEY_ID: test
|
||||||
|
|
@ -495,7 +325,7 @@ jobs:
|
||||||
versioning: None
|
versioning: None
|
||||||
KUBE_STORAGE_CLASS: local-path
|
KUBE_STORAGE_CLASS: local-path
|
||||||
PROVIDER_STRATEGY: k8s
|
PROVIDER_STRATEGY: k8s
|
||||||
KUBE_VOLUME_SIZE: 5Gi
|
KUBE_VOLUME_SIZE: 2Gi
|
||||||
ENABLE_K8S_E2E: 'true'
|
ENABLE_K8S_E2E: 'true'
|
||||||
containerCpu: '1000'
|
containerCpu: '1000'
|
||||||
containerMemory: '1024'
|
containerMemory: '1024'
|
||||||
|
|
@ -825,7 +655,7 @@ jobs:
|
||||||
versioning: None
|
versioning: None
|
||||||
KUBE_STORAGE_CLASS: local-path
|
KUBE_STORAGE_CLASS: local-path
|
||||||
PROVIDER_STRATEGY: k8s
|
PROVIDER_STRATEGY: k8s
|
||||||
KUBE_VOLUME_SIZE: 5Gi
|
KUBE_VOLUME_SIZE: 2Gi
|
||||||
# Set resource requests for tests - increased memory to prevent OOM kills
|
# Set resource requests for tests - increased memory to prevent OOM kills
|
||||||
containerCpu: '1000'
|
containerCpu: '1000'
|
||||||
containerMemory: '1024'
|
containerMemory: '1024'
|
||||||
|
|
@ -945,7 +775,7 @@ jobs:
|
||||||
versioning: None
|
versioning: None
|
||||||
KUBE_STORAGE_CLASS: local-path
|
KUBE_STORAGE_CLASS: local-path
|
||||||
PROVIDER_STRATEGY: k8s
|
PROVIDER_STRATEGY: k8s
|
||||||
KUBE_VOLUME_SIZE: 5Gi
|
KUBE_VOLUME_SIZE: 2Gi
|
||||||
containerCpu: '512'
|
containerCpu: '512'
|
||||||
containerMemory: '512'
|
containerMemory: '512'
|
||||||
AWS_ACCESS_KEY_ID: test
|
AWS_ACCESS_KEY_ID: test
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,250 @@
|
||||||
|
# K8s Integrity Test Failure Diagnosis and Fix Plan
|
||||||
|
|
||||||
|
## Executive Summary
|
||||||
|
|
||||||
|
The K8s integrity tests on `cloud-runner-develop` have been failing consistently since September 2025. The last
|
||||||
|
successful runs were in early September 2025 (commits 464a9d1, 98963da). Since then, we've added extensive disk pressure
|
||||||
|
handling, cleanup logic, and resource management, but tests continue to fail with pod evictions and disk pressure
|
||||||
|
issues.
|
||||||
|
|
||||||
|
## Key Findings
|
||||||
|
|
||||||
|
### 1. Successful Configuration (September 2025)
|
||||||
|
|
||||||
|
**Workflow Characteristics:**
|
||||||
|
|
||||||
|
- **Simple k3d cluster creation**: `k3d cluster create unity-builder --agents 1 --wait`
|
||||||
|
- **No pre-cleanup**: Cluster created directly without aggressive cleanup
|
||||||
|
- **No disk pressure handling**: No taint detection or removal logic
|
||||||
|
- **No image pre-pulling**: Images pulled on-demand during tests
|
||||||
|
- **Simple test execution**: Direct test runs without intermediate cleanup
|
||||||
|
- **Kubectl version**: v1.29.0
|
||||||
|
- **k3d version**: Latest (v5.8.3 equivalent)
|
||||||
|
|
||||||
|
**Key Differences:**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Successful version (464a9d1)
|
||||||
|
- name: Create k3s cluster (k3d)
|
||||||
|
run: |
|
||||||
|
k3d cluster create unity-builder --agents 1 --wait
|
||||||
|
kubectl config current-context | cat
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Current Configuration (December 2025)
|
||||||
|
|
||||||
|
**Workflow Characteristics:**
|
||||||
|
|
||||||
|
- **Complex cleanup before cluster creation**: `k3d cluster delete`, `docker system prune`
|
||||||
|
- **Extensive disk pressure handling**: Taint detection, removal loops, cleanup retries
|
||||||
|
- **Image pre-pulling**: Attempts to pre-pull Unity image (3.9GB) into k3d node
|
||||||
|
- **Aggressive cleanup between tests**: PVC deletion, PV cleanup, containerd cleanup
|
||||||
|
- **Kubectl version**: v1.34.1 (newer)
|
||||||
|
- **k3d version**: v5.8.3
|
||||||
|
|
||||||
|
**Current Issues:**
|
||||||
|
|
||||||
|
1. **Pod evictions due to disk pressure** - Even after cleanup, pods get evicted
|
||||||
|
2. **PreStopHook failures** - Pods killed before graceful shutdown
|
||||||
|
3. **Exit code 137** - OOM kills (memory pressure) or disk evictions
|
||||||
|
4. **"Collected Logs" missing** - Pods terminated before post-build completes
|
||||||
|
5. **Disk usage at 96%** - Cleanup not effectively freeing space
|
||||||
|
|
||||||
|
## Root Cause Analysis
|
||||||
|
|
||||||
|
### Primary Issue: Disk Space Management
|
||||||
|
|
||||||
|
**Problem**: GitHub Actions runners have limited disk space (~72GB total), and k3d nodes share this space with:
|
||||||
|
|
||||||
|
- Docker images (Unity image: 3.9GB)
|
||||||
|
- k3s/containerd data
|
||||||
|
- PVC storage (5Gi per test)
|
||||||
|
- Logs and temporary files
|
||||||
|
- System overhead
|
||||||
|
|
||||||
|
**Why Current Approach Fails:**
|
||||||
|
|
||||||
|
1. **Cleanup happens too late**: Disk pressure taints appear after space is already exhausted
|
||||||
|
2. **Cleanup is ineffective**: `crictl rmi --prune` and manual cleanup don't free enough space
|
||||||
|
3. **Image pre-pulling makes it worse**: Pulling 3.9GB image before tests reduces available space
|
||||||
|
4. **PVC accumulation**: Multiple tests create 5Gi PVCs that aren't cleaned up fast enough
|
||||||
|
5. **Ephemeral storage requests**: Even though removed for tests, k3s still tracks usage
|
||||||
|
|
||||||
|
### Secondary Issues
|
||||||
|
|
||||||
|
1. **k3d/k3s version compatibility**: Newer k3d (v5.8.3) with k3s v1.31.5 may have different resource management
|
||||||
|
2. **Kubectl version mismatch**: v1.34.1 client with v1.31.5 server may cause issues
|
||||||
|
3. **LocalStack connectivity**: `host.k3d.internal` DNS resolution failures in some cases
|
||||||
|
4. **Test timeout**: 5-minute timeout may be too short for cleanup + test execution
|
||||||
|
|
||||||
|
## Fix Plan
|
||||||
|
|
||||||
|
### Phase 1: Simplify and Stabilize (Immediate)
|
||||||
|
|
||||||
|
**Goal**: Return to a simpler, more reliable configuration similar to successful runs.
|
||||||
|
|
||||||
|
#### 1.1 Revert to Simpler k3d Configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- name: Create k3s cluster (k3d)
|
||||||
|
run: |
|
||||||
|
# Only delete if exists, no aggressive cleanup
|
||||||
|
k3d cluster delete unity-builder || true
|
||||||
|
# Create with minimal configuration
|
||||||
|
k3d cluster create unity-builder \
|
||||||
|
--agents 1 \
|
||||||
|
--wait \
|
||||||
|
--k3s-arg '--kubelet-arg=eviction-hard=imagefs.available<5%,memory.available<100Mi@agent:*'
|
||||||
|
kubectl config current-context | cat
|
||||||
|
```
|
||||||
|
|
||||||
|
**Rationale**:
|
||||||
|
|
||||||
|
- Set eviction thresholds explicitly to prevent premature evictions
|
||||||
|
- Don't pre-cleanup aggressively (may cause issues)
|
||||||
|
- Let k3s manage resources naturally
|
||||||
|
|
||||||
|
#### 1.2 Reduce PVC Size
|
||||||
|
|
||||||
|
- Change `KUBE_VOLUME_SIZE` from `5Gi` to `2Gi` for tests
|
||||||
|
- Tests don't need 5GB, and this reduces pressure significantly
|
||||||
|
|
||||||
|
#### 1.3 Remove Image Pre-pulling
|
||||||
|
|
||||||
|
- Remove the "Pre-pull Unity image" step
|
||||||
|
- Let images pull on-demand (k3s handles caching)
|
||||||
|
- Pre-pulling uses space that may be needed later
|
||||||
|
|
||||||
|
#### 1.4 Simplify Cleanup Between Tests
|
||||||
|
|
||||||
|
- Keep PVC cleanup but remove aggressive containerd cleanup
|
||||||
|
- Remove disk pressure taint loops (they're not effective)
|
||||||
|
- Trust k3s to manage resources
|
||||||
|
|
||||||
|
#### 1.5 Match Kubectl Version to k3s
|
||||||
|
|
||||||
|
- Use kubectl v1.31.x to match k3s v1.31.5
|
||||||
|
- Or pin k3d to use compatible k3s version
|
||||||
|
|
||||||
|
### Phase 2: Resource Optimization (Short-term)
|
||||||
|
|
||||||
|
#### 2.1 Use Smaller Test Images
|
||||||
|
|
||||||
|
- Consider using a smaller Unity base image for tests
|
||||||
|
- Or use a minimal test image that doesn't require 3.9GB
|
||||||
|
|
||||||
|
#### 2.2 Implement PVC Reuse
|
||||||
|
|
||||||
|
- Reuse PVCs across tests instead of creating new ones
|
||||||
|
- Only create new PVC if previous one is still in use
|
||||||
|
|
||||||
|
#### 2.3 Add Resource Limits
|
||||||
|
|
||||||
|
- Set explicit resource limits on test pods
|
||||||
|
- Prevent pods from consuming all available resources
|
||||||
|
|
||||||
|
#### 2.4 Optimize Job TTL
|
||||||
|
|
||||||
|
- Keep `ttlSecondsAfterFinished: 300` (5 minutes)
|
||||||
|
- Ensure jobs are cleaned up promptly
|
||||||
|
|
||||||
|
### Phase 3: Monitoring and Diagnostics (Medium-term)
|
||||||
|
|
||||||
|
#### 3.1 Add Disk Usage Monitoring
|
||||||
|
|
||||||
|
- Log disk usage before/after each test
|
||||||
|
- Track which components use most space
|
||||||
|
- Alert when usage exceeds thresholds
|
||||||
|
|
||||||
|
#### 3.2 Improve Error Messages
|
||||||
|
|
||||||
|
- Detect evictions explicitly and provide clear errors
|
||||||
|
- Log disk pressure events with context
|
||||||
|
- Show available vs. requested resources
|
||||||
|
|
||||||
|
#### 3.3 Add Retry Logic
|
||||||
|
|
||||||
|
- Retry tests that fail due to infrastructure issues (evictions)
|
||||||
|
- Skip retry for actual test failures
|
||||||
|
|
||||||
|
## Implementation Steps
|
||||||
|
|
||||||
|
### Step 1: Immediate Fixes (High Priority)
|
||||||
|
|
||||||
|
1. ✅ Remove image pre-pulling step
|
||||||
|
2. ✅ Simplify k3d cluster creation (remove aggressive cleanup)
|
||||||
|
3. ✅ Reduce PVC size to 2Gi
|
||||||
|
4. ✅ Remove disk pressure taint loops
|
||||||
|
5. ✅ Match kubectl version to k3s version
|
||||||
|
|
||||||
|
### Step 2: Test and Validate
|
||||||
|
|
||||||
|
1. Run integrity checks multiple times
|
||||||
|
2. Monitor disk usage patterns
|
||||||
|
3. Verify no evictions occur
|
||||||
|
4. Check test reliability
|
||||||
|
|
||||||
|
### Step 3: Iterate Based on Results
|
||||||
|
|
||||||
|
1. If still failing, add eviction thresholds
|
||||||
|
2. If space is issue, implement PVC reuse
|
||||||
|
3. If timing is issue, increase timeouts
|
||||||
|
|
||||||
|
## Expected Outcomes
|
||||||
|
|
||||||
|
### Success Criteria
|
||||||
|
|
||||||
|
- ✅ All K8s integrity tests pass consistently
|
||||||
|
- ✅ No pod evictions during test execution
|
||||||
|
- ✅ Disk usage stays below 85%
|
||||||
|
- ✅ Tests complete within timeout (5 minutes)
|
||||||
|
- ✅ "Collected Logs" always present in output
|
||||||
|
|
||||||
|
### Metrics to Track
|
||||||
|
|
||||||
|
- Test pass rate (target: 100%)
|
||||||
|
- Average disk usage during tests
|
||||||
|
- Number of evictions per run
|
||||||
|
- Test execution time
|
||||||
|
- Cleanup effectiveness
|
||||||
|
|
||||||
|
## Risk Assessment
|
||||||
|
|
||||||
|
### Low Risk Changes
|
||||||
|
|
||||||
|
- Removing image pre-pulling
|
||||||
|
- Reducing PVC size
|
||||||
|
- Simplifying cleanup
|
||||||
|
|
||||||
|
### Medium Risk Changes
|
||||||
|
|
||||||
|
- Changing k3d configuration
|
||||||
|
- Modifying eviction thresholds
|
||||||
|
- Changing kubectl version
|
||||||
|
|
||||||
|
### High Risk Changes
|
||||||
|
|
||||||
|
- PVC reuse (requires careful state management)
|
||||||
|
- Changing k3s version
|
||||||
|
- Major workflow restructuring
|
||||||
|
|
||||||
|
## Rollback Plan
|
||||||
|
|
||||||
|
If changes make things worse:
|
||||||
|
|
||||||
|
1. Revert to commit 464a9d1 workflow configuration
|
||||||
|
2. Gradually add back only essential changes
|
||||||
|
3. Test each change individually
|
||||||
|
|
||||||
|
## Timeline
|
||||||
|
|
||||||
|
- **Phase 1**: 1-2 days (immediate fixes)
|
||||||
|
- **Phase 2**: 3-5 days (optimization)
|
||||||
|
- **Phase 3**: 1 week (monitoring)
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- The successful September runs used a much simpler approach
|
||||||
|
- Complexity has increased without solving the root problem
|
||||||
|
- Simplification is likely the key to reliability
|
||||||
|
- GitHub Actions runners have limited resources - we must work within constraints
|
||||||
|
|
@ -4772,7 +4772,8 @@ class KubernetesTaskRunner {
|
||||||
}
|
}
|
||||||
else if (!output.includes('Collected Logs')) {
|
else if (!output.includes('Collected Logs')) {
|
||||||
// We have some output but missing "Collected Logs" - append the fallback message
|
// We have some output but missing "Collected Logs" - append the fallback message
|
||||||
output += '\nPod logs incomplete - "Collected Logs" marker not found. Pod may have been terminated before post-build completed.\n';
|
output +=
|
||||||
|
'\nPod logs incomplete - "Collected Logs" marker not found. Pod may have been terminated before post-build completed.\n';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
|
|
@ -217,7 +217,7 @@ class KubernetesTaskRunner {
|
||||||
// If output is empty, we need to be more aggressive about getting logs
|
// If output is empty, we need to be more aggressive about getting logs
|
||||||
const needsFallback = output.trim().length === 0;
|
const needsFallback = output.trim().length === 0;
|
||||||
const missingCollectedLogs = !output.includes('Collected Logs');
|
const missingCollectedLogs = !output.includes('Collected Logs');
|
||||||
|
|
||||||
if (needsFallback) {
|
if (needsFallback) {
|
||||||
CloudRunnerLogger.log('Output is empty, attempting aggressive log collection fallback...');
|
CloudRunnerLogger.log('Output is empty, attempting aggressive log collection fallback...');
|
||||||
// Give the pod a moment to finish writing logs before we try to read them
|
// Give the pod a moment to finish writing logs before we try to read them
|
||||||
|
|
@ -234,8 +234,8 @@ class KubernetesTaskRunner {
|
||||||
const reason = needsFallback
|
const reason = needsFallback
|
||||||
? 'output is empty'
|
? 'output is empty'
|
||||||
: missingCollectedLogs
|
: missingCollectedLogs
|
||||||
? 'Collected Logs missing from output'
|
? 'Collected Logs missing from output'
|
||||||
: 'pod is terminated';
|
: 'pod is terminated';
|
||||||
CloudRunnerLogger.log(
|
CloudRunnerLogger.log(
|
||||||
`Pod is ${isPodStillRunning ? 'running' : 'terminated'} and ${reason}, reading log file as fallback...`,
|
`Pod is ${isPodStillRunning ? 'running' : 'terminated'} and ${reason}, reading log file as fallback...`,
|
||||||
);
|
);
|
||||||
|
|
@ -354,7 +354,8 @@ class KubernetesTaskRunner {
|
||||||
output = 'Pod logs unavailable - pod may have been terminated before logs could be collected.\n';
|
output = 'Pod logs unavailable - pod may have been terminated before logs could be collected.\n';
|
||||||
} else if (!output.includes('Collected Logs')) {
|
} else if (!output.includes('Collected Logs')) {
|
||||||
// We have some output but missing "Collected Logs" - append the fallback message
|
// We have some output but missing "Collected Logs" - append the fallback message
|
||||||
output += '\nPod logs incomplete - "Collected Logs" marker not found. Pod may have been terminated before post-build completed.\n';
|
output +=
|
||||||
|
'\nPod logs incomplete - "Collected Logs" marker not found. Pod may have been terminated before post-build completed.\n';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (fallbackError: any) {
|
} catch (fallbackError: any) {
|
||||||
|
|
@ -473,7 +474,7 @@ class KubernetesTaskRunner {
|
||||||
// Pod is complete if it's not Pending or Unknown - it might be Running, Succeeded, or Failed
|
// Pod is complete if it's not Pending or Unknown - it might be Running, Succeeded, or Failed
|
||||||
// For Failed/Succeeded pods, we still want to try to get logs, so we mark as complete
|
// For Failed/Succeeded pods, we still want to try to get logs, so we mark as complete
|
||||||
waitComplete = phase !== 'Pending' && phase !== 'Unknown';
|
waitComplete = phase !== 'Pending' && phase !== 'Unknown';
|
||||||
|
|
||||||
// If pod completed (Succeeded/Failed), log it but don't throw - we'll try to get logs
|
// If pod completed (Succeeded/Failed), log it but don't throw - we'll try to get logs
|
||||||
if (waitComplete && phase !== 'Running') {
|
if (waitComplete && phase !== 'Running') {
|
||||||
CloudRunnerLogger.log(`Pod ${podName} completed with phase: ${phase}. Will attempt to retrieve logs.`);
|
CloudRunnerLogger.log(`Pod ${podName} completed with phase: ${phase}. Will attempt to retrieve logs.`);
|
||||||
|
|
@ -481,7 +482,7 @@ class KubernetesTaskRunner {
|
||||||
|
|
||||||
if (phase === 'Pending') {
|
if (phase === 'Pending') {
|
||||||
consecutivePendingCount++;
|
consecutivePendingCount++;
|
||||||
|
|
||||||
// Check for scheduling failures in events (faster than waiting for conditions)
|
// Check for scheduling failures in events (faster than waiting for conditions)
|
||||||
try {
|
try {
|
||||||
const events = await kubeClient.listNamespacedEvent(namespace);
|
const events = await kubeClient.listNamespacedEvent(namespace);
|
||||||
|
|
@ -489,7 +490,7 @@ class KubernetesTaskRunner {
|
||||||
const failedSchedulingEvents = podEvents.filter(
|
const failedSchedulingEvents = podEvents.filter(
|
||||||
(x) => x.reason === 'FailedScheduling' || x.reason === 'SchedulingGated',
|
(x) => x.reason === 'FailedScheduling' || x.reason === 'SchedulingGated',
|
||||||
);
|
);
|
||||||
|
|
||||||
if (failedSchedulingEvents.length > 0) {
|
if (failedSchedulingEvents.length > 0) {
|
||||||
const schedulingMessage = failedSchedulingEvents
|
const schedulingMessage = failedSchedulingEvents
|
||||||
.map((x) => `${x.reason}: ${x.message || ''}`)
|
.map((x) => `${x.reason}: ${x.message || ''}`)
|
||||||
|
|
@ -502,11 +503,11 @@ class KubernetesTaskRunner {
|
||||||
} catch {
|
} catch {
|
||||||
// Ignore event fetch errors
|
// Ignore event fetch errors
|
||||||
}
|
}
|
||||||
|
|
||||||
// For tests, fail faster if stuck in Pending (2 minutes = 8 checks at 15s interval)
|
// For tests, fail faster if stuck in Pending (2 minutes = 8 checks at 15s interval)
|
||||||
const isTest = process.env['cloudRunnerTests'] === 'true';
|
const isTest = process.env['cloudRunnerTests'] === 'true';
|
||||||
const maxPendingChecks = isTest ? 8 : 80; // 2 minutes for tests, 20 minutes for production
|
const maxPendingChecks = isTest ? 8 : 80; // 2 minutes for tests, 20 minutes for production
|
||||||
|
|
||||||
if (consecutivePendingCount >= maxPendingChecks) {
|
if (consecutivePendingCount >= maxPendingChecks) {
|
||||||
message = `Pod ${podName} stuck in Pending state for too long (${consecutivePendingCount} checks). This indicates a scheduling problem.`;
|
message = `Pod ${podName} stuck in Pending state for too long (${consecutivePendingCount} checks). This indicates a scheduling problem.`;
|
||||||
// Get events for context
|
// Get events for context
|
||||||
|
|
@ -526,7 +527,7 @@ class KubernetesTaskRunner {
|
||||||
waitComplete = false;
|
waitComplete = false;
|
||||||
return true; // Exit wait loop to throw error
|
return true; // Exit wait loop to throw error
|
||||||
}
|
}
|
||||||
|
|
||||||
// Log diagnostic info every 4 checks (1 minute) if still pending
|
// Log diagnostic info every 4 checks (1 minute) if still pending
|
||||||
if (consecutivePendingCount % 4 === 0) {
|
if (consecutivePendingCount % 4 === 0) {
|
||||||
const pendingMessage = `Pod ${podName} still Pending (check ${consecutivePendingCount}/${maxPendingChecks}). Phase: ${phase}`;
|
const pendingMessage = `Pod ${podName} still Pending (check ${consecutivePendingCount}/${maxPendingChecks}). Phase: ${phase}`;
|
||||||
|
|
|
||||||
|
|
@ -16,3 +16,21 @@
|
||||||
[Client] Error: Command failed: rclone lsf local:./temp/rclone-remote
|
[Client] Error: Command failed: rclone lsf local:./temp/rclone-remote
|
||||||
2025/12/29 16:36:40 CRITICAL: Failed to create file system for "local:./temp/rclone-remote": didn't find section in config file ("local")
|
2025/12/29 16:36:40 CRITICAL: Failed to create file system for "local:./temp/rclone-remote": didn't find section in config file ("local")
|
||||||
|
|
||||||
|
[Client] bash -lc 'mkdir -p /data/cache/$CACHE_KEY/Library/ ; mkdir -p /data/cache/$CACHE_KEY/lfs/ ; if command -v rclone > /dev/null 2>&1; then ; rclone copy local:./temp/rclone-remote/cloud-runner-cache/$CACHE_KEY/Library /data/cache/$CACHE_KEY/Library/ || true ; rclone copy local:./temp/rclone-remote/cloud-runner-cache/$CACHE_KEY/lfs /data/cache/$CACHE_KEY/lfs/ || true ; else ; echo "rclone not available, skipping rclone-pull-cache" ; fi'
|
||||||
|
[Client] [0]
|
||||||
|
[Client] The system cannot find the path specified.
|
||||||
|
[Client]
|
||||||
|
[Client] bash -lc 'echo "cloud runner build workflow starting" ; # skipping apt-get in local-docker or non-container provider ; # skipping toolchain setup in local-docker or non-container provider ; export GITHUB_WORKSPACE="/data/0-linux64-pl38/repo" ; # skipping df on /data in non-container provider ; export LOG_FILE=$(pwd)/temp/job-log.txt ; export GIT_DISCOVERY_ACROSS_FILESYSTEM=1 ; mkdir -p "$(dirname "$LOG_FILE")" ; echo "log start" >> "$LOG_FILE" ; echo "CACHE_KEY=$CACHE_KEY" ; echo "game ci start" ; echo "game ci start" >> "$LOG_FILE" ; timeout 3s node C:/Users/Mark/OneDrive/Documents/unity-builder/dist/index.js -m remote-cli-log-stream --logFile "$LOG_FILE" || true ; node C:/Users/Mark/OneDrive/Documents/unity-builder/dist/index.js -m remote-cli-post-build'
|
||||||
|
[Client] [0]
|
||||||
|
[Client]
|
||||||
|
[Client] bash -lc 'if command -v rclone > /dev/null 2>&1; then ; rclone copy /data/cache/$CACHE_KEY/build/build-0-linux64-pl38.tar local:./temp/rclone-remote/cloud-runner-cache/$CACHE_KEY/build/ || true ; rm /data/cache/$CACHE_KEY/build/build-0-linux64-pl38.tar || true ; else ; echo "rclone not available, skipping rclone-upload-build" ; fi'
|
||||||
|
[Client] [0]
|
||||||
|
[Client] The system cannot find the path specified.
|
||||||
|
[Client]
|
||||||
|
[Client] bash -lc 'if command -v rclone > /dev/null 2>&1; then ; rclone copy /data/cache/$CACHE_KEY/lfs local:./temp/rclone-remote/cloud-runner-cache/$CACHE_KEY/lfs || true ; rm -r /data/cache/$CACHE_KEY/lfs || true ; rclone copy /data/cache/$CACHE_KEY/Library local:./temp/rclone-remote/cloud-runner-cache/$CACHE_KEY/Library || true ; rm -r /data/cache/$CACHE_KEY/Library || true ; else ; echo "rclone not available, skipping rclone-upload-cache" ; fi'
|
||||||
|
[Client] [0]
|
||||||
|
[Client] The system cannot find the path specified.
|
||||||
|
[Client]
|
||||||
|
[Client] Error: Command failed: rclone lsf local:./temp/rclone-remote
|
||||||
|
2026/01/03 15:36:12 CRITICAL: Failed to create file system for "local:./temp/rclone-remote": didn't find section in config file ("local")
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue