pr feedback - handle evictions and wait for disk pressure condition
parent
ed0d2c13b6
commit
f4d28fa6d2
|
|
@ -242,6 +242,27 @@ jobs:
|
||||||
echo "Taint removed. Checking nodes..."
|
echo "Taint removed. Checking nodes..."
|
||||||
kubectl describe nodes | grep -i taint || echo "No taints found"
|
kubectl describe nodes | grep -i taint || echo "No taints found"
|
||||||
fi
|
fi
|
||||||
|
# Wait for disk pressure condition to clear (not just taint)
|
||||||
|
echo "Waiting for disk pressure condition to clear on nodes..."
|
||||||
|
for i in {1..20}; do
|
||||||
|
HAS_DISK_PRESSURE_CONDITION=$(kubectl get nodes -o json 2>/dev/null | grep -q '"type":"DiskPressure"' && echo "true" || echo "false")
|
||||||
|
if [ "$HAS_DISK_PRESSURE_CONDITION" = "true" ]; then
|
||||||
|
echo "Disk pressure condition still present, waiting... ($i/20)"
|
||||||
|
sleep 2
|
||||||
|
else
|
||||||
|
echo "Disk pressure condition cleared, proceeding with test"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
# Final check - if condition still exists, remove taint and wait a bit more
|
||||||
|
if kubectl get nodes -o json 2>/dev/null | grep -q '"type":"DiskPressure"'; then
|
||||||
|
echo "WARNING: Disk pressure condition still exists. Removing taint and waiting 10 seconds..."
|
||||||
|
NODE_NAMES=$(kubectl get nodes -o name 2>/dev/null | sed 's/node\///' || echo "")
|
||||||
|
for node in $NODE_NAMES; do
|
||||||
|
kubectl taint nodes "$node" node.kubernetes.io/disk-pressure- 2>/dev/null || true
|
||||||
|
done
|
||||||
|
sleep 10
|
||||||
|
fi
|
||||||
- name: Run cloud-runner-image test (validate image creation)
|
- name: Run cloud-runner-image test (validate image creation)
|
||||||
timeout-minutes: 10
|
timeout-minutes: 10
|
||||||
run: yarn run test "cloud-runner-image" --detectOpenHandles --forceExit --runInBand
|
run: yarn run test "cloud-runner-image" --detectOpenHandles --forceExit --runInBand
|
||||||
|
|
@ -471,6 +492,27 @@ jobs:
|
||||||
echo "Taint removed. Checking nodes..."
|
echo "Taint removed. Checking nodes..."
|
||||||
kubectl describe nodes | grep -i taint || echo "No taints found"
|
kubectl describe nodes | grep -i taint || echo "No taints found"
|
||||||
fi
|
fi
|
||||||
|
# Wait for disk pressure condition to clear (not just taint)
|
||||||
|
echo "Waiting for disk pressure condition to clear on nodes..."
|
||||||
|
for i in {1..20}; do
|
||||||
|
HAS_DISK_PRESSURE_CONDITION=$(kubectl get nodes -o json 2>/dev/null | grep -q '"type":"DiskPressure"' && echo "true" || echo "false")
|
||||||
|
if [ "$HAS_DISK_PRESSURE_CONDITION" = "true" ]; then
|
||||||
|
echo "Disk pressure condition still present, waiting... ($i/20)"
|
||||||
|
sleep 2
|
||||||
|
else
|
||||||
|
echo "Disk pressure condition cleared, proceeding with test"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
# Final check - if condition still exists, remove taint and wait a bit more
|
||||||
|
if kubectl get nodes -o json 2>/dev/null | grep -q '"type":"DiskPressure"'; then
|
||||||
|
echo "WARNING: Disk pressure condition still exists. Removing taint and waiting 10 seconds..."
|
||||||
|
NODE_NAMES=$(kubectl get nodes -o name 2>/dev/null | sed 's/node\///' || echo "")
|
||||||
|
for node in $NODE_NAMES; do
|
||||||
|
kubectl taint nodes "$node" node.kubernetes.io/disk-pressure- 2>/dev/null || true
|
||||||
|
done
|
||||||
|
sleep 10
|
||||||
|
fi
|
||||||
- name: Run cloud-runner-s3-steps test (validate S3 operations with K8s)
|
- name: Run cloud-runner-s3-steps test (validate S3 operations with K8s)
|
||||||
timeout-minutes: 30
|
timeout-minutes: 30
|
||||||
run: yarn run test "cloud-runner-s3-steps" --detectOpenHandles --forceExit --runInBand
|
run: yarn run test "cloud-runner-s3-steps" --detectOpenHandles --forceExit --runInBand
|
||||||
|
|
|
||||||
|
|
@ -4147,6 +4147,14 @@ class KubernetesPods {
|
||||||
cloud_runner_logger_1.default.logWarning(`Pod ${podName} has PreStopHook failure but no container failure detected. Treating as non-fatal.`);
|
cloud_runner_logger_1.default.logWarning(`Pod ${podName} has PreStopHook failure but no container failure detected. Treating as non-fatal.`);
|
||||||
return false; // PreStopHook failure alone is not fatal if container status is unclear
|
return false; // PreStopHook failure alone is not fatal if container status is unclear
|
||||||
}
|
}
|
||||||
|
// Check if pod was evicted due to disk pressure - this is an infrastructure issue
|
||||||
|
const wasEvicted = errorDetails.some((detail) => detail.toLowerCase().includes('evicted') || detail.toLowerCase().includes('diskpressure'));
|
||||||
|
if (wasEvicted) {
|
||||||
|
const evictionMessage = `Pod ${podName} was evicted due to disk pressure. This is a test infrastructure issue - the cluster doesn't have enough disk space.`;
|
||||||
|
cloud_runner_logger_1.default.logWarning(evictionMessage);
|
||||||
|
cloud_runner_logger_1.default.log(`Pod details: ${errorDetails.join('\n')}`);
|
||||||
|
throw new Error(`${evictionMessage}\nThis indicates the test environment needs more disk space or better cleanup.\n${errorDetails.join('\n')}`);
|
||||||
|
}
|
||||||
// Exit code 137 (128 + 9) means SIGKILL - container was killed by system (often OOM)
|
// Exit code 137 (128 + 9) means SIGKILL - container was killed by system (often OOM)
|
||||||
// If this happened with PreStopHook failure, it might be a resource issue, not a real failure
|
// If this happened with PreStopHook failure, it might be a resource issue, not a real failure
|
||||||
// Be lenient if we only have PreStopHook/ExceededGracePeriod issues
|
// Be lenient if we only have PreStopHook/ExceededGracePeriod issues
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -147,6 +147,19 @@ class KubernetesPods {
|
||||||
return false; // PreStopHook failure alone is not fatal if container status is unclear
|
return false; // PreStopHook failure alone is not fatal if container status is unclear
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check if pod was evicted due to disk pressure - this is an infrastructure issue
|
||||||
|
const wasEvicted = errorDetails.some((detail) =>
|
||||||
|
detail.toLowerCase().includes('evicted') || detail.toLowerCase().includes('diskpressure'),
|
||||||
|
);
|
||||||
|
if (wasEvicted) {
|
||||||
|
const evictionMessage = `Pod ${podName} was evicted due to disk pressure. This is a test infrastructure issue - the cluster doesn't have enough disk space.`;
|
||||||
|
CloudRunnerLogger.logWarning(evictionMessage);
|
||||||
|
CloudRunnerLogger.log(`Pod details: ${errorDetails.join('\n')}`);
|
||||||
|
throw new Error(
|
||||||
|
`${evictionMessage}\nThis indicates the test environment needs more disk space or better cleanup.\n${errorDetails.join('\n')}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// Exit code 137 (128 + 9) means SIGKILL - container was killed by system (often OOM)
|
// Exit code 137 (128 + 9) means SIGKILL - container was killed by system (often OOM)
|
||||||
// If this happened with PreStopHook failure, it might be a resource issue, not a real failure
|
// If this happened with PreStopHook failure, it might be a resource issue, not a real failure
|
||||||
// Be lenient if we only have PreStopHook/ExceededGracePeriod issues
|
// Be lenient if we only have PreStopHook/ExceededGracePeriod issues
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue