Skip to content

Commit

Permalink
CI: simplify script that adds virtual GPUs to test cluster (#246)
Browse files Browse the repository at this point in the history
  • Loading branch information
dgrove-oss authored Sep 16, 2024
1 parent 5b628c1 commit a1c6ed3
Showing 1 changed file with 3 additions and 45 deletions.
48 changes: 3 additions & 45 deletions hack/e2e-util.sh
Original file line number Diff line number Diff line change
Expand Up @@ -206,54 +206,12 @@ function wait_for_appwrapper_controller {

function add_virtual_GPUs {
# Patch nodes to provide GPUs resources without physical GPUs.
# This is intended to allow testing of our autopilot integration.

# Start communication with cluster
kubectl proxy --port=0 > .port.dat 2>&1 &
proxy_pid=$!

echo "Starting background proxy connection (pid=${proxy_pid})..."
echo "Waiting for proxy process to start."
sleep 5

kube_proxy_port=$(cat .port.dat | awk '{split($5, substrings, ":"); print substrings[2]}')
curl -s 127.0.0.1:${kube_proxy_port} > /dev/null 2>&1

if [[ ! $? -eq 0 ]]; then
echo "Calling 'kubectl proxy' did not create a successful connection to the kubelet needed to patch the nodes. Exiting."
kill -9 ${proxy_pid}
exit 1
else
echo "Connected to the kubelet for patching the nodes. Using port ${kube_proxy_port}."
fi

rm .port.dat

# Variables
resource_name="nvidia.com~1gpu"
resource_count="8"

# Patch nodes
# This enables testing of our autopilot integration.
echo "Adding virtual GPUs to all nodes"
for node_name in $(kubectl get nodes --no-headers -o custom-columns=":metadata.name")
do
echo "- Patching node (add): ${node_name}"

patching_status=$(curl -s --header "Content-Type: application/json-patch+json" \
--request PATCH \
--data '[{"op": "add", "path": "/status/capacity/'${resource_name}'", "value": "'${resource_count}'"}]' \
http://localhost:${kube_proxy_port}/api/v1/nodes/${node_name}/status | jq -r '.status')

if [[ ${patching_status} == "Failure" ]]; then
echo "Failed to patch node '${node_name}' with GPU resources"
exit 1
fi

echo "Patching done!"
kubectl patch node $node_name --subresource=status --type=json -p='[{"op":"add","path":"/status/capacity/nvidia.com~1gpu","value":"8"}]'
done

# Stop communication with cluster
echo "Killing proxy (pid=${proxy_pid})..."
kill -9 ${proxy_pid}
}

# clean up
Expand Down

0 comments on commit a1c6ed3

Please sign in to comment.