From a5a6d39647287d1b2a8612d6c2f583ee2cc1ed0b Mon Sep 17 00:00:00 2001 From: abhijeet-dhumal Date: Mon, 23 Dec 2024 11:57:20 +0530 Subject: [PATCH] Added resources list for CPU based usecase, updated requirements.txt to resolve fsspec/numpy package compatibility issue and added license in MNIST script --- tests/kfto/kfto_mnist_training_test.go | 40 ++++++++++++++-------- tests/kfto/resources/mnist.py | 14 ++++++++ tests/kfto/resources/requirements-rocm.txt | 4 ++- tests/kfto/resources/requirements.txt | 4 ++- 4 files changed, 46 insertions(+), 16 deletions(-) diff --git a/tests/kfto/kfto_mnist_training_test.go b/tests/kfto/kfto_mnist_training_test.go index 63596c07..e11575ac 100644 --- a/tests/kfto/kfto_mnist_training_test.go +++ b/tests/kfto/kfto_mnist_training_test.go @@ -33,7 +33,6 @@ import ( func TestPyTorchJobMnistCpu(t *testing.T) { runKFTOPyTorchMnistJob(t, 0, 2, "", GetCudaTrainingImage(), "resources/requirements.txt") } - func TestPyTorchJobMnistWithCuda(t *testing.T) { runKFTOPyTorchMnistJob(t, 1, 1, "nvidia.com/gpu", GetCudaTrainingImage(), "resources/requirements.txt") } @@ -133,7 +132,7 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config "/bin/bash", "-c", fmt.Sprintf(`mkdir -p /tmp/lib && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \ pip install --no-cache-dir -r /mnt/files/requirements.txt --target=/tmp/lib && \ - python /mnt/files/mnist.py --epochs 1 --save-model --output-path /mnt/output --backend %s`, backend), + python /mnt/files/mnist.py --epochs 3 --save-model --output-path /mnt/output --backend %s`, backend), }, VolumeMounts: []corev1.VolumeMount{ { @@ -149,6 +148,12 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config MountPath: "/mnt/output", }, }, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("6Gi"), + }, + }, }, }, Volumes: []corev1.Volume{ @@ -214,7 +219,7 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config "/bin/bash", "-c", fmt.Sprintf(`mkdir -p /tmp/lib && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \ pip install --no-cache-dir -r /mnt/files/requirements.txt --target=/tmp/lib && \ - python /mnt/files/mnist.py --epochs 1 --save-model --backend %s`, backend), + python /mnt/files/mnist.py --epochs 3 --save-model --backend %s`, backend), }, VolumeMounts: []corev1.VolumeMount{ { @@ -226,6 +231,12 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config MountPath: "/tmp", }, }, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("6Gi"), + }, + }, }, }, Volumes: []corev1.Volume{ @@ -255,19 +266,20 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config } if useGPU { - // Update resource lists - tuningJob.Spec.PyTorchReplicaSpecs["Master"].Template.Spec.Containers[0].Resources = corev1.ResourceRequirements{ - Limits: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("2"), - corev1.ResourceMemory: resource.MustParse("8Gi"), - corev1.ResourceName(gpuLabel): resource.MustParse(fmt.Sprint(numGpus)), + // Update resource lists for GPU (NVIDIA/ROCm) usecase + tuningJob.Spec.PyTorchReplicaSpecs["Master"].Template.Spec.Containers[0].Resources.Limits[corev1.ResourceName(gpuLabel)] = resource.MustParse(fmt.Sprint(numGpus)) + tuningJob.Spec.PyTorchReplicaSpecs["Worker"].Template.Spec.Containers[0].Resources.Limits[corev1.ResourceName(gpuLabel)] = resource.MustParse(fmt.Sprint(numGpus)) + + tuningJob.Spec.PyTorchReplicaSpecs["Master"].Template.Spec.Containers[0].Env = []corev1.EnvVar{ + { + Name: "NCCL_DEBUG", + Value: "INFO", }, } - tuningJob.Spec.PyTorchReplicaSpecs["Worker"].Template.Spec.Containers[0].Resources = corev1.ResourceRequirements{ - Limits: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("2"), - corev1.ResourceMemory: resource.MustParse("8Gi"), - corev1.ResourceName(gpuLabel): resource.MustParse(fmt.Sprint(numGpus)), + tuningJob.Spec.PyTorchReplicaSpecs["Worker"].Template.Spec.Containers[0].Env = []corev1.EnvVar{ + { + Name: "NCCL_DEBUG", + Value: "INFO", }, } diff --git a/tests/kfto/resources/mnist.py b/tests/kfto/resources/mnist.py index 7d8d445d..91b1cbd3 100644 --- a/tests/kfto/resources/mnist.py +++ b/tests/kfto/resources/mnist.py @@ -1,3 +1,17 @@ +# Copyright 2023. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import os diff --git a/tests/kfto/resources/requirements-rocm.txt b/tests/kfto/resources/requirements-rocm.txt index 1880dc8f..6e2f7b93 100644 --- a/tests/kfto/resources/requirements-rocm.txt +++ b/tests/kfto/resources/requirements-rocm.txt @@ -1,3 +1,5 @@ --extra-index-url https://download.pytorch.org/whl/rocm6.1 torchvision==0.19.0 -tensorboard==2.18.0 \ No newline at end of file +tensorboard==2.18.0 +fsspec[http]==2024.6.1 +numpy==2.0.2 \ No newline at end of file diff --git a/tests/kfto/resources/requirements.txt b/tests/kfto/resources/requirements.txt index e3ae7b3e..9352f8b6 100644 --- a/tests/kfto/resources/requirements.txt +++ b/tests/kfto/resources/requirements.txt @@ -1,2 +1,4 @@ torchvision==0.19.0 -tensorboard==2.18.0 \ No newline at end of file +tensorboard==2.18.0 +fsspec[http]==2024.6.1 +numpy==2.0.2 \ No newline at end of file