From 9dee8e92db75b1ad670fbc861a8da0be05856de4 Mon Sep 17 00:00:00 2001 From: J-shang <33053116+J-shang@users.noreply.github.com> Date: Mon, 18 Apr 2022 17:33:01 +0800 Subject: [PATCH] fix kubeflow pipeline (#4767) --- docs/source/index.rst | 2 +- docs/source/index_zh.rst | 3 ++- docs/source/release.rst | 2 +- .../training_services/frameworkcontroller.py | 1 + .../config/training_services/kubeflow.py | 1 + nni/tools/nnictl/config_schema.py | 2 ++ .../config/examples/cifar10_search_space.json | 2 +- test/config/training_service.yml | 4 +++- test/config/training_service_v2.yml | 2 ++ test/nni_test/nnitest/utils.py | 14 +++++++++---- ts/nni_manager/common/experimentConfig.ts | 3 ++- ts/nni_manager/config/aml/amlUtil.py | 3 ++- .../kubernetes/adl/adlApiClient.ts | 2 +- .../frameworkcontrollerTrainingService.ts | 11 +++++----- .../kubernetes/kubeflow/kubeflowApiClient.ts | 16 +++++++-------- .../kubernetes/kubeflow/kubeflowConfig.ts | 20 +++++++++++-------- .../kubeflow/kubeflowJobRestServer.ts | 4 ++-- .../kubeflow/kubeflowTrainingService.ts | 10 ++++++---- .../kubernetes/kubernetesApiClient.ts | 1 + .../kubernetes/kubernetesTrainingService.ts | 4 ++-- .../remoteMachineTrainingService.ts | 4 ++-- .../frameworkcontrollerEnvironmentService.ts | 3 ++- .../kubernetes/kubeflowEnvironmentService.ts | 5 +++-- .../kubernetesEnvironmentService.ts | 4 ++-- 24 files changed, 75 insertions(+), 48 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 4b55922911..5e8a96f584 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -128,7 +128,7 @@ NNI makes AutoML techniques plug-and-play .. codesnippetcard:: :icon: ../img/thumbnails/quantization-small.svg :title: Quantization - :link: tutorials/quantization_speedup + :link: tutorials/quantization_quick_start_mnist .. code-block:: diff --git a/docs/source/index_zh.rst b/docs/source/index_zh.rst index f0826332be..f87d2dee11 100644 --- a/docs/source/index_zh.rst +++ b/docs/source/index_zh.rst @@ -1,4 +1,5 @@ -.. f2a86f83def6c4b2e35ba50ce2487deb +.. dbd41cab307bcd76cc747b3d478709b8 + NNI 文档 ================= diff --git a/docs/source/release.rst b/docs/source/release.rst index 93d69d9c39..3d53727307 100644 --- a/docs/source/release.rst +++ b/docs/source/release.rst @@ -5,7 +5,7 @@ Change Log ========== -Release 2.7 - 4/14/2022 +Release 2.7 - 4/18/2022 ----------------------- Documentation diff --git a/nni/experiment/config/training_services/frameworkcontroller.py b/nni/experiment/config/training_services/frameworkcontroller.py index 5676127174..e196ac708c 100644 --- a/nni/experiment/config/training_services/frameworkcontroller.py +++ b/nni/experiment/config/training_services/frameworkcontroller.py @@ -46,6 +46,7 @@ class FrameworkControllerConfig(TrainingServiceConfig): service_account_name: Optional[str] task_roles: List[FrameworkControllerRoleConfig] reuse_mode: Optional[bool] = True + namespace: str = 'default' def _canonicalize(self, parents): super()._canonicalize(parents) diff --git a/nni/experiment/config/training_services/kubeflow.py b/nni/experiment/config/training_services/kubeflow.py index 15d1981698..9c76e649c1 100644 --- a/nni/experiment/config/training_services/kubeflow.py +++ b/nni/experiment/config/training_services/kubeflow.py @@ -43,6 +43,7 @@ class KubeflowConfig(TrainingServiceConfig): ps: Optional[KubeflowRoleConfig] = None master: Optional[KubeflowRoleConfig] = None reuse_mode: Optional[bool] = True #set reuse mode as true for v2 config + namespace: str = 'default' def _canonicalize(self, parents): super()._canonicalize(parents) diff --git a/nni/tools/nnictl/config_schema.py b/nni/tools/nnictl/config_schema.py index 997ad1fa36..a659ce5e1a 100644 --- a/nni/tools/nnictl/config_schema.py +++ b/nni/tools/nnictl/config_schema.py @@ -359,6 +359,7 @@ def validate(self, data): 'path': setType('path', str) }, Optional('reuse'): setType('reuse', bool), + Optional('namespace'): setType('namespace', str), }, { 'operator': setChoice('operator', 'tf-operator', 'pytorch-operator'), 'apiVersion': setType('apiVersion', str), @@ -377,6 +378,7 @@ def validate(self, data): }, Optional('uploadRetryCount'): setNumberRange('uploadRetryCount', int, 1, 99999), Optional('reuse'): setType('reuse', bool), + Optional('namespace'): setType('namespace', str), }) } diff --git a/test/config/examples/cifar10_search_space.json b/test/config/examples/cifar10_search_space.json index ca1c0d2034..5cd40c18fb 100644 --- a/test/config/examples/cifar10_search_space.json +++ b/test/config/examples/cifar10_search_space.json @@ -1,5 +1,5 @@ { "lr":{"_type":"choice", "_value":[0.1, 0.01, 0.001, 0.0001]}, "optimizer":{"_type":"choice", "_value":["SGD", "Adadelta", "Adagrad", "Adam", "Adamax"]}, - "model":{"_type":"choice", "_value":["vgg", "resnet18"]} + "model":{"_type":"choice", "_value":["vgg"]} } diff --git a/test/config/training_service.yml b/test/config/training_service.yml index 07e809414f..4e52786deb 100644 --- a/test/config/training_service.yml +++ b/test/config/training_service.yml @@ -18,6 +18,7 @@ kubeflow: azureStorage: accountName: azureShare: + namespace: kubeflow trial: worker: replicas: 1 @@ -35,7 +36,7 @@ frameworkcontroller: maxTrialNum: 2 trialConcurrency: 2 frameworkcontrollerConfig: - serviceAccountName: frameworkbarrier + serviceAccountName: frameworkcontroller storage: azureStorage keyVault: vaultName: @@ -43,6 +44,7 @@ frameworkcontroller: azureStorage: accountName: azureShare: + namespace: kubeflow trial: taskRoles: - name: worker diff --git a/test/config/training_service_v2.yml b/test/config/training_service_v2.yml index 85ee637817..0623ca587a 100644 --- a/test/config/training_service_v2.yml +++ b/test/config/training_service_v2.yml @@ -20,6 +20,7 @@ kubeflow: trainingService: reuseMode: true platform: kubeflow + namespace: kubeflow worker: command: code_directory: @@ -44,6 +45,7 @@ frameworkcontroller: trainingService: reuseMode: true platform: frameworkcontroller + namespace: kubeflow serviceAccountName: frameworkcontroller taskRoles: - name: worker diff --git a/test/nni_test/nnitest/utils.py b/test/nni_test/nnitest/utils.py index 300799003f..e3f5276036 100644 --- a/test/nni_test/nnitest/utils.py +++ b/test/nni_test/nnitest/utils.py @@ -122,12 +122,18 @@ def print_file_content(filepath): print(content, flush=True) def print_trial_job_log(training_service, trial_jobs_url): - trial_jobs = get_trial_jobs(trial_jobs_url) - for trial_job in trial_jobs: - trial_log_dir = os.path.join(get_experiment_dir(EXPERIMENT_URL), 'trials', trial_job['trialJobId']) + trial_log_root = os.path.join(get_experiment_dir(EXPERIMENT_URL), 'trials') + if not os.path.exists(trial_log_root): + print('trial log folder does not exist: {}'.format(trial_log_root), flush=True) + return + folders = os.listdir(trial_log_root) + for name in folders: + trial_log_dir = os.path.join(trial_log_root, name) log_files = ['stderr', 'trial.log'] if training_service == 'local' else ['stdout_log_collection.log'] for log_file in log_files: - print_file_content(os.path.join(trial_log_dir, log_file)) + log_file_path = os.path.join(trial_log_dir, log_file) + if os.path.exists(log_file_path): + print_file_content(log_file_path) def print_experiment_log(experiment_id): log_dir = get_nni_log_dir(experiment_id=experiment_id) diff --git a/ts/nni_manager/common/experimentConfig.ts b/ts/nni_manager/common/experimentConfig.ts index a4a38dcbff..a5454d6a7a 100644 --- a/ts/nni_manager/common/experimentConfig.ts +++ b/ts/nni_manager/common/experimentConfig.ts @@ -132,6 +132,7 @@ export interface KubeflowConfig extends TrainingServiceConfig { master?: KubeflowRoleConfig; reuseMode: boolean; maxTrialNumberPerGpu?: number; + namespace?: string; } export interface FrameworkControllerTaskRoleConfig { @@ -156,7 +157,7 @@ export interface FrameworkControllerConfig extends TrainingServiceConfig { taskRoles: FrameworkControllerTaskRoleConfig[]; reuseMode: boolean; maxTrialNumberPerGpu?: number; - namespace?: 'default'; + namespace?: string; apiVersion?: string; } diff --git a/ts/nni_manager/config/aml/amlUtil.py b/ts/nni_manager/config/aml/amlUtil.py index a5b2a6bc2d..2237506b00 100644 --- a/ts/nni_manager/config/aml/amlUtil.py +++ b/ts/nni_manager/config/aml/amlUtil.py @@ -52,7 +52,8 @@ print('stop_result:failed') exit(0) loop_count += 1 - time.sleep(500) + time.sleep(5) + status = run.get_status() print('stop_result:success') exit(0) elif line == 'receive': diff --git a/ts/nni_manager/training_service/kubernetes/adl/adlApiClient.ts b/ts/nni_manager/training_service/kubernetes/adl/adlApiClient.ts index a8d8607f57..c93fa48128 100644 --- a/ts/nni_manager/training_service/kubernetes/adl/adlApiClient.ts +++ b/ts/nni_manager/training_service/kubernetes/adl/adlApiClient.ts @@ -11,7 +11,7 @@ class AdlClientV1 extends KubernetesCRDClient { /** * constructor, to initialize adl CRD definition */ - protected readonly namespace: string; + public readonly namespace: string; public constructor(namespace: string) { super(); diff --git a/ts/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts b/ts/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts index 8683e6c6c2..0acb5a0101 100644 --- a/ts/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts +++ b/ts/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts @@ -118,7 +118,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple } else { configTaskRoles = this.parseCustomTaskRoles(this.fcTemplate.spec.taskRoles) } - const namespace = this.fcClusterConfig.namespace ? this.fcClusterConfig.namespace : "default"; + const namespace = this.fcClusterConfig.namespace ?? "default"; this.genericK8sClient.setNamespace = namespace; if (this.kubernetesRestServerPort === undefined) { @@ -134,7 +134,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple const trialJobId: string = uniqueString(5); // Set trial's NFS working folder const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId); - const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId); + const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials', trialJobId); let frameworkcontrollerJobName: string = `nniexp${this.experimentId}trial${trialJobId}`.toLowerCase(); let frameworkcontrollerJobConfig: any; @@ -204,6 +204,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple let namespace: string | undefined; this.fcClusterConfig = FrameworkControllerClusterConfigFactory .generateFrameworkControllerClusterConfig(frameworkcontrollerClusterJsonObject); + this.genericK8sClient.setNamespace = this.fcClusterConfig.namespace ?? "default"; if (this.fcClusterConfig.storageType === 'azureStorage') { const azureFrameworkControllerClusterConfig: FrameworkControllerClusterConfigAzure = this.fcClusterConfig; @@ -346,8 +347,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple for (const taskRole of configTaskRoles) { const runScriptContent: string = await this.generateRunScript('frameworkcontroller', trialJobId, trialWorkingFolder, - this.generateCommandScript(configTaskRoles, taskRole.command), form.sequenceId.toString(), - taskRole.name, taskRole.gpuNum ? taskRole.gpuNum : 0); + this.generateCommandScript(configTaskRoles, taskRole.command), + form.sequenceId.toString(), taskRole.name, taskRole.gpuNum ? taskRole.gpuNum : 0); await fs.promises.writeFile(path.join(trialLocalTempFolder, `run_${taskRole.name}.sh`), runScriptContent, {encoding: 'utf8'}); } @@ -439,7 +440,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple kind: 'Framework', metadata: { name: frameworkcontrollerJobName, - namespace: this.fcClusterConfig.namespace ? this.fcClusterConfig.namespace : "default", + namespace: this.fcClusterConfig.namespace ?? "default", labels: { app: this.NNI_KUBERNETES_TRIAL_LABEL, expId: getExperimentId(), diff --git a/ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowApiClient.ts b/ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowApiClient.ts index 978c924786..02a4a30e1f 100644 --- a/ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowApiClient.ts +++ b/ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowApiClient.ts @@ -17,7 +17,7 @@ class TFOperatorClientV1Alpha2 extends KubernetesCRDClient { } protected get operator(): any { - return this.client.apis['kubeflow.org'].v1alpha2.namespaces('default').tfjobs; + return this.client.apis['kubeflow.org'].v1alpha2.namespaces(this.namespace).tfjobs; } public get containerName(): string { @@ -36,7 +36,7 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient { } protected get operator(): any { - return this.client.apis['kubeflow.org'].v1beta1.namespaces('default').tfjobs; + return this.client.apis['kubeflow.org'].v1beta1.namespaces(this.namespace).tfjobs; } public get containerName(): string { @@ -55,7 +55,7 @@ class TFOperatorClientV1Beta2 extends KubernetesCRDClient { } protected get operator(): any { - return this.client.apis['kubeflow.org'].v1beta2.namespaces('default').tfjobs; + return this.client.apis['kubeflow.org'].v1beta2.namespaces(this.namespace).tfjobs; } public get containerName(): string { @@ -74,7 +74,7 @@ class TFOperatorClientV1 extends KubernetesCRDClient { } protected get operator(): any { - return this.client.apis['kubeflow.org'].v1.namespaces('default').tfjobs; + return this.client.apis['kubeflow.org'].v1.namespaces(this.namespace).tfjobs; } public get containerName(): string { @@ -92,7 +92,7 @@ class PyTorchOperatorClientV1 extends KubernetesCRDClient { } protected get operator(): any { - return this.client.apis['kubeflow.org'].v1.namespaces('default').pytorchjobs; + return this.client.apis['kubeflow.org'].v1.namespaces(this.namespace).pytorchjobs; } public get containerName(): string { @@ -110,7 +110,7 @@ class PyTorchOperatorClientV1Alpha2 extends KubernetesCRDClient { } protected get operator(): any { - return this.client.apis['kubeflow.org'].v1alpha2.namespaces('default').pytorchjobs; + return this.client.apis['kubeflow.org'].v1alpha2.namespaces(this.namespace).pytorchjobs; } public get containerName(): string { @@ -129,7 +129,7 @@ class PyTorchOperatorClientV1Beta1 extends KubernetesCRDClient { } protected get operator(): any { - return this.client.apis['kubeflow.org'].v1beta1.namespaces('default').pytorchjobs; + return this.client.apis['kubeflow.org'].v1beta1.namespaces(this.namespace).pytorchjobs; } public get containerName(): string { @@ -148,7 +148,7 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient { } protected get operator(): any { - return this.client.apis['kubeflow.org'].v1beta2.namespaces('default').pytorchjobs; + return this.client.apis['kubeflow.org'].v1beta2.namespaces(this.namespace).pytorchjobs; } public get containerName(): string { diff --git a/ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts b/ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts index cffc88d376..735937f280 100644 --- a/ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts +++ b/ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts @@ -18,8 +18,8 @@ export type OperatorApiVersion = 'v1alpha2' | 'v1beta1' | 'v1beta2' | 'v1'; */ export class KubeflowClusterConfig extends KubernetesClusterConfig { public readonly operator: KubeflowOperator; - constructor(apiVersion: string, operator: KubeflowOperator) { - super(apiVersion); + constructor(apiVersion: string, operator: KubeflowOperator, namespace?: string) { + super(apiVersion, undefined, namespace); this.operator = operator; } } @@ -30,9 +30,10 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS { operator: KubeflowOperator, apiVersion: string, nfs: NFSConfig, - storage?: KubernetesStorageKind + storage?: KubernetesStorageKind, + namespace?: string ) { - super(apiVersion, nfs, storage); + super(apiVersion, nfs, storage, namespace); this.operator = operator; } @@ -48,7 +49,8 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS { kubeflowClusterConfigObjectNFS.operator, kubeflowClusterConfigObjectNFS.apiVersion, kubeflowClusterConfigObjectNFS.nfs, - kubeflowClusterConfigObjectNFS.storage + kubeflowClusterConfigObjectNFS.storage, + kubeflowClusterConfigObjectNFS.namespace ); } } @@ -61,9 +63,10 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure { apiVersion: string, keyVault: KeyVaultConfig, azureStorage: AzureStorage, - storage?: KubernetesStorageKind + storage?: KubernetesStorageKind, + namespace?: string ) { - super(apiVersion, keyVault, azureStorage, storage); + super(apiVersion, keyVault, azureStorage, storage, undefined, namespace); this.operator = operator; } @@ -79,7 +82,8 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure { kubeflowClusterConfigObjectAzure.apiVersion, kubeflowClusterConfigObjectAzure.keyVault, kubeflowClusterConfigObjectAzure.azureStorage, - kubeflowClusterConfigObjectAzure.storage + kubeflowClusterConfigObjectAzure.storage, + kubeflowClusterConfigObjectAzure.namespace ); } } diff --git a/ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobRestServer.ts b/ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobRestServer.ts index cee5540076..b42e4cb542 100644 --- a/ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobRestServer.ts +++ b/ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobRestServer.ts @@ -14,7 +14,7 @@ export class KubeflowJobRestServer extends KubernetesJobRestServer { /** * constructor to provide NNIRestServer's own rest property, e.g. port */ - constructor() { - super(component.get(KubeflowTrainingService)); + constructor(kubeflowTrainingService: KubeflowTrainingService) { + super(kubeflowTrainingService); } } diff --git a/ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts b/ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts index 7fee90b2ff..bdcac8645f 100644 --- a/ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts +++ b/ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts @@ -69,7 +69,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber } if (this.kubernetesRestServerPort === undefined) { - const restServer: KubeflowJobRestServer = component.get(KubeflowJobRestServer); + const restServer: KubeflowJobRestServer = new KubeflowJobRestServer(this); this.kubernetesRestServerPort = restServer.clusterRestServerPort; } @@ -81,7 +81,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber const trialJobId: string = uniqueString(5); const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId); const kubeflowJobName: string = `nni-exp-${this.experimentId}-trial-${trialJobId}`.toLowerCase(); - const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId); + const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials', trialJobId); //prepare the runscript await this.prepareRunScript(trialLocalTempFolder, trialJobId, trialWorkingFolder, form); //upload script files to sotrage @@ -120,6 +120,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber case TrialConfigMetadataKey.KUBEFLOW_CLUSTER_CONFIG: { const kubeflowClusterJsonObject: object = JSON.parse(value); this.kubeflowClusterConfig = KubeflowClusterConfigFactory.generateKubeflowClusterConfig(kubeflowClusterJsonObject); + this.genericK8sClient.setNamespace = this.kubeflowClusterConfig.namespace ?? "default"; if (this.kubeflowClusterConfig.storageType === 'azureStorage') { const azureKubeflowClusterConfig: KubeflowClusterConfigAzure = this.kubeflowClusterConfig; this.azureStorageAccountName = azureKubeflowClusterConfig.azureStorage.accountName; @@ -137,6 +138,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber } this.kubernetesCRDClient = KubeflowOperatorClientFactory.createClient( this.kubeflowClusterConfig.operator, this.kubeflowClusterConfig.apiVersion); + this.kubernetesCRDClient.namespace = this.kubeflowClusterConfig.namespace ?? "default"; break; } case TrialConfigMetadataKey.TRIAL_CONFIG: { @@ -310,7 +312,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber // Generate kubeflow job resource config object const kubeflowJobConfig: any = await this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, workerPodResources, nonWorkerResources); - + this.log.info('kubeflowJobConfig:', kubeflowJobConfig); return Promise.resolve(kubeflowJobConfig); } @@ -368,7 +370,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber kind: this.kubernetesCRDClient.jobKind, metadata: { name: kubeflowJobName, - namespace: 'default', + namespace: this.kubernetesCRDClient.namespace, labels: { app: this.NNI_KUBERNETES_TRIAL_LABEL, expId: getExperimentId(), diff --git a/ts/nni_manager/training_service/kubernetes/kubernetesApiClient.ts b/ts/nni_manager/training_service/kubernetes/kubernetesApiClient.ts index f4a735822b..72e839010f 100644 --- a/ts/nni_manager/training_service/kubernetes/kubernetesApiClient.ts +++ b/ts/nni_manager/training_service/kubernetes/kubernetesApiClient.ts @@ -150,6 +150,7 @@ abstract class KubernetesCRDClient { protected readonly client: any; protected readonly log: Logger = getLogger('KubernetesCRDClient'); protected crdSchema: any; + public namespace: string = 'default'; constructor() { this.client = new Client1_10({config: getKubernetesConfig()}); diff --git a/ts/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts b/ts/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts index 2682570a74..103b8087f2 100644 --- a/ts/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts +++ b/ts/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts @@ -230,7 +230,7 @@ abstract class KubernetesTrainingService { this.azureStorageSecretName = String.Format('nni-secret-{0}', uniqueString(8) .toLowerCase()); - const namespace = this.genericK8sClient.getNamespace ? this.genericK8sClient.getNamespace : "default" + const namespace = this.genericK8sClient.getNamespace ?? "default"; await this.genericK8sClient.createSecret( { apiVersion: 'v1', @@ -330,7 +330,7 @@ abstract class KubernetesTrainingService { const body = fs.readFileSync(filePath).toString('base64'); const registrySecretName = String.Format('nni-secret-{0}', uniqueString(8) .toLowerCase()); - const namespace = this.genericK8sClient.getNamespace ? this.genericK8sClient.getNamespace : "default" + const namespace = this.genericK8sClient.getNamespace ?? "default"; await this.genericK8sClient.createSecret( { apiVersion: 'v1', diff --git a/ts/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/ts/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index e5b205afdd..599f1c54ac 100644 --- a/ts/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/ts/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -467,7 +467,7 @@ class RemoteMachineTrainingService implements TrainingService { throw new Error(`Can not get trial job detail for job: ${trialJobId}`); } - const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId); + const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials', trialJobId); await executor.createFolder(executor.joinPath(trialJobDetail.workingDirectory, '.nni')); @@ -582,7 +582,7 @@ class RemoteMachineTrainingService implements TrainingService { const executor = await this.getExecutor(trialJobId); const trialWorkingFolder: string = executor.joinPath(executor.getRemoteExperimentRootDir(getExperimentId()), 'trials', trialJobId); - const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId); + const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials', trialJobId); const fileName: string = generateParamFileName(hyperParameters); const localFilepath: string = path.join(trialLocalTempFolder, fileName); diff --git a/ts/nni_manager/training_service/reusable/environments/kubernetes/frameworkcontrollerEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/kubernetes/frameworkcontrollerEnvironmentService.ts index c823b5ca93..012f862337 100644 --- a/ts/nni_manager/training_service/reusable/environments/kubernetes/frameworkcontrollerEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/kubernetes/frameworkcontrollerEnvironmentService.ts @@ -29,6 +29,7 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment this.config = config; // Create kubernetesCRDClient this.kubernetesCRDClient = FrameworkControllerClientFactory.createClient(this.config.namespace); + this.genericK8sClient.setNamespace = this.config.namespace ?? "default" // Create storage if (this.config.storage.storageType === 'azureStorage') { if (this.config.storage.azureShare === undefined || @@ -194,7 +195,7 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment kind: 'Framework', metadata: { name: frameworkcontrollerJobName, - namespace: this.config.namespace ? this.config.namespace : "default", + namespace: this.config.namespace ?? "default", labels: { app: this.NNI_KUBERNETES_TRIAL_LABEL, expId: this.experimentId, diff --git a/ts/nni_manager/training_service/reusable/environments/kubernetes/kubeflowEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/kubernetes/kubeflowEnvironmentService.ts index 2a0b84dde0..bd295e9ff2 100644 --- a/ts/nni_manager/training_service/reusable/environments/kubernetes/kubeflowEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/kubernetes/kubeflowEnvironmentService.ts @@ -26,6 +26,7 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService { // Create kubernetesCRDClient this.kubernetesCRDClient = KubeflowOperatorClientFactory.createClient( this.config.operator, this.config.apiVersion); + this.kubernetesCRDClient.namespace = this.config.namespace ?? "default"; // Create storage if (this.config.storage.storageType === 'azureStorage') { if (this.config.storage.azureShare === undefined || @@ -41,7 +42,7 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService { this.config.operator, this.config.apiVersion, keyValutConfig, azureStorage); this.azureStorageAccountName = azureKubeflowClusterConfig.azureStorage.accountName; this.azureStorageShare = azureKubeflowClusterConfig.azureStorage.azureShare; - + this.genericK8sClient.setNamespace = this.config.namespace ?? "default"; this.createStoragePromise = this.createAzureStorage( azureKubeflowClusterConfig.keyVault.vaultName, azureKubeflowClusterConfig.keyVault.name @@ -186,7 +187,7 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService { kind: this.kubernetesCRDClient.jobKind, metadata: { name: kubeflowJobName, - namespace: 'default', + namespace: this.kubernetesCRDClient.namespace, labels: { app: this.NNI_KUBERNETES_TRIAL_LABEL, expId: this.experimentId, diff --git a/ts/nni_manager/training_service/reusable/environments/kubernetes/kubernetesEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/kubernetes/kubernetesEnvironmentService.ts index 95b6d1c609..268a059bf8 100644 --- a/ts/nni_manager/training_service/reusable/environments/kubernetes/kubernetesEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/kubernetes/kubernetesEnvironmentService.ts @@ -77,7 +77,7 @@ export class KubernetesEnvironmentService extends EnvironmentService { if (this.genericK8sClient === undefined) { throw new Error("genericK8sClient undefined!"); } - const namespace = this.genericK8sClient.getNamespace ? this.genericK8sClient.getNamespace : "default" + const namespace = this.genericK8sClient.getNamespace ?? "default"; await this.genericK8sClient.createSecret( { apiVersion: 'v1', @@ -180,7 +180,7 @@ export class KubernetesEnvironmentService extends EnvironmentService { const body = fs.readFileSync(filePath).toString('base64'); const registrySecretName = String.Format('nni-secret-{0}', uniqueString(8) .toLowerCase()); - const namespace = this.genericK8sClient.getNamespace ? this.genericK8sClient.getNamespace : "default" + const namespace = this.genericK8sClient.getNamespace ?? "default"; await this.genericK8sClient.createSecret( { apiVersion: 'v1',