From 1d5ec5df5f487e354823721c14264c4de3ac08cf Mon Sep 17 00:00:00 2001 From: Marcin Zablocki Date: Wed, 2 Jun 2021 09:17:07 -0700 Subject: [PATCH 1/8] Deleted stale file --- playbooks/roles/packages/tasks/el-7.yml | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100755 playbooks/roles/packages/tasks/el-7.yml diff --git a/playbooks/roles/packages/tasks/el-7.yml b/playbooks/roles/packages/tasks/el-7.yml deleted file mode 100755 index 68cbbad8..00000000 --- a/playbooks/roles/packages/tasks/el-7.yml +++ /dev/null @@ -1,14 +0,0 @@ ---- -- name: Make sure python OpenSSL is installed - yum: - name: - - pyOpenSSL - - python2-cryptography - state: latest - - -- name: Make sure python3-oci-cli is installed - yum: - name: - - python3-oci-cli - state: latest From b863ca62dc97067de73ab8a0a5a3de43c74e909d Mon Sep 17 00:00:00 2001 From: Marcin Zablocki Date: Tue, 15 Jun 2021 11:40:27 -0700 Subject: [PATCH 2/8] Improved marketplace handling. New image. --- autoscaling/create_cluster.sh | 3 ++ autoscaling/tf_init/bastion_update.tf | 3 +- autoscaling/tf_init/locals.tf | 8 ++-- autoscaling/tf_init/marketplace.tf | 10 ++--- autoscaling/variables.tpl | 27 ++++++------ bastion.tf | 3 +- cluster-network.tf | 4 ++ instance-pool.tf | 5 ++- inventory.tpl | 2 + locals.tf | 8 ++-- marketplace.tf | 9 ++-- playbooks/roles/etc-hosts/tasks/common.yml | 4 +- playbooks/roles/packages/tasks/el-7.yml | 14 +++++++ .../roles/rdma-interface/tasks/debian.yml | 2 +- playbooks/roles/rdma-interface/tasks/el.yml | 2 +- playbooks/roles/slurm/templates/slurm.conf.j2 | 12 +++++- schema.yaml | 42 ++++++++++++++----- variables.tf | 27 ++++++------ 18 files changed, 122 insertions(+), 63 deletions(-) create mode 100755 playbooks/roles/packages/tasks/el-7.yml diff --git a/autoscaling/create_cluster.sh b/autoscaling/create_cluster.sh index d6190da9..d7feaa98 100755 --- a/autoscaling/create_cluster.sh +++ b/autoscaling/create_cluster.sh @@ -13,6 +13,9 @@ cd $folder/clusters/$2 if [[ $3 == VM.Standard.E3.* ]] then sed "s/##NODES##/$1/g;s/##NAME##/$2/g;s/##SHAPE##/VM.Standard.E3.Flex/g;s/##CN##/$4/g;s/##OCPU##/${3:15}/g" $folder/tf_init/variables.tf > variables.tf +elif [[ $3 == VM.Optimized3.* ]] +then + sed "s/##NODES##/$1/g;s/##NAME##/$2/g;s/##SHAPE##/VM.Optimized3.Flex/g;s/##CN##/$4/g;s/##OCPU##/${3:15}/g" $folder/tf_init/variables.tf > variables.tf elif [[ $3 == VM.Standard.E4.* ]] then sed "s/##NODES##/$1/g;s/##NAME##/$2/g;s/##SHAPE##/VM.Standard.E4.Flex/g;s/##CN##/$4/g;s/##OCPU##/${3:15}/g" $folder/tf_init/variables.tf > variables.tf diff --git a/autoscaling/tf_init/bastion_update.tf b/autoscaling/tf_init/bastion_update.tf index 8ca94588..50ad5f66 100755 --- a/autoscaling/tf_init/bastion_update.tf +++ b/autoscaling/tf_init/bastion_update.tf @@ -43,7 +43,8 @@ resource "local_file" "inventory" { cluster_mount_ip = local.mount_ip, cluster_name = local.cluster_name, shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape, - instance_pool_ocpus=var.instance_pool_ocpus + instance_pool_ocpus=var.instance_pool_ocpus, + unsupported = var.unsupported }) filename = "${local.bastion_path}/inventory" } diff --git a/autoscaling/tf_init/locals.tf b/autoscaling/tf_init/locals.tf index f613cc7e..ef8ebaaa 100755 --- a/autoscaling/tf_init/locals.tf +++ b/autoscaling/tf_init/locals.tf @@ -2,7 +2,7 @@ locals { // display names of instances cluster_instances_ids = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.id : data.oci_core_instance.instance_pool_instances.*.id cluster_instances_names = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.display_name : data.oci_core_instance.instance_pool_instances.*.display_name - + image_ocid = var.unsupported ? var.image_ocid : var.image // ips of the instances cluster_instances_ips = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip @@ -14,14 +14,14 @@ locals { cluster_name = var.use_custom_name ? var.cluster_name : random_pet.name.id - cluster_network_image = var.use_marketplace_image ? data.oci_core_app_catalog_listing_resource_versions.app_catalog_listing_resource_versions[0].app_catalog_listing_resource_versions[0].listing_resource_id : var.image + cluster_network_image = var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid - instance_pool_image = ! var.cluster_network && var.use_marketplace_image ? data.oci_core_app_catalog_listing_resource_versions.app_catalog_listing_resource_versions[0].app_catalog_listing_resource_versions[0].listing_resource_id : var.image + instance_pool_image = ! var.cluster_network && var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid // image = (var.cluster_network && var.use_marketplace_image == true) || (var.cluster_network == false && var.use_marketplace_image == false) ? var.image : data.oci_core_images.linux.images.0.id // is_bastion_flex_shape = var.bastion_shape == "VM.Standard.E3.Flex" ? [var.bastion_ocpus]:[] - is_instance_pool_flex_shape = length(regexall(".*VM.*E[3-4].*Flex$", var.instance_pool_shape)) > 0 ? [var.instance_pool_ocpus]:[] + is_instance_pool_flex_shape = length(regexall(".*VM.*[3-4].*Flex$", var.instance_pool_shape)) > 0 ? [var.instance_pool_ocpus]:[] // bastion_mount_ip = var.bastion_block ? element(concat(oci_core_volume_attachment.bastion_volume_attachment.*.ipv4, [""]), 0) : "none" diff --git a/autoscaling/tf_init/marketplace.tf b/autoscaling/tf_init/marketplace.tf index 88ee7b15..2971c739 100755 --- a/autoscaling/tf_init/marketplace.tf +++ b/autoscaling/tf_init/marketplace.tf @@ -1,6 +1,6 @@ locals { - listing_number = split(".", var.marketplace_listing)[0] - mp_listing_id = var.use_marketplace_image ? var.marketplace_listing_id[local.listing_number] : "" + mp_listing_id = var.use_marketplace_image ? var.marketplace_listing_id : "" + mp_version_id = split(".", var.marketplace_listing)[0] } /* @@ -25,12 +25,11 @@ resource "oci_core_app_catalog_listing_resource_version_agreement" "mp_image_agr count = var.use_marketplace_image ? 1 : 0 listing_id = local.mp_listing_id - listing_resource_version = data.oci_core_app_catalog_listing_resource_versions.app_catalog_listing_resource_versions[0].app_catalog_listing_resource_versions[0].listing_resource_version - + listing_resource_version = var.marketplace_version_id[local.mp_version_id] } resource "oci_core_app_catalog_subscription" "mp_image_subscription" { - count = var.use_marketplace_image ? 1 : 0 + count = var.use_marketplace_image && var.node_count > 0 ? 1 : 0 compartment_id = var.targetCompartment eula_link = oci_core_app_catalog_listing_resource_version_agreement.mp_image_agreement[0].eula_link listing_id = oci_core_app_catalog_listing_resource_version_agreement.mp_image_agreement[0].listing_id @@ -43,4 +42,3 @@ resource "oci_core_app_catalog_subscription" "mp_image_subscription" { create = "20m" } } - diff --git a/autoscaling/variables.tpl b/autoscaling/variables.tpl index 29c5bc69..0b79bef8 100755 --- a/autoscaling/variables.tpl +++ b/autoscaling/variables.tpl @@ -27,23 +27,24 @@ variable "spack" { default = ${spack} } variable "instance_pool_ocpus" { default = "##OCPU##"} variable "instance_pool_memory" { default = ${instance_pool_memory} } variable "instance_pool_custom_memory" { default = ${instance_pool_custom_memory} } -variable "marketplace_listing" { -/* - Allowed values - "1. Oracle Linux 7.8 OFED5.0" - "2. Oracle Linux 7.5 OFED4.4" -*/ - default = "${marketplace_listing}" -} -variable "marketplace_listing_id" { +variable "marketplace_listing" { + default = "${marketplace_listing}" +} + +variable "marketplace_version_id" { type = map(string) default = { - "1" = "ocid1.appcataloglisting.oc1..aaaaaaaahzcnanlki5vonyaeoiajjisejikzczygqqwheifymjqx3ft4iowa" - "2" = "ocid1.appcataloglisting.oc1..aaaaaaaahz2xiwfcsbebmqg7sp6lhdt6r2vsjro5jfukkl5cntlqvfhkbzaq" + "1" = "OL7.9-OFED5.3-1.0.0.1-RHCK-20210607" + "2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826" + "3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229" } } +variable "marketplace_listing_id" { + default = "ocid1.appcataloglisting.oc1..aaaaaaaahzcnanlki5vonyaeoiajjisejikzczygqqwheifymjqx3ft4iowa" +} + variable "bastion_block_volume_performance" { /* Allowed values @@ -75,4 +76,6 @@ variable "nfs_source_IP" { default = "${nfs_source_IP}"} variable "nfs_source_path" { default = "${nfs_source_path}"} variable "nfs_options" { default = "${nfs_options}"} -variable "hyperthreading" { default = ${hyperthreading} } \ No newline at end of file +variable "hyperthreading" { default = ${hyperthreading} } +variable "unsupported" { default = ${unsupported} } +variable "image_ocid" { default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" } diff --git a/bastion.tf b/bastion.tf index 63cf14ac..983f6ae0 100644 --- a/bastion.tf +++ b/bastion.tf @@ -274,7 +274,8 @@ resource "null_resource" "cluster" { nfs_source_path = var.nfs_source_path, nfs_options = var.nfs_options, monitoring = var.monitoring, - hyperthreading = var.hyperthreading + hyperthreading = var.hyperthreading, + unsupported = var.unsupported }) destination = "/home/${var.bastion_username}/autoscaling/tf_init/variables.tf" diff --git a/cluster-network.tf b/cluster-network.tf index 598f5b95..6ab521fa 100755 --- a/cluster-network.tf +++ b/cluster-network.tf @@ -26,6 +26,10 @@ resource "oci_core_cluster_network" "cluster_network" { size = var.node_count display_name = local.cluster_name } + freeform_tags = { + "cluster_name" = local.cluster_name + "parent_cluster" = local.cluster_name + } placement_configuration { availability_domain = var.ad primary_subnet_id = local.subnet_id diff --git a/instance-pool.tf b/instance-pool.tf index 70d7f583..06498d81 100755 --- a/instance-pool.tf +++ b/instance-pool.tf @@ -25,7 +25,10 @@ resource "oci_core_instance_pool" "instance_pool" { instance_configuration_id = oci_core_instance_configuration.instance_pool_configuration[0].id size = var.node_count display_name = local.cluster_name - + freeform_tags = { + "cluster_name" = local.cluster_name + "parent_cluster" = local.cluster_name + } placement_configurations { availability_domain = var.ad primary_subnet_id = local.subnet_id diff --git a/inventory.tpl b/inventory.tpl index bd6b6237..5d810b59 100755 --- a/inventory.tpl +++ b/inventory.tpl @@ -5,7 +5,9 @@ ${bastion_name} ansible_host=${bastion_ip} ansible_user=${bastion_username} role ${host} ansible_host=${ip} ansible_user=${compute_username} role=compute %{ endfor ~} [nfs] +%{ if nfs != "" } ${nfs} ansible_user=${compute_username} role=nfs +%{ endif } [all:children] bastion compute diff --git a/locals.tf b/locals.tf index a14eda9c..9f026129 100755 --- a/locals.tf +++ b/locals.tf @@ -21,14 +21,14 @@ locals { cluster_name = var.use_custom_name ? var.cluster_name : random_pet.name.id - cluster_network_image = var.use_marketplace_image ? data.oci_core_app_catalog_listing_resource_versions.app_catalog_listing_resource_versions[0].app_catalog_listing_resource_versions[0].listing_resource_id : local.image_ocid + cluster_network_image = var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid - instance_pool_image = ! var.cluster_network && var.use_marketplace_image ? data.oci_core_app_catalog_listing_resource_versions.app_catalog_listing_resource_versions[0].app_catalog_listing_resource_versions[0].listing_resource_id : local.image_ocid + instance_pool_image = ! var.cluster_network && var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid // image = (var.cluster_network && var.use_marketplace_image == true) || (var.cluster_network == false && var.use_marketplace_image == false) ? var.image : data.oci_core_images.linux.images.0.id - is_bastion_flex_shape = length(regexall(".*VM.*E[3-4].*Flex$", var.bastion_shape)) > 0 ? [var.bastion_ocpus]:[] - is_instance_pool_flex_shape = length(regexall(".*VM.*E[3-4].*Flex$", var.instance_pool_shape)) > 0 ? [var.instance_pool_ocpus]:[] + is_bastion_flex_shape = length(regexall(".*VM.*[3-4].*Flex$", var.bastion_shape)) > 0 ? [var.bastion_ocpus]:[] + is_instance_pool_flex_shape = length(regexall(".*VM.*[3-4].*Flex$", var.instance_pool_shape)) > 0 ? [var.instance_pool_ocpus]:[] bastion_mount_ip = var.bastion_block ? element(concat(oci_core_volume_attachment.bastion_volume_attachment.*.ipv4, [""]), 0) : "none" diff --git a/marketplace.tf b/marketplace.tf index 87e0de25..153c8081 100755 --- a/marketplace.tf +++ b/marketplace.tf @@ -1,6 +1,7 @@ locals { - listing_number = split(".", var.marketplace_listing)[0] - mp_listing_id = var.use_marketplace_image ? var.marketplace_listing_id[local.listing_number] : "" +// listing_number = split(".", var.marketplace_listing)[0] + mp_listing_id = var.use_marketplace_image ? var.marketplace_listing_id : "" + mp_version_id = split(".", var.marketplace_listing)[0] } /* @@ -25,12 +26,12 @@ resource "oci_core_app_catalog_listing_resource_version_agreement" "mp_image_agr count = var.use_marketplace_image ? 1 : 0 listing_id = local.mp_listing_id - listing_resource_version = data.oci_core_app_catalog_listing_resource_versions.app_catalog_listing_resource_versions[0].app_catalog_listing_resource_versions[0].listing_resource_version + listing_resource_version = var.marketplace_version_id[local.mp_version_id] } resource "oci_core_app_catalog_subscription" "mp_image_subscription" { - count = var.use_marketplace_image && var.node_count > 0 ? 1 : 0 + count = var.use_marketplace_image ? 1 : 0 compartment_id = var.targetCompartment eula_link = oci_core_app_catalog_listing_resource_version_agreement.mp_image_agreement[0].eula_link listing_id = oci_core_app_catalog_listing_resource_version_agreement.mp_image_agreement[0].listing_id diff --git a/playbooks/roles/etc-hosts/tasks/common.yml b/playbooks/roles/etc-hosts/tasks/common.yml index 4d521898..38872a6e 100644 --- a/playbooks/roles/etc-hosts/tasks/common.yml +++ b/playbooks/roles/etc-hosts/tasks/common.yml @@ -30,7 +30,7 @@ marker: "# {mark} ANSIBLE MANAGED BLOCK {{ cluster_name }}" delegate_to: 127.0.0.1 run_once: true - when: not destroy|bool + when: not destroy|bool and groups['compute']|length > 0 - name: add cluster nodes to the /etc/hosts file of the bastion blockinfile: @@ -41,7 +41,7 @@ marker: "# {mark} ANSIBLE MANAGED BLOCK {{ cluster_name }}" delegate_to: 127.0.0.1 run_once: true - when: not destroy|bool + when: not destroy|bool and groups['compute']|length > 0 - name: move /etc/hosts on all compute nodes become: true diff --git a/playbooks/roles/packages/tasks/el-7.yml b/playbooks/roles/packages/tasks/el-7.yml new file mode 100755 index 00000000..68cbbad8 --- /dev/null +++ b/playbooks/roles/packages/tasks/el-7.yml @@ -0,0 +1,14 @@ +--- +- name: Make sure python OpenSSL is installed + yum: + name: + - pyOpenSSL + - python2-cryptography + state: latest + + +- name: Make sure python3-oci-cli is installed + yum: + name: + - python3-oci-cli + state: latest diff --git a/playbooks/roles/rdma-interface/tasks/debian.yml b/playbooks/roles/rdma-interface/tasks/debian.yml index ad702349..31de07ce 100644 --- a/playbooks/roles/rdma-interface/tasks/debian.yml +++ b/playbooks/roles/rdma-interface/tasks/debian.yml @@ -36,5 +36,5 @@ - name: restart_interface command: bash -c "ifdown {{ item['item']['device'] }} && ifup {{ item['item']['device'] }}" - when: item.changed + when: ansible_mlx is defined and item.changed|default(false) with_items: "{{ rdma_interface['results'] }}" diff --git a/playbooks/roles/rdma-interface/tasks/el.yml b/playbooks/roles/rdma-interface/tasks/el.yml index 51206767..62b0d1d1 100755 --- a/playbooks/roles/rdma-interface/tasks/el.yml +++ b/playbooks/roles/rdma-interface/tasks/el.yml @@ -36,5 +36,5 @@ - name: restart_interface command: bash -c "ifdown {{ item['item']['device'] }} && ifup {{ item['item']['device'] }}" - when: item.changed + when: ansible_mlx is defined and item.changed|default(false) with_items: "{{ rdma_interface['results'] }}" diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index d21e1298..d8363118 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -59,6 +59,10 @@ NodeName=cluster-[1-20]-amd3{{ ocpu }}-node-[1-10] Boards=1 SocketsPerBoard=1 Co {% endfor %} NodeName=cluster-[1-20]-amd3128-node-[1-20] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% for item in range(51) -%} cluster-size-{{ item }},{%- endfor %}BM.Standard.E3.128 +{% for ocpu in range(1,18) -%} +NodeName=cluster-[1-20]-optimized3{{ ocpu }}-node-[1-10] Boards=1 SocketsPerBoard=1 CoresPerSocket={{ ocpu }} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% for item in range(51) -%} + cluster-size-{{ item }},{%- endfor %}VM.Optimized3.{{ ocpu }} +{% endfor %} {% for ocpu in range(1,65) -%} NodeName=cluster-[1-20]-amd4{{ ocpu }}-node-[1-10] Boards=1 SocketsPerBoard=1 CoresPerSocket={{ ocpu }} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% for item in range(51) -%} cluster-size-{{ item }},{%- endfor %}VM.Standard.E4.{{ ocpu }} @@ -109,6 +113,12 @@ NodeName={{ short_name[0] }} Boards=1 SocketsPerBoard=2 CoresPerSocket=32 Thread {% elif shape == "VM.Standard.E3.Flex" %} NodeName={{ short_name[0] }} Boards=1 SocketsPerBoard=1 CoresPerSocket={{ instance_pool_ocpus }} ThreadsPerCore={{threadspercore}} State=UNKNOWN Features={% for item in range(groups['compute'] | length + 1) -%} cluster-size-{{ item }},{%- endfor %}VM.Standard.E3.{{ instance_pool_ocpus }} +{% elif shape == "VM.Optimized3.Flex" %} +NodeName={{ short_name[0] }} Boards=1 SocketsPerBoard=1 CoresPerSocket={{ instance_pool_ocpus }} ThreadsPerCore={{threadspercore}} State=UNKNOWN Features={% for item in range(groups['compute'] | length + 1) -%} + cluster-size-{{ item }},{%- endfor %}VM.Optimized3.{{ instance_pool_ocpus }} +{% elif shape == "VM.Standard.E4.Flex" %} +NodeName={{ short_name[0] }} Boards=1 SocketsPerBoard=1 CoresPerSocket={{ instance_pool_ocpus }} ThreadsPerCore={{threadspercore}} State=UNKNOWN Features={% for item in range(groups['compute'] | length + 1) -%} + cluster-size-{{ item }},{%- endfor %}VM.Standard.E4.{{ instance_pool_ocpus }} {% else %} {% if "BM" in shape %} NodeName={{ short_name[0] }} Boards=1 SocketsPerBoard=2 CoresPerSocket={{ (shape.split('.')[-1]|int / 2)|int}} ThreadsPerCore=1 State=UNKNOWN Features={% for item in range(groups['compute'] | length + 1) -%} @@ -119,4 +129,4 @@ NodeName={{ short_name[0] }} Boards=1 SocketsPerBoard=1 CoresPerSocket={{ shape. {% endif %} {% endif %} {% endfor %} -PartitionName=compute Nodes=ALL Default=YES MaxTime=INFINITE State=UP \ No newline at end of file +PartitionName=compute Nodes=ALL Default=YES MaxTime=INFINITE State=UP diff --git a/schema.yaml b/schema.yaml index 50407e6e..f1710c6b 100755 --- a/schema.yaml +++ b/schema.yaml @@ -21,7 +21,6 @@ variableGroups: - title: "Cluster configuration" variables: - ${targetCompartment} - - ${ad} - ${ssh_key} - ${use_custom_name} - ${cluster_name} @@ -51,6 +50,7 @@ variableGroups: - ${bastion_block_volume_performance} - title: "Compute node options" variables: + - ${ad} - ${cluster_network} - ${cluster_network_shape} - ${instance_pool_shape} @@ -68,6 +68,13 @@ variableGroups: - ${compute_image_compartment} - ${image} - ${image_ocid} + - title: "API authentication" + variables: + - ${inst_prin} + - ${api_user_ocid} + - ${api_fingerprint} + - ${api_user_key} + visible: ${autoscaling} - title: "Additional file system" variables: - ${add_nfs} @@ -103,13 +110,6 @@ variableGroups: - ${slurm} - ${spack} - ${monitoring} - - title: "API authentication" - variables: - - ${inst_prin} - - ${api_user_ocid} - - ${api_fingerprint} - - ${api_user_key} - visible: ${autoscaling} - title: "Hidden" variables: - ${region} @@ -121,6 +121,7 @@ variableGroups: - ${marketplace_listing_id} - ${ssh_cidr} - ${marketplace_source_images} + - ${marketplace_version_id} visible: false - title: "Debug" variables: @@ -189,6 +190,9 @@ variables: - eq: - ${bastion_shape} - "VM.Standard.E4.Flex" + - eq: + - ${bastion_shape} + - "VM.Optimized3.Flex" required: false bastion_custom_memory: title: Use custom memory size @@ -200,6 +204,9 @@ variables: - eq: - ${bastion_shape} - "VM.Standard.E3.Flex" + - eq: + - ${bastion_shape} + - "VM.Optimized3.Flex" - eq: - ${bastion_shape} - "VM.Standard.E4.Flex" @@ -217,6 +224,9 @@ variables: - eq: - ${bastion_shape} - "VM.Standard.E3.Flex" + - eq: + - ${bastion_shape} + - "VM.Optimized3.Flex" - eq: - ${bastion_shape} - "VM.Standard.E4.Flex" @@ -393,6 +403,9 @@ variables: - eq: - ${instance_pool_shape} - "VM.Standard.E3.Flex" + - eq: + - ${instance_pool_shape} + - "VM.Optimized3.Flex" - eq: - ${instance_pool_shape} - "VM.Standard.E4.Flex" @@ -408,6 +421,9 @@ variables: - eq: - ${instance_pool_shape} - "VM.Standard.E3.Flex" + - eq: + - ${instance_pool_shape} + - "VM.Optimized3.Flex" - eq: - ${instance_pool_shape} - "VM.Standard.E4.Flex" @@ -425,6 +441,9 @@ variables: - eq: - ${instance_pool_shape} - "VM.Standard.E3.Flex" + - eq: + - ${instance_pool_shape} + - "VM.Optimized3.Flex" - eq: - ${instance_pool_shape} - "VM.Standard.E4.Flex" @@ -468,9 +487,10 @@ variables: description: "Marketplace listing to use" required: true enum: - - "1. Oracle Linux 7.8 OFED5.0" - - "2. Oracle Linux 7.5 OFED4.4" - default: "1. Oracle Linux 7.8 OFED5.0" + - "1. Oracle Linux 7.9 OFED 5.3-1.0.0.1 RHCK 20210607" + - "2. Oracle Linux 7.8 OFED 5.0-1.0.0.0 UEK 20200826" + - "3. Oracle Linux 7.7 OFED 4.4-2.0.7.0 UEK 20200229" + default: "1. Oracle Linux 7.9 OFED 5.3-1.0.0.1 RHCK 20210607" visible: and: - ${use_marketplace_image} diff --git a/variables.tf b/variables.tf index 0261545e..61bc2c4e 100755 --- a/variables.tf +++ b/variables.tf @@ -46,23 +46,22 @@ variable "instance_pool_custom_memory" { default = false } variable "bastion_memory" { default = 16 } variable "bastion_custom_memory" { default = false } variable "marketplace_listing" { -/* - Allowed values - "1. Oracle Linux 7.8 OFED5.0" - "2. Oracle Linux 7.5 OFED4.4" -*/ - - default = "1. Oracle Linux 7.8 OFED5.0" + default = "1. Oracle Linux 7.9 OFED 5.3-1.0.0.1 RHCK 20210607" } -variable "marketplace_listing_id" { - type = map(string) - default = { - "1" = "ocid1.appcataloglisting.oc1..aaaaaaaahzcnanlki5vonyaeoiajjisejikzczygqqwheifymjqx3ft4iowa" - "2" = "ocid1.appcataloglisting.oc1..aaaaaaaahz2xiwfcsbebmqg7sp6lhdt6r2vsjro5jfukkl5cntlqvfhkbzaq" +variable "marketplace_version_id" { + type = map(string) + default = { + "1" = "OL7.9-OFED5.3-1.0.0.1-RHCK-20210607" + "2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826" + "3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229" } } +variable "marketplace_listing_id" { + default = "ocid1.appcataloglisting.oc1..aaaaaaaahzcnanlki5vonyaeoiajjisejikzczygqqwheifymjqx3ft4iowa" +} + variable "bastion_block_volume_performance" { /* Allowed values @@ -109,12 +108,12 @@ variable "monitoring" { default = true } variable "unsupported" { type=bool - default = "false" + default = false } variable "unsupported_bastion" { type=bool - default = "false" + default = false } variable "bastion_username" { From 81cecb47a1b345cf7405a4e9d6fc4420bf029041 Mon Sep 17 00:00:00 2001 From: Marcin Zablocki Date: Tue, 15 Jun 2021 11:41:31 -0700 Subject: [PATCH 3/8] Updated readme --- README.md | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 029773eb..f70df627 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,17 @@ allow service compute_management to manage compute-management-family in tenancy allow service compute_management to read app-catalog-listing in tenancy allow group user to manage all-resources in compartment compartmentName ``` -## Policies for autoscaling: + +## What is cluster resizing (resize.py) ? +TODO + +## What is cluster autoscaling ? +TODO + +## How is resizing different from autoscaling ? +TODO + +## Policies for autoscaling or resizing: As described when you specify your variables, if you select instance-principal as way of authenticating your node, make sure your generate a dynamic group and give the following policies to it: ``` Allow dynamic-group instance_principal to read app-catalog-listing in tenancy @@ -27,6 +37,11 @@ or: `Allow dynamic-group instance_principal to manage all-resources in compartment compartmentName` + +# Resizing (via resize.py or OCI console) +TODO + + # Autoscaling The autoscaling will work in a “cluster per job” approach. This means that for job waiting in the queue, we will launch new cluster specifically for that job. Autoscaling will also take care of spinning down clusters. By default, a cluster is left Idle for 10 minutes before shutting down. Autoscaling is achieved with a cronjob to be able to quickly switch from one scheduler to the next. From cd6fa449a434343a1f62a86696f7f05d2cba8925 Mon Sep 17 00:00:00 2001 From: Marcin Zablocki Date: Mon, 21 Jun 2021 17:02:05 -0700 Subject: [PATCH 4/8] add ldap variable to autoscaling --- autoscaling/variables.tpl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/autoscaling/variables.tpl b/autoscaling/variables.tpl index 0b79bef8..c8336cb1 100755 --- a/autoscaling/variables.tpl +++ b/autoscaling/variables.tpl @@ -77,5 +77,9 @@ variable "nfs_source_path" { default = "${nfs_source_path}"} variable "nfs_options" { default = "${nfs_options}"} variable "hyperthreading" { default = ${hyperthreading} } +<<<<<<< HEAD variable "unsupported" { default = ${unsupported} } variable "image_ocid" { default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" } +======= +Variable “ldap” { default = “${ldap}” } +>>>>>>> a90a58f (add ldap variable to autoscaling) From 51aa5c6d765ea9efeabab328a8cd12c6a762ecce Mon Sep 17 00:00:00 2001 From: Marcin Zablocki Date: Mon, 21 Jun 2021 17:47:34 -0700 Subject: [PATCH 5/8] Revert "add ldap variable to autoscaling" This reverts commit cd6fa449a434343a1f62a86696f7f05d2cba8925. --- autoscaling/variables.tpl | 4 ---- 1 file changed, 4 deletions(-) diff --git a/autoscaling/variables.tpl b/autoscaling/variables.tpl index c8336cb1..0b79bef8 100755 --- a/autoscaling/variables.tpl +++ b/autoscaling/variables.tpl @@ -77,9 +77,5 @@ variable "nfs_source_path" { default = "${nfs_source_path}"} variable "nfs_options" { default = "${nfs_options}"} variable "hyperthreading" { default = ${hyperthreading} } -<<<<<<< HEAD variable "unsupported" { default = ${unsupported} } variable "image_ocid" { default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" } -======= -Variable “ldap” { default = “${ldap}” } ->>>>>>> a90a58f (add ldap variable to autoscaling) From b3e7beae355a9e4d7fd482d3efa49770d9473ee7 Mon Sep 17 00:00:00 2001 From: Marcin Zablocki Date: Mon, 21 Jun 2021 17:48:14 -0700 Subject: [PATCH 6/8] add ldap variable to autoscaling --- autoscaling/variables.tpl | 1 + 1 file changed, 1 insertion(+) diff --git a/autoscaling/variables.tpl b/autoscaling/variables.tpl index 0b79bef8..7bb27a80 100755 --- a/autoscaling/variables.tpl +++ b/autoscaling/variables.tpl @@ -79,3 +79,4 @@ variable "nfs_options" { default = "${nfs_options}"} variable "hyperthreading" { default = ${hyperthreading} } variable "unsupported" { default = ${unsupported} } variable "image_ocid" { default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" } +variable "ldap" { default = "${ldap}" } From ffdbf05024315ed1fcb41a5f5e1d8c4f9f92eb4e Mon Sep 17 00:00:00 2001 From: Marcin Zablocki Date: Mon, 21 Jun 2021 18:02:58 -0700 Subject: [PATCH 7/8] Run SLURM and SPACK installers before proceeding to compute hosts --- playbooks/site.yml | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/playbooks/site.yml b/playbooks/site.yml index 2a95759a..92628b9f 100755 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -170,7 +170,7 @@ name: grafana when: monitoring|default(false)|bool -- hosts: all +- hosts: bastion vars: destroy: false initial: true @@ -179,6 +179,19 @@ - include_role: name: spack when: spack|default(false)|bool + - include_role: + name: telegraf + when: monitoring|default(false)|bool + - include_role: + name: slurm + when: slurm|default(false)|bool + +- hosts: compute + vars: + destroy: false + initial: true + download_path: "{{ '/nfs/cluster/' if cluster_nfs|bool else '/tmp' }}" + tasks: - include_role: name: slurm when: slurm|default(false)|bool From ddd684911444e1188e846f55976dfde809bc84bd Mon Sep 17 00:00:00 2001 From: Marcin Zablocki Date: Fri, 9 Jul 2021 08:59:13 +0200 Subject: [PATCH 8/8] Remove quotes from variable --- autoscaling/variables.tpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoscaling/variables.tpl b/autoscaling/variables.tpl index 7bb27a80..bbea9f62 100755 --- a/autoscaling/variables.tpl +++ b/autoscaling/variables.tpl @@ -79,4 +79,4 @@ variable "nfs_options" { default = "${nfs_options}"} variable "hyperthreading" { default = ${hyperthreading} } variable "unsupported" { default = ${unsupported} } variable "image_ocid" { default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" } -variable "ldap" { default = "${ldap}" } +variable "ldap" { default = ${ldap} }