Skip to content

Commit

Permalink
Merge pull request #28 from oci-hpc/dev
Browse files Browse the repository at this point in the history
Merge current fixes from dev
  • Loading branch information
MarcinZablocki authored Jul 9, 2021
2 parents a90a58f + ddd6849 commit b5c3378
Show file tree
Hide file tree
Showing 14 changed files with 116 additions and 60 deletions.
3 changes: 3 additions & 0 deletions autoscaling/create_cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ cd $folder/clusters/$2
if [[ $3 == VM.Standard.E3.* ]]
then
sed "s/##NODES##/$1/g;s/##NAME##/$2/g;s/##SHAPE##/VM.Standard.E3.Flex/g;s/##CN##/$4/g;s/##OCPU##/${3:15}/g" $folder/tf_init/variables.tf > variables.tf
elif [[ $3 == VM.Optimized3.* ]]
then
sed "s/##NODES##/$1/g;s/##NAME##/$2/g;s/##SHAPE##/VM.Optimized3.Flex/g;s/##CN##/$4/g;s/##OCPU##/${3:15}/g" $folder/tf_init/variables.tf > variables.tf
elif [[ $3 == VM.Standard.E4.* ]]
then
sed "s/##NODES##/$1/g;s/##NAME##/$2/g;s/##SHAPE##/VM.Standard.E4.Flex/g;s/##CN##/$4/g;s/##OCPU##/${3:15}/g" $folder/tf_init/variables.tf > variables.tf
Expand Down
3 changes: 2 additions & 1 deletion autoscaling/tf_init/bastion_update.tf
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ resource "local_file" "inventory" {
cluster_mount_ip = local.mount_ip,
cluster_name = local.cluster_name,
shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape,
instance_pool_ocpus=var.instance_pool_ocpus
instance_pool_ocpus=var.instance_pool_ocpus,
unsupported = var.unsupported
})
filename = "${local.bastion_path}/inventory"
}
Expand Down
8 changes: 4 additions & 4 deletions autoscaling/tf_init/locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ locals {
// display names of instances
cluster_instances_ids = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.id : data.oci_core_instance.instance_pool_instances.*.id
cluster_instances_names = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.display_name : data.oci_core_instance.instance_pool_instances.*.display_name

image_ocid = var.unsupported ? var.image_ocid : var.image
// ips of the instances
cluster_instances_ips = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip

Expand All @@ -14,14 +14,14 @@ locals {

cluster_name = var.use_custom_name ? var.cluster_name : random_pet.name.id

cluster_network_image = var.use_marketplace_image ? data.oci_core_app_catalog_listing_resource_versions.app_catalog_listing_resource_versions[0].app_catalog_listing_resource_versions[0].listing_resource_id : var.image
cluster_network_image = var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid

instance_pool_image = ! var.cluster_network && var.use_marketplace_image ? data.oci_core_app_catalog_listing_resource_versions.app_catalog_listing_resource_versions[0].app_catalog_listing_resource_versions[0].listing_resource_id : var.image
instance_pool_image = ! var.cluster_network && var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid

// image = (var.cluster_network && var.use_marketplace_image == true) || (var.cluster_network == false && var.use_marketplace_image == false) ? var.image : data.oci_core_images.linux.images.0.id

// is_bastion_flex_shape = var.bastion_shape == "VM.Standard.E3.Flex" ? [var.bastion_ocpus]:[]
is_instance_pool_flex_shape = length(regexall(".*VM.*E[3-4].*Flex$", var.instance_pool_shape)) > 0 ? [var.instance_pool_ocpus]:[]
is_instance_pool_flex_shape = length(regexall(".*VM.*[3-4].*Flex$", var.instance_pool_shape)) > 0 ? [var.instance_pool_ocpus]:[]

// bastion_mount_ip = var.bastion_block ? element(concat(oci_core_volume_attachment.bastion_volume_attachment.*.ipv4, [""]), 0) : "none"

Expand Down
10 changes: 4 additions & 6 deletions autoscaling/tf_init/marketplace.tf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
locals {
listing_number = split(".", var.marketplace_listing)[0]
mp_listing_id = var.use_marketplace_image ? var.marketplace_listing_id[local.listing_number] : ""
mp_listing_id = var.use_marketplace_image ? var.marketplace_listing_id : ""
mp_version_id = split(".", var.marketplace_listing)[0]
}

/*
Expand All @@ -25,12 +25,11 @@ resource "oci_core_app_catalog_listing_resource_version_agreement" "mp_image_agr
count = var.use_marketplace_image ? 1 : 0

listing_id = local.mp_listing_id
listing_resource_version = data.oci_core_app_catalog_listing_resource_versions.app_catalog_listing_resource_versions[0].app_catalog_listing_resource_versions[0].listing_resource_version

listing_resource_version = var.marketplace_version_id[local.mp_version_id]
}

resource "oci_core_app_catalog_subscription" "mp_image_subscription" {
count = var.use_marketplace_image ? 1 : 0
count = var.use_marketplace_image && var.node_count > 0 ? 1 : 0
compartment_id = var.targetCompartment
eula_link = oci_core_app_catalog_listing_resource_version_agreement.mp_image_agreement[0].eula_link
listing_id = oci_core_app_catalog_listing_resource_version_agreement.mp_image_agreement[0].listing_id
Expand All @@ -43,4 +42,3 @@ resource "oci_core_app_catalog_subscription" "mp_image_subscription" {
create = "20m"
}
}

27 changes: 15 additions & 12 deletions autoscaling/variables.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -27,23 +27,24 @@ variable "spack" { default = ${spack} }
variable "instance_pool_ocpus" { default = "##OCPU##"}
variable "instance_pool_memory" { default = ${instance_pool_memory} }
variable "instance_pool_custom_memory" { default = ${instance_pool_custom_memory} }
variable "marketplace_listing" {
/*
Allowed values
"1. Oracle Linux 7.8 OFED5.0"
"2. Oracle Linux 7.5 OFED4.4"
*/
default = "${marketplace_listing}"
}

variable "marketplace_listing_id" {
variable "marketplace_listing" {
default = "${marketplace_listing}"
}

variable "marketplace_version_id" {
type = map(string)
default = {
"1" = "ocid1.appcataloglisting.oc1..aaaaaaaahzcnanlki5vonyaeoiajjisejikzczygqqwheifymjqx3ft4iowa"
"2" = "ocid1.appcataloglisting.oc1..aaaaaaaahz2xiwfcsbebmqg7sp6lhdt6r2vsjro5jfukkl5cntlqvfhkbzaq"
"1" = "OL7.9-OFED5.3-1.0.0.1-RHCK-20210607"
"2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826"
"3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229"
}
}

variable "marketplace_listing_id" {
default = "ocid1.appcataloglisting.oc1..aaaaaaaahzcnanlki5vonyaeoiajjisejikzczygqqwheifymjqx3ft4iowa"
}

variable "bastion_block_volume_performance" {
/*
Allowed values
Expand Down Expand Up @@ -76,4 +77,6 @@ variable "nfs_source_path" { default = "${nfs_source_path}"}
variable "nfs_options" { default = "${nfs_options}"}

variable "hyperthreading" { default = ${hyperthreading} }
Variable “ldap” { default = “${ldap}” }
variable "unsupported" { default = ${unsupported} }
variable "image_ocid" { default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" }
variable "ldap" { default = ${ldap} }
3 changes: 2 additions & 1 deletion bastion.tf
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,8 @@ resource "null_resource" "cluster" {
nfs_source_path = var.nfs_source_path,
nfs_options = var.nfs_options,
monitoring = var.monitoring,
hyperthreading = var.hyperthreading
hyperthreading = var.hyperthreading,
unsupported = var.unsupported
})

destination = "/home/${var.bastion_username}/autoscaling/tf_init/variables.tf"
Expand Down
4 changes: 4 additions & 0 deletions cluster-network.tf
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ resource "oci_core_cluster_network" "cluster_network" {
size = var.node_count
display_name = local.cluster_name
}
freeform_tags = {
"cluster_name" = local.cluster_name
"parent_cluster" = local.cluster_name
}
placement_configuration {
availability_domain = var.ad
primary_subnet_id = local.subnet_id
Expand Down
5 changes: 4 additions & 1 deletion instance-pool.tf
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ resource "oci_core_instance_pool" "instance_pool" {
instance_configuration_id = oci_core_instance_configuration.instance_pool_configuration[0].id
size = var.node_count
display_name = local.cluster_name

freeform_tags = {
"cluster_name" = local.cluster_name
"parent_cluster" = local.cluster_name
}
placement_configurations {
availability_domain = var.ad
primary_subnet_id = local.subnet_id
Expand Down
8 changes: 4 additions & 4 deletions locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ locals {

cluster_name = var.use_custom_name ? var.cluster_name : random_pet.name.id

cluster_network_image = var.use_marketplace_image ? data.oci_core_app_catalog_listing_resource_versions.app_catalog_listing_resource_versions[0].app_catalog_listing_resource_versions[0].listing_resource_id : local.image_ocid
cluster_network_image = var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid

instance_pool_image = ! var.cluster_network && var.use_marketplace_image ? data.oci_core_app_catalog_listing_resource_versions.app_catalog_listing_resource_versions[0].app_catalog_listing_resource_versions[0].listing_resource_id : local.image_ocid
instance_pool_image = ! var.cluster_network && var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid

// image = (var.cluster_network && var.use_marketplace_image == true) || (var.cluster_network == false && var.use_marketplace_image == false) ? var.image : data.oci_core_images.linux.images.0.id

is_bastion_flex_shape = length(regexall(".*VM.*E[3-4].*Flex$", var.bastion_shape)) > 0 ? [var.bastion_ocpus]:[]
is_instance_pool_flex_shape = length(regexall(".*VM.*E[3-4].*Flex$", var.instance_pool_shape)) > 0 ? [var.instance_pool_ocpus]:[]
is_bastion_flex_shape = length(regexall(".*VM.*[3-4].*Flex$", var.bastion_shape)) > 0 ? [var.bastion_ocpus]:[]
is_instance_pool_flex_shape = length(regexall(".*VM.*[3-4].*Flex$", var.instance_pool_shape)) > 0 ? [var.instance_pool_ocpus]:[]

bastion_mount_ip = var.bastion_block ? element(concat(oci_core_volume_attachment.bastion_volume_attachment.*.ipv4, [""]), 0) : "none"

Expand Down
9 changes: 5 additions & 4 deletions marketplace.tf
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
locals {
listing_number = split(".", var.marketplace_listing)[0]
mp_listing_id = var.use_marketplace_image ? var.marketplace_listing_id[local.listing_number] : ""
// listing_number = split(".", var.marketplace_listing)[0]
mp_listing_id = var.use_marketplace_image ? var.marketplace_listing_id : ""
mp_version_id = split(".", var.marketplace_listing)[0]
}

/*
Expand All @@ -25,12 +26,12 @@ resource "oci_core_app_catalog_listing_resource_version_agreement" "mp_image_agr
count = var.use_marketplace_image ? 1 : 0

listing_id = local.mp_listing_id
listing_resource_version = data.oci_core_app_catalog_listing_resource_versions.app_catalog_listing_resource_versions[0].app_catalog_listing_resource_versions[0].listing_resource_version
listing_resource_version = var.marketplace_version_id[local.mp_version_id]

}

resource "oci_core_app_catalog_subscription" "mp_image_subscription" {
count = var.use_marketplace_image && var.node_count > 0 ? 1 : 0
count = var.use_marketplace_image ? 1 : 0
compartment_id = var.targetCompartment
eula_link = oci_core_app_catalog_listing_resource_version_agreement.mp_image_agreement[0].eula_link
listing_id = oci_core_app_catalog_listing_resource_version_agreement.mp_image_agreement[0].listing_id
Expand Down
12 changes: 11 additions & 1 deletion playbooks/roles/slurm/templates/slurm.conf.j2
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ NodeName=cluster-[1-20]-amd3{{ ocpu }}-node-[1-10] Boards=1 SocketsPerBoard=1 Co
{% endfor %}
NodeName=cluster-[1-20]-amd3128-node-[1-20] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% for item in range(51) -%}
cluster-size-{{ item }},{%- endfor %}BM.Standard.E3.128
{% for ocpu in range(1,18) -%}
NodeName=cluster-[1-20]-optimized3{{ ocpu }}-node-[1-10] Boards=1 SocketsPerBoard=1 CoresPerSocket={{ ocpu }} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% for item in range(51) -%}
cluster-size-{{ item }},{%- endfor %}VM.Optimized3.{{ ocpu }}
{% endfor %}
{% for ocpu in range(1,65) -%}
NodeName=cluster-[1-20]-amd4{{ ocpu }}-node-[1-10] Boards=1 SocketsPerBoard=1 CoresPerSocket={{ ocpu }} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% for item in range(51) -%}
cluster-size-{{ item }},{%- endfor %}VM.Standard.E4.{{ ocpu }}
Expand Down Expand Up @@ -109,6 +113,12 @@ NodeName={{ short_name[0] }} Boards=1 SocketsPerBoard=2 CoresPerSocket=32 Thread
{% elif shape == "VM.Standard.E3.Flex" %}
NodeName={{ short_name[0] }} Boards=1 SocketsPerBoard=1 CoresPerSocket={{ instance_pool_ocpus }} ThreadsPerCore={{threadspercore}} State=UNKNOWN Features={% for item in range(groups['compute'] | length + 1) -%}
cluster-size-{{ item }},{%- endfor %}VM.Standard.E3.{{ instance_pool_ocpus }}
{% elif shape == "VM.Optimized3.Flex" %}
NodeName={{ short_name[0] }} Boards=1 SocketsPerBoard=1 CoresPerSocket={{ instance_pool_ocpus }} ThreadsPerCore={{threadspercore}} State=UNKNOWN Features={% for item in range(groups['compute'] | length + 1) -%}
cluster-size-{{ item }},{%- endfor %}VM.Optimized3.{{ instance_pool_ocpus }}
{% elif shape == "VM.Standard.E4.Flex" %}
NodeName={{ short_name[0] }} Boards=1 SocketsPerBoard=1 CoresPerSocket={{ instance_pool_ocpus }} ThreadsPerCore={{threadspercore}} State=UNKNOWN Features={% for item in range(groups['compute'] | length + 1) -%}
cluster-size-{{ item }},{%- endfor %}VM.Standard.E4.{{ instance_pool_ocpus }}
{% else %}
{% if "BM" in shape %}
NodeName={{ short_name[0] }} Boards=1 SocketsPerBoard=2 CoresPerSocket={{ (shape.split('.')[-1]|int / 2)|int}} ThreadsPerCore=1 State=UNKNOWN Features={% for item in range(groups['compute'] | length + 1) -%}
Expand All @@ -119,4 +129,4 @@ NodeName={{ short_name[0] }} Boards=1 SocketsPerBoard=1 CoresPerSocket={{ shape.
{% endif %}
{% endif %}
{% endfor %}
PartitionName=compute Nodes=ALL Default=YES MaxTime=INFINITE State=UP
PartitionName=compute Nodes=ALL Default=YES MaxTime=INFINITE State=UP
15 changes: 14 additions & 1 deletion playbooks/site.yml
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@
name: grafana
when: monitoring|default(false)|bool

- hosts: all
- hosts: bastion
vars:
destroy: false
initial: true
Expand All @@ -179,6 +179,19 @@
- include_role:
name: spack
when: spack|default(false)|bool
- include_role:
name: telegraf
when: monitoring|default(false)|bool
- include_role:
name: slurm
when: slurm|default(false)|bool

- hosts: compute
vars:
destroy: false
initial: true
download_path: "{{ '/nfs/cluster/' if cluster_nfs|bool else '/tmp' }}"
tasks:
- include_role:
name: slurm
when: slurm|default(false)|bool
Expand Down
42 changes: 31 additions & 11 deletions schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ variableGroups:
- title: "Cluster configuration"
variables:
- ${targetCompartment}
- ${ad}
- ${ssh_key}
- ${use_custom_name}
- ${cluster_name}
Expand Down Expand Up @@ -51,6 +50,7 @@ variableGroups:
- ${bastion_block_volume_performance}
- title: "Compute node options"
variables:
- ${ad}
- ${cluster_network}
- ${cluster_network_shape}
- ${instance_pool_shape}
Expand All @@ -68,6 +68,13 @@ variableGroups:
- ${compute_image_compartment}
- ${image}
- ${image_ocid}
- title: "API authentication"
variables:
- ${inst_prin}
- ${api_user_ocid}
- ${api_fingerprint}
- ${api_user_key}
visible: ${autoscaling}
- title: "Additional file system"
variables:
- ${add_nfs}
Expand Down Expand Up @@ -103,13 +110,6 @@ variableGroups:
- ${slurm}
- ${spack}
- ${monitoring}
- title: "API authentication"
variables:
- ${inst_prin}
- ${api_user_ocid}
- ${api_fingerprint}
- ${api_user_key}
visible: ${autoscaling}
- title: "Hidden"
variables:
- ${region}
Expand All @@ -121,6 +121,7 @@ variableGroups:
- ${marketplace_listing_id}
- ${ssh_cidr}
- ${marketplace_source_images}
- ${marketplace_version_id}
visible: false
- title: "Debug"
variables:
Expand Down Expand Up @@ -189,6 +190,9 @@ variables:
- eq:
- ${bastion_shape}
- "VM.Standard.E4.Flex"
- eq:
- ${bastion_shape}
- "VM.Optimized3.Flex"
required: false
bastion_custom_memory:
title: Use custom memory size
Expand All @@ -200,6 +204,9 @@ variables:
- eq:
- ${bastion_shape}
- "VM.Standard.E3.Flex"
- eq:
- ${bastion_shape}
- "VM.Optimized3.Flex"
- eq:
- ${bastion_shape}
- "VM.Standard.E4.Flex"
Expand All @@ -217,6 +224,9 @@ variables:
- eq:
- ${bastion_shape}
- "VM.Standard.E3.Flex"
- eq:
- ${bastion_shape}
- "VM.Optimized3.Flex"
- eq:
- ${bastion_shape}
- "VM.Standard.E4.Flex"
Expand Down Expand Up @@ -393,6 +403,9 @@ variables:
- eq:
- ${instance_pool_shape}
- "VM.Standard.E3.Flex"
- eq:
- ${instance_pool_shape}
- "VM.Optimized3.Flex"
- eq:
- ${instance_pool_shape}
- "VM.Standard.E4.Flex"
Expand All @@ -408,6 +421,9 @@ variables:
- eq:
- ${instance_pool_shape}
- "VM.Standard.E3.Flex"
- eq:
- ${instance_pool_shape}
- "VM.Optimized3.Flex"
- eq:
- ${instance_pool_shape}
- "VM.Standard.E4.Flex"
Expand All @@ -425,6 +441,9 @@ variables:
- eq:
- ${instance_pool_shape}
- "VM.Standard.E3.Flex"
- eq:
- ${instance_pool_shape}
- "VM.Optimized3.Flex"
- eq:
- ${instance_pool_shape}
- "VM.Standard.E4.Flex"
Expand Down Expand Up @@ -468,9 +487,10 @@ variables:
description: "Marketplace listing to use"
required: true
enum:
- "1. Oracle Linux 7.8 OFED5.0"
- "2. Oracle Linux 7.5 OFED4.4"
default: "1. Oracle Linux 7.8 OFED5.0"
- "1. Oracle Linux 7.9 OFED 5.3-1.0.0.1 RHCK 20210607"
- "2. Oracle Linux 7.8 OFED 5.0-1.0.0.0 UEK 20200826"
- "3. Oracle Linux 7.7 OFED 4.4-2.0.7.0 UEK 20200229"
default: "1. Oracle Linux 7.9 OFED 5.3-1.0.0.1 RHCK 20210607"
visible:
and:
- ${use_marketplace_image}
Expand Down
Loading

0 comments on commit b5c3378

Please sign in to comment.