From 38808f2f864ef9894a1e56bdbb78a3c7468c1a49 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Fri, 17 Nov 2023 11:55:29 +0000 Subject: [PATCH 01/14] Ability to generate a debug bundle and upload as a CI artifact (#59) * Add steps to upload a log bundle * Add generic support for creating debug bundles * Do not swallow output from seed-ssh commands * Revert incorrect change * Fix typo * Fix broken tar command * Fix typo in bundle filenames --- .github/actions/test/action.yml | 20 +++++++ bin/create-debug-bundle | 102 ++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100755 bin/create-debug-bundle diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml index 3963f78..c0fcdae 100644 --- a/.github/actions/test/action.yml +++ b/.github/actions/test/action.yml @@ -5,6 +5,10 @@ inputs: description: The name to use for the test report artifact. required: true default: test-report + debug-bundle-artifact-name: + description: The name to use for the debug bundle artifact. + required: true + default: debug-bundle runs: using: composite steps: @@ -49,3 +53,19 @@ runs: log.html report.html if: ${{ always() }} + + - name: Create debug bundle + shell: bash + run: | + set -e + source ./ci.env + source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT" + ./bin/create-debug-bundle + if: ${{ always() }} + + - name: Upload debug bundle + uses: actions/upload-artifact@v3 + with: + name: ${{ inputs.debug-bundle-artifact-name }} + path: debug-bundle.tar.gz + if: ${{ always() }} diff --git a/bin/create-debug-bundle b/bin/create-debug-bundle new file mode 100755 index 0000000..94f21de --- /dev/null +++ b/bin/create-debug-bundle @@ -0,0 +1,102 @@ +#!/usr/bin/env bash + +##### +# This script produces an archive containing useful information for debugging +##### + +set -eo pipefail + + +if [ -z "$AZIMUTH_CONFIG_ROOT" ] || [ -z "$AZIMUTH_CONFIG_ENVIRONMENT_ROOT" ]; then + echo "Please activate an environment" >&2 + exit 1 +fi + + +ansible_variable() { + ANSIBLE_LOAD_CALLBACK_PLUGINS=true \ + ANSIBLE_STDOUT_CALLBACK=json \ + ansible -m debug -a "var=$1" all | \ + jq -r ".plays[0].tasks[0].hosts.localhost.$1" +} + + +bundle_name="${1:-debug-bundle}" + + +# Get the install_mode that is in use +install_mode="$(ansible_variable install_mode)" + +# For HA installations, we want to collect some additional information from the seed before +# collecting the regular information from the HA cluster +if [ "$install_mode" = "ha" ]; then + cluster_name="$(ansible_variable capi_cluster_release_name)" + additional_commands="$( +cat < debug-bundle/seed-list-pods.txt 2>&1 +kubectl get pvc --all-namespaces -o wide > debug-bundle/seed-list-pvcs.txt 2>&1 +kubectl get cluster-api -o wide > debug-bundle/seed-list-cluster-api.txt 2>&1 +kubectl -n capi-addon-system logs --since=1h deploy/cluster-api-addon-provider > debug-bundle/seed-logs-cluster-api-addon-provider.txt 2>&1 +kubectl -n capi-janitor-system logs --since=1h deploy/cluster-api-janitor-openstack > debug-bundle/seed-logs-cluster-api-janitor-openstack.txt 2>&1 +kubectl -n capi-kubeadm-bootstrap-system logs --since=1h deploy/capi-kubeadm-bootstrap-controller-manager > debug-bundle/seed-logs-capi-kubeadm-bootstrap-controller-manager.txt 2>&1 +kubectl -n capi-kubeadm-control-plane-system logs --since=1h deploy/capi-kubeadm-control-plane-controller-manager > debug-bundle/seed-logs-capi-kubeadm-control-plane-controller-manager.txt 2>&1 +kubectl -n capi-system logs --since=1h deploy/capi-controller-manager > debug-bundle/seed-logs-capi-controller-manager.txt 2>&1 +kubectl -n capo-system logs --since=1h deploy/capo-controller-manager > debug-bundle/seed-logs-capo-controller-manager.txt 2>&1 + +export KUBECONFIG=./kubeconfig-${cluster_name}.yaml +EOF + )" +fi + + +# We produce an archive on the seed, then pull it down +# For some reason, cating the archive at the end of the main command corrupts the file +# but using a separate command doesn't... :shrugs: + +# Things that we include: +#  * List of pods +#  * List of PVCs +#  * List of installed CaaS cluster types and clusters +#  * List of installed Kubernetes templates and clusters +#  * List of installed app templates +# * List of installed Cluster API resources +#  * The last hour of logs from some key components +echo "[INFO] Collecting debug information" +"$AZIMUTH_CONFIG_ROOT/bin/seed-ssh" < debug-bundle/list-pods.txt 2>&1 +kubectl get pvc --all-namespaces -o wide > debug-bundle/list-pvcs.txt 2>&1 +kubectl get clustertypes.caas -o wide > debug-bundle/list-caas-clustertypes.txt 2>&1 +kubectl get clusters.caas --all-namespaces -o wide > debug-bundle/list-caas-clusters.txt 2>&1 +kubectl get clustertemplates -o wide > debug-bundle/list-kube-templates.txt 2>&1 +kubectl get clusters --all-namespaces -o wide > debug-bundle/list-kube-clusters.txt 2>&1 +kubectl get apptemplates --all-namespaces -o wide > debug-bundle/list-kube-apptemplates.txt 2>&1 +kubectl get cluster-api --all-namespaces -o wide > debug-bundle/list-cluster-api.txt 2>&1 + +kubectl -n azimuth logs --since=1h deploy/azimuth-api > debug-bundle/logs-azimuth-api.txt 2>&1 +kubectl -n azimuth logs --since=1h deploy/azimuth-caas-operator > debug-bundle/logs-azimuth-caas-operator.txt 2>&1 +kubectl -n azimuth logs --since=1h deploy/azimuth-capi-operator > debug-bundle/logs-azimuth-capi-operator.txt 2>&1 +kubectl -n azimuth logs --since=1h deploy/azimuth-identity-operator > debug-bundle/logs-azimuth-identity-operator.txt 2>&1 +kubectl -n azimuth logs --since=1h deploy/azimuth-ui > debug-bundle/logs-azimuth-ui.txt 2>&1 +kubectl -n azimuth logs --since=1h deploy/zenith-server-registrar > debug-bundle/logs-zenith-server-registrar.txt 2>&1 +kubectl -n azimuth logs --since=1h deploy/zenith-server-sshd > debug-bundle/logs-zenith-server-sshd.txt 2>&1 +kubectl -n azimuth logs --since=1h deploy/zenith-server-sync > debug-bundle/logs-zenith-server-sync.txt 2>&1 +kubectl -n capi-addon-system logs --since=1h deploy/cluster-api-addon-provider > debug-bundle/logs-cluster-api-addon-provider.txt 2>&1 +kubectl -n capi-janitor-system logs --since=1h deploy/cluster-api-janitor-openstack > debug-bundle/logs-cluster-api-janitor-openstack.txt 2>&1 +kubectl -n capi-kubeadm-bootstrap-system logs --since=1h deploy/capi-kubeadm-bootstrap-controller-manager > debug-bundle/logs-capi-kubeadm-bootstrap-controller-manager.txt 2>&1 +kubectl -n capi-kubeadm-control-plane-system logs --since=1h deploy/capi-kubeadm-control-plane-controller-manager > debug-bundle/logs-capi-kubeadm-control-plane-controller-manager.txt 2>&1 +kubectl -n capi-system logs --since=1h deploy/capi-controller-manager > debug-bundle/logs-capi-controller-manager.txt 2>&1 +kubectl -n capo-system logs --since=1h deploy/capo-controller-manager > debug-bundle/logs-capo-controller-manager.txt 2>&1 + +tar -czf debug-bundle.tar.gz -C debug-bundle \$(ls -A debug-bundle) +EOF +echo "[INFO] Fetching debug bundle" +"$AZIMUTH_CONFIG_ROOT/bin/seed-ssh" -- cat debug-bundle.tar.gz > "$bundle_name.tar.gz" +echo "[INFO] Cleaning up interim files" +"$AZIMUTH_CONFIG_ROOT/bin/seed-ssh" -- rm -rf debug-bundle debug-bundle.tar.gz From c3a3b105374a0c1a405691518c276972d2e464f3 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Fri, 1 Dec 2023 11:57:07 +0000 Subject: [PATCH 02/14] Specify CI network by name, not ID (#62) --- environments/ci/inventory/group_vars/all/variables.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/ci/inventory/group_vars/all/variables.yml b/environments/ci/inventory/group_vars/all/variables.yml index 115d3ae..c70a08f 100644 --- a/environments/ci/inventory/group_vars/all/variables.yml +++ b/environments/ci/inventory/group_vars/all/variables.yml @@ -1,5 +1,5 @@ -# Use a pre-existing network so that we don't need to steal a router -infra_network_id: 4b6b2722-ee5b-40ec-8e52-a6610e14cc51 +# Use the pre-existing portal-internal network so that we don't need to steal a router +infra_network_id: "{{ lookup('pipe', 'openstack network show portal-internal -f value -c id') }}" # Flavor auto-detection picks the wrong flavors on Arcus, so override them # The flavor to use for the Azimuth AIO VM (vm.ska.cpu.general.eighth) From 4d828341d287bed3b6ceeef38eab65bdfa19f589 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Fri, 1 Dec 2023 18:12:22 +0000 Subject: [PATCH 03/14] Add chore to update azimuth-ops version (#63) --- .github/workflows/update-dependencies.yml | 65 +++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 .github/workflows/update-dependencies.yml diff --git a/.github/workflows/update-dependencies.yml b/.github/workflows/update-dependencies.yml new file mode 100644 index 0000000..3ec31b5 --- /dev/null +++ b/.github/workflows/update-dependencies.yml @@ -0,0 +1,65 @@ +# This workflow proposes updates to the dependencies that dependabot cannot +name: Update dependencies + +on: + # Allow manual executions + workflow_dispatch: + # Run nightly + schedule: + - cron: '0 0 * * *' + +jobs: + propose_github_release_updates: + runs-on: ubuntu-latest + strategy: + matrix: + include: + - key: azimuth-ops + path: ./requirements.yml + repository: stackhpc/ansible-collection-azimuth-ops + prereleases: "yes" + version_jsonpath: collections[0].version + + name: ${{ matrix.key }} + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Check for most recent GitHub release + id: next + uses: stackhpc/github-actions/github-latest-release@master + with: + repository: ${{ matrix.repository }} + prereleases: ${{ matrix.prereleases || 'no' }} + + - name: Update dependency key + uses: stackhpc/github-actions/config-update@master + with: + path: ${{ matrix.path }} + updates: | + ${{ matrix.version_jsonpath }}=${{ steps.next.outputs.version }} + + - name: Generate app token for PR + uses: stackhpc/github-actions/generate-app-token@master + id: generate-app-token + with: + repository: ${{ github.repository }} + app-id: ${{ secrets.APP_ID }} + app-private-key: ${{ secrets.APP_PRIVATE_KEY }} + + - name: Propose changes via PR if required + uses: peter-evans/create-pull-request@v5 + with: + token: ${{ steps.generate-app-token.outputs.token }} + commit-message: >- + Update ${{ matrix.key }} to ${{ steps.next.outputs.version }} + branch: update-dependency/${{ matrix.key }} + delete-branch: true + title: >- + Update ${{ matrix.key }} to ${{ steps.next.outputs.version }} + body: > + This PR was created automatically to update + ${{ matrix.key }} to ${{ steps.next.outputs.version }}. + labels: | + automation + dependency-update From 0c2e6092a0782fd56ea385a4323b59d741fb4917 Mon Sep 17 00:00:00 2001 From: "azimuth-ci-bot[bot]" <142236172+azimuth-ci-bot[bot]@users.noreply.github.com> Date: Tue, 5 Dec 2023 13:06:14 +0000 Subject: [PATCH 04/14] Update azimuth-ops to 0.2.0-rc.1 (#64) Co-authored-by: mkjpryor --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index 7c1ec24..18f9de6 100644 --- a/requirements.yml +++ b/requirements.yml @@ -3,7 +3,7 @@ collections: - name: https://github.com/stackhpc/ansible-collection-azimuth-ops.git type: git - version: 0.1.0 + version: 0.2.0-rc.1 # For local development # - type: dir # source: ../ansible-collection-azimuth-ops From 61e8d5e71acf1c833da24bdda9497a07ba9efa07 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Tue, 5 Dec 2023 15:07:21 +0000 Subject: [PATCH 05/14] [skip ci] Add a HA test case that runs on tags only (#65) --- .github/actions/setup/action.yml | 6 ++- .github/workflows/test-pr.yml | 15 +++++- .github/workflows/test-tag.yml | 46 +++++++++++++++++++ .../base/inventory/group_vars/all.yml | 3 ++ environments/ci-ha/ansible.cfg | 9 ++++ .../inventory/group_vars/all/variables.yml | 6 +++ environments/ci-ha/inventory/hosts | 2 + 7 files changed, 85 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/test-tag.yml create mode 100644 environments/ci-ha/ansible.cfg create mode 100644 environments/ci-ha/inventory/group_vars/all/variables.yml create mode 100644 environments/ci-ha/inventory/hosts diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml index 188b18d..2ee7f04 100644 --- a/.github/actions/setup/action.yml +++ b/.github/actions/setup/action.yml @@ -9,6 +9,10 @@ inputs: description: The ref to use for the Azimuth configuration. required: true default: main + config-environment: + description: The config environment to use. + required: true + default: ci azimuth-ops-version: description: > The azimuth-ops version to use. If not given, the default version is used. @@ -55,7 +59,7 @@ runs: CI_ENV: | export OS_CLOUD="${{ inputs.os-cloud-name }}" export OS_CLIENT_CONFIG_FILE="$PWD/clouds.yaml" - export AZIMUTH_CONFIG_ENVIRONMENT=ci + export AZIMUTH_CONFIG_ENVIRONMENT=${{ inputs.config-environment }} export AZIMUTH_ENVIRONMENT="${{ inputs.environment-prefix }}-${{ github.run_id }}" export ANSIBLE_FORCE_COLOR=true diff --git a/.github/workflows/test-pr.yml b/.github/workflows/test-pr.yml index e9dba31..fa3a3fa 100644 --- a/.github/workflows/test-pr.yml +++ b/.github/workflows/test-pr.yml @@ -24,8 +24,21 @@ jobs: - name: PR must be from a branch in the azimuth-config repo run: exit ${{ github.repository == 'stackhpc/azimuth-config' && '0' || '1' }} - run_azimuth_tests: + # We want jobs to wait in a queue for a slot to run, so as not to overload the test infra + # GitHub concurrency _almost_ does this, except the queue length is one :-( + # There is a feature request for what we need https://github.com/orgs/community/discussions/12835 + # Until that is implemented, the only other viable option is a busy wait + wait_in_queue: needs: [fail_on_remote] + runs-on: ubuntu-latest + steps: + - name: Wait for an available slot + uses: stackhpc/github-actions/workflow-concurrency@master + with: + max-concurrency: 1 + + run_azimuth_tests: + needs: [wait_in_queue] if: ${{ !github.event.pull_request.draft }} runs-on: ubuntu-latest steps: diff --git a/.github/workflows/test-tag.yml b/.github/workflows/test-tag.yml new file mode 100644 index 0000000..934f382 --- /dev/null +++ b/.github/workflows/test-tag.yml @@ -0,0 +1,46 @@ +name: Test Azimuth deployment + +on: + push: + tags: + - "*" + +jobs: + # We want jobs to wait in a queue for a slot to run, so as not to overload the test infra + # GitHub concurrency _almost_ does this, except the queue length is one :-( + # There is a feature request for what we need https://github.com/orgs/community/discussions/12835 + # Until that is implemented, the only other viable option is a busy wait + wait_in_queue: + runs-on: ubuntu-latest + steps: + - name: Wait for an available slot + uses: stackhpc/github-actions/workflow-concurrency@master + with: + max-concurrency: 1 + + # For tags, we run a full HA test (for now) + run_azimuth_tests: + needs: [wait_in_queue] + runs-on: ubuntu-latest + steps: + # We need to check out the code under test first in order to use local actions + - name: Checkout code under test + uses: actions/checkout@v3 + + - name: Set up Azimuth environment + uses: ./.github/actions/setup + with: + os-clouds: ${{ secrets.CLOUD }} + repository: ${{ github.repository }} + ref: ${{ github.ref }} + config-environment: ci-ha + + - name: Provision Azimuth + uses: ./.github/actions/provision + + - name: Run Azimuth tests + uses: ./.github/actions/test + + - name: Destroy Azimuth + uses: ./.github/actions/destroy + if: ${{ always() }} diff --git a/environments/base/inventory/group_vars/all.yml b/environments/base/inventory/group_vars/all.yml index def726c..923ec5f 100644 --- a/environments/base/inventory/group_vars/all.yml +++ b/environments/base/inventory/group_vars/all.yml @@ -55,6 +55,9 @@ harbor_proxy_cache_projects: >- # Indicates whether to install a Grafana to show cloud metrics cloud_metrics_enabled: no +# Indicates whether to enable Velero for backup and restore +velero_enabled: no + # Azimuth features to enable azimuth_apps_enabled: yes azimuth_kubernetes_enabled: yes diff --git a/environments/ci-ha/ansible.cfg b/environments/ci-ha/ansible.cfg new file mode 100644 index 0000000..3a71ec0 --- /dev/null +++ b/environments/ci-ha/ansible.cfg @@ -0,0 +1,9 @@ +[defaults] +inventory = ../base/inventory,../ha/inventory,../demo/inventory,../ci/inventory,./inventory +roles_path = ../../.ansible/roles +collections_path = ../../.ansible/collections + +host_key_checking = False + +[ssh_connection] +retries = 3 diff --git a/environments/ci-ha/inventory/group_vars/all/variables.yml b/environments/ci-ha/inventory/group_vars/all/variables.yml new file mode 100644 index 0000000..d82c9c9 --- /dev/null +++ b/environments/ci-ha/inventory/group_vars/all/variables.yml @@ -0,0 +1,6 @@ +# Flavor auto-detection picks the wrong flavors on Arcus, so override them +# The flavor to use for the seed VM (vm.ska.cpu.general.small) +infra_flavor_id: c8b72062-5d52-4590-9d7a-68a670b44442 +# The flavor to use for the control plane nodes +capi_cluster_control_plane_flavor: vm.ska.cpu.general.small +capi_cluster_worker_flavor: vm.ska.cpu.general.eighth diff --git a/environments/ci-ha/inventory/hosts b/environments/ci-ha/inventory/hosts new file mode 100644 index 0000000..9dcf1df --- /dev/null +++ b/environments/ci-ha/inventory/hosts @@ -0,0 +1,2 @@ +[terraform_provision] +localhost ansible_connection=local ansible_python_interpreter="{{ ansible_playbook_python }}" From 0c8b46d65045bde14640e2db2ddab7eeac4e8ff1 Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Tue, 5 Dec 2023 15:29:05 +0000 Subject: [PATCH 06/14] Documentation for backup and restore process (#60) * WIP: Velero docs * First draft of Velero docs * Add TTL note * Update backup and restore docs * Update backup and restore docs * Format + trim documentation * [skip ci] Add disaster recovery to best practices --------- Co-authored-by: Scott Davidson Co-authored-by: Matt Pryor --- docs/best-practice.md | 9 ++ docs/configuration/14-disaster-recovery.md | 115 +++++++++++++++++++++ mkdocs.yml | 1 + 3 files changed, 125 insertions(+) create mode 100644 docs/configuration/14-disaster-recovery.md diff --git a/docs/best-practice.md b/docs/best-practice.md index 10006e1..0ab30e2 100644 --- a/docs/best-practice.md +++ b/docs/best-practice.md @@ -103,6 +103,15 @@ A [sample GitLab CI/CD configuration](https://github.com/stackhpc/azimuth-config/tree/main/.gitlab-ci.yml.sample) is provided that implements this workflow for GitLab-hosted repositories. +## Disaster recovery + +Azimuth uses [Velero](https://velero.io/) to backup the data that is required to restore an +Azimuth instance in the event of a catastrophic failure. This functionality is not enabled by +default, as it requires credentials for an S3 bucket in which the backups will be stored. + +It is recommended that [disaster recovery is enabled](./configuration/14-disaster-recovery.md) for +a production deployment. + ## Configuration You are now ready to begin adding configuration to your environments. When building an environment diff --git a/docs/configuration/14-disaster-recovery.md b/docs/configuration/14-disaster-recovery.md new file mode 100644 index 0000000..1935611 --- /dev/null +++ b/docs/configuration/14-disaster-recovery.md @@ -0,0 +1,115 @@ +# Disaster Recovery + +Azimuth uses [Velero](https://velero.io) as a disaster recovery solution. Velero provides the +ability to back up Kubernetes API resources to an object store and has a plugin-based system +to enable snapshotting of a cluster's persistent volumes. + +!!! warning + + Backup and restore is only available for production-grade HA installations of Azimuth. + +The Azimuth playbooks install Velero on the HA management cluster and the Velero CLI tool +on the seed node. Once configured with the appropriate credentials, the installation process +will create a [Schedule](https://velero.io/docs/latest/api-types/schedule/) on the HA cluster, +which triggers a daily backup at midnight and cleans up backups older which are more than 1 week old. + +The +[AWS Velero plugin](https://github.com/vmware-tanzu/velero-plugin-for-aws) is used for S3 support +and the +[CSI plugin](https://github.com/vmware-tanzu/velero-plugin-for-csi) for volume snapshots. +The CSI plugin uses Kubernetes generic support for +[Volume Snapshots](https://kubernetes.io/docs/concepts/storage/volume-snapshots/), which is +implemented for OpenStack by the +[Cinder CSI plugin](https://github.com/kubernetes/cloud-provider-openstack). + +## Configuration + +To enable backup and restore functionality, the following variables should be set in your environment: + +```yaml title="environments/my-site/inventory/group_vars/all/variables.yml" +velero_enabled: true +velero_s3_url: +velero_bucket_name: +``` + +```yaml title="environments/my-site/inventory/group_vars/all/secrets.yml" +velero_aws_access_key_id: +velero_aws_secret_access_key: +``` + +!!! danger + + The S3 credentials should be kept secret. If you want to keep them in Git - which is recommended - + then it [must be encrypted](../repository/secrets.md). + +## Velero CLI + +The Velero installation process also installs the Velero CLI on the Azimuth seed node, which can be +used to inspect the state of the backups: + +```sh title="On the seed node, with the kubeconfig for the HA cluster exported" +# List the configured backup locations +velero backup-location get + +# List the backups and their statuses +velero backup get +``` + +See `velero -h` for other useful commands. + +## Restoring from a backup + +To restore from a backup, you must first know the name of the target backup. This can be inferred +from the object names in S3 if the Velero CLI is no longer available. + +Once you have the name of the backup to restore, run the following command with your environment +activated (similar to a provision): + +```bash +ansible-playbook stackhpc.azimuth_ops.restore \ + -e velero_restore_backup_name= +``` + +This will provision a new HA cluster, restore the backup onto it and then bring the installation +up-to-date with your configuration. + +## Performing ad-hoc backups + +In order to perform ad-hoc backups using the same config parameters as the installed backup schedule, +run the following Velero CLI command from the seed node: + +```sh title="On the seed node, with the kubeconfig for the HA cluster exported" +velero backup create --from-schedule default +``` + +This will begin the backup process in the background. The status of this backup (and others) can be +viewed with the `velero backup get` command shown above. + +!!! tip + + Ad-hoc backups will have the same time-to-live as the configured schedule backups (default = 7 days). + To change this, pass the `--ttl ` option to the `velero backup create` command. + +## Modifying the backup schedule + +The following config options are available for modifying the regular backup schedule: + +```yaml title="environments/my-site/inventory/group_vars/all/variables.yml" +# Whether or not to perform scheduled backups +velero_backup_schedule_enabled: true +# Name for backup schedule kubernetes resource +velero_backup_schedule_name: default +# Schedule to use for backups (defaults to every day at midnight) +# See https://en.wikipedia.org/wiki/Cron for format options +velero_backup_schedule_timings: "0 0 * * *" +# Time-to-live for existing backups (defaults to 1 week) +# See https://pkg.go.dev/time#ParseDuration for duration format options +velero_backup_schedule_ttl: "168h" +``` + +!!! note + + Setting `velero_backup_schedule_enabled: false` does not prevent the backup schedule from being + installed - instead it sets the schedule state to `paused`. + + This allows for ad-hoc backups to still be run on demand using the configured backup parameters. diff --git a/mkdocs.yml b/mkdocs.yml index c922965..6ee6b34 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -27,6 +27,7 @@ nav: - configuration/11-kubernetes-apps.md - configuration/12-caas.md - configuration/13-monitoring.md + - configuration/14-disaster-recovery.md - Deployment: - deployment/index.md - deployment/automation.md From 161a26b101e7b3616d573407c884fec33dd0bdf6 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Thu, 7 Dec 2023 10:55:58 +0000 Subject: [PATCH 07/14] Allow docs build to be manually triggered --- .github/workflows/publish-docs.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml index 9024063..55173fb 100644 --- a/.github/workflows/publish-docs.yml +++ b/.github/workflows/publish-docs.yml @@ -3,6 +3,7 @@ on: push: branches: - main + workflow_dispatch: jobs: deploy: From d2924782b5401906065c3619d9b999a599b6f07f Mon Sep 17 00:00:00 2001 From: "azimuth-ci-bot[bot]" <142236172+azimuth-ci-bot[bot]@users.noreply.github.com> Date: Thu, 7 Dec 2023 13:57:28 +0000 Subject: [PATCH 08/14] Update azimuth-ops to 0.2.0 (#66) Co-authored-by: mkjpryor --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index 18f9de6..b7477cf 100644 --- a/requirements.yml +++ b/requirements.yml @@ -3,7 +3,7 @@ collections: - name: https://github.com/stackhpc/ansible-collection-azimuth-ops.git type: git - version: 0.2.0-rc.1 + version: 0.2.0 # For local development # - type: dir # source: ../ansible-collection-azimuth-ops From a292e2603b15336c61a876949f6d9064e3590615 Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Thu, 7 Dec 2023 16:35:54 +0000 Subject: [PATCH 09/14] Set expected alert count depending on CI deployment type (#68) * Set expected alert count depending on deployment type * Fix typo * Use ansible variable to determine expected alert count --------- Co-authored-by: sd109 Co-authored-by: Matt Pryor --- bin/check-alerts | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/bin/check-alerts b/bin/check-alerts index 1660dce..49b7776 100755 --- a/bin/check-alerts +++ b/bin/check-alerts @@ -5,6 +5,34 @@ # to check that the expected number of alerts are firing after deployment ##### +set -eo pipefail + + +if [ -z "$AZIMUTH_CONFIG_ROOT" ] || [ -z "$AZIMUTH_CONFIG_ENVIRONMENT_ROOT" ]; then + echo "Please activate an environment" >&2 + exit 1 +fi + + +ansible_variable() { + ANSIBLE_LOAD_CALLBACK_PLUGINS=true \ + ANSIBLE_STDOUT_CALLBACK=json \ + ansible -m debug -a "var=$1" all | \ + jq -r ".plays[0].tasks[0].hosts.localhost.$1" +} + + +# Get the install_mode that is in use +install_mode="$(ansible_variable install_mode)" + +# For HA installs, there should only be the watchdog +# Single-node deployments have other expected alerts +if [ "$install_mode" = "ha" ]; then + EXPECTED_ALERT_COUNT=1 +else + EXPECTED_ALERT_COUNT=4 +fi + echo "Starting port-forward for Prometheus API" ./bin/port-forward prometheus 9090 > /dev/null 2>&1 & PID="$!" @@ -43,11 +71,7 @@ while true; do echo "$ALERTS" | jq -r '.data.alerts[] | " " + .labels.alertname + " is " + .state' echo "" - # Demo deployment should have 4 pending alerts: - # watchdog + Kube{ControllerManager,Scheduler,Proxy}Down - # since k3s doesn't have distinct pods for these services - # TODO(mkjpryor) Fix these alerts in K3S deployment :D - if [[ $ALERT_COUNT == 4 ]]; then + if [[ ${ALERT_COUNT} -eq ${EXPECTED_ALERT_COUNT} ]]; then exit elif [[ ${RETRIES} -eq ${RETRY_LIMIT} ]]; then exit 1 From cce9fa16b7e2491c61f0297ee44db825334addad Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Thu, 7 Dec 2023 17:07:31 +0000 Subject: [PATCH 10/14] Reduce resource requirements for HA test --- environments/ci-ha/inventory/group_vars/all/variables.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/environments/ci-ha/inventory/group_vars/all/variables.yml b/environments/ci-ha/inventory/group_vars/all/variables.yml index d82c9c9..31f1a9e 100644 --- a/environments/ci-ha/inventory/group_vars/all/variables.yml +++ b/environments/ci-ha/inventory/group_vars/all/variables.yml @@ -4,3 +4,8 @@ infra_flavor_id: c8b72062-5d52-4590-9d7a-68a670b44442 # The flavor to use for the control plane nodes capi_cluster_control_plane_flavor: vm.ska.cpu.general.small capi_cluster_worker_flavor: vm.ska.cpu.general.eighth +# Although this is a "HA" test, what we are really testing is the spawning +# of the CAPI cluster and deployment of Azimuth onto that +# So one control plane node and two workers is sufficient for that +capi_cluster_control_plane_count: 1 +capi_cluster_worker_count: 2 From 22fcace1b447326b00dd8702a83653db7e4aea9f Mon Sep 17 00:00:00 2001 From: "azimuth-ci-bot[bot]" <142236172+azimuth-ci-bot[bot]@users.noreply.github.com> Date: Thu, 7 Dec 2023 18:28:23 +0000 Subject: [PATCH 11/14] Update azimuth-ops to 0.2.1 (#69) Co-authored-by: mkjpryor --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index b7477cf..af8c8d7 100644 --- a/requirements.yml +++ b/requirements.yml @@ -3,7 +3,7 @@ collections: - name: https://github.com/stackhpc/ansible-collection-azimuth-ops.git type: git - version: 0.2.0 + version: 0.2.1 # For local development # - type: dir # source: ../ansible-collection-azimuth-ops From 7b80ac8bb7137b6275edda756b6cba9a022285e7 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Thu, 7 Dec 2023 18:32:06 +0000 Subject: [PATCH 12/14] Use Docker Hub mirror for HA test --- .github/workflows/test-tag.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/test-tag.yml b/.github/workflows/test-tag.yml index 934f382..904d13c 100644 --- a/.github/workflows/test-tag.yml +++ b/.github/workflows/test-tag.yml @@ -34,6 +34,11 @@ jobs: repository: ${{ github.repository }} ref: ${{ github.ref }} config-environment: ci-ha + # Use the configured Docker Hub mirror for the HA cluster for now + extra-vars: | + capi_cluster_registry_mirrors: + docker.io: + - "${{ secrets.DOCKER_HUB_MIRROR_URL }}" - name: Provision Azimuth uses: ./.github/actions/provision From 2d34f24f4be224c88f824ab748eb37a310c621de Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Fri, 8 Dec 2023 12:38:08 +0000 Subject: [PATCH 13/14] Fix Consul affinity in HA test (#70) --- .github/workflows/test-tag.yml | 5 ----- environments/ci-ha/inventory/group_vars/all/variables.yml | 7 +++++++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test-tag.yml b/.github/workflows/test-tag.yml index 904d13c..934f382 100644 --- a/.github/workflows/test-tag.yml +++ b/.github/workflows/test-tag.yml @@ -34,11 +34,6 @@ jobs: repository: ${{ github.repository }} ref: ${{ github.ref }} config-environment: ci-ha - # Use the configured Docker Hub mirror for the HA cluster for now - extra-vars: | - capi_cluster_registry_mirrors: - docker.io: - - "${{ secrets.DOCKER_HUB_MIRROR_URL }}" - name: Provision Azimuth uses: ./.github/actions/provision diff --git a/environments/ci-ha/inventory/group_vars/all/variables.yml b/environments/ci-ha/inventory/group_vars/all/variables.yml index 31f1a9e..83be0d3 100644 --- a/environments/ci-ha/inventory/group_vars/all/variables.yml +++ b/environments/ci-ha/inventory/group_vars/all/variables.yml @@ -1,3 +1,6 @@ +# Unset the network ID so that a network + router are provisioned +infra_network_id: + # Flavor auto-detection picks the wrong flavors on Arcus, so override them # The flavor to use for the seed VM (vm.ska.cpu.general.small) infra_flavor_id: c8b72062-5d52-4590-9d7a-68a670b44442 @@ -9,3 +12,7 @@ capi_cluster_worker_flavor: vm.ska.cpu.general.eighth # So one control plane node and two workers is sufficient for that capi_cluster_control_plane_count: 1 capi_cluster_worker_count: 2 +# Disable affinity for the Consul server so we can have 3 pods on two nodes +consul_release_overrides: + server: + affinity: "" From 0b103b9480a47b6b284d0da691c9be7a49a62cf9 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Fri, 8 Dec 2023 17:25:15 +0000 Subject: [PATCH 14/14] Move docs to readthedocs (#71) --- .github/workflows/publish-docs.yml | 20 -------------------- .readthedocs.yaml | 13 +++++++++++++ requirements-docs.txt | 3 +++ 3 files changed, 16 insertions(+), 20 deletions(-) delete mode 100644 .github/workflows/publish-docs.yml create mode 100644 .readthedocs.yaml create mode 100644 requirements-docs.txt diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml deleted file mode 100644 index 55173fb..0000000 --- a/.github/workflows/publish-docs.yml +++ /dev/null @@ -1,20 +0,0 @@ -name: Publish docs via GitHub Pages -on: - push: - branches: - - main - workflow_dispatch: - -jobs: - deploy: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - - uses: actions/setup-python@v2 - with: - python-version: 3.x - - - run: pip install mkdocs==1.3.0 mkdocs-material==8.3.8 mkdocs-git-revision-date-localized-plugin - - - run: mkdocs gh-deploy --force diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..7c247e0 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,13 @@ +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3" + +mkdocs: + configuration: mkdocs.yml + +python: + install: + - requirements: requirements-docs.txt diff --git a/requirements-docs.txt b/requirements-docs.txt new file mode 100644 index 0000000..a9c1f19 --- /dev/null +++ b/requirements-docs.txt @@ -0,0 +1,3 @@ +mkdocs==1.3.0 +mkdocs-material==8.3.8 +mkdocs-git-revision-date-localized-plugin