From b2062ed3729d27a56dc0cd505a732122417c3346 Mon Sep 17 00:00:00 2001 From: floptical Date: Mon, 12 Feb 2024 16:16:21 +0000 Subject: [PATCH] completely stop staging ECS containers before pg_restore --- ais/engine/bin/build_and_deploy.sh | 62 +++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/ais/engine/bin/build_and_deploy.sh b/ais/engine/bin/build_and_deploy.sh index 00d41e0..e084c95 100755 --- a/ais/engine/bin/build_and_deploy.sh +++ b/ais/engine/bin/build_and_deploy.sh @@ -32,8 +32,8 @@ set -o errtrace # Cleanup command that will run if something in the script fails. function cleanup { echo "Error! Exited prematurely at $BASH_COMMAND!!" - echo "running reenable_alarm.." - reenable_alarm + echo "running reenable_taskin_alarm.." + reenable_taskin_alarm } trap cleanup ERR @@ -325,7 +325,7 @@ check_rds_instance() { local stage_instance_identifier="ais-engine-blue" fi - local max_attempts=30 + local max_attempts=90 local attempt=1 while [ $attempt -le $max_attempts ]; do instance_status=$(aws rds describe-db-instances --region "us-east-1" \ @@ -349,6 +349,26 @@ check_rds_instance() { } +modify_stage_scaling_out() { + # either disable or enable + action=$1 + + # 1 disable staging taskout action that scales out containers + if [[ "$action" == 'disable' ]]; then + echo 'Disabling ECS tasks and scale out..' + # 1. disable staging taskout action that scales out containers + aws cloudwatch disable-alarm-actions --alarm-names ais-${staging_color}-api-taskout + # 2. Set desired tasks to 0 so they don't blow up the db with health checks while restoring. + aws ecs update-service --cluster ais-blue-cluster --service ais-${staging_color}-api-service --desired-count 0 + elif [[ "$action" == 'enable' ]]; then + echo 'Reenabling ECS tasks and scale out..' + aws cloudwatch enable-alarm-actions --alarm-names ais-${staging_color}-api-taskout + # Must allow back at least 1 instance so our later checks on the target groups works. + aws ecs update-service --cluster ais-blue-cluster --service ais-${staging_color}-api-service --desired-count 1 + fi +} + + # Update (Restore) AWS RDS instance to staging database # Note: you can somewhat track restore progress by looking at the db size: #SELECT pg_size_pretty( pg_database_size('ais_engine') ); @@ -381,18 +401,18 @@ restore_db_to_staging() { # and https://stackoverflow.com/a/75147585 # This command actually modifies the parameter group "ais-restore-parameters" each time. Just nice to have the changes it makes explicitly in code. - aws rds modify-db-parameter-group \ - --db-parameter-group-name ais-restore-parameters \ - --parameters "ParameterName=max_wal_size,ParameterValue=5120,ApplyMethod='immediate'" \ - --parameters "ParameterName=max_wal_senders,ParameterValue=0,ApplyMethod='immediate'" \ - --parameters "ParameterName=wal_keep_segments,ParameterValue=0,ApplyMethod='immediate'" \ - --parameters "ParameterName=autovacuum,ParameterValue=off,ApplyMethod='immediate'" \ - --parameters "ParameterName=shared_buffers,ParameterValue='{DBInstanceClassMemory/65536}',ApplyMethod='pending-reboot'" \ - --parameters "ParameterName=synchronous_commit,ParameterValue=off,ApplyMethod='immediate'" \ - --no-cli-pager + #aws rds modify-db-parameter-group \ + # --db-parameter-group-name ais-restore-parameters \ + # --parameters "ParameterName=max_wal_size,ParameterValue=5120,ApplyMethod='immediate'" \ + # --parameters "ParameterName=max_wal_senders,ParameterValue=0,ApplyMethod='immediate'" \ + # --parameters "ParameterName=wal_keep_segments,ParameterValue=0,ApplyMethod='immediate'" \ + # --parameters "ParameterName=autovacuum,ParameterValue=off,ApplyMethod='immediate'" \ + # --parameters "ParameterName=shared_buffers,ParameterValue='{DBInstanceClassMemory/65536}',ApplyMethod='pending-reboot'" \ + # --parameters "ParameterName=synchronous_commit,ParameterValue=off,ApplyMethod='immediate'" \ + # --no-cli-pager # modify stage rds to use restore parameter group - aws rds modify-db-instance --db-instance-identifier $stage_instance_identifier --db-parameter-group-name ais-restore-parameters --apply-immediately --no-cli-pager + #aws rds modify-db-instance --db-instance-identifier $stage_instance_identifier --db-parameter-group-name ais-restore-parameters --apply-immediately --no-cli-pager # Wait for instance status to be "available" and not "modifying". check_rds_instance $stage_instance_identifier @@ -420,6 +440,7 @@ restore_db_to_staging() { # Store output so we can determine if errors are actually bad restore_output=$(time pg_restore -v -j 6 -h $staging_db_uri -d ais_engine -U ais_engine -c $DB_DUMP_FILE_LOC || true) #echo $restore_output | grep 'errors ignored on restore' + sleep 10 # Check size after restore export PGPASSWORD=$PG_ENGINE_DB_PASS @@ -436,14 +457,12 @@ restore_db_to_staging() { fi # After restore, switch back to default RDS parameter group - aws rds modify-db-instance --db-instance-identifier $stage_instance_identifier --db-parameter-group-name default.postgres12 --apply-immediately --no-cli-pager + #aws rds modify-db-instance --db-instance-identifier $stage_instance_identifier --db-parameter-group-name default.postgres12 --apply-immediately --no-cli-pager sleep 60 - # Wait for instance status to be "available" and not "modifying". + # Wait for instance status to be "available" and not "modifying" or "backing-up". Can be triggered by restores it seems. check_rds_instance $stage_instance_identifier - # Final reboot just because - restart_staging_db sleep 60 } @@ -586,7 +605,7 @@ swap_cnames() { } -reenable_alarm() { +reenable_taskin_alarm() { echo -e "\nSleeping for 5 minutes, then running scale-in alarm re-enable command..." sleep 300 aws cloudwatch enable-alarm-actions --alarm-names ais-${staging_color}-api-taskin @@ -629,10 +648,14 @@ api_tests dump_local_db +modify_stage_scaling_out "disable" + restart_staging_db restore_db_to_staging +modify_stage_scaling_out "enable" + docker_tests scale_up_staging @@ -645,7 +668,8 @@ warmup_lb swap_cnames -c $staging_color -reenable_alarm + +reenable_taskin_alarm make_reports_tables