diff --git a/CHANGELOG/CHANGELOG-1.21.md b/CHANGELOG/CHANGELOG-1.21.md index 240204af1..4ddc03a96 100644 --- a/CHANGELOG/CHANGELOG-1.21.md +++ b/CHANGELOG/CHANGELOG-1.21.md @@ -15,4 +15,4 @@ When cutting a new release, update the `unreleased` heading to the tag being gen ## unreleased - +* [BUGFIX] [#1383](https://github.com/k8ssandra/k8ssandra-operator/issues/1383) Do not create MedusaBackup if MadusaBakupJob did not fully succeed diff --git a/controllers/medusa/medusabackupjob_controller.go b/controllers/medusa/medusabackupjob_controller.go index 5e9399a5e..35b913080 100644 --- a/controllers/medusa/medusabackupjob_controller.go +++ b/controllers/medusa/medusabackupjob_controller.go @@ -72,9 +72,9 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, err } - backup := instance.DeepCopy() + backupJob := instance.DeepCopy() - cassdcKey := types.NamespacedName{Namespace: backup.Namespace, Name: backup.Spec.CassandraDatacenter} + cassdcKey := types.NamespacedName{Namespace: backupJob.Namespace, Name: backupJob.Spec.CassandraDatacenter} cassdc := &cassdcapi.CassandraDatacenter{} err = r.Get(ctx, cassdcKey, cassdc) if err != nil { @@ -83,12 +83,12 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ } // Set an owner reference on the backup job so that it can be cleaned up when the cassandra datacenter is deleted - if backup.OwnerReferences == nil { - if err = controllerutil.SetControllerReference(cassdc, backup, r.Scheme); err != nil { + if backupJob.OwnerReferences == nil { + if err = controllerutil.SetControllerReference(cassdc, backupJob, r.Scheme); err != nil { logger.Error(err, "failed to set controller reference", "CassandraDatacenter", cassdcKey) return ctrl.Result{}, err } - if err = r.Update(ctx, backup); err != nil { + if err = r.Update(ctx, backupJob); err != nil { logger.Error(err, "failed to update MedusaBackupJob with owner reference", "CassandraDatacenter", cassdcKey) return ctrl.Result{}, err } else { @@ -104,16 +104,16 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ } // If there is anything in progress, simply requeue the request until each pod has finished or errored - if len(backup.Status.InProgress) > 0 { + if len(backupJob.Status.InProgress) > 0 { logger.Info("There are backups in progress, checking them..") - progress := make([]string, 0, len(backup.Status.InProgress)) - patch := client.MergeFrom(backup.DeepCopy()) + progress := make([]string, 0, len(backupJob.Status.InProgress)) + patch := client.MergeFrom(backupJob.DeepCopy()) StatusCheck: - for _, podName := range backup.Status.InProgress { + for _, podName := range backupJob.Status.InProgress { for _, pod := range pods { if podName == pod.Name { - status, err := backupStatus(ctx, backup.ObjectMeta.Name, &pod, r.ClientFactory, logger) + status, err := backupStatus(ctx, backupJob.ObjectMeta.Name, &pod, r.ClientFactory, logger) if err != nil { return ctrl.Result{}, err } @@ -121,9 +121,9 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ if status == medusa.StatusType_IN_PROGRESS { progress = append(progress, podName) } else if status == medusa.StatusType_SUCCESS { - backup.Status.Finished = append(backup.Status.Finished, podName) + backupJob.Status.Finished = append(backupJob.Status.Finished, podName) } else if status == medusa.StatusType_FAILED || status == medusa.StatusType_UNKNOWN { - backup.Status.Failed = append(backup.Status.Failed, podName) + backupJob.Status.Failed = append(backupJob.Status.Failed, podName) } continue StatusCheck @@ -131,9 +131,9 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ } } - if len(backup.Status.InProgress) != len(progress) { - backup.Status.InProgress = progress - if err := r.Status().Patch(ctx, backup, patch); err != nil { + if len(backupJob.Status.InProgress) != len(progress) { + backupJob.Status.InProgress = progress + if err := r.Status().Patch(ctx, backupJob, patch); err != nil { logger.Error(err, "failed to patch status") return ctrl.Result{}, err } @@ -147,42 +147,49 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ } // If the backup is already finished, there is nothing to do. - if medusaBackupFinished(backup) { + if medusaBackupFinished(backupJob) { logger.Info("Backup operation is already finished") return ctrl.Result{Requeue: false}, nil } // First check to see if the backup is already in progress - if !backup.Status.StartTime.IsZero() { + if !backupJob.Status.StartTime.IsZero() { // If there is anything in progress, simply requeue the request - if len(backup.Status.InProgress) > 0 { + if len(backupJob.Status.InProgress) > 0 { logger.Info("Backup is still in progress") return ctrl.Result{RequeueAfter: r.DefaultDelay}, nil } + // there is nothing in progress, so the job is finished (not yet sure if successfully) + // Regardless of the success, we set the job finish time + // Note that the time here is not accurate, but that is ok. For now we are just + // using it as a completion marker. + patch := client.MergeFrom(backupJob.DeepCopy()) + backupJob.Status.FinishTime = metav1.Now() + if err := r.Status().Patch(ctx, backupJob, patch); err != nil { + logger.Error(err, "failed to patch status with finish time") + return ctrl.Result{}, err + } + + // if there are failures, we will end here and not proceed with creating a backup object + if len(backupJob.Status.Failed) > 0 { + logger.Info("Backup failed on some nodes", "BackupName", backupJob.Name, "Failed", backupJob.Status.Failed) + return ctrl.Result{Requeue: false}, nil + } + logger.Info("backup complete") - // The MedusaBackupJob is finished and we now need to create the MedusaBackup object. - backupSummary, err := r.getBackupSummary(ctx, backup, pods, logger) + // The MedusaBackupJob is finished successfully and we now need to create the MedusaBackup object. + backupSummary, err := r.getBackupSummary(ctx, backupJob, pods, logger) if err != nil { logger.Error(err, "Failed to get backup summary") return ctrl.Result{}, err } - if err := r.createMedusaBackup(ctx, backup, backupSummary, logger); err != nil { + if err := r.createMedusaBackup(ctx, backupJob, backupSummary, logger); err != nil { logger.Error(err, "Failed to create MedusaBackup") return ctrl.Result{}, err } - // Set the finish time - // Note that the time here is not accurate, but that is ok. For now we are just - // using it as a completion marker. - patch := client.MergeFrom(backup.DeepCopy()) - backup.Status.FinishTime = metav1.Now() - if err := r.Status().Patch(ctx, backup, patch); err != nil { - logger.Error(err, "failed to patch status with finish time") - return ctrl.Result{}, err - } - return ctrl.Result{Requeue: false}, nil } @@ -195,31 +202,31 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, medusa.BackupSidecarNotFound } - patch := client.MergeFromWithOptions(backup.DeepCopy(), client.MergeFromWithOptimisticLock{}) + patch := client.MergeFromWithOptions(backupJob.DeepCopy(), client.MergeFromWithOptimisticLock{}) - backup.Status.StartTime = metav1.Now() + backupJob.Status.StartTime = metav1.Now() - if err := r.Status().Patch(ctx, backup, patch); err != nil { + if err := r.Status().Patch(ctx, backupJob, patch); err != nil { logger.Error(err, "Failed to patch status") // We received a stale object, requeue for next processing return ctrl.Result{RequeueAfter: r.DefaultDelay}, nil } logger.Info("Starting backups") - patch = client.MergeFrom(backup.DeepCopy()) + patch = client.MergeFrom(backupJob.DeepCopy()) for _, p := range pods { logger.Info("starting backup", "CassandraPod", p.Name) - _, err := doMedusaBackup(ctx, backup.ObjectMeta.Name, backup.Spec.Type, &p, r.ClientFactory, logger) + _, err := doMedusaBackup(ctx, backupJob.ObjectMeta.Name, backupJob.Spec.Type, &p, r.ClientFactory, logger) if err != nil { logger.Error(err, "backup failed", "CassandraPod", p.Name) } - backup.Status.InProgress = append(backup.Status.InProgress, p.Name) + backupJob.Status.InProgress = append(backupJob.Status.InProgress, p.Name) } // logger.Info("finished backup operations") - if err := r.Status().Patch(context.Background(), backup, patch); err != nil { - logger.Error(err, "failed to patch status", "Backup", fmt.Sprintf("%s/%s", backup.Name, backup.Namespace)) + if err := r.Status().Patch(context.Background(), backupJob, patch); err != nil { + logger.Error(err, "failed to patch status", "Backup", fmt.Sprintf("%s/%s", backupJob.Name, backupJob.Namespace)) } return ctrl.Result{RequeueAfter: r.DefaultDelay}, nil diff --git a/controllers/medusa/medusabackupjob_controller_test.go b/controllers/medusa/medusabackupjob_controller_test.go index 163382133..fa4580e51 100644 --- a/controllers/medusa/medusabackupjob_controller_test.go +++ b/controllers/medusa/medusabackupjob_controller_test.go @@ -26,15 +26,20 @@ import ( ) const ( - medusaImageRepo = "test/medusa" - cassandraUserSecret = "medusa-secret" - defaultBackupName = "backup1" - dc1PodPrefix = "192.168.1." - dc2PodPrefix = "192.168.2." - fakeBackupFileCount = int64(13) - fakeBackupByteSize = int64(42) - fakeBackupHumanSize = "42.00 B" - fakeMaxBackupCount = 1 + medusaImageRepo = "test/medusa" + cassandraUserSecret = "medusa-secret" + successfulBackupName = "good-backup" + failingBackupName = "bad-backup" + dc1PodPrefix = "192.168.1." + dc2PodPrefix = "192.168.2." + fakeBackupFileCount = int64(13) + fakeBackupByteSize = int64(42) + fakeBackupHumanSize = "42.00 B" + fakeMaxBackupCount = 1 +) + +var ( + alreadyReportedFailingBackup = false ) func testMedusaBackupDatacenter(t *testing.T, ctx context.Context, f *framework.Framework, namespace string) { @@ -142,16 +147,19 @@ func testMedusaBackupDatacenter(t *testing.T, ctx context.Context, f *framework. }) require.NoError(err, "failed to update dc1 status to ready") - backupCreated := createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, defaultBackupName) + backupCreated := createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, successfulBackupName) require.True(backupCreated, "failed to create backup") t.Log("verify that medusa gRPC clients are invoked") require.Equal(map[string][]string{ - fmt.Sprintf("%s:%d", getPodIpAddress(0, dc1.DatacenterName()), shared.BackupSidecarPort): {defaultBackupName}, - fmt.Sprintf("%s:%d", getPodIpAddress(1, dc1.DatacenterName()), shared.BackupSidecarPort): {defaultBackupName}, - fmt.Sprintf("%s:%d", getPodIpAddress(2, dc1.DatacenterName()), shared.BackupSidecarPort): {defaultBackupName}, + fmt.Sprintf("%s:%d", getPodIpAddress(0, dc1.DatacenterName()), shared.BackupSidecarPort): {successfulBackupName}, + fmt.Sprintf("%s:%d", getPodIpAddress(1, dc1.DatacenterName()), shared.BackupSidecarPort): {successfulBackupName}, + fmt.Sprintf("%s:%d", getPodIpAddress(2, dc1.DatacenterName()), shared.BackupSidecarPort): {successfulBackupName}, }, medusaClientFactory.GetRequestedBackups(dc1.DatacenterName())) + backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, failingBackupName) + require.False(backupCreated, "the backup object shouldn't have been created") + err = f.DeleteK8ssandraCluster(ctx, client.ObjectKey{Namespace: kc.Namespace, Name: kc.Name}, timeout, interval) require.NoError(err, "failed to delete K8ssandraCluster") verifyObjectDoesNotExist(ctx, t, f, dc1Key, &cassdcapi.CassandraDatacenter{}) @@ -231,16 +239,22 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa return false } t.Logf("backup finish time: %v", updated.Status.FinishTime) + t.Logf("backup failed: %v", updated.Status.Failed) t.Logf("backup finished: %v", updated.Status.Finished) t.Logf("backup in progress: %v", updated.Status.InProgress) - return !updated.Status.FinishTime.IsZero() && len(updated.Status.Finished) == 3 && len(updated.Status.InProgress) == 0 + return !updated.Status.FinishTime.IsZero() // && len(updated.Status.Finished) == 3 && len(updated.Status.InProgress) == 0 }, timeout, interval) - t.Log("verify that the MedusaBackup is created") + t.Log("check for the MedusaBackup being created") medusaBackupKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, backupName) medusaBackup := &api.MedusaBackup{} err = f.Get(ctx, medusaBackupKey, medusaBackup) - require.NoError(err, "failed to get MedusaBackup") + if err != nil { + if errors.IsNotFound(err) { + return false + } + } + t.Log("verify the MedusaBackup is correct") require.Equal(medusaBackup.Status.TotalNodes, dc.Spec.Size, "backup total nodes doesn't match dc nodes") require.Equal(medusaBackup.Status.FinishedNodes, dc.Spec.Size, "backup finished nodes doesn't match dc nodes") require.Equal(len(medusaBackup.Status.Nodes), int(dc.Spec.Size), "backup topology doesn't match dc topology") @@ -387,8 +401,21 @@ func (c *fakeMedusaClient) GetBackups(ctx context.Context) ([]*medusa.BackupSumm } func (c *fakeMedusaClient) BackupStatus(ctx context.Context, name string) (*medusa.BackupStatusResponse, error) { + var status medusa.StatusType + if name == successfulBackupName || strings.HasPrefix(successfulBackupName, name) { + status = medusa.StatusType_SUCCESS + } else if name == failingBackupName { + if !alreadyReportedFailingBackup { + status = medusa.StatusType_FAILED + alreadyReportedFailingBackup = true + } else { + status = medusa.StatusType_SUCCESS + } + } else { + status = medusa.StatusType_IN_PROGRESS + } return &medusa.BackupStatusResponse{ - Status: medusa.StatusType_SUCCESS, + Status: status, }, nil } diff --git a/controllers/medusa/medusatask_controller_test.go b/controllers/medusa/medusatask_controller_test.go index 7cb8b6357..3dea1f07e 100644 --- a/controllers/medusa/medusatask_controller_test.go +++ b/controllers/medusa/medusatask_controller_test.go @@ -16,10 +16,10 @@ import ( ) const ( - backup1 = "backup1" - backup2 = "backup2" - backup3 = "backup3" - backup4 = "backup4" + backup1 = "good-backup1" + backup2 = "good-backup2" + backup3 = "good-backup3" + backup4 = "good-backup4" ) func testMedusaTasks(t *testing.T, ctx context.Context, f *framework.Framework, namespace string) {