From 53e04bebf7bf2469b3713f89c2700730bf97f631 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Tue, 25 Jun 2024 14:24:59 -0700
Subject: [PATCH 1/2] Update migrator to keep `DataObject.was_generated_by` in
 sync with updated `id`

---
 .../migrator_from_10_3_0_to_10_4_0.py         | 94 +++++++++++++++++--
 1 file changed, 86 insertions(+), 8 deletions(-)

diff --git a/nmdc_schema/migrators/migrator_from_10_3_0_to_10_4_0.py b/nmdc_schema/migrators/migrator_from_10_3_0_to_10_4_0.py
index 433bddcecc..8657e58c1c 100644
--- a/nmdc_schema/migrators/migrator_from_10_3_0_to_10_4_0.py
+++ b/nmdc_schema/migrators/migrator_from_10_3_0_to_10_4_0.py
@@ -1,4 +1,5 @@
 import re
+from typing import Dict
 
 from nmdc_schema.migrators.migrator_base import MigratorBase
 
@@ -10,13 +11,14 @@ class Migrator(MigratorBase):
     Note: In schema version 10.3.0, the `NomAnalysisActivity.id` slot had this `structured_pattern.syntax`:
           "{id_nmdc_prefix}:wfnom-{id_shoulder}-{id_blade}{id_version}{id_locus}"
 
-          In schema version 10.4.0, it had _this_ `structured_pattern.syntax`:
+          In schema version 10.4.0, it has _this_ `structured_pattern.syntax`:
           "{id_nmdc_prefix}:wfnom-{id_shoulder}-{id_blade}{id_version}"
 
           That specific change was made in commit `a1d5686`:
           https://github.com/microbiomedata/nmdc-schema/commit/a1d5686
 
-          This migrator was designed to migrate `NomAnalysisActivity.id` values to conform to the latter pattern.
+          This migrator was designed to migrate `NomAnalysisActivity.id` values to conform to the latter pattern,
+          and to update the `was_generated_by` fields of the associated `data_object_set` documents accordingly.
     """
 
     _from_version = "10.3.0"
@@ -30,6 +32,9 @@ class Migrator(MigratorBase):
         r"(^(nmdc):wfnom-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\.[0-9]{1,})$)"
     )
 
+    # A mapping from `DataObject.id` value to `NomAnalysisActivity.id` value.
+    data_object_id_to_noma_activity_id_map: Dict[str, str] = dict()
+
     def upgrade(self) -> None:
         r"""Migrates the database from conforming to the original schema, to conforming to the new schema."""
 
@@ -41,6 +46,28 @@ def upgrade(self) -> None:
             ],
         )
 
+        self.adapter.process_each_document(collection_name="data_object_set",
+                                           pipeline=[self.update_was_generated_by_field])
+
+    def update_was_generated_by_field(self, data_object: dict) -> dict:
+        r"""
+        Updates the `was_generated_by` field of the specified `data_object_set` document so that it contains
+        the `id` of the `nom_analysis_activity_set` document that "generated" that `data_object_set` document.
+
+        >>> m = Migrator()
+        >>> m.data_object_id_to_noma_activity_id_map = {'do1': 'nom1.1'}
+        >>> m.update_was_generated_by_field({'id': 'do1', 'was_generated_by': 'nom1'})  # changes `was_generated_by`
+        {'id': 'do1', 'was_generated_by': 'nom1.1'}
+        >>> m.update_was_generated_by_field({'id': 'do2', 'was_generated_by': 'nom2'})  # no change (not in dictionary)
+        {'id': 'do2', 'was_generated_by': 'nom2'}
+        """
+
+        data_object_id = data_object["id"]
+        if data_object_id in self.data_object_id_to_noma_activity_id_map.keys():
+            data_object["was_generated_by"] = self.data_object_id_to_noma_activity_id_map[data_object_id]
+
+        return data_object
+
     def is_valid_id(self, s: str) -> bool:
         r"""
         Helper function that checks whether a string is a valid ID.
@@ -91,8 +118,16 @@ def populate_alternative_identifiers(self, nom_analysis_activity: dict) -> dict:
 
     def update_nom_analysis_activity_id(self, nom_analysis_activity: dict) -> dict:
         r"""
-        Updates a nom_analysis_activity's id to contain a version number (.1),
-        if it has no version number.
+        If the specified `nom_analysis_activity` document has an invalid `id`,
+        this function does two things:
+        1. Make the `id` valid (by appending a version number to it); and
+        2. If it has a `has_output` field (which is a list) consisting of a single item,
+           add an entry to the `data_object_id_to_noma_activity_id_map` dictionary, using
+           the `has_output` value as the key and the newly-valid `id` as the value.
+           Raise an exception if the `has_output` list contains more than 1 item.
+
+        If the specified `nom_analysis_activity` document already has a valid `id`,
+        this function does nothing.
 
         >>> m = Migrator()
         >>> # test: adds version number to id
@@ -102,13 +137,56 @@ def update_nom_analysis_activity_id(self, nom_analysis_activity: dict) -> dict:
         >>> m.update_nom_analysis_activity_id({'id': 'nmdc:wfnom-13-7yf9qj85.1'})
         {'id': 'nmdc:wfnom-13-7yf9qj85.1'}
 
+        Test the production of the ID map.
+
+        >>> m.data_object_id_to_noma_activity_id_map = {}
+        >>> m.update_nom_analysis_activity_id({'id': 'nmdc:wfnom-13-7yf9qj85',
+        ...                                    'has_output': ['do1']})  # creates dict entry
+        {'id': 'nmdc:wfnom-13-7yf9qj85.1', 'has_output': ['do1']}
+        >>> len(m.data_object_id_to_noma_activity_id_map.keys())
+        1
+        >>> m.data_object_id_to_noma_activity_id_map['do1']
+        'nmdc:wfnom-13-7yf9qj85.1'
+        >>> m.update_nom_analysis_activity_id({'id': 'nmdc:wfnom-13-7yf9qj86',
+        ...                                    'has_output': ['do2']})  # creates other dict entry
+        {'id': 'nmdc:wfnom-13-7yf9qj86.1', 'has_output': ['do2']}
+        >>> len(m.data_object_id_to_noma_activity_id_map.keys())
+        2
+        >>> m.data_object_id_to_noma_activity_id_map['do1']
+        'nmdc:wfnom-13-7yf9qj85.1'
+        >>> m.data_object_id_to_noma_activity_id_map['do2']
+        'nmdc:wfnom-13-7yf9qj86.1'
         """
+        nom_analysis_activity_id = nom_analysis_activity["id"]
+
         self.logger.info(
-            f"Checking ID for NomAnalysisActivity: {nom_analysis_activity['id']}"
+            f"Checking ID for NomAnalysisActivity: {nom_analysis_activity_id}"
         )
 
-        # If there's no version number, add one
-        if not self.is_valid_id(nom_analysis_activity["id"]):
-            nom_analysis_activity["id"] = str(nom_analysis_activity["id"]) + ".1"
+        # Get the `has_output` value, if any, of the `nom_analysis_activity` document;
+        # treating `null` values as empty lists.
+        nom_outputs = []
+        if "has_output" in nom_analysis_activity.keys():
+            if nom_analysis_activity["has_output"] is not None:
+                nom_outputs = nom_analysis_activity["has_output"]
+
+        # If the `id` is invalid, do two things:
+        # 1. Append a version number to it; and
+        # 2. Record the `id` of the `DataObject`, if any, that is the output of this `NomAnalysisActivity`.
+        if not self.is_valid_id(nom_analysis_activity_id):
+            nom_analysis_activity_id_new = f"{nom_analysis_activity_id}.1"
+            nom_analysis_activity["id"] = nom_analysis_activity_id_new
+
+            if len(nom_outputs) > 1:
+                raise ValueError(f"NomAnalysisActivity ({nom_analysis_activity_id}) 'has_output' field "
+                                 f"contains more than one item. Contains: {nom_outputs}")
+            elif len(nom_outputs) == 1:
+                data_object_id = nom_outputs[0]
+                if data_object_id in self.data_object_id_to_noma_activity_id_map.keys():
+                    raise ValueError(f"Multiple NomAnalysisActivity instances have same DataObject ({data_object_id}) "
+                                     f"instance as their output.")
+                self.data_object_id_to_noma_activity_id_map[data_object_id] = nom_analysis_activity_id_new
+            else:  # This `NomAnalysisActivity` instance didn't have any output.
+                pass
 
         return nom_analysis_activity

From e62f31e912d5e000ec13df92768468fa10cce1fe Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Tue, 25 Jun 2024 14:30:09 -0700
Subject: [PATCH 2/2] Simplify variable name

---
 .../migrator_from_10_3_0_to_10_4_0.py         | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/nmdc_schema/migrators/migrator_from_10_3_0_to_10_4_0.py b/nmdc_schema/migrators/migrator_from_10_3_0_to_10_4_0.py
index 8657e58c1c..a5f05d4ce5 100644
--- a/nmdc_schema/migrators/migrator_from_10_3_0_to_10_4_0.py
+++ b/nmdc_schema/migrators/migrator_from_10_3_0_to_10_4_0.py
@@ -33,7 +33,7 @@ class Migrator(MigratorBase):
     )
 
     # A mapping from `DataObject.id` value to `NomAnalysisActivity.id` value.
-    data_object_id_to_noma_activity_id_map: Dict[str, str] = dict()
+    data_object_id_to_nom_id_map: Dict[str, str] = dict()
 
     def upgrade(self) -> None:
         r"""Migrates the database from conforming to the original schema, to conforming to the new schema."""
@@ -55,7 +55,7 @@ def update_was_generated_by_field(self, data_object: dict) -> dict:
         the `id` of the `nom_analysis_activity_set` document that "generated" that `data_object_set` document.
 
         >>> m = Migrator()
-        >>> m.data_object_id_to_noma_activity_id_map = {'do1': 'nom1.1'}
+        >>> m.data_object_id_to_nom_id_map = {'do1': 'nom1.1'}
         >>> m.update_was_generated_by_field({'id': 'do1', 'was_generated_by': 'nom1'})  # changes `was_generated_by`
         {'id': 'do1', 'was_generated_by': 'nom1.1'}
         >>> m.update_was_generated_by_field({'id': 'do2', 'was_generated_by': 'nom2'})  # no change (not in dictionary)
@@ -63,8 +63,8 @@ def update_was_generated_by_field(self, data_object: dict) -> dict:
         """
 
         data_object_id = data_object["id"]
-        if data_object_id in self.data_object_id_to_noma_activity_id_map.keys():
-            data_object["was_generated_by"] = self.data_object_id_to_noma_activity_id_map[data_object_id]
+        if data_object_id in self.data_object_id_to_nom_id_map.keys():
+            data_object["was_generated_by"] = self.data_object_id_to_nom_id_map[data_object_id]
 
         return data_object
 
@@ -122,7 +122,7 @@ def update_nom_analysis_activity_id(self, nom_analysis_activity: dict) -> dict:
         this function does two things:
         1. Make the `id` valid (by appending a version number to it); and
         2. If it has a `has_output` field (which is a list) consisting of a single item,
-           add an entry to the `data_object_id_to_noma_activity_id_map` dictionary, using
+           add an entry to the `data_object_id_to_nom_id_map` dictionary, using
            the `has_output` value as the key and the newly-valid `id` as the value.
            Raise an exception if the `has_output` list contains more than 1 item.
 
@@ -139,22 +139,22 @@ def update_nom_analysis_activity_id(self, nom_analysis_activity: dict) -> dict:
 
         Test the production of the ID map.
 
-        >>> m.data_object_id_to_noma_activity_id_map = {}
+        >>> m.data_object_id_to_nom_id_map = {}
         >>> m.update_nom_analysis_activity_id({'id': 'nmdc:wfnom-13-7yf9qj85',
         ...                                    'has_output': ['do1']})  # creates dict entry
         {'id': 'nmdc:wfnom-13-7yf9qj85.1', 'has_output': ['do1']}
-        >>> len(m.data_object_id_to_noma_activity_id_map.keys())
+        >>> len(m.data_object_id_to_nom_id_map.keys())
         1
-        >>> m.data_object_id_to_noma_activity_id_map['do1']
+        >>> m.data_object_id_to_nom_id_map['do1']
         'nmdc:wfnom-13-7yf9qj85.1'
         >>> m.update_nom_analysis_activity_id({'id': 'nmdc:wfnom-13-7yf9qj86',
         ...                                    'has_output': ['do2']})  # creates other dict entry
         {'id': 'nmdc:wfnom-13-7yf9qj86.1', 'has_output': ['do2']}
-        >>> len(m.data_object_id_to_noma_activity_id_map.keys())
+        >>> len(m.data_object_id_to_nom_id_map.keys())
         2
-        >>> m.data_object_id_to_noma_activity_id_map['do1']
+        >>> m.data_object_id_to_nom_id_map['do1']
         'nmdc:wfnom-13-7yf9qj85.1'
-        >>> m.data_object_id_to_noma_activity_id_map['do2']
+        >>> m.data_object_id_to_nom_id_map['do2']
         'nmdc:wfnom-13-7yf9qj86.1'
         """
         nom_analysis_activity_id = nom_analysis_activity["id"]
@@ -182,10 +182,10 @@ def update_nom_analysis_activity_id(self, nom_analysis_activity: dict) -> dict:
                                  f"contains more than one item. Contains: {nom_outputs}")
             elif len(nom_outputs) == 1:
                 data_object_id = nom_outputs[0]
-                if data_object_id in self.data_object_id_to_noma_activity_id_map.keys():
+                if data_object_id in self.data_object_id_to_nom_id_map.keys():
                     raise ValueError(f"Multiple NomAnalysisActivity instances have same DataObject ({data_object_id}) "
                                      f"instance as their output.")
-                self.data_object_id_to_noma_activity_id_map[data_object_id] = nom_analysis_activity_id_new
+                self.data_object_id_to_nom_id_map[data_object_id] = nom_analysis_activity_id_new
             else:  # This `NomAnalysisActivity` instance didn't have any output.
                 pass