Merge pull request #94 from lanl/develop

Develop
lanl · Mar 7, 2024 · d12a05b · d12a05b
2 parents 3106597 + 53cb644
commit d12a05b
Show file tree

Hide file tree

Showing 96 changed files with 1,266 additions and 289 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -20,7 +20,7 @@ authors:
   - family-names: Alexandrov
     given-names: Boian
 title: "Tensor Extraction of Latent Features (T-ELF)"
-version: 0.0.8
+version: 0.0.9
 url: https://github.com/lanl/T-ELF
 doi: 10.5281/zenodo.10257897
 date-released: 2023-12-04
diff --git a/README.md b/README.md
@@ -63,6 +63,7 @@ conda develop .
 ### Step 2: Install Spacy NLP model and NLTK Packages
 ```shell
 python -m spacy download en_core_web_lg
+python -m spacy download en_core_web_trf
 python -m nltk.downloader wordnet omw-1.4
 ```
 
@@ -151,7 +152,7 @@ If you use T-ELF please cite.
 
 **APA:**
 ```latex
-Eren, M., Solovyev, N., Barron, R., Bhattarai, M., Truong, D., Boureima, I., Skau, E., Rasmussen, K., & Alexandrov, B. (2023). Tensor Extraction of Latent Features (T-ELF) (Version 0.0.8) [Computer software]. https://doi.org/10.5281/zenodo.10257897
+Eren, M., Solovyev, N., Barron, R., Bhattarai, M., Truong, D., Boureima, I., Skau, E., Rasmussen, K., & Alexandrov, B. (2023). Tensor Extraction of Latent Features (T-ELF) (Version 0.0.9) [Computer software]. https://doi.org/10.5281/zenodo.10257897
 ```
 
 **BibTeX:**

diff --git a/TELF/factorization/decompositions/nmf_fro_mu.py b/TELF/factorization/decompositions/nmf_fro_mu.py
@@ -179,8 +179,8 @@ def nmf(X, W, H,
 
     for i in tqdm(range(niter), disable=nmf_verbose == False):
 
-        H = H_update(X, W, H, H_opts, use_gpu=use_gpu)
-        W = W_update(X, W, H, W_opts, use_gpu=use_gpu)
+        H = H_update(X, W, H, H_opts, use_gpu=use_gpu, mask=mask)
+        W = W_update(X, W, H, W_opts, use_gpu=use_gpu, mask=mask)
 
         if i % 10 == 0:
             H = np.maximum(H.astype(dtype), eps)

diff --git a/TELF/factorization/decompositions/nmf_kl_mu.py b/TELF/factorization/decompositions/nmf_kl_mu.py
@@ -39,7 +39,7 @@ def H_update(X, W, H, opts=None, nz_rows=None, nz_cols=None, use_gpu=True, mask=
     """
     if mask is not None:
         mask = mask.T
-    return W_update(X.T, H.T, W.T, opts=opts, use_gpu=use_gpu, mask=None, nz_rows=nz_cols, nz_cols=nz_rows).T
+    return W_update(X.T, H.T, W.T, opts=opts, use_gpu=use_gpu, mask=mask, nz_rows=nz_cols, nz_cols=nz_rows).T
 
 
 def W_update(X, W, H, opts=None, nz_rows=None, nz_cols=None, use_gpu=True, mask=None):
@@ -203,8 +203,8 @@ def nmf(X, W, H,
         inc = 0
 
     for i in tqdm(range(niter), disable=nmf_verbose == False):
-        H = H_update(X, W, H, H_opts, use_gpu=use_gpu)
-        W = W_update(X, W, H, W_opts, use_gpu=use_gpu)
+        H = H_update(X, W, H, H_opts, use_gpu=use_gpu, mask=mask)
+        W = W_update(X, W, H, W_opts, use_gpu=use_gpu, mask=mask)
         if i % 10 == 0:
             H = np.maximum(H.astype(dtype), eps)
             W = np.maximum(W.astype(dtype), eps)

diff --git a/TELF/pre_processing/Vulture/modules/__init__.py b/TELF/pre_processing/Vulture/modules/__init__.py
@@ -2,4 +2,5 @@
 from .simple_clean import SimpleCleaner
 from .lemmatize import LemmatizeCleaner
 from .substitute import SubstitutionCleaner
-from .detect_nonenglish import RemoveNonEnglishCleaner
+from .detect_nonenglish import RemoveNonEnglishCleaner
+from .ner import NEDetector
diff --git a/TELF/pre_processing/Vulture/modules/detect_nonenglish.py b/TELF/pre_processing/Vulture/modules/detect_nonenglish.py
@@ -37,6 +37,7 @@ class RemoveNonEnglishCleaner(VultureModuleBase):
 
     def __init__(self, ascii_ratio=0.9, stopwords_ratio=0.2, frozen=None):
         super().__init__(frozen)  # initialize the base class with the preserve
+        self.module_type = "CLEANER"
         self.ascii_ratio = ascii_ratio
         self.stopwords_ratio = stopwords_ratio
 

diff --git a/TELF/pre_processing/Vulture/modules/lemmatize.py b/TELF/pre_processing/Vulture/modules/lemmatize.py
@@ -28,6 +28,7 @@ class LemmatizeCleaner(VultureModuleBase):
 
     def __init__(self, library, frozen=None):
         super().__init__(frozen)
+        self.module_type = "CLEANER"
         self.library = library
         self.backend = None
 

diff --git a/TELF/pre_processing/Vulture/modules/ner.py b/TELF/pre_processing/Vulture/modules/ner.py
@@ -0,0 +1,99 @@
+import re
+import spacy
+import warnings
+
+from TELF.pre_processing.Vulture.modules import VultureModuleBase
+
+
+class NEDetector(VultureModuleBase):
+    """
+    An operator that detects Named Entities in text.
+    
+    Attributes:
+    -----------
+    library: str 
+        The name of the library that is used for the NER backend 
+    """
+    # supported lemmatization libraries
+    BACKEND_LIBRARIES = ['en_core_web_trf', 'en_core_web_lg']
+
+    def __init__(self, library="en_core_web_trf", frozen=None):
+        super().__init__(frozen)
+        self.module_type = "OPERATOR"
+        self.library = library
+        self.backend = None
+
+    def __call__(self, document):
+        return self.run(document)
+
+
+    def run(self, document):
+        """
+        Run the NER detection
+
+        Parameters
+        ----------
+        document: tuple
+            A document id, document text pair for which to perform NER detection
+
+        Returns
+        -------
+        tuple
+            Tuple of document id and operation result
+        """
+        doc_id, doc_text = document
+        doc_operation_result = self._detect_NER(doc_text)
+        return (doc_id, doc_operation_result)
+
+
+    def _detect_NER(self, text):
+        """
+        Detect NERs in a given string
+
+        Parameters
+        ----------
+        text: str
+            A string to be NER detection performed on
+
+        Returns
+        -------
+        str
+            Dictionary of entity name and correcponding set of entities
+        """
+        if self.backend is None:
+            self._init_backend()
+
+        doc = self.backend(text)
+        entities_set = set()  
+        label_entities = {}
+        for ent in doc.ents:
+            entities_set.add((ent.text, ent.label_))
+            if ent.label_ not in label_entities:
+                label_entities[ent.label_] = set()
+            label_entities[ent.label_].add(ent.text)
+
+        # return string where hyphens are not split
+        return label_entities
+
+
+    def _init_backend(self):
+        """
+        Change NER detection backend depending on library
+        """
+        self.backend = spacy.load(self.library)
+
+
+    # GETTERS / SETTERS
+
+
+    @property
+    def library(self):
+        return self._library
+
+    @library.setter
+    def library(self, library):
+        if not isinstance(library, str):
+            raise TypeError('Expected type str for `library`!')
+        if library not in self.BACKEND_LIBRARIES:
+            raise ValueError(f'Unknown library "{library}"! Supported options are {self.BACKEND_LIBRARIES}.')
+        self._library = library
diff --git a/TELF/pre_processing/Vulture/modules/simple_clean.py b/TELF/pre_processing/Vulture/modules/simple_clean.py
@@ -103,6 +103,7 @@ class SimpleCleaner(VultureModuleBase):
 
     def __init__(self, custom_patterns=None, stop_words=None, stop_phrases=None, min_characters=2, order=None, frozen=None):
         self._frozen = set()
+        self.module_type = "CLEANER"
         self.effective_stop_words = None
         self.patterns = self.DEFAULT_PATTERNS.copy()
         self.custom_patterns = custom_patterns

diff --git a/TELF/pre_processing/Vulture/modules/substitute.py b/TELF/pre_processing/Vulture/modules/substitute.py
@@ -42,6 +42,7 @@ class SubstitutionCleaner(VultureModuleBase):
     def __init__(self, substitution_map, permute=False, lower=False, lemmatize=False, frozen=None):
         super().__init__(frozen)  # initialize the base class with the preserve
 
+        self.module_type = "CLEANER"
         self.lower = lower
         self.permute = permute
         self.lemmatize = lemmatize

diff --git a/TELF/pre_processing/Vulture/vulture.py b/TELF/pre_processing/Vulture/vulture.py
@@ -27,6 +27,7 @@
 from TELF.pre_processing.Vulture.modules import LemmatizeCleaner
 from TELF.pre_processing.Vulture.modules import SubstitutionCleaner
 from TELF.pre_processing.Vulture.modules import RemoveNonEnglishCleaner
+from TELF.pre_processing.Vulture.modules import NEDetector
 from TELF.pre_processing.Vulture.default_stop_words import STOP_WORDS
 from TELF.pre_processing.Vulture.default_stop_phrases import STOP_PHRASES
 
@@ -74,6 +75,9 @@ class Vulture:
     Vultures are natures' cleaners!
     """
     PARALLEL_BACKEND_OPTIONS = {'loky', 'multiprocessing', 'threading'}
+    DEFAULT_OPERATOR_PIPELINE = [
+        NEDetector(library="en_core_web_trf")
+    ]
     DEFAULT_PIPELINE = [
         SimpleCleaner(stop_words = STOP_WORDS,
                       stop_phrases = STOP_PHRASES,
@@ -129,12 +133,60 @@ def __init__(self, *, n_jobs = -1, n_nodes = 1, parallel_backend = "multiprocess
         # broadcast unique_id from root process to all other processes
         if self.comm is not None:
             self.unique_id = self.comm.bcast(self.unique_id, root=0)
+
+
+    def operate(self, documents, steps=None, save_path=None, file_name=""):
+
+        if steps is None:
+            steps = self.DEFAULT_OPERATOR_PIPELINE.copy()
+
+        for step in steps:
+            if "module_type" in vars(step):
+                assert step.module_type == "OPERATOR", "This method can only be used with a OPERATOR type module."
+
+        # transform documents into list of tuples
+        operate_documents = list(documents.items())
+        if self.verbose and self.rank == 0:
+            print(f'[Vulture]: Cleaning {len(operate_documents)} documents', file=sys.stderr)
+
+
+        # prepare for MPI by chunking data and saving chunks (assuming DFS)
+        if self.use_mpi():
+            self._mpi_init(operate_documents)
+            self.comm.Barrier()
+            operate_documents = self._mpi_load_chunk_from_disk(self.rank, is_clean=False)
+
+        # perform operation
+        all_results = []
+        for step in steps:
+            if save_path is not None:
+                self.save_path = f'{save_path}/{file_name}_{step.__class__.__name__}.p'
+            else:
+                self.save_path = save_path
+
+            curr_operated_documents = self._clean_helper(operate_documents, [step])
+            if self.use_mpi():
+                self._mpi_save_chunk_to_disk(curr_operated_documents, self.rank, is_clean=True, custom_fn=f'{step.__class__.__name__}')
+                self.comm.Barrier()
+                curr_operated_documents = self._mpi_combine(custom_fn=f'{step.__class__.__name__}')
+
+            if self.save_path is not None:
+                self._save_documents(dict(curr_operated_documents))
+            else:
+                all_results.append((f'{step.__class__.__name__}', dict(curr_operated_documents)))
 
+        if self.save_path is None:
+            return all_results
 
     def clean(self, documents, steps=None, substitutions=None, save_path=None):
         self.save_path = save_path
         if steps is None:
             steps = self.DEFAULT_PIPELINE.copy()
+
+        for step in steps:
+            if "module_type" in vars(step):
+                assert step.module_type == "CLEANER", "This method can only be used with a CLEANER type module."
+
         if substitutions is not None:
             assert isinstance(substitutions, dict), '`substitutions` must be a dict!'
             initial_sub = SubstitutionCleaner(substitutions, permute=True, lower=True, lemmatize=True)
@@ -174,6 +226,13 @@ def clean_dataframe(self, df, columns, steps=None, substitutions=None,
         if not all(col in df.columns for col in columns):  # make sure columns exist
             raise ValueError("One or more columns are invalid!")
 
+        if steps is None:
+            steps = self.DEFAULT_PIPELINE.copy()
+
+        for step in steps:
+            if "module_type" in vars(step):
+                assert step.module_type == "CLEANER", "This method can only be used with a CLEANER type module."
+
         # make a copy of the DataFrame to prevent changing original and fill nans with empty strings
         if append_to_original_df:
             df = df.copy()
@@ -212,29 +271,33 @@ def _mpi_init(self, documents):
                 self._mpi_save_chunk_to_disk(chunk, idx, is_clean=False)
 
 
-    def _mpi_get_name(self, rank, is_clean):
-        if is_clean:
-            return f'vulture_{self.unique_id}_{rank}_clean.p'
+    def _mpi_get_name(self, rank, is_clean, custom_fn=None):
+        if not custom_fn:
+            if is_clean:
+                return f'vulture_{self.unique_id}_{rank}_clean.p'
+            else:
+                return f'vulture_{self.unique_id}_{rank}.p'
         else:
-            return f'vulture_{self.unique_id}_{rank}.p'
+            return f'vulture_{self.unique_id}_{rank}_{custom_fn}.p'
 
 
-    def _mpi_save_chunk_to_disk(self, data, rank, *, is_clean):
-        fn = self._mpi_get_name(rank, is_clean)
+    def _mpi_save_chunk_to_disk(self, data, rank, *, is_clean, custom_fn=None):
+        fn = self._mpi_get_name(rank, is_clean, custom_fn)
+
         with open(os.path.join(self.cache, fn), 'wb') as fh:
             pickle.dump(data, fh)
 
 
-    def _mpi_load_chunk_from_disk(self, rank, *, is_clean):
-        fn = self._mpi_get_name(rank, is_clean)
+    def _mpi_load_chunk_from_disk(self, rank, *, is_clean, custom_fn=None):
+        fn = self._mpi_get_name(rank, is_clean, custom_fn=custom_fn)
         with open(os.path.join(self.cache, fn), 'rb') as fh:
             return pickle.load(fh)
 
 
-    def _mpi_combine(self):
+    def _mpi_combine(self, custom_fn=None):
         clean_documents = []
         for rank in range(self.n_nodes):
-            clean_documents += self._mpi_load_chunk_from_disk(rank, is_clean=True)
+            clean_documents += self._mpi_load_chunk_from_disk(rank, is_clean=True, custom_fn=custom_fn)
         return clean_documents
 
 

diff --git a/TELF/version.py b/TELF/version.py
@@ -1 +1 @@
-__version__ = '0.0.8'
+__version__ = '0.0.9'
diff --git a/docs/Beaver.html b/docs/Beaver.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>TELF.pre_processing.Beaver: Fast matrix and tensor building tool &#8212; TELF 0.0.8 documentation</title>
+    <title>TELF.pre_processing.Beaver: Fast matrix and tensor building tool &#8212; TELF 0.0.9 documentation</title>
 
 
 
@@ -28,7 +28,7 @@
 <link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
 <link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />
 
-    <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=a746c00c" />
+    <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=362ab14a" />
     <link rel="stylesheet" href="_static/styles/sphinx-book-theme.css?digest=14f4ca6b54d191a8c7657f6c759bf11a5fb86285" type="text/css" />
     <link rel="stylesheet" type="text/css" href="_static/graphviz.css?v=eafc0fe6" />
 
@@ -37,7 +37,7 @@
 <link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
   <script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>
 
-    <script src="_static/documentation_options.js?v=820a49c8"></script>
+    <script src="_static/documentation_options.js?v=39f6cbd5"></script>
     <script src="_static/doctools.js?v=888ff710"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
@@ -127,7 +127,7 @@
 
 
 
-    <p class="title logo__title">TELF 0.0.8 documentation</p>
+    <p class="title logo__title">TELF 0.0.9 documentation</p>
 
 </a></div>
         <div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">