diff --git a/CITATION.cff b/CITATION.cff index 4877c347..209c95dc 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -20,7 +20,7 @@ authors: - family-names: Alexandrov given-names: Boian title: "Tensor Extraction of Latent Features (T-ELF)" -version: 0.0.8 +version: 0.0.9 url: https://github.com/lanl/T-ELF doi: 10.5281/zenodo.10257897 date-released: 2023-12-04 diff --git a/README.md b/README.md index a67e2678..9885f2c3 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,7 @@ conda develop . ### Step 2: Install Spacy NLP model and NLTK Packages ```shell python -m spacy download en_core_web_lg +python -m spacy download en_core_web_trf python -m nltk.downloader wordnet omw-1.4 ``` @@ -151,7 +152,7 @@ If you use T-ELF please cite. **APA:** ```latex -Eren, M., Solovyev, N., Barron, R., Bhattarai, M., Truong, D., Boureima, I., Skau, E., Rasmussen, K., & Alexandrov, B. (2023). Tensor Extraction of Latent Features (T-ELF) (Version 0.0.8) [Computer software]. https://doi.org/10.5281/zenodo.10257897 +Eren, M., Solovyev, N., Barron, R., Bhattarai, M., Truong, D., Boureima, I., Skau, E., Rasmussen, K., & Alexandrov, B. (2023). Tensor Extraction of Latent Features (T-ELF) (Version 0.0.9) [Computer software]. https://doi.org/10.5281/zenodo.10257897 ``` **BibTeX:** diff --git a/TELF/factorization/decompositions/nmf_fro_mu.py b/TELF/factorization/decompositions/nmf_fro_mu.py index 374cbbcd..c0bc0493 100644 --- a/TELF/factorization/decompositions/nmf_fro_mu.py +++ b/TELF/factorization/decompositions/nmf_fro_mu.py @@ -179,8 +179,8 @@ def nmf(X, W, H, for i in tqdm(range(niter), disable=nmf_verbose == False): - H = H_update(X, W, H, H_opts, use_gpu=use_gpu) - W = W_update(X, W, H, W_opts, use_gpu=use_gpu) + H = H_update(X, W, H, H_opts, use_gpu=use_gpu, mask=mask) + W = W_update(X, W, H, W_opts, use_gpu=use_gpu, mask=mask) if i % 10 == 0: H = np.maximum(H.astype(dtype), eps) diff --git a/TELF/factorization/decompositions/nmf_kl_mu.py b/TELF/factorization/decompositions/nmf_kl_mu.py index e7ea3865..9062cf15 100644 --- a/TELF/factorization/decompositions/nmf_kl_mu.py +++ b/TELF/factorization/decompositions/nmf_kl_mu.py @@ -39,7 +39,7 @@ def H_update(X, W, H, opts=None, nz_rows=None, nz_cols=None, use_gpu=True, mask= """ if mask is not None: mask = mask.T - return W_update(X.T, H.T, W.T, opts=opts, use_gpu=use_gpu, mask=None, nz_rows=nz_cols, nz_cols=nz_rows).T + return W_update(X.T, H.T, W.T, opts=opts, use_gpu=use_gpu, mask=mask, nz_rows=nz_cols, nz_cols=nz_rows).T def W_update(X, W, H, opts=None, nz_rows=None, nz_cols=None, use_gpu=True, mask=None): @@ -203,8 +203,8 @@ def nmf(X, W, H, inc = 0 for i in tqdm(range(niter), disable=nmf_verbose == False): - H = H_update(X, W, H, H_opts, use_gpu=use_gpu) - W = W_update(X, W, H, W_opts, use_gpu=use_gpu) + H = H_update(X, W, H, H_opts, use_gpu=use_gpu, mask=mask) + W = W_update(X, W, H, W_opts, use_gpu=use_gpu, mask=mask) if i % 10 == 0: H = np.maximum(H.astype(dtype), eps) W = np.maximum(W.astype(dtype), eps) diff --git a/TELF/pre_processing/Vulture/modules/__init__.py b/TELF/pre_processing/Vulture/modules/__init__.py index 3e94701f..0cf6b53e 100644 --- a/TELF/pre_processing/Vulture/modules/__init__.py +++ b/TELF/pre_processing/Vulture/modules/__init__.py @@ -2,4 +2,5 @@ from .simple_clean import SimpleCleaner from .lemmatize import LemmatizeCleaner from .substitute import SubstitutionCleaner -from .detect_nonenglish import RemoveNonEnglishCleaner \ No newline at end of file +from .detect_nonenglish import RemoveNonEnglishCleaner +from .ner import NEDetector \ No newline at end of file diff --git a/TELF/pre_processing/Vulture/modules/detect_nonenglish.py b/TELF/pre_processing/Vulture/modules/detect_nonenglish.py index 9a6b64eb..65d290f2 100644 --- a/TELF/pre_processing/Vulture/modules/detect_nonenglish.py +++ b/TELF/pre_processing/Vulture/modules/detect_nonenglish.py @@ -37,6 +37,7 @@ class RemoveNonEnglishCleaner(VultureModuleBase): def __init__(self, ascii_ratio=0.9, stopwords_ratio=0.2, frozen=None): super().__init__(frozen) # initialize the base class with the preserve + self.module_type = "CLEANER" self.ascii_ratio = ascii_ratio self.stopwords_ratio = stopwords_ratio diff --git a/TELF/pre_processing/Vulture/modules/lemmatize.py b/TELF/pre_processing/Vulture/modules/lemmatize.py index 607b5847..0f960fa9 100644 --- a/TELF/pre_processing/Vulture/modules/lemmatize.py +++ b/TELF/pre_processing/Vulture/modules/lemmatize.py @@ -28,6 +28,7 @@ class LemmatizeCleaner(VultureModuleBase): def __init__(self, library, frozen=None): super().__init__(frozen) + self.module_type = "CLEANER" self.library = library self.backend = None diff --git a/TELF/pre_processing/Vulture/modules/ner.py b/TELF/pre_processing/Vulture/modules/ner.py new file mode 100644 index 00000000..50d2a343 --- /dev/null +++ b/TELF/pre_processing/Vulture/modules/ner.py @@ -0,0 +1,99 @@ +import re +import spacy +import warnings + +from TELF.pre_processing.Vulture.modules import VultureModuleBase + + +class NEDetector(VultureModuleBase): + """ + An operator that detects Named Entities in text. + + Attributes: + ----------- + library: str + The name of the library that is used for the NER backend + """ + # supported lemmatization libraries + BACKEND_LIBRARIES = ['en_core_web_trf', 'en_core_web_lg'] + + def __init__(self, library="en_core_web_trf", frozen=None): + super().__init__(frozen) + self.module_type = "OPERATOR" + self.library = library + self.backend = None + + def __call__(self, document): + return self.run(document) + + + def run(self, document): + """ + Run the NER detection + + Parameters + ---------- + document: tuple + A document id, document text pair for which to perform NER detection + + Returns + ------- + tuple + Tuple of document id and operation result + """ + doc_id, doc_text = document + doc_operation_result = self._detect_NER(doc_text) + return (doc_id, doc_operation_result) + + + def _detect_NER(self, text): + """ + Detect NERs in a given string + + Parameters + ---------- + text: str + A string to be NER detection performed on + + Returns + ------- + str + Dictionary of entity name and correcponding set of entities + """ + if self.backend is None: + self._init_backend() + + doc = self.backend(text) + entities_set = set() + label_entities = {} + for ent in doc.ents: + entities_set.add((ent.text, ent.label_)) + if ent.label_ not in label_entities: + label_entities[ent.label_] = set() + label_entities[ent.label_].add(ent.text) + + # return string where hyphens are not split + return label_entities + + + def _init_backend(self): + """ + Change NER detection backend depending on library + """ + self.backend = spacy.load(self.library) + + + # GETTERS / SETTERS + + + @property + def library(self): + return self._library + + @library.setter + def library(self, library): + if not isinstance(library, str): + raise TypeError('Expected type str for `library`!') + if library not in self.BACKEND_LIBRARIES: + raise ValueError(f'Unknown library "{library}"! Supported options are {self.BACKEND_LIBRARIES}.') + self._library = library \ No newline at end of file diff --git a/TELF/pre_processing/Vulture/modules/simple_clean.py b/TELF/pre_processing/Vulture/modules/simple_clean.py index 5366c4d6..22cb2374 100644 --- a/TELF/pre_processing/Vulture/modules/simple_clean.py +++ b/TELF/pre_processing/Vulture/modules/simple_clean.py @@ -103,6 +103,7 @@ class SimpleCleaner(VultureModuleBase): def __init__(self, custom_patterns=None, stop_words=None, stop_phrases=None, min_characters=2, order=None, frozen=None): self._frozen = set() + self.module_type = "CLEANER" self.effective_stop_words = None self.patterns = self.DEFAULT_PATTERNS.copy() self.custom_patterns = custom_patterns diff --git a/TELF/pre_processing/Vulture/modules/substitute.py b/TELF/pre_processing/Vulture/modules/substitute.py index 23dd64a5..bfe3bafd 100644 --- a/TELF/pre_processing/Vulture/modules/substitute.py +++ b/TELF/pre_processing/Vulture/modules/substitute.py @@ -42,6 +42,7 @@ class SubstitutionCleaner(VultureModuleBase): def __init__(self, substitution_map, permute=False, lower=False, lemmatize=False, frozen=None): super().__init__(frozen) # initialize the base class with the preserve + self.module_type = "CLEANER" self.lower = lower self.permute = permute self.lemmatize = lemmatize diff --git a/TELF/pre_processing/Vulture/vulture.py b/TELF/pre_processing/Vulture/vulture.py index 645754bc..767be7bc 100644 --- a/TELF/pre_processing/Vulture/vulture.py +++ b/TELF/pre_processing/Vulture/vulture.py @@ -27,6 +27,7 @@ from TELF.pre_processing.Vulture.modules import LemmatizeCleaner from TELF.pre_processing.Vulture.modules import SubstitutionCleaner from TELF.pre_processing.Vulture.modules import RemoveNonEnglishCleaner +from TELF.pre_processing.Vulture.modules import NEDetector from TELF.pre_processing.Vulture.default_stop_words import STOP_WORDS from TELF.pre_processing.Vulture.default_stop_phrases import STOP_PHRASES @@ -74,6 +75,9 @@ class Vulture: Vultures are natures' cleaners! """ PARALLEL_BACKEND_OPTIONS = {'loky', 'multiprocessing', 'threading'} + DEFAULT_OPERATOR_PIPELINE = [ + NEDetector(library="en_core_web_trf") + ] DEFAULT_PIPELINE = [ SimpleCleaner(stop_words = STOP_WORDS, stop_phrases = STOP_PHRASES, @@ -129,12 +133,60 @@ def __init__(self, *, n_jobs = -1, n_nodes = 1, parallel_backend = "multiprocess # broadcast unique_id from root process to all other processes if self.comm is not None: self.unique_id = self.comm.bcast(self.unique_id, root=0) + + + def operate(self, documents, steps=None, save_path=None, file_name=""): + + if steps is None: + steps = self.DEFAULT_OPERATOR_PIPELINE.copy() + + for step in steps: + if "module_type" in vars(step): + assert step.module_type == "OPERATOR", "This method can only be used with a OPERATOR type module." + + # transform documents into list of tuples + operate_documents = list(documents.items()) + if self.verbose and self.rank == 0: + print(f'[Vulture]: Cleaning {len(operate_documents)} documents', file=sys.stderr) + + + # prepare for MPI by chunking data and saving chunks (assuming DFS) + if self.use_mpi(): + self._mpi_init(operate_documents) + self.comm.Barrier() + operate_documents = self._mpi_load_chunk_from_disk(self.rank, is_clean=False) + + # perform operation + all_results = [] + for step in steps: + if save_path is not None: + self.save_path = f'{save_path}/{file_name}_{step.__class__.__name__}.p' + else: + self.save_path = save_path + + curr_operated_documents = self._clean_helper(operate_documents, [step]) + if self.use_mpi(): + self._mpi_save_chunk_to_disk(curr_operated_documents, self.rank, is_clean=True, custom_fn=f'{step.__class__.__name__}') + self.comm.Barrier() + curr_operated_documents = self._mpi_combine(custom_fn=f'{step.__class__.__name__}') + + if self.save_path is not None: + self._save_documents(dict(curr_operated_documents)) + else: + all_results.append((f'{step.__class__.__name__}', dict(curr_operated_documents))) + if self.save_path is None: + return all_results def clean(self, documents, steps=None, substitutions=None, save_path=None): self.save_path = save_path if steps is None: steps = self.DEFAULT_PIPELINE.copy() + + for step in steps: + if "module_type" in vars(step): + assert step.module_type == "CLEANER", "This method can only be used with a CLEANER type module." + if substitutions is not None: assert isinstance(substitutions, dict), '`substitutions` must be a dict!' initial_sub = SubstitutionCleaner(substitutions, permute=True, lower=True, lemmatize=True) @@ -174,6 +226,13 @@ def clean_dataframe(self, df, columns, steps=None, substitutions=None, if not all(col in df.columns for col in columns): # make sure columns exist raise ValueError("One or more columns are invalid!") + if steps is None: + steps = self.DEFAULT_PIPELINE.copy() + + for step in steps: + if "module_type" in vars(step): + assert step.module_type == "CLEANER", "This method can only be used with a CLEANER type module." + # make a copy of the DataFrame to prevent changing original and fill nans with empty strings if append_to_original_df: df = df.copy() @@ -212,29 +271,33 @@ def _mpi_init(self, documents): self._mpi_save_chunk_to_disk(chunk, idx, is_clean=False) - def _mpi_get_name(self, rank, is_clean): - if is_clean: - return f'vulture_{self.unique_id}_{rank}_clean.p' + def _mpi_get_name(self, rank, is_clean, custom_fn=None): + if not custom_fn: + if is_clean: + return f'vulture_{self.unique_id}_{rank}_clean.p' + else: + return f'vulture_{self.unique_id}_{rank}.p' else: - return f'vulture_{self.unique_id}_{rank}.p' + return f'vulture_{self.unique_id}_{rank}_{custom_fn}.p' - def _mpi_save_chunk_to_disk(self, data, rank, *, is_clean): - fn = self._mpi_get_name(rank, is_clean) + def _mpi_save_chunk_to_disk(self, data, rank, *, is_clean, custom_fn=None): + fn = self._mpi_get_name(rank, is_clean, custom_fn) + with open(os.path.join(self.cache, fn), 'wb') as fh: pickle.dump(data, fh) - def _mpi_load_chunk_from_disk(self, rank, *, is_clean): - fn = self._mpi_get_name(rank, is_clean) + def _mpi_load_chunk_from_disk(self, rank, *, is_clean, custom_fn=None): + fn = self._mpi_get_name(rank, is_clean, custom_fn=custom_fn) with open(os.path.join(self.cache, fn), 'rb') as fh: return pickle.load(fh) - def _mpi_combine(self): + def _mpi_combine(self, custom_fn=None): clean_documents = [] for rank in range(self.n_nodes): - clean_documents += self._mpi_load_chunk_from_disk(rank, is_clean=True) + clean_documents += self._mpi_load_chunk_from_disk(rank, is_clean=True, custom_fn=custom_fn) return clean_documents diff --git a/TELF/version.py b/TELF/version.py index 13e35bf9..6b5e9fa9 100644 --- a/TELF/version.py +++ b/TELF/version.py @@ -1 +1 @@ -__version__ = '0.0.8' \ No newline at end of file +__version__ = '0.0.9' \ No newline at end of file diff --git a/docs/Beaver.html b/docs/Beaver.html index b22b71ad..148b73fe 100644 --- a/docs/Beaver.html +++ b/docs/Beaver.html @@ -8,7 +8,7 @@ -
TELF 0.0.8 documentation
+TELF 0.0.9 documentation