Skip to content

Commit

Permalink
Merge pull request #94 from lanl/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
MaksimEkin authored Mar 7, 2024
2 parents 3106597 + 53cb644 commit d12a05b
Show file tree
Hide file tree
Showing 96 changed files with 1,266 additions and 289 deletions.
2 changes: 1 addition & 1 deletion CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ authors:
- family-names: Alexandrov
given-names: Boian
title: "Tensor Extraction of Latent Features (T-ELF)"
version: 0.0.8
version: 0.0.9
url: https://github.com/lanl/T-ELF
doi: 10.5281/zenodo.10257897
date-released: 2023-12-04
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ conda develop .
### Step 2: Install Spacy NLP model and NLTK Packages
```shell
python -m spacy download en_core_web_lg
python -m spacy download en_core_web_trf
python -m nltk.downloader wordnet omw-1.4
```

Expand Down Expand Up @@ -151,7 +152,7 @@ If you use T-ELF please cite.

**APA:**
```latex
Eren, M., Solovyev, N., Barron, R., Bhattarai, M., Truong, D., Boureima, I., Skau, E., Rasmussen, K., & Alexandrov, B. (2023). Tensor Extraction of Latent Features (T-ELF) (Version 0.0.8) [Computer software]. https://doi.org/10.5281/zenodo.10257897
Eren, M., Solovyev, N., Barron, R., Bhattarai, M., Truong, D., Boureima, I., Skau, E., Rasmussen, K., & Alexandrov, B. (2023). Tensor Extraction of Latent Features (T-ELF) (Version 0.0.9) [Computer software]. https://doi.org/10.5281/zenodo.10257897
```

**BibTeX:**
Expand Down
4 changes: 2 additions & 2 deletions TELF/factorization/decompositions/nmf_fro_mu.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,8 @@ def nmf(X, W, H,

for i in tqdm(range(niter), disable=nmf_verbose == False):

H = H_update(X, W, H, H_opts, use_gpu=use_gpu)
W = W_update(X, W, H, W_opts, use_gpu=use_gpu)
H = H_update(X, W, H, H_opts, use_gpu=use_gpu, mask=mask)
W = W_update(X, W, H, W_opts, use_gpu=use_gpu, mask=mask)

if i % 10 == 0:
H = np.maximum(H.astype(dtype), eps)
Expand Down
6 changes: 3 additions & 3 deletions TELF/factorization/decompositions/nmf_kl_mu.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def H_update(X, W, H, opts=None, nz_rows=None, nz_cols=None, use_gpu=True, mask=
"""
if mask is not None:
mask = mask.T
return W_update(X.T, H.T, W.T, opts=opts, use_gpu=use_gpu, mask=None, nz_rows=nz_cols, nz_cols=nz_rows).T
return W_update(X.T, H.T, W.T, opts=opts, use_gpu=use_gpu, mask=mask, nz_rows=nz_cols, nz_cols=nz_rows).T


def W_update(X, W, H, opts=None, nz_rows=None, nz_cols=None, use_gpu=True, mask=None):
Expand Down Expand Up @@ -203,8 +203,8 @@ def nmf(X, W, H,
inc = 0

for i in tqdm(range(niter), disable=nmf_verbose == False):
H = H_update(X, W, H, H_opts, use_gpu=use_gpu)
W = W_update(X, W, H, W_opts, use_gpu=use_gpu)
H = H_update(X, W, H, H_opts, use_gpu=use_gpu, mask=mask)
W = W_update(X, W, H, W_opts, use_gpu=use_gpu, mask=mask)
if i % 10 == 0:
H = np.maximum(H.astype(dtype), eps)
W = np.maximum(W.astype(dtype), eps)
Expand Down
3 changes: 2 additions & 1 deletion TELF/pre_processing/Vulture/modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
from .simple_clean import SimpleCleaner
from .lemmatize import LemmatizeCleaner
from .substitute import SubstitutionCleaner
from .detect_nonenglish import RemoveNonEnglishCleaner
from .detect_nonenglish import RemoveNonEnglishCleaner
from .ner import NEDetector
1 change: 1 addition & 0 deletions TELF/pre_processing/Vulture/modules/detect_nonenglish.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class RemoveNonEnglishCleaner(VultureModuleBase):

def __init__(self, ascii_ratio=0.9, stopwords_ratio=0.2, frozen=None):
super().__init__(frozen) # initialize the base class with the preserve
self.module_type = "CLEANER"
self.ascii_ratio = ascii_ratio
self.stopwords_ratio = stopwords_ratio

Expand Down
1 change: 1 addition & 0 deletions TELF/pre_processing/Vulture/modules/lemmatize.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class LemmatizeCleaner(VultureModuleBase):

def __init__(self, library, frozen=None):
super().__init__(frozen)
self.module_type = "CLEANER"
self.library = library
self.backend = None

Expand Down
99 changes: 99 additions & 0 deletions TELF/pre_processing/Vulture/modules/ner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import re
import spacy
import warnings

from TELF.pre_processing.Vulture.modules import VultureModuleBase


class NEDetector(VultureModuleBase):
"""
An operator that detects Named Entities in text.
Attributes:
-----------
library: str
The name of the library that is used for the NER backend
"""
# supported lemmatization libraries
BACKEND_LIBRARIES = ['en_core_web_trf', 'en_core_web_lg']

def __init__(self, library="en_core_web_trf", frozen=None):
super().__init__(frozen)
self.module_type = "OPERATOR"
self.library = library
self.backend = None

def __call__(self, document):
return self.run(document)


def run(self, document):
"""
Run the NER detection
Parameters
----------
document: tuple
A document id, document text pair for which to perform NER detection
Returns
-------
tuple
Tuple of document id and operation result
"""
doc_id, doc_text = document
doc_operation_result = self._detect_NER(doc_text)
return (doc_id, doc_operation_result)


def _detect_NER(self, text):
"""
Detect NERs in a given string
Parameters
----------
text: str
A string to be NER detection performed on
Returns
-------
str
Dictionary of entity name and correcponding set of entities
"""
if self.backend is None:
self._init_backend()

doc = self.backend(text)
entities_set = set()
label_entities = {}
for ent in doc.ents:
entities_set.add((ent.text, ent.label_))
if ent.label_ not in label_entities:
label_entities[ent.label_] = set()
label_entities[ent.label_].add(ent.text)

# return string where hyphens are not split
return label_entities


def _init_backend(self):
"""
Change NER detection backend depending on library
"""
self.backend = spacy.load(self.library)


# GETTERS / SETTERS


@property
def library(self):
return self._library

@library.setter
def library(self, library):
if not isinstance(library, str):
raise TypeError('Expected type str for `library`!')
if library not in self.BACKEND_LIBRARIES:
raise ValueError(f'Unknown library "{library}"! Supported options are {self.BACKEND_LIBRARIES}.')
self._library = library
1 change: 1 addition & 0 deletions TELF/pre_processing/Vulture/modules/simple_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ class SimpleCleaner(VultureModuleBase):

def __init__(self, custom_patterns=None, stop_words=None, stop_phrases=None, min_characters=2, order=None, frozen=None):
self._frozen = set()
self.module_type = "CLEANER"
self.effective_stop_words = None
self.patterns = self.DEFAULT_PATTERNS.copy()
self.custom_patterns = custom_patterns
Expand Down
1 change: 1 addition & 0 deletions TELF/pre_processing/Vulture/modules/substitute.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class SubstitutionCleaner(VultureModuleBase):
def __init__(self, substitution_map, permute=False, lower=False, lemmatize=False, frozen=None):
super().__init__(frozen) # initialize the base class with the preserve

self.module_type = "CLEANER"
self.lower = lower
self.permute = permute
self.lemmatize = lemmatize
Expand Down
83 changes: 73 additions & 10 deletions TELF/pre_processing/Vulture/vulture.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from TELF.pre_processing.Vulture.modules import LemmatizeCleaner
from TELF.pre_processing.Vulture.modules import SubstitutionCleaner
from TELF.pre_processing.Vulture.modules import RemoveNonEnglishCleaner
from TELF.pre_processing.Vulture.modules import NEDetector
from TELF.pre_processing.Vulture.default_stop_words import STOP_WORDS
from TELF.pre_processing.Vulture.default_stop_phrases import STOP_PHRASES

Expand Down Expand Up @@ -74,6 +75,9 @@ class Vulture:
Vultures are natures' cleaners!
"""
PARALLEL_BACKEND_OPTIONS = {'loky', 'multiprocessing', 'threading'}
DEFAULT_OPERATOR_PIPELINE = [
NEDetector(library="en_core_web_trf")
]
DEFAULT_PIPELINE = [
SimpleCleaner(stop_words = STOP_WORDS,
stop_phrases = STOP_PHRASES,
Expand Down Expand Up @@ -129,12 +133,60 @@ def __init__(self, *, n_jobs = -1, n_nodes = 1, parallel_backend = "multiprocess
# broadcast unique_id from root process to all other processes
if self.comm is not None:
self.unique_id = self.comm.bcast(self.unique_id, root=0)


def operate(self, documents, steps=None, save_path=None, file_name=""):

if steps is None:
steps = self.DEFAULT_OPERATOR_PIPELINE.copy()

for step in steps:
if "module_type" in vars(step):
assert step.module_type == "OPERATOR", "This method can only be used with a OPERATOR type module."

# transform documents into list of tuples
operate_documents = list(documents.items())
if self.verbose and self.rank == 0:
print(f'[Vulture]: Cleaning {len(operate_documents)} documents', file=sys.stderr)


# prepare for MPI by chunking data and saving chunks (assuming DFS)
if self.use_mpi():
self._mpi_init(operate_documents)
self.comm.Barrier()
operate_documents = self._mpi_load_chunk_from_disk(self.rank, is_clean=False)

# perform operation
all_results = []
for step in steps:
if save_path is not None:
self.save_path = f'{save_path}/{file_name}_{step.__class__.__name__}.p'
else:
self.save_path = save_path

curr_operated_documents = self._clean_helper(operate_documents, [step])
if self.use_mpi():
self._mpi_save_chunk_to_disk(curr_operated_documents, self.rank, is_clean=True, custom_fn=f'{step.__class__.__name__}')
self.comm.Barrier()
curr_operated_documents = self._mpi_combine(custom_fn=f'{step.__class__.__name__}')

if self.save_path is not None:
self._save_documents(dict(curr_operated_documents))
else:
all_results.append((f'{step.__class__.__name__}', dict(curr_operated_documents)))

if self.save_path is None:
return all_results

def clean(self, documents, steps=None, substitutions=None, save_path=None):
self.save_path = save_path
if steps is None:
steps = self.DEFAULT_PIPELINE.copy()

for step in steps:
if "module_type" in vars(step):
assert step.module_type == "CLEANER", "This method can only be used with a CLEANER type module."

if substitutions is not None:
assert isinstance(substitutions, dict), '`substitutions` must be a dict!'
initial_sub = SubstitutionCleaner(substitutions, permute=True, lower=True, lemmatize=True)
Expand Down Expand Up @@ -174,6 +226,13 @@ def clean_dataframe(self, df, columns, steps=None, substitutions=None,
if not all(col in df.columns for col in columns): # make sure columns exist
raise ValueError("One or more columns are invalid!")

if steps is None:
steps = self.DEFAULT_PIPELINE.copy()

for step in steps:
if "module_type" in vars(step):
assert step.module_type == "CLEANER", "This method can only be used with a CLEANER type module."

# make a copy of the DataFrame to prevent changing original and fill nans with empty strings
if append_to_original_df:
df = df.copy()
Expand Down Expand Up @@ -212,29 +271,33 @@ def _mpi_init(self, documents):
self._mpi_save_chunk_to_disk(chunk, idx, is_clean=False)


def _mpi_get_name(self, rank, is_clean):
if is_clean:
return f'vulture_{self.unique_id}_{rank}_clean.p'
def _mpi_get_name(self, rank, is_clean, custom_fn=None):
if not custom_fn:
if is_clean:
return f'vulture_{self.unique_id}_{rank}_clean.p'
else:
return f'vulture_{self.unique_id}_{rank}.p'
else:
return f'vulture_{self.unique_id}_{rank}.p'
return f'vulture_{self.unique_id}_{rank}_{custom_fn}.p'


def _mpi_save_chunk_to_disk(self, data, rank, *, is_clean):
fn = self._mpi_get_name(rank, is_clean)
def _mpi_save_chunk_to_disk(self, data, rank, *, is_clean, custom_fn=None):
fn = self._mpi_get_name(rank, is_clean, custom_fn)

with open(os.path.join(self.cache, fn), 'wb') as fh:
pickle.dump(data, fh)


def _mpi_load_chunk_from_disk(self, rank, *, is_clean):
fn = self._mpi_get_name(rank, is_clean)
def _mpi_load_chunk_from_disk(self, rank, *, is_clean, custom_fn=None):
fn = self._mpi_get_name(rank, is_clean, custom_fn=custom_fn)
with open(os.path.join(self.cache, fn), 'rb') as fh:
return pickle.load(fh)


def _mpi_combine(self):
def _mpi_combine(self, custom_fn=None):
clean_documents = []
for rank in range(self.n_nodes):
clean_documents += self._mpi_load_chunk_from_disk(rank, is_clean=True)
clean_documents += self._mpi_load_chunk_from_disk(rank, is_clean=True, custom_fn=custom_fn)
return clean_documents


Expand Down
2 changes: 1 addition & 1 deletion TELF/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.0.8'
__version__ = '0.0.9'
8 changes: 4 additions & 4 deletions docs/Beaver.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

<title>TELF.pre_processing.Beaver: Fast matrix and tensor building tool &#8212; TELF 0.0.8 documentation</title>
<title>TELF.pre_processing.Beaver: Fast matrix and tensor building tool &#8212; TELF 0.0.9 documentation</title>



Expand All @@ -28,7 +28,7 @@
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />

<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=a746c00c" />
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=362ab14a" />
<link rel="stylesheet" href="_static/styles/sphinx-book-theme.css?digest=14f4ca6b54d191a8c7657f6c759bf11a5fb86285" type="text/css" />
<link rel="stylesheet" type="text/css" href="_static/graphviz.css?v=eafc0fe6" />

Expand All @@ -37,7 +37,7 @@
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>

<script src="_static/documentation_options.js?v=820a49c8"></script>
<script src="_static/documentation_options.js?v=39f6cbd5"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
Expand Down Expand Up @@ -127,7 +127,7 @@



<p class="title logo__title">TELF 0.0.8 documentation</p>
<p class="title logo__title">TELF 0.0.9 documentation</p>

</a></div>
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
Expand Down
Loading

0 comments on commit d12a05b

Please sign in to comment.