Skip to content

Commit

Permalink
Merge pull request #176 from lanl/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
MaksimEkin authored Jul 24, 2024
2 parents 309eb02 + 979a22a commit 581cceb
Show file tree
Hide file tree
Showing 93 changed files with 380 additions and 227 deletions.
2 changes: 1 addition & 1 deletion CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ authors:
- family-names: Alexandrov
given-names: Boian
title: "Tensor Extraction of Latent Features (T-ELF)"
version: 0.0.19
version: 0.0.20
url: https://github.com/lanl/T-ELF
doi: 10.5281/zenodo.10257897
date-released: 2023-12-04
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ If you use T-ELF please cite.

**APA:**
```latex
Eren, M., Solovyev, N., Barron, R., Bhattarai, M., Truong, D., Boureima, I., Skau, E., Rasmussen, K., & Alexandrov, B. (2023). Tensor Extraction of Latent Features (T-ELF) (Version 0.0.19) [Computer software]. https://doi.org/10.5281/zenodo.10257897
Eren, M., Solovyev, N., Barron, R., Bhattarai, M., Truong, D., Boureima, I., Skau, E., Rasmussen, K., & Alexandrov, B. (2023). Tensor Extraction of Latent Features (T-ELF) (Version 0.0.20) [Computer software]. https://doi.org/10.5281/zenodo.10257897
```

**BibTeX:**
Expand Down
6 changes: 4 additions & 2 deletions TELF/factorization/HNMFk.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,6 +536,8 @@ def _process_node(self, Ks, depth, original_indices, node_name, parent_node_name
if len(cluster_c_indices) == 0:
continue

extracted_indicies = [current_node.original_indices[i] for i in cluster_c_indices]

# save current results
next_name = str(uuid.uuid1())
current_node.child_node_names.append(next_name)
Expand All @@ -544,8 +546,8 @@ def _process_node(self, Ks, depth, original_indices, node_name, parent_node_name
next_job = {
"parent_node_name":node_name,
"node_name":next_name,
"Ks":self._get_curr_Ks(node_k=current_node.k, num_samples=len(cluster_c_indices)),
"original_indices":cluster_c_indices.copy(),
"Ks":self._get_curr_Ks(node_k=current_node.k, num_samples=len(extracted_indicies)),
"original_indices":extracted_indicies.copy(),
"depth":current_node.depth+1,
"parent_topic":c,
}
Expand Down
161 changes: 161 additions & 0 deletions TELF/pre_processing/Vulture/tokens_analysis/stem_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
from nltk.stem import PorterStemmer
import re
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from TELF.pre_processing.Vulture.tokens_analysis.levenstein import compare_keys

"""
SAMPLE USAGE
----------
stem_processor = StemProcessor(vocabulary)
subs_stemed, new_vocabulary = stem_processor()
"""
class StemProcessor:

SUFFIXES = ['acity', 'ation', 'ative', 'cracy', 'craft', 'esque', 'able',
'ance', 'ancy', 'cide', 'ence', 'ency', 'hood', 'ible', 'less',
'like', 'ment', 'ness', 'ship', 'sion', 'ster', 'tion', 'ward',
'ware', 'wise', 'acy', 'ant', 'ary', 'ate', 'dom', 'ent', 'ern',
'ese', 'ess', 'est', 'ful', 'ian', 'ice', 'ify', 'ing', 'ion',
'ish', 'ism', 'ist', 'ity', 'ive', 'ize', 'ory', 'ous', 'ac',
'al', 'ar', 'ed', 'ee', 'en', 'er', 'fy', 'ic', 'ly', 'or', 'ty',
'y']

def __init__(self, vocabulary, suffixes=None):
"""
Store values for processing in functions
Parameters
----------
vocabulary : list
words from the corpus
suffixes : list
common suffixes in english
"""
if suffixes:
self.suffixes = sorted(suffixes, key=len, reverse=True)
else:
self.suffixes = StemProcessor.SUFFIXES
self.vocabulary = vocabulary

def strip_suffixes(self, word):
"""
Removes all suffixes, longest to shorest
Parameters
----------
word : str
unified variants map to shortest variant
Returns
-------
word : str
word without suffixes
"""
for suffix in self.suffixes:
if word.endswith(suffix):
return word[:-len(suffix)]
return word

def unify_common_stems(self, vocab_stems, similarity_threshold=0.9, min_word_length=5, n_jobs=None):
"""
finds stems that are the same without endings
Parameters
----------
vocab_stems : dict (str:str)
unified variants map to shortest variant
similarity_threshold : float
similarity cutoff
min_word_length : int
only consider words meeting this length
n_jobs : int
number of concurrent jobs
Returns
-------
vocab_stems : dict (str:str)
unified variants map to shortest variant
"""
def compare_stems(stem_pair):
stem_i, stem_j = stem_pair
if len(stem_i) > min_word_length and len(stem_j) > min_word_length:
compare_i = self.strip_suffixes(stem_i)
compare_j = self.strip_suffixes(stem_j)
similar, _ = compare_keys(compare_i, compare_j, threshold=similarity_threshold)
if similar:
return (stem_i, stem_j)
return None

stems = list(vocab_stems.keys())
stem_pairs = [(stems[i], stems[j]) for i in range(len(stems)) for j in range(i + 1, len(stems)) if stems[i][0] == stems[j][0]]
similar = []

with ThreadPoolExecutor(max_workers=n_jobs) as executor:
results = list(tqdm(executor.map(compare_stems, stem_pairs), total=len(stem_pairs)))

similar = [result for result in results if result is not None]

seen = {}
for stem_i, stem_j in similar:
shortest_stem = min(stem_i, stem_j, key=len)
longest_stem = stem_j if shortest_stem == stem_i else stem_i

destination_map = seen.get(longest_stem, shortest_stem)
if longest_stem in vocab_stems:
if destination_map in vocab_stems:
vocab_stems[destination_map]['src'].extend(vocab_stems.pop(longest_stem)['src'])
else:
vocab_stems[destination_map] = {'src': vocab_stems.pop(longest_stem)['src'], 'dest': vocab_stems[destination_map]}
seen[longest_stem] = shortest_stem

return vocab_stems

def build_stem_map(self):
"""
Stems vocabulary map, ununified
Returns
-------
vocab_stems : dict (str:str)
variants map to shortest variant, ununified
"""
ps = PorterStemmer()
vocab_stems = {}
for word in self.vocabulary:
stem = ps.stem(word)
if stem in vocab_stems:
vocab_stems[stem]['src'].append(word)
else:
vocab_stems[stem] = {'src': [word], 'dest': word}

shortest_word = min(vocab_stems[stem]['src'], key=len)
vocab_stems[stem]['dest'] = shortest_word

return vocab_stems

def __call__(self):

"""
Stems vocabulary, constructs map of all variants to the shorstest variant.
Returns
-------
subs_stemed : dict (str:str)
variants map to shortest variant
shortened_vocabulary : list
new vocabulary post-consolidation
"""
subs_stemed = {}
vocab_stems = self.build_stem_map()
vocab_stems = self.unify_common_stems(vocab_stems)
shortened_vocabulary = set()

for stem, info in vocab_stems.items():
destination_word = info['dest']
shortened_vocabulary.add(destination_word)
for src in info['src']:
if src != destination_word:
subs_stemed[src] = destination_word

return subs_stemed, list(shortened_vocabulary)
2 changes: 1 addition & 1 deletion TELF/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.0.19'
__version__ = '0.0.20'
6 changes: 3 additions & 3 deletions docs/Beaver.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

<title>TELF.pre_processing.Beaver: Fast matrix and tensor building tool &#8212; TELF 0.0.19 documentation</title>
<title>TELF.pre_processing.Beaver: Fast matrix and tensor building tool &#8212; TELF 0.0.20 documentation</title>



Expand Down Expand Up @@ -37,7 +37,7 @@
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>

<script src="_static/documentation_options.js?v=f00aad14"></script>
<script src="_static/documentation_options.js?v=30839ccb"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
Expand Down Expand Up @@ -127,7 +127,7 @@



<p class="title logo__title">TELF 0.0.19 documentation</p>
<p class="title logo__title">TELF 0.0.20 documentation</p>

</a></div>
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
Expand Down
6 changes: 3 additions & 3 deletions docs/Cheetah.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

<title>TELF.applications.Cheetah: Advanced search by keywords and phrases &#8212; TELF 0.0.19 documentation</title>
<title>TELF.applications.Cheetah: Advanced search by keywords and phrases &#8212; TELF 0.0.20 documentation</title>



Expand Down Expand Up @@ -37,7 +37,7 @@
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>

<script src="_static/documentation_options.js?v=f00aad14"></script>
<script src="_static/documentation_options.js?v=30839ccb"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
Expand Down Expand Up @@ -127,7 +127,7 @@



<p class="title logo__title">TELF 0.0.19 documentation</p>
<p class="title logo__title">TELF 0.0.20 documentation</p>

</a></div>
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
Expand Down
6 changes: 3 additions & 3 deletions docs/HNMFk.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

<title>TELF.factorization.HNMFk: Hierarchical Non-negative Matrix Factorization with Automatic Model Determination &#8212; TELF 0.0.19 documentation</title>
<title>TELF.factorization.HNMFk: Hierarchical Non-negative Matrix Factorization with Automatic Model Determination &#8212; TELF 0.0.20 documentation</title>



Expand Down Expand Up @@ -37,7 +37,7 @@
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>

<script src="_static/documentation_options.js?v=f00aad14"></script>
<script src="_static/documentation_options.js?v=30839ccb"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
Expand Down Expand Up @@ -127,7 +127,7 @@



<p class="title logo__title">TELF 0.0.19 documentation</p>
<p class="title logo__title">TELF 0.0.20 documentation</p>

</a></div>
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
Expand Down
6 changes: 3 additions & 3 deletions docs/NMFk.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

<title>TELF.factorization.NMFk: Non-negative Matrix Factorization with Automatic Model Determination &#8212; TELF 0.0.19 documentation</title>
<title>TELF.factorization.NMFk: Non-negative Matrix Factorization with Automatic Model Determination &#8212; TELF 0.0.20 documentation</title>



Expand Down Expand Up @@ -37,7 +37,7 @@
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>

<script src="_static/documentation_options.js?v=f00aad14"></script>
<script src="_static/documentation_options.js?v=30839ccb"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
Expand Down Expand Up @@ -127,7 +127,7 @@



<p class="title logo__title">TELF 0.0.19 documentation</p>
<p class="title logo__title">TELF 0.0.20 documentation</p>

</a></div>
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
Expand Down
6 changes: 3 additions & 3 deletions docs/RESCALk.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

<title>TELF.factorization.RESCALk: RESCAL with Automatic Model Determination &#8212; TELF 0.0.19 documentation</title>
<title>TELF.factorization.RESCALk: RESCAL with Automatic Model Determination &#8212; TELF 0.0.20 documentation</title>



Expand Down Expand Up @@ -37,7 +37,7 @@
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>

<script src="_static/documentation_options.js?v=f00aad14"></script>
<script src="_static/documentation_options.js?v=30839ccb"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
Expand Down Expand Up @@ -127,7 +127,7 @@



<p class="title logo__title">TELF 0.0.19 documentation</p>
<p class="title logo__title">TELF 0.0.20 documentation</p>

</a></div>
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
Expand Down
6 changes: 3 additions & 3 deletions docs/SymNMFk.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

<title>TELF.factorization.SymNMFk: Symmetric Non-negative Matrix Factorization with Automatic Model Determination &#8212; TELF 0.0.19 documentation</title>
<title>TELF.factorization.SymNMFk: Symmetric Non-negative Matrix Factorization with Automatic Model Determination &#8212; TELF 0.0.20 documentation</title>



Expand Down Expand Up @@ -37,7 +37,7 @@
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>

<script src="_static/documentation_options.js?v=f00aad14"></script>
<script src="_static/documentation_options.js?v=30839ccb"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
Expand Down Expand Up @@ -127,7 +127,7 @@



<p class="title logo__title">TELF 0.0.19 documentation</p>
<p class="title logo__title">TELF 0.0.20 documentation</p>

</a></div>
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
Expand Down
6 changes: 3 additions & 3 deletions docs/TELF.factorization.decompositions.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

<title>TELF.factorization.decompositions package &#8212; TELF 0.0.19 documentation</title>
<title>TELF.factorization.decompositions package &#8212; TELF 0.0.20 documentation</title>



Expand Down Expand Up @@ -37,7 +37,7 @@
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>

<script src="_static/documentation_options.js?v=f00aad14"></script>
<script src="_static/documentation_options.js?v=30839ccb"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
Expand Down Expand Up @@ -128,7 +128,7 @@



<p class="title logo__title">TELF 0.0.19 documentation</p>
<p class="title logo__title">TELF 0.0.20 documentation</p>

</a></div>
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
Expand Down
Loading

0 comments on commit 581cceb

Please sign in to comment.