Skip to content

Commit

Permalink
add embedding vectors support
Browse files Browse the repository at this point in the history
  • Loading branch information
goru001 committed May 17, 2019
1 parent b558753 commit 57ff183
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 1 deletion.
28 changes: 28 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,34 @@ from inltk.inltk import tokenize
tokenize(text ,'<code-of-language>') // where text is string in <code-of-language>
```
**Get Embedding Vectors**
This returns an array of "Embedding vectors", containing 400 Dimensional representation for
every token in the text.
```
from inltk.inltk import get_embedding_vectors

vectors = get_embedding_vectors(text, '<code-of-language>') // where text is string in <code-of-language>

Example:

>> vectors = get_embedding_vectors('भारत', 'hi')
>> vectors[0].shape
(400,)

>> get_embedding_vectors('ਜਿਹਨਾਂ ਤੋਂ ਧਾਤਵੀ ਅਲੌਹ ਦਾ ਆਰਥਕ','pa')
[array([-0.894777, -0.140635, -0.030086, -0.669998, ..., 0.859898, 1.940608, 0.09252 , 1.043363], dtype=float32), array([ 0.290839, 1.459981, -0.582347, 0.27822 , ..., -0.736542, -0.259388, 0.086048, 0.736173], dtype=float32), array([ 0.069481, -0.069362, 0.17558 , -0.349333, ..., 0.390819, 0.117293, -0.194081, 2.492722], dtype=float32), array([-0.37837 , -0.549682, -0.497131, 0.161678, ..., 0.048844, -1.090546, 0.154555, 0.925028], dtype=float32), array([ 0.219287, 0.759776, 0.695487, 1.097593, ..., 0.016115, -0.81602 , 0.333799, 1.162199], dtype=float32), array([-0.31529 , -0.281649, -0.207479, 0.177357, ..., 0.729619, -0.161499, -0.270225, 2.083801], dtype=float32), array([-0.501414, 1.337661, -0.405563, 0.733806, ..., -0.182045, -1.413752, 0.163339, 0.907111], dtype=float32), array([ 0.185258, -0.429729, 0.060273, 0.232177, ..., -0.537831, -0.51664 , -0.249798, 1.872428], dtype=float32)]
>> vectors = get_embedding_vectors('ਜਿਹਨਾਂ ਤੋਂ ਧਾਤਵੀ ਅਲੌਹ ਦਾ ਆਰਥਕ','pa')
>> len(vectors)
8

```
To get a feel of embeddings, checkout
[this visualization of subset of Hindi Embedding vectors](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-hindi/master/language-model/embedding_projector_config.json)
**Predict Next 'n' words**
```bash
Expand Down
17 changes: 17 additions & 0 deletions inltk/inltk.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,20 @@ def reset_language_identifying_models():
path = Path(__file__).parent
shutil.rmtree(path / 'models' / 'all')
return


def get_embedding_vectors(input: str, language_code: str):
check_input_language(language_code)
tok = LanguageTokenizer(language_code)
token_ids = tok.numericalize(input)
# get learner
defaults.device = torch.device('cpu')
path = Path(__file__).parent
learn = load_learner(path / 'models' / f'{language_code}')
encoder = get_model(learn.model)[0]
embeddings = encoder.state_dict()['encoder.weight']
embeddings = np.array(embeddings)
embedding_vectors = []
for token in token_ids:
embedding_vectors.append(embeddings[token])
return embedding_vectors
3 changes: 3 additions & 0 deletions inltk/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ def __init__(self, lang: str):
def tokenizer(self, t: str) -> List[str]:
return self.sp.EncodeAsPieces(t)

def numericalize(self, t: str) -> List[int]:
return self.sp.EncodeAsIds(t)

def remove_foreign_tokens(self, t: str):
local_pieces = []
for i in self.sp.EncodeAsIds(t):
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="inltk",
version="0.3.0",
version="0.4.0",
author="Gaurav",
author_email="[email protected]",
description="Natural Language Toolkit for Indian Languages (iNLTK)",
Expand Down

0 comments on commit 57ff183

Please sign in to comment.