Skip to content

Commit

Permalink
feat(*): handle UNK created by contextual vectors inside as.vector he…
Browse files Browse the repository at this point in the history
…lper
  • Loading branch information
sanjayaksaxena committed Mar 21, 2024
1 parent 3f76c8f commit 7ffb5b6
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 2 deletions.
8 changes: 6 additions & 2 deletions src/as.js
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ as.vector = function ( tokens, rdd ) {
const size = rdd.wordVectors.dimensions;
const precision = rdd.wordVectors.precision;
const vectors = rdd.wordVectors.vectors;
const l2NormIndex = rdd.wordVectors.l2NormIndex;

// Set up a new initialized vector of `size`
const v = new Array( size );
Expand All @@ -203,8 +204,11 @@ as.vector = function ( tokens, rdd ) {
for ( let i = 0; i < tokens.length; i += 1 ) {
// Extract token vector for the current token.
const tv = vectors[ tokens[ i ].toLowerCase() ];
// Increment `numOfTokens` if the above operation was successful.
if ( tv !== undefined ) numOfTokens += 1;
// Increment `numOfTokens` if the above operation was successful
// AND l2Norm is non-zero, because for UNK vectors it is set to 0.
// The later is applicable for the contextual vectors, where in event
// of UNK, an all zero vectors is set for UNK word.
if ( tv !== undefined && tv[ l2NormIndex ] !== 0 ) numOfTokens += 1;
for ( let j = 0; j < size; j += 1 ) {
// Keep summing, eventually it will be divided by `numOfTokens` to obtain avareage.
v[ j ] += ( tv === undefined ) ? 0 : tv[ j ];
Expand Down
2 changes: 2 additions & 0 deletions src/doc-v2.js
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,8 @@ var doc = function ( docData, addons ) {
.out( its.lemma )
.map( ( t ) => t.toLowerCase() );

// NOTE: For UNK words an all zero vector is set up, with `l2Norm = 0`, which may be used in as.vector helper
// to detect an UNK word.
for ( let i = 0; i < docTokens.length; i += 1 ) cv.vectors[ docTokens[ i ] ] = ( awvs[ docTokens[ i ] ] || cv.unkVector ).slice( 0 );
for ( let i = 0; i < docTokensLemma.length; i += 1 ) cv.vectors[ docTokensLemma[ i ] ] = ( awvs[ docTokensLemma[ i ] ] || cv.unkVector ).slice( 0 );
for ( let i = 0; i < specificWordVectors.length; i += 1 ) {
Expand Down

0 comments on commit 7ffb5b6

Please sign in to comment.