From 7ffb5b614012c1fe1f0aab817a365e86f8bedad8 Mon Sep 17 00:00:00 2001 From: Sanjaya Kumar Saxena Date: Thu, 21 Mar 2024 20:18:16 +0530 Subject: [PATCH] feat(*): handle UNK created by contextual vectors inside as.vector helper --- src/as.js | 8 ++++++-- src/doc-v2.js | 2 ++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/as.js b/src/as.js index 89e0af6..932fcf0 100644 --- a/src/as.js +++ b/src/as.js @@ -193,6 +193,7 @@ as.vector = function ( tokens, rdd ) { const size = rdd.wordVectors.dimensions; const precision = rdd.wordVectors.precision; const vectors = rdd.wordVectors.vectors; + const l2NormIndex = rdd.wordVectors.l2NormIndex; // Set up a new initialized vector of `size` const v = new Array( size ); @@ -203,8 +204,11 @@ as.vector = function ( tokens, rdd ) { for ( let i = 0; i < tokens.length; i += 1 ) { // Extract token vector for the current token. const tv = vectors[ tokens[ i ].toLowerCase() ]; - // Increment `numOfTokens` if the above operation was successful. - if ( tv !== undefined ) numOfTokens += 1; + // Increment `numOfTokens` if the above operation was successful + // AND l2Norm is non-zero, because for UNK vectors it is set to 0. + // The later is applicable for the contextual vectors, where in event + // of UNK, an all zero vectors is set for UNK word. + if ( tv !== undefined && tv[ l2NormIndex ] !== 0 ) numOfTokens += 1; for ( let j = 0; j < size; j += 1 ) { // Keep summing, eventually it will be divided by `numOfTokens` to obtain avareage. v[ j ] += ( tv === undefined ) ? 0 : tv[ j ]; diff --git a/src/doc-v2.js b/src/doc-v2.js index c8c2a1f..5e3d2be 100644 --- a/src/doc-v2.js +++ b/src/doc-v2.js @@ -487,6 +487,8 @@ var doc = function ( docData, addons ) { .out( its.lemma ) .map( ( t ) => t.toLowerCase() ); + // NOTE: For UNK words an all zero vector is set up, with `l2Norm = 0`, which may be used in as.vector helper + // to detect an UNK word. for ( let i = 0; i < docTokens.length; i += 1 ) cv.vectors[ docTokens[ i ] ] = ( awvs[ docTokens[ i ] ] || cv.unkVector ).slice( 0 ); for ( let i = 0; i < docTokensLemma.length; i += 1 ) cv.vectors[ docTokensLemma[ i ] ] = ( awvs[ docTokensLemma[ i ] ] || cv.unkVector ).slice( 0 ); for ( let i = 0; i < specificWordVectors.length; i += 1 ) {