From 59151d5631669bfe32e6dcfa6abd111a1a6ca1ff Mon Sep 17 00:00:00 2001 From: pr0m1th3as Date: Mon, 18 Mar 2024 06:36:39 +0200 Subject: [PATCH] editDistance: add 'uniquetol' like functionality for single input of words (cellstr) and documents (cell array of cellstr) when distance comparison is requested, improved memory performance in the core edit distance computation functions, added 'OutputAllIndices' option for uniquetol like functionality --- src/editDistance.cc | 550 +++++++++++++++++++++++++++++++------------- 1 file changed, 391 insertions(+), 159 deletions(-) diff --git a/src/editDistance.cc b/src/editDistance.cc index fc75afd4..d49de20b 100644 --- a/src/editDistance.cc +++ b/src/editDistance.cc @@ -17,6 +17,7 @@ You should have received a copy of the GNU General Public License along with this program; if not, see . */ +#include #include #include #include @@ -27,6 +28,13 @@ this program; if not, see . #include using namespace std; + +struct UniqueVecOut +{ + ColumnVector IA; + ColumnVector IC; + Cell IA_c; +}; // Function for computing the minimum of three integer values int minimum (int a, int b, int c) @@ -37,114 +45,180 @@ int minimum (int a, int b, int c) return min; } - -// Function for computing the Levenshtein distance -int LevensDist (const string& s1, const string s2) +// Function for computing the Levenshtein distance between two strings +int LevensDistStr (const string& s1, const string& s2) { - const int rows = s1.length() + 1; - const int cols = s2.length() + 1; - // Dynamically allocate distance matrix - int **dist; - dist = new int*[rows]; - for (int i = 0; i curr(cols+1, 0); + int prev; + // Prepopulate 1st column + for (int j = 0; j <= cols; j++) { - dist[i][0] = i; - } - for (int j = 0; j < cols; j++) - { - dist[0][j] = j; + curr[j] = j; } // Compute all other elements in distance matrix - for (int i = 1; i < rows; i++) + for (int i = 1; i <= rows; i++) { - for (int j = 1; j < cols; j++) + prev = curr[0]; + curr[0] = i; + for (int j = 1; j <= cols; j++) { + int temp = curr[j]; if (s1[i - 1] == s2[j - 1]) { - dist[i][j] = dist[i - 1][j - 1]; + curr[j] = prev; } else { - dist[i][j] = minimum (dist[i - 1][j - 1] + 1, dist[i - 1][j] + 1, - dist[i][j - 1] + 1); + curr[j] = 1 + minimum (prev, curr[j - 1], curr[j]); } + prev = temp; } } - int d = dist[rows - 1][cols - 1]; - // Free memory - for(int i = 0; i < rows; i++) - { - delete dist[i]; - } - delete dist; - return d; + return curr[cols]; } -// Function for comparing the Levenshtein distance against a minimum distance -bool minLevensDist (const string& s1, const string& s2, int const minDist) +// Function for computing the Levenshtein distance between two documents +int LevensDistDoc (const Cell& d1, const Cell& d2) { - int rows = s1.length() + 1; - int cols = s2.length() + 1; - // Dynamically allocate distance matrix - int **dist; - dist = new int*[rows]; - for (int i = 0; i curr(cols+1, 0); + int prev; + // Prepopulate 1st column + for (int j = 0; j <= cols; j++) { - dist[0][j] = j; + curr[j] = j; } - // Compute all other elements in distance matrix until minDist is exceeded - for (int i = 1; i < rows; i++) + // Compute all other elements in distance matrix + for (int i = 1; i <= rows; i++) { - for (int j = 1; j < cols; j++) + prev = curr[0]; + curr[0] = i; + for (int j = 1; j <= cols; j++) { - if (s1[i - 1] == s2[j - 1]) + int temp = curr[j]; + if (d1(i - 1).string_value() == d2(i - 1).string_value()) { - dist[i][j] = dist[i - 1][j - 1]; + curr[j] = prev; } else { - dist[i][j] = minimum (dist[i - 1][j - 1] + 1, dist[i - 1][j] + 1, - dist[i][j - 1] + 1); - int d = dist[rows - 1][cols - 1]; - if (d > minDist) - { - // Free memory - for(int i = 0; i < rows; i++) - { - delete dist[i]; - } - delete dist; - return false; - } - } + curr[j] = 1 + minimum (prev, curr[j - 1], curr[j]); + } + prev = temp; } } - // Free memory - for(int i = 0; i < rows; i++) + return curr[cols]; +} + +// Transform a distance triu matric to a boolean triu matrix +boolMatrix double2bool (const Matrix& D, const int& minDist) +{ + const int sz = D.rows(); + boolMatrix Bmat(sz, sz); + for (octave_idx_type i = 0; i < sz - 1; i++) + { + Bmat(i,i) = true; + for (octave_idx_type j = i + 1; j < sz; j++) + { + if (D(i,j) <= minDist) + { + Bmat(i,j) = true; + Bmat(j,i) = false; + } + else + { + Bmat(i,j) = false; + Bmat(j,i) = false; + } + } + } + Bmat(sz - 1,sz - 1) = true; + return Bmat; +} + +// Transform a distance triu matric to a distance vector +Matrix triu2Dvec (const Matrix& D) +{ + const int szA = D.rows(); + const int sz = szA * (szA - 1) / 2; + octave_idx_type idx = 0; + Matrix Dvec(sz, 1); + for (octave_idx_type i = 0; i < szA - 1; i++) + { + for (octave_idx_type j = i + 1; j < szA; j++) + { + Dvec(idx++,0) = D(i,j); + } + } + return Dvec; +} + +// Compute unique indexing IA cell of vectors +vector> IAcellvec (const boolMatrix& B) +{ + int rows = B.rows(); + int cols = B.columns(); + vector> IAcell; + for (int i = 0; i < rows; i++) + { + vector IA_cIdx; + IA_cIdx.push_back(i); + for (int j = i + 1; j < cols; j++) + { + if (B(i,j)) + { + IA_cIdx.push_back(j); + } + } + IAcell.push_back(IA_cIdx); + } + return IAcell; +} + +// Compute unique indexing IA vector +vector IAvector (const vector>& IAcell) +{ + vector IA; + vector IA_done; + for (int i = 0; i < IAcell.size(); i++) { - delete dist[i]; + for (int j = 0; j < IAcell[i].size(); j++) + { + if (binary_search(IA_done.begin(), IA_done.end(), IAcell[i][j])) + { + break; + } + else + { + if (j == 0) + { + IA.push_back(IAcell[i][j]); + } + else + { + IA_done.push_back(IAcell[i][j]); + } + } + } + sort (IA_done.begin(), IA_done.end()); } - delete dist; - return true; + return IA; } DEFUN_DLD(editDistance, args, nargout, "-*- texinfo -*-\n\ @deftypefn {statistics} {@var{d} =} editDistance (@var{str1})\n\ + @deftypefnx {statistics} {@var{d} =} editDistance (@var{doc1})\n\ + @deftypefnx {statistics} {@var{C} =} editDistance (@dots{}, @var{minDist})\n\ + @deftypefnx {statistics} {[@var{C}, @var{IA}, @var{IC}] =} editDistance @\ + (@dots{}, @var{minDist})\n\ + @deftypefnx {statistics} {[@var{C}, @var{IA}, @var{IC}] =} editDistance @\ + (@dots{}, @var{minDist}, @qcode{\"OutputAllIndices\"}, @qcode{true})\n\ + @deftypefnx {statistics} {[@var{C}, @var{IA}, @var{IC}] =} editDistance @\ + (@dots{}, @var{minDist}, @qcode{\"OutputAllIndices\"}, @qcode{false})\n\ @deftypefnx {statistics} {@var{d} =} editDistance (@var{str1}, @var{str2})\n\ @deftypefnx {statistics} {@var{d} =} editDistance (@dots{}, @var{minDist})\n\ \n\ @@ -172,6 +246,28 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\ @end deftypefn") { int nargin = args.length(); + // Add default options + bool OutputAllIndices = false; + // Parse Name-Value paired arguments + if (nargin > 2 && args(nargin-2).is_string()) + { + string ParamName = "OutputAllIndices"; + if (args(nargin - 2).string_value() == ParamName) + { + const int idx = nargin - 1; + if (args(idx).islogical() && args(idx).numel() == 1) + { + const boolMatrix tmp = args(idx).bool_matrix_value(); + OutputAllIndices = tmp(0); + } + else + { + error ("editDistance: value for OutputAllIndices must be a logical scalar."); + } + nargin--; + nargin--; + } + } // Check for invalid number of input arguments if (nargin > 3) { @@ -183,11 +279,11 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\ if (nargin > 1 && args(nargin-1).isnumeric()) { // Check minDist input argument - if (args(nargin-1).numel() != 1) + if (args(nargin -1 ).numel() != 1) { error ("editDistance: minDist must be a scalar value."); } - Matrix tmp = args(nargin-1).matrix_value(); + Matrix tmp = args(nargin - 1).matrix_value(); if (tmp(0,0) < 0 || floor (tmp(0,0)) != tmp(0,0)) { error ("editDistance: minDist must be a nonnegative integer."); @@ -201,7 +297,7 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\ doMinDist = false; } // Check cases of string arguments - octave_value_list retval; + octave_value_list retval (nargout); if (nargin == 1) { if (args(0).iscellstr()) @@ -215,36 +311,160 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\ retval(0) = double (0); return retval; } - // Compute the distance vector - int sz = szA * (szA - 1) / 2; - octave_idx_type idx = 0; - if (doMinDist) + // Compute the edit distance + Matrix D(szA, szA); + #pragma omp parallel { - boolMatrix D(sz, 1); #pragma omp parallel for for (octave_idx_type i = 0; i < szA - 1; i++) { + D(i,i) = 0; string s1 = strA(i).string_value(); for (octave_idx_type j = i + 1; j < szA; j++) { - D(idx++,0) = minLevensDist (s1, strA(j).string_value(), minDist); + D(i,j) = LevensDistStr (s1, strA(j).string_value()); + } + } + D(szA - 1,szA - 1) = 0; + } + // If minDist is given, revert functionality from 'pdist' to 'uniquetol' + // and return a vector indexing similar strings + if (doMinDist) + { + boolMatrix B = double2bool (D, minDist); + vector> IAc; + vector IAv; + IAc = IAcellvec (B); + IAv = IAvector (IAc); + // Build cellstr with unique elements + Cell C(IAv.size(), 1); + for (octave_idx_type i = 0; i < IAv.size(); i++) + { + C(i,0) = strA(IAv[i]).string_value(); + } + retval(0) = C; + // Build IA vector output + if (nargout > 1) + { + if (OutputAllIndices) + { + Cell IA(IAv.size(), 1); + for (octave_idx_type i = 0; i < IAv.size(); i++) + { + int idx = IAv[i]; + Matrix IAidx(IAc[idx].size(), 1); + for (octave_idx_type j = 0; j < IAc[idx].size(); j++) + { + IAidx(j,0) = IAc[idx][j] + 1; + } + IA(i,0) = IAidx; + } + retval(1) = IA; + } + else + { + Matrix IA(IAv.size(), 1); + for (octave_idx_type i = 0; i < IAv.size(); i++) + { + IA(i,0) = IAv[i] + 1; + } + retval(1) = IA; } } - retval(0) = D; } else { - Matrix D(sz, 1); + // Transform to distance vector + retval(0) = triu2Dvec (D); + } + return retval; + } + else if (args(0).iscell()) + { + // Get cellstr input argument + const Cell docA = args(0).cell_value(); + int szA = docA.numel(); + // Check that all cell elements contain cellstr arrays + for (octave_idx_type i = 0; i < szA - 1; i++) + { + Cell tmp = docA.elem(i); + if (! tmp.iscellstr()) + { + error ("editDistance: tokenizedDocument " + "must contain cellstr arrays."); + } + } + // For scalar input return distance to itself, i.e. 0 + if (szA == 1) + { + retval(0) = double (0); + return retval; + } + // Compute the edit distance + Matrix D(szA, szA); + #pragma omp parallel + { #pragma omp parallel for for (octave_idx_type i = 0; i < szA - 1; i++) { - string s1 = strA(i).string_value(); + D(i,i) = 0; + Cell d1 = docA.elem(i); for (octave_idx_type j = i + 1; j < szA; j++) { - D(idx++, 0) = LevensDist (s1, strA(j).string_value()); + D(i,j) = LevensDistDoc (d1, docA.elem(j)); + } + } + D(szA - 1,szA - 1) = 0; + } + // If minDist is given, revert functionality from 'pdist' to 'uniquetol' + // and return a vector indexing similar documents + if (doMinDist) + { + boolMatrix B = double2bool (D, minDist); + vector> IAc; + vector IAv; + IAc = IAcellvec (B); + IAv = IAvector (IAc); + // Build cellstr with unique elements + Cell C(IAv.size(), 1); + for (octave_idx_type i = 0; i < IAv.size(); i++) + { + C(i,0) = docA(IAv[i]).cell_value(); + } + retval(0) = C; + // Build IA vector output + if (nargout > 1) + { + if (OutputAllIndices) + { + Cell IA(IAv.size(), 1); + for (octave_idx_type i = 0; i < IAv.size(); i++) + { + int idx = IAv[i]; + Matrix IAidx(IAc[idx].size(), 1); + for (octave_idx_type j = 0; j < IAc[idx].size(); j++) + { + IAidx(j,0) = IAc[idx][j] + 1; + } + IA(i,0) = IAidx; + } + retval(1) = IA; + } + else + { + Matrix IA(IAv.size(), 1); + for (octave_idx_type i = 0; i < IAv.size(); i++) + { + IA(i,0) = IAv[i] + 1; + } + retval(1) = IA; } } - retval(0) = D; + } + else + { + // Transform to distance vector + retval(0) = triu2Dvec (D); } return retval; } @@ -267,88 +487,79 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\ { error ("editDistance: cellstr input arguments size mismatch."); } + // Preallocate the distance vector + int szV = szA; + if (szA == 1) + { + szV = szB; + } + Matrix D(szV, 1); // Compute the distance vector if (szA == 1 && szB != 1) { - if (doMinDist) - { - boolMatrix D (szB, 1); - for (octave_idx_type i = 0; i < szB; i++) - { - D(i,0) = minLevensDist (strA(0).string_value(), - strB(i).string_value(), minDist); - } - retval(0) = D; - } - else + for (octave_idx_type i = 0; i < szB; i++) { - Matrix D (szB, 1); - for (octave_idx_type i = 0; i < szB; i++) - { - D(i,0) = LevensDist (strA(0).string_value(), - strB(i).string_value()); - } - retval(0) = D; + D(i,0) = LevensDistStr (strA(0).string_value(), + strB(i).string_value()); } + retval(0) = D; } else if (szA != 1 && szB == 1) { - if (doMinDist) + for (octave_idx_type i = 0; i < szA; i++) { - boolMatrix D (szA, 1); - for (octave_idx_type i = 0; i < szA; i++) - { - D(i,0) = minLevensDist (strA(i).string_value(), - strB(0).string_value(), minDist); - } - retval(0) = D; + D(i,0) = LevensDistStr (strA(i).string_value(), + strB(0).string_value()); } - else + retval(0) = D; + } + else + { + for (octave_idx_type i = 0; i < szA; i++) { - Matrix D (szA, 1); - for (octave_idx_type i = 0; i < szA; i++) - { - D(i,0) = LevensDist (strA(i).string_value(), - strB(0).string_value()); - } - retval(0) = D; + D(i,0) = LevensDistStr (strA(i).string_value(), + strB(i).string_value()); } + retval(0) = D; } - else + if (doMinDist) { - if (doMinDist) + boolMatrix Bvec(szV, 1); + for (octave_idx_type i = 0; i < szV; i++) { - boolMatrix D (szA, 1); - for (octave_idx_type i = 0; i < szA; i++) + if (D(i,0) <= minDist) { - D(i,0) = minLevensDist (strA(i).string_value(), - strB(i).string_value(), minDist); + Bvec(i,0) = true; } - retval(0) = D; - } - else - { - Matrix D (szA, 1); - for (octave_idx_type i = 0; i < szA; i++) + else { - D(i,0) = LevensDist (strA(i).string_value(), - strB(i).string_value()); + Bvec(i,0) = false; } - retval(0) = D; } + retval(0) = Bvec; + } + else + { + retval(0) = D; } - } else if (args(0).is_string() && args(1).is_string()) { + int d = LevensDistStr (args(0).string_value(), args(1).string_value()); if (doMinDist) { - retval(0) = minLevensDist (args(0).string_value(), - args(1).string_value(), minDist); + if (d <= minDist) + { + retval(0) = true; + } + else + { + retval(0) = false; + } } else { - retval(0) = LevensDist (args(0).string_value(), args(1).string_value()); + retval(0) = d; } } else @@ -379,6 +590,10 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\ %! d = editDistance ("string1", "string2", -2); %!error ... %! d = editDistance ("string1", "string2", 1.25); +%!error ... +%! d = editDistance ({{"string1", "string2"}, 2}); +%!error ... +%! d = editDistance ({{"string1", "string2"}, 2}, 2); %!error ... %! d = editDistance ([1, 2, 3]); %!error ... @@ -402,9 +617,26 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\ %! assert (d, [2; 1; 1]); %! assert (class (d), "double"); %!test -%! d = editDistance ({"AS","SD","AD"}, 1); -%! assert (d, logical ([0; 1; 1])); -%! assert (class (d), "logical"); +%! C = editDistance ({"AS","SD","AD"}, 1); +%! assert (iscellstr (C), true); +%! assert (C, {"AS";"SD"}); +%!test +%! [C, IA] = editDistance ({"AS","SD","AD"}, 1); +%! assert (class (IA), "double"); +%! assert (IA, [1;2]); +%!test +%! A = {"ASS"; "SDS"; "FDE"; "EDS"; "OPA"}; +%! [C, IA] = editDistance (A, 2, "OutputAllIndices", false); +%! assert (class (IA), "double"); +%! assert (A(IA), C); +%!test +%! A = {"ASS"; "SDS"; "FDE"; "EDS"; "OPA"}; +%! [C, IA] = editDistance (A, 2, "OutputAllIndices", true); +%! assert (class (IA), "cell"); +%! assert (C, {"ASS"; "FDE"; "OPA"}); +%! assert (A(IA{1}), {"ASS"; "SDS"; "EDS"}); +%! assert (A(IA{2}), {"FDE"; "EDS"}); +%! assert (A(IA{3}), {"OPA"}); %!test %! d = editDistance ({"AS","SD","AD"}, {"AS", "AD", "SE"}); %! assert (d, [0; 1; 2]); @@ -418,23 +650,23 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\ %! assert (d, [0; 2; 1]); %! assert (class (d), "double"); %!test -%! d = editDistance ({"AS","SD","AD"}, {"AS", "AD", "SE"}, 1); -%! assert (d, logical ([1; 1; 0])); -%! assert (class (d), "logical"); +%! b = editDistance ({"AS","SD","AD"}, {"AS", "AD", "SE"}, 1); +%! assert (b, logical ([1; 1; 0])); +%! assert (class (b), "logical"); %!test -%! d = editDistance ({"AS","SD","AD"}, {"AS"}, 1); -%! assert (d, logical ([1; 0; 1])); -%! assert (class (d), "logical"); +%! b = editDistance ({"AS","SD","AD"}, {"AS"}, 1); +%! assert (b, logical ([1; 0; 1])); +%! assert (class (b), "logical"); %!test -%! d = editDistance ({"AS"}, {"AS", "AD", "SE"}, 1); -%! assert (d, logical ([1; 1; 0])); -%! assert (class (d), "logical"); +%! b = editDistance ({"AS"}, {"AS", "AD", "SE"}, 1); +%! assert (b, logical ([1; 1; 0])); +%! assert (class (b), "logical"); %!test -%! d = editDistance ("Octave", "octave"); -%! assert (d, 1); -%! assert (class (d), "double"); +%! b = editDistance ("Octave", "octave"); +%! assert (b, 1); +%! assert (class (b), "double"); %!test -%! d = editDistance ("Octave", "octave", 1); -%! assert (d, true); -%! assert (class (d), "logical"); +%! b = editDistance ("Octave", "octave", 1); +%! assert (b, true); +%! assert (class (b), "logical"); */