File Explorer

/proc/self/root/var/runtime/node_modules/@aws-sdk/node_modules/mnemonist

This explorer reads the filesystem of the server it runs on, so /workspace/user isn't present here. Browsing and the terminal still work against this server's own disk from /.

passjoin-index.js12.7 KB · 519 lines
/** * Mnemonist PassjoinIndex * ======================== * * The PassjoinIndex is an index leveraging the "passjoin" algorithm as a mean * to index strings for Levenshtein distance queries. It features a complexity * related to the Levenshtein query threshold k rather than the number of * strings to test (roughly O(k^3)). * * [References]: * Jiang, Yu, Dong Deng, Jiannan Wang, Guoliang Li, et Jianhua Feng. * « Efficient Parallel Partition-Based Algorithms for Similarity Search and Join * with Edit Distance Constraints ». In Proceedings of the Joint EDBT/ICDT 2013 * Workshops on - EDBT ’13, 341. Genoa, Italy: ACM Press, 2013. * https://doi.org/10.1145/2457317.2457382. * * Li, Guoliang, Dong Deng, et Jianhua Feng. « A Partition-Based Method for * String Similarity Joins with Edit-Distance Constraints ». ACM Transactions on * Database Systems 38, no 2 (1 juin 2013): 133. * https://doi.org/10.1145/2487259.2487261. * * [Urls]: * http://people.csail.mit.edu/dongdeng/projects/passjoin/index.html */var Iterator = require('obliterator/iterator'),    forEach = require('obliterator/foreach'); // TODO: leveraging BagDistance as an upper bound of Levenshtein// TODO: leverage n-grams recursive indexing// TODO: try the MultiArray as a memory backend// TODO: what about damerau levenshtein /** * Helpers. */ /** * Function returning the number of substrings that will be selected by the * multi-match-aware selection scheme for theshold `k`, for a string of length * `s` to match strings of length `l`. * * @param   {number} k - Levenshtein distance threshold. * @param   {number} s - Length of target strings. * @param   {number} l - Length of strings to match. * @returns {number}   - The number of selected substrings. */function countSubstringsL(k, s, l) {  return (((Math.pow(k, 2) - Math.pow(Math.abs(s - l), 2)) / 2) | 0) + k + 1;} /** * Function returning the minimum number of substrings that will be selected by * the multi-match-aware selection scheme for theshold `k`, for a string of * length `s` to match any string of relevant length. * * @param   {number} k - Levenshtein distance threshold. * @param   {number} s - Length of target strings. * @returns {number}   - The number of selected substrings. */function countKeys(k, s) {  var c = 0;   for (var l = 0, m = s + 1; l < m; l++)    c += countSubstringsL(k, s, l);   return c;} /** * Function used to compare two keys in order to sort them first by decreasing * length and then alphabetically as per the "4.2 Effective Indexing Strategy" * point of the paper. * * @param   {number} k - Levenshtein distance threshold. * @param   {number} s - Length of target strings. * @returns {number}   - The number of selected substrings. */function comparator(a, b) {  if (a.length > b.length)    return -1;  if (a.length < b.length)    return 1;   if (a < b)    return -1;  if (a > b)    return 1;   return 0;} /** * Function partitioning a string into k + 1 uneven segments, the shorter * ones, then the longer ones. * * @param   {number} k - Levenshtein distance threshold. * @param   {number} l - Length of the string. * @returns {Array}    - The partition tuples (start, length). */function partition(k, l) {  var m = k + 1,      a = (l / m) | 0,      b = a + 1,      i,      j;   var largeSegments = l - a * m,      smallSegments = m - largeSegments;   var tuples = new Array(k + 1);   for (i = 0; i < smallSegments; i++)    tuples[i] = [i * a, a];   var offset = (i - 1) * a + a;   for (j = 0; j < largeSegments; j++)    tuples[i + j] = [offset + j * b, b];   return tuples;} /** * Function yielding a string's k + 1 passjoin segments to index. * * @param   {number} k      - Levenshtein distance threshold. * @param   {string} string - Target string. * @returns {Array}         - The string's segments. */function segments(k, string) {  var l = string.length,      m = k + 1,      a = (l / m) | 0,      b = a + 1,      o,      i,      j;   var largeSegments = l - a * m,      smallSegments = m - largeSegments;   var S = new Array(k + 1);   for (i = 0; i < smallSegments; i++) {    o = i * a;    S[i] = string.slice(o, o + a);  }   var offset = (i - 1) * a + a;   for (j = 0; j < largeSegments; j++) {    o = offset + j * b;    S[i + j] = string.slice(o, o + b);  }   return S;} // TODO: jsdocsfunction segmentPos(k, i, string) {  if (i === 0)    return 0;   var l = string.length;   var m = k + 1,      a = (l / m) | 0,      b = a + 1;   var largeSegments = l - a * m,      smallSegments = m - largeSegments;   if (i <= smallSegments - 1)    return i * a;   var offset = i - smallSegments;   return smallSegments * a + offset * b;} /** * Function returning the interval of relevant substrings to lookup using the * multi-match-aware substring selection scheme described in the paper. * * @param   {number} k      - Levenshtein distance threshold. * @param   {number} delta  - Signed length difference between both considered strings. * @param   {number} i      - k + 1 segment index. * @param   {number} s      - String's length. * @param   {number} pi     - k + 1 segment position in target string. * @param   {number} li     - k + 1 segment length. * @returns {Array}         - The interval (start, stop). */function multiMatchAwareInterval(k, delta, i, s, pi, li) {  var start1 = pi - i,      end1 = pi + i;   var o = k - i;   var start2 = pi + delta - o,      end2 = pi + delta + o;   var end3 = s - li;   return [Math.max(0, start1, start2), Math.min(end1, end2, end3)];} /** * Function yielding relevant substrings to lookup using the multi-match-aware * substring selection scheme described in the paper. * * @param   {number} k      - Levenshtein distance threshold. * @param   {string} string  - Target string. * @param   {number} l      - Length of strings to match. * @param   {number} i      - k + 1 segment index. * @param   {number} pi     - k + 1 segment position in target string. * @param   {number} li     - k + 1 segment length. * @returns {Array}         - The contiguous substrings. */function multiMatchAwareSubstrings(k, string, l, i, pi, li) {  var s = string.length;   // Note that we need to keep the non-absolute delta for this function  // to work in both directions, up & down  var delta = s - l;   var interval = multiMatchAwareInterval(k, delta, i, s, pi, li);   var start = interval[0],      stop = interval[1];   var currentSubstring = '';   var substrings = [];   var substring, j, m;   for (j = start, m = stop + 1; j < m; j++) {    substring = string.slice(j, j + li);     // We skip identical consecutive substrings (to avoid repetition in case    // of contiguous letter duplication)    if (substring === currentSubstring)      continue;     substrings.push(substring);     currentSubstring = substring;  }   return substrings;} /** * PassjoinIndex. * * @note I tried to apply the paper's optimizations regarding Levenshtein * distance computations but it did not provide a performance boost, quite * the contrary. This is because since we are mostly using the index for small k * here, most of the strings we work on are quite small and the bookkeeping * induced by Ukkonen's method and the paper's one are slowing us down more than * they actually help us go faster. * * @note This implementation does not try to ensure that you add the same string * more than once. * * @constructor * @param {function} levenshtein - Levenshtein distance function. * @param {number}   k           - Levenshtein distance threshold. */function PassjoinIndex(levenshtein, k) {  if (typeof levenshtein !== 'function')    throw new Error('mnemonist/passjoin-index: `levenshtein` should be a function returning edit distance between two strings.');   if (typeof k !== 'number' || k < 1)    throw new Error('mnemonist/passjoin-index: `k` should be a number > 0');   this.levenshtein = levenshtein;  this.k = k;  this.clear();} /** * Method used to clear the structure. * * @return {undefined} */PassjoinIndex.prototype.clear = function() {   // Properties  this.size = 0;  this.strings = [];  this.invertedIndices = {};}; /** * Method used to add a new value to the index. * * @param  {string|Array} value - Value to add. * @return {PassjoinIndex} */PassjoinIndex.prototype.add = function(value) {  var l = value.length;   var stringIndex = this.size;   this.strings.push(value);  this.size++;   var S = segments(this.k, value);   var Ll = this.invertedIndices[l];   if (typeof Ll === 'undefined') {    Ll = {};    this.invertedIndices[l] = Ll;  }   var segment,      matches,      key,      i,      m;   for (i = 0, m = S.length; i < m; i++) {    segment = S[i];    key = segment + i;    matches = Ll[key];     if (typeof matches === 'undefined') {      matches = [stringIndex];      Ll[key] = matches;    }    else {      matches.push(stringIndex);    }  }   return this;}; /** * Method used to search for string matching the given query. * * @param  {string|Array} query - Query string. * @return {Array} */PassjoinIndex.prototype.search = function(query) {  var s = query.length,      k = this.k;   var M = new Set();   var candidates,      candidate,      queryPos,      querySegmentLength,      key,      S,      P,      l,      m,      i,      n1,      j,      n2,      y,      n3;   for (l = Math.max(0, s - k), m = s + k + 1; l < m; l++) {    var Ll = this.invertedIndices[l];     if (typeof Ll === 'undefined')      continue;     P = partition(k, l);     for (i = 0, n1 = P.length; i < n1; i++) {      queryPos = P[i][0];      querySegmentLength = P[i][1];       S = multiMatchAwareSubstrings(        k,        query,        l,        i,        queryPos,        querySegmentLength      );       // Empty string edge case      if (!S.length)        S = [''];       for (j = 0, n2 = S.length; j < n2; j++) {        key = S[j] + i;        candidates = Ll[key];         if (typeof candidates === 'undefined')          continue;         for (y = 0, n3 = candidates.length; y < n3; y++) {          candidate = this.strings[candidates[y]];           // NOTE: first condition is here not to compute Levenshtein          // distance for tiny strings           // NOTE: maintaining a Set of rejected candidate is not really useful          // because it consumes more memory and because non-matches are          // less likely to be candidates agains          if (            s <= k && l <= k ||            (              !M.has(candidate) &&              this.levenshtein(query, candidate) <= k            )          )            M.add(candidate);        }      }    }  }   return M;}; /** * Method used to iterate over the index. * * @param  {function}  callback - Function to call for each item. * @param  {object}    scope    - Optional scope. * @return {undefined} */PassjoinIndex.prototype.forEach = function(callback, scope) {  scope = arguments.length > 1 ? scope : this;   for (var i = 0, l = this.strings.length; i < l; i++)    callback.call(scope, this.strings[i], i, this);}; /** * Method used to create an iterator over a index's values. * * @return {Iterator} */PassjoinIndex.prototype.values = function() {  var strings = this.strings,      l = strings.length,      i = 0;   return new Iterator(function() {    if (i >= l)      return {        done: true      };     var value = strings[i];    i++;     return {      value: value,      done: false    };  });}; /** * Attaching the #.values method to Symbol.iterator if possible. */if (typeof Symbol !== 'undefined')  PassjoinIndex.prototype[Symbol.iterator] = PassjoinIndex.prototype.values; /** * Convenience known methods. */PassjoinIndex.prototype.inspect = function() {  var array = this.strings.slice();   // Trick so that node displays the name of the constructor  Object.defineProperty(array, 'constructor', {    value: PassjoinIndex,    enumerable: false  });   return array;}; if (typeof Symbol !== 'undefined')  PassjoinIndex.prototype[Symbol.for('nodejs.util.inspect.custom')] = PassjoinIndex.prototype.inspect; /** * Static @.from function taking an arbitrary iterable & converting it into * a structure. * * @param  {Iterable} iterable - Target iterable. * @return {PassjoinIndex} */PassjoinIndex.from = function(iterable, levenshtein, k) {  var index = new PassjoinIndex(levenshtein, k);   forEach(iterable, function(string) {    index.add(string);  });   return index;}; /** * Exporting. */PassjoinIndex.countKeys = countKeys;PassjoinIndex.comparator = comparator;PassjoinIndex.partition = partition;PassjoinIndex.segments = segments;PassjoinIndex.segmentPos = segmentPos;PassjoinIndex.multiMatchAwareInterval = multiMatchAwareInterval;PassjoinIndex.multiMatchAwareSubstrings = multiMatchAwareSubstrings; module.exports = PassjoinIndex;