File Explorer

/proc/self/root/proc/1/task/1/cwd/node24/lib/node_modules/npm/node_modules/diff/libesm/diff
This explorer reads the filesystem of the server it runs on, so /workspace/user isn't present here. Browsing and the terminal still work against this server's own disk from /.
0 dirs
8 files
word.js14.2 KB · 297 lines
1import Diff from './base.js';2import { longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix, removePrefix, removeSuffix, maximumOverlap, leadingWs, trailingWs } from '../util/string.js';3// Based on https://en.wikipedia.org/wiki/Latin_script_in_Unicode4//5// Chars/ranges counted as "word" characters by this regex are as follows:6//7// + U+00AD  Soft hyphen8// + 00C0–00FF (letters with diacritics from the Latin-1 Supplement), except:9//   - U+00D7  × Multiplication sign10//   - U+00F7  ÷ Division sign11// + Latin Extended-A, 0100–017F12// + Latin Extended-B, 0180–024F13// + IPA Extensions, 0250–02AF14// + Spacing Modifier Letters, 02B0–02FF, except:15//   - U+02C7  ˇ &#711;  Caron16//   - U+02D8  ˘ &#728;  Breve17//   - U+02D9  ˙ &#729;  Dot Above18//   - U+02DA  ˚ &#730;  Ring Above19//   - U+02DB  ˛ &#731;  Ogonek20//   - U+02DC  ˜ &#732;  Small Tilde21//   - U+02DD  ˝ &#733;  Double Acute Accent22// + Latin Extended Additional, 1E00–1EFF23const extendedWordChars = 'a-zA-Z0-9_\\u{AD}\\u{C0}-\\u{D6}\\u{D8}-\\u{F6}\\u{F8}-\\u{2C6}\\u{2C8}-\\u{2D7}\\u{2DE}-\\u{2FF}\\u{1E00}-\\u{1EFF}';24// Each token is one of the following:25// - A punctuation mark plus the surrounding whitespace26// - A word plus the surrounding whitespace27// - Pure whitespace (but only in the special case where the entire text28//   is just whitespace)29//30// We have to include surrounding whitespace in the tokens because the two31// alternative approaches produce horribly broken results:32// * If we just discard the whitespace, we can't fully reproduce the original33//   text from the sequence of tokens and any attempt to render the diff will34//   get the whitespace wrong.35// * If we have separate tokens for whitespace, then in a typical text every36//   second token will be a single space character. But this often results in37//   the optimal diff between two texts being a perverse one that preserves38//   the spaces between words but deletes and reinserts actual common words.39//   See https://github.com/kpdecker/jsdiff/issues/160#issuecomment-186609964040//   for an example.41//42// Keeping the surrounding whitespace of course has implications for .equals43// and .join, not just .tokenize.44// This regex does NOT fully implement the tokenization rules described above.45// Instead, it gives runs of whitespace their own "token". The tokenize method46// then handles stitching whitespace tokens onto adjacent word or punctuation47// tokens.48const tokenizeIncludingWhitespace = new RegExp(`[${extendedWordChars}]+|\\s+|[^${extendedWordChars}]`, 'ug');49class WordDiff extends Diff {50    equals(left, right, options) {51        if (options.ignoreCase) {52            left = left.toLowerCase();53            right = right.toLowerCase();54        }55        return left.trim() === right.trim();56    }57    tokenize(value, options = {}) {58        let parts;59        if (options.intlSegmenter) {60            const segmenter = options.intlSegmenter;61            if (segmenter.resolvedOptions().granularity != 'word') {62                throw new Error('The segmenter passed must have a granularity of "word"');63            }64            // We want `parts` to be an array whose elements alternate between being65            // pure whitespace and being pure non-whitespace. This is ALMOST what the66            // segments returned by a word-based Intl.Segmenter already look like,67            // and therefore we can ALMOST get what we want by simply doing...68            //     parts = Array.from(segmenter.segment(value), segment => segment.segment);69            // ... but not QUITE, because there's of one annoying special case: every70            // newline character gets its own segment, instead of sharing a segment71            // with other surrounding whitespace. We therefore need to manually merge72            // consecutive segments of whitespace into a single part:73            parts = [];74            for (const segmentObj of Array.from(segmenter.segment(value))) {75                const segment = segmentObj.segment;76                if (parts.length && (/\s/).test(parts[parts.length - 1]) && (/\s/).test(segment)) {77                    parts[parts.length - 1] += segment;78                }79                else {80                    parts.push(segment);81                }82            }83        }84        else {85            parts = value.match(tokenizeIncludingWhitespace) || [];86        }87        const tokens = [];88        let prevPart = null;89        parts.forEach(part => {90            if ((/\s/).test(part)) {91                if (prevPart == null) {92                    tokens.push(part);93                }94                else {95                    tokens.push(tokens.pop() + part);96                }97            }98            else if (prevPart != null && (/\s/).test(prevPart)) {99                if (tokens[tokens.length - 1] == prevPart) {100                    tokens.push(tokens.pop() + part);101                }102                else {103                    tokens.push(prevPart + part);104                }105            }106            else {107                tokens.push(part);108            }109            prevPart = part;110        });111        return tokens;112    }113    join(tokens) {114        // Tokens being joined here will always have appeared consecutively in the115        // same text, so we can simply strip off the leading whitespace from all the116        // tokens except the first (and except any whitespace-only tokens - but such117        // a token will always be the first and only token anyway) and then join them118        // and the whitespace around words and punctuation will end up correct.119        return tokens.map((token, i) => {120            if (i == 0) {121                return token;122            }123            else {124                return token.replace((/^\s+/), '');125            }126        }).join('');127    }128    postProcess(changes, options) {129        if (!changes || options.oneChangePerToken) {130            return changes;131        }132        let lastKeep = null;133        // Change objects representing any insertion or deletion since the last134        // "keep" change object. There can be at most one of each.135        let insertion = null;136        let deletion = null;137        changes.forEach(change => {138            if (change.added) {139                insertion = change;140            }141            else if (change.removed) {142                deletion = change;143            }144            else {145                if (insertion || deletion) { // May be false at start of text146                    dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, change);147                }148                lastKeep = change;149                insertion = null;150                deletion = null;151            }152        });153        if (insertion || deletion) {154            dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, null);155        }156        return changes;157    }158}159export const wordDiff = new WordDiff();160export function diffWords(oldStr, newStr, options) {161    // This option has never been documented and never will be (it's clearer to162    // just call `diffWordsWithSpace` directly if you need that behavior), but163    // has existed in jsdiff for a long time, so we retain support for it here164    // for the sake of backwards compatibility.165    if ((options === null || options === void 0 ? void 0 : options.ignoreWhitespace) != null && !options.ignoreWhitespace) {166        return diffWordsWithSpace(oldStr, newStr, options);167    }168    return wordDiff.diff(oldStr, newStr, options);169}170function dedupeWhitespaceInChangeObjects(startKeep, deletion, insertion, endKeep) {171    // Before returning, we tidy up the leading and trailing whitespace of the172    // change objects to eliminate cases where trailing whitespace in one object173    // is repeated as leading whitespace in the next.174    // Below are examples of the outcomes we want here to explain the code.175    // I=insert, K=keep, D=delete176    // 1. diffing 'foo bar baz' vs 'foo baz'177    //    Prior to cleanup, we have K:'foo ' D:' bar ' K:' baz'178    //    After cleanup, we want:   K:'foo ' D:'bar ' K:'baz'179    //180    // 2. Diffing 'foo bar baz' vs 'foo qux baz'181    //    Prior to cleanup, we have K:'foo ' D:' bar ' I:' qux ' K:' baz'182    //    After cleanup, we want K:'foo ' D:'bar' I:'qux' K:' baz'183    //184    // 3. Diffing 'foo\nbar baz' vs 'foo baz'185    //    Prior to cleanup, we have K:'foo ' D:'\nbar ' K:' baz'186    //    After cleanup, we want K'foo' D:'\nbar' K:' baz'187    //188    // 4. Diffing 'foo baz' vs 'foo\nbar baz'189    //    Prior to cleanup, we have K:'foo\n' I:'\nbar ' K:' baz'190    //    After cleanup, we ideally want K'foo' I:'\nbar' K:' baz'191    //    but don't actually manage this currently (the pre-cleanup change192    //    objects don't contain enough information to make it possible).193    //194    // 5. Diffing 'foo   bar baz' vs 'foo  baz'195    //    Prior to cleanup, we have K:'foo  ' D:'   bar ' K:'  baz'196    //    After cleanup, we want K:'foo  ' D:' bar ' K:'baz'197    //198    // Our handling is unavoidably imperfect in the case where there's a single199    // indel between keeps and the whitespace has changed. For instance, consider200    // diffing 'foo\tbar\nbaz' vs 'foo baz'. Unless we create an extra change201    // object to represent the insertion of the space character (which isn't even202    // a token), we have no way to avoid losing information about the texts'203    // original whitespace in the result we return. Still, we do our best to204    // output something that will look sensible if we e.g. print it with205    // insertions in green and deletions in red.206    // Between two "keep" change objects (or before the first or after the last207    // change object), we can have either:208    // * A "delete" followed by an "insert"209    // * Just an "insert"210    // * Just a "delete"211    // We handle the three cases separately.212    if (deletion && insertion) {213        const oldWsPrefix = leadingWs(deletion.value);214        const oldWsSuffix = trailingWs(deletion.value);215        const newWsPrefix = leadingWs(insertion.value);216        const newWsSuffix = trailingWs(insertion.value);217        if (startKeep) {218            const commonWsPrefix = longestCommonPrefix(oldWsPrefix, newWsPrefix);219            startKeep.value = replaceSuffix(startKeep.value, newWsPrefix, commonWsPrefix);220            deletion.value = removePrefix(deletion.value, commonWsPrefix);221            insertion.value = removePrefix(insertion.value, commonWsPrefix);222        }223        if (endKeep) {224            const commonWsSuffix = longestCommonSuffix(oldWsSuffix, newWsSuffix);225            endKeep.value = replacePrefix(endKeep.value, newWsSuffix, commonWsSuffix);226            deletion.value = removeSuffix(deletion.value, commonWsSuffix);227            insertion.value = removeSuffix(insertion.value, commonWsSuffix);228        }229    }230    else if (insertion) {231        // The whitespaces all reflect what was in the new text rather than232        // the old, so we essentially have no information about whitespace233        // insertion or deletion. We just want to dedupe the whitespace.234        // We do that by having each change object keep its trailing235        // whitespace and deleting duplicate leading whitespace where236        // present.237        if (startKeep) {238            const ws = leadingWs(insertion.value);239            insertion.value = insertion.value.substring(ws.length);240        }241        if (endKeep) {242            const ws = leadingWs(endKeep.value);243            endKeep.value = endKeep.value.substring(ws.length);244        }245        // otherwise we've got a deletion and no insertion246    }247    else if (startKeep && endKeep) {248        const newWsFull = leadingWs(endKeep.value), delWsStart = leadingWs(deletion.value), delWsEnd = trailingWs(deletion.value);249        // Any whitespace that comes straight after startKeep in both the old and250        // new texts, assign to startKeep and remove from the deletion.251        const newWsStart = longestCommonPrefix(newWsFull, delWsStart);252        deletion.value = removePrefix(deletion.value, newWsStart);253        // Any whitespace that comes straight before endKeep in both the old and254        // new texts, and hasn't already been assigned to startKeep, assign to255        // endKeep and remove from the deletion.256        const newWsEnd = longestCommonSuffix(removePrefix(newWsFull, newWsStart), delWsEnd);257        deletion.value = removeSuffix(deletion.value, newWsEnd);258        endKeep.value = replacePrefix(endKeep.value, newWsFull, newWsEnd);259        // If there's any whitespace from the new text that HASN'T already been260        // assigned, assign it to the start:261        startKeep.value = replaceSuffix(startKeep.value, newWsFull, newWsFull.slice(0, newWsFull.length - newWsEnd.length));262    }263    else if (endKeep) {264        // We are at the start of the text. Preserve all the whitespace on265        // endKeep, and just remove whitespace from the end of deletion to the266        // extent that it overlaps with the start of endKeep.267        const endKeepWsPrefix = leadingWs(endKeep.value);268        const deletionWsSuffix = trailingWs(deletion.value);269        const overlap = maximumOverlap(deletionWsSuffix, endKeepWsPrefix);270        deletion.value = removeSuffix(deletion.value, overlap);271    }272    else if (startKeep) {273        // We are at the END of the text. Preserve all the whitespace on274        // startKeep, and just remove whitespace from the start of deletion to275        // the extent that it overlaps with the end of startKeep.276        const startKeepWsSuffix = trailingWs(startKeep.value);277        const deletionWsPrefix = leadingWs(deletion.value);278        const overlap = maximumOverlap(startKeepWsSuffix, deletionWsPrefix);279        deletion.value = removePrefix(deletion.value, overlap);280    }281}282class WordsWithSpaceDiff extends Diff {283    tokenize(value) {284        // Slightly different to the tokenizeIncludingWhitespace regex used above in285        // that this one treats each individual newline as a distinct token, rather286        // than merging them into other surrounding whitespace. This was requested287        // in https://github.com/kpdecker/jsdiff/issues/180 &288        //    https://github.com/kpdecker/jsdiff/issues/211289        const regex = new RegExp(`(\\r?\\n)|[${extendedWordChars}]+|[^\\S\\n\\r]+|[^${extendedWordChars}]`, 'ug');290        return value.match(regex) || [];291    }292}293export const wordsWithSpaceDiff = new WordsWithSpaceDiff();294export function diffWordsWithSpace(oldStr, newStr, options) {295    return wordsWithSpaceDiff.diff(oldStr, newStr, options);296}297