improvements to lexer and its tests

2025-06-06 18:08:33 +02:00 · 2020-07-21 23:42:59 +02:00 · 2020-07-21 23:42:59 +02:00 · 60e8bd98b9
commit 60e8bd98b9
parent 32ecb43b5c
2 changed files with 39 additions and 29 deletions
--- a/spec/search/lexer.spec.js
+++ b/spec/search/lexer.spec.js
@ -22,6 +22,16 @@ describe("Lexer fulltext", () => {
            .toEqual(["i can use \" or ` or #~=*", "without", "problem"]);
    });

+    it("quote inside a word does not have a special meaning", () => {
+        const lexResult = lex("d'Artagnan is dead #hero = d'Artagnan");
+
+        expect(lexResult.fulltextTokens.map(t => t.token))
+            .toEqual(["d'artagnan", "is", "dead"]);
+
+        expect(lexResult.expressionTokens.map(t => t.token))
+            .toEqual(['#hero', '=', "d'artagnan"]);
+    });
+
    it("if quote is not ended then it's just one long token", () => {
        expect(lex("'unfinished quote").fulltextTokens.map(t => t.token))
            .toEqual(["unfinished quote"]);
@ -52,16 +62,16 @@ describe("Lexer expression", () => {
    it("simple label operator with in quotes and without", () => {
        expect(lex("#label*=*'text'").expressionTokens)
            .toEqual([
-                {token: "#label", inQuotes: false},
-                {token: "*=*", inQuotes: false},
-                {token: "text", inQuotes: true}
+                {token: "#label", inQuotes: false, startIndex: 0, endIndex: 5},
+                {token: "*=*", inQuotes: false, startIndex: 6, endIndex: 8},
+                {token: "text", inQuotes: true, startIndex: 10, endIndex: 13}
            ]);

        expect(lex("#label*=*text").expressionTokens)
            .toEqual([
-                {token: "#label", inQuotes: false},
-                {token: "*=*", inQuotes: false},
-                {token: "text", inQuotes: false}
+                {token: "#label", inQuotes: false, startIndex: 0, endIndex: 5},
+                {token: "*=*", inQuotes: false, startIndex: 6, endIndex: 8},
+                {token: "text", inQuotes: false, startIndex: 9, endIndex: 12}
            ]);
    });

@ -92,9 +102,8 @@ describe("Lexer invalid queries and edge cases", () => {
            .toEqual(["#label", "~relation"]);
    });

-    it("spaces in attribute names and values", () => {
-        // invalid but should be reported by parser as an error
-        expect(lex(`#'long label'="hello o' world" ~'long relation'`).expressionTokens.map(t => t.token))
-            .toEqual(["#long label", "=", "hello o' world", "~long relation"]);
+    it("trailing escape \\", () => {
+        expect(lex('abc \\').fulltextTokens.map(t => t.token))
+            .toEqual(["abc", "\\"]);
    });
 });
--- a/src/services/search/services/lex.js
+++ b/src/services/search/services/lex.js
@ -4,31 +4,33 @@ function lex(str) {
    const fulltextTokens = [];
    const expressionTokens = [];

-    let quotes = false;
+    let quotes = false; // otherwise contains used quote - ', " or `
    let fulltextEnded = false;
    let currentWord = '';

-    function isOperatorSymbol(chr) {
+    function isSymbolAnOperator(chr) {
        return ['=', '*', '>', '<', '!'].includes(chr);
    }

-    function previousOperatorSymbol() {
+    function isPreviousSymbolAnOperator() {
        if (currentWord.length === 0) {
            return false;
        }
        else {
-            return isOperatorSymbol(currentWord[currentWord.length - 1]);
+            return isSymbolAnOperator(currentWord[currentWord.length - 1]);
        }
    }

-    function finishWord() {
+    function finishWord(endIndex) {
        if (currentWord === '') {
            return;
        }

        const rec = {
            token: currentWord,
-            inQuotes: !!quotes
+            inQuotes: !!quotes,
+            startIndex: endIndex - currentWord.length + 1,
+            endIndex
        };

        if (fulltextEnded) {
@ -44,7 +46,7 @@ function lex(str) {
        const chr = str[i];

        if (chr === '\\') {
-            if ((i + 1) < str.length) {
+            if (i + 1 < str.length) {
                i++;

                currentWord += str[i];
@ -57,10 +59,8 @@ function lex(str) {
        }
        else if (['"', "'", '`'].includes(chr)) {
            if (!quotes) {
-                if (currentWord.length === 0 || fulltextEnded) {
-                    if (previousOperatorSymbol()) {
-                        finishWord();
-                    }
+                if (currentWord.length === 0 || isPreviousSymbolAnOperator()) {
+                    finishWord(i - 1);

                    quotes = chr;
                }
@ -71,7 +71,7 @@ function lex(str) {
                }
            }
            else if (quotes === chr) {
-                finishWord();
+                finishWord(i - 1);

                quotes = false;
            }
@ -79,6 +79,7 @@ function lex(str) {
                // it's a quote but within other kind of quotes so it's valid as a literal character
                currentWord += chr;
            }
+
            continue;
        }
        else if (!quotes) {
@ -87,7 +88,7 @@ function lex(str) {
                    fulltextEnded = true;
                }
                else {
-                    finishWord();
+                    finishWord(i - 1);
                }

                currentWord = chr;
@ -99,20 +100,20 @@ function lex(str) {
                continue;
            }
            else if (chr === ' ') {
-                finishWord();
+                finishWord(i - 1);
                continue;
            }
            else if (fulltextEnded && ['(', ')', '.'].includes(chr)) {
-                finishWord();
+                finishWord(i - 1);
                currentWord += chr;
-                finishWord();
+                finishWord(i);
                continue;
            }
            else if (fulltextEnded
                && !['#!', '~!'].includes(currentWord)
-                && previousOperatorSymbol() !== isOperatorSymbol(chr)) {
+                && isPreviousSymbolAnOperator() !== isSymbolAnOperator(chr)) {

-                finishWord();
+                finishWord(i - 1);

                currentWord += chr;
                continue;
@ -122,7 +123,7 @@ function lex(str) {
        currentWord += chr;
    }

-    finishWord();
+    finishWord(str.length - 1);

    return {
        fulltextTokens,