From 60e8bd98b97695506498c223ce571a4ce681ad11 Mon Sep 17 00:00:00 2001
From: zadam <adam.zivner@gmail.com>
Date: Tue, 21 Jul 2020 23:42:59 +0200
Subject: [PATCH] improvements to lexer and its tests

---
 spec/search/lexer.spec.js           | 29 +++++++++++++--------
 src/services/search/services/lex.js | 39 +++++++++++++++--------------
 2 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/spec/search/lexer.spec.js b/spec/search/lexer.spec.js
index 09d27b3d3..8cce5e6c0 100644
--- a/spec/search/lexer.spec.js
+++ b/spec/search/lexer.spec.js
@@ -22,6 +22,16 @@ describe("Lexer fulltext", () => {
             .toEqual(["i can use \" or ` or #~=*", "without", "problem"]);
     });
 
+    it("quote inside a word does not have a special meaning", () => {
+        const lexResult = lex("d'Artagnan is dead #hero = d'Artagnan");
+
+        expect(lexResult.fulltextTokens.map(t => t.token))
+            .toEqual(["d'artagnan", "is", "dead"]);
+
+        expect(lexResult.expressionTokens.map(t => t.token))
+            .toEqual(['#hero', '=', "d'artagnan"]);
+    });
+
     it("if quote is not ended then it's just one long token", () => {
         expect(lex("'unfinished quote").fulltextTokens.map(t => t.token))
             .toEqual(["unfinished quote"]);
@@ -52,16 +62,16 @@ describe("Lexer expression", () => {
     it("simple label operator with in quotes and without", () => {
         expect(lex("#label*=*'text'").expressionTokens)
             .toEqual([
-                {token: "#label", inQuotes: false},
-                {token: "*=*", inQuotes: false},
-                {token: "text", inQuotes: true}
+                {token: "#label", inQuotes: false, startIndex: 0, endIndex: 5},
+                {token: "*=*", inQuotes: false, startIndex: 6, endIndex: 8},
+                {token: "text", inQuotes: true, startIndex: 10, endIndex: 13}
             ]);
 
         expect(lex("#label*=*text").expressionTokens)
             .toEqual([
-                {token: "#label", inQuotes: false},
-                {token: "*=*", inQuotes: false},
-                {token: "text", inQuotes: false}
+                {token: "#label", inQuotes: false, startIndex: 0, endIndex: 5},
+                {token: "*=*", inQuotes: false, startIndex: 6, endIndex: 8},
+                {token: "text", inQuotes: false, startIndex: 9, endIndex: 12}
             ]);
     });
 
@@ -92,9 +102,8 @@ describe("Lexer invalid queries and edge cases", () => {
             .toEqual(["#label", "~relation"]);
     });
 
-    it("spaces in attribute names and values", () => {
-        // invalid but should be reported by parser as an error
-        expect(lex(`#'long label'="hello o' world" ~'long relation'`).expressionTokens.map(t => t.token))
-            .toEqual(["#long label", "=", "hello o' world", "~long relation"]);
+    it("trailing escape \\", () => {
+        expect(lex('abc \\').fulltextTokens.map(t => t.token))
+            .toEqual(["abc", "\\"]);
     });
 });
diff --git a/src/services/search/services/lex.js b/src/services/search/services/lex.js
index 65ff8823a..821a3e2fc 100644
--- a/src/services/search/services/lex.js
+++ b/src/services/search/services/lex.js
@@ -4,31 +4,33 @@ function lex(str) {
     const fulltextTokens = [];
     const expressionTokens = [];
 
-    let quotes = false;
+    let quotes = false; // otherwise contains used quote - ', " or `
     let fulltextEnded = false;
     let currentWord = '';
 
-    function isOperatorSymbol(chr) {
+    function isSymbolAnOperator(chr) {
         return ['=', '*', '>', '<', '!'].includes(chr);
     }
 
-    function previousOperatorSymbol() {
+    function isPreviousSymbolAnOperator() {
         if (currentWord.length === 0) {
             return false;
         }
         else {
-            return isOperatorSymbol(currentWord[currentWord.length - 1]);
+            return isSymbolAnOperator(currentWord[currentWord.length - 1]);
         }
     }
 
-    function finishWord() {
+    function finishWord(endIndex) {
         if (currentWord === '') {
             return;
         }
 
         const rec = {
             token: currentWord,
-            inQuotes: !!quotes
+            inQuotes: !!quotes,
+            startIndex: endIndex - currentWord.length + 1,
+            endIndex
         };
 
         if (fulltextEnded) {
@@ -44,7 +46,7 @@ function lex(str) {
         const chr = str[i];
 
         if (chr === '\\') {
-            if ((i + 1) < str.length) {
+            if (i + 1 < str.length) {
                 i++;
 
                 currentWord += str[i];
@@ -57,10 +59,8 @@ function lex(str) {
         }
         else if (['"', "'", '`'].includes(chr)) {
             if (!quotes) {
-                if (currentWord.length === 0 || fulltextEnded) {
-                    if (previousOperatorSymbol()) {
-                        finishWord();
-                    }
+                if (currentWord.length === 0 || isPreviousSymbolAnOperator()) {
+                    finishWord(i - 1);
 
                     quotes = chr;
                 }
@@ -71,7 +71,7 @@ function lex(str) {
                 }
             }
             else if (quotes === chr) {
-                finishWord();
+                finishWord(i - 1);
 
                 quotes = false;
             }
@@ -79,6 +79,7 @@ function lex(str) {
                 // it's a quote but within other kind of quotes so it's valid as a literal character
                 currentWord += chr;
             }
+
             continue;
         }
         else if (!quotes) {
@@ -87,7 +88,7 @@ function lex(str) {
                     fulltextEnded = true;
                 }
                 else {
-                    finishWord();
+                    finishWord(i - 1);
                 }
 
                 currentWord = chr;
@@ -99,20 +100,20 @@ function lex(str) {
                 continue;
             }
             else if (chr === ' ') {
-                finishWord();
+                finishWord(i - 1);
                 continue;
             }
             else if (fulltextEnded && ['(', ')', '.'].includes(chr)) {
-                finishWord();
+                finishWord(i - 1);
                 currentWord += chr;
-                finishWord();
+                finishWord(i);
                 continue;
             }
             else if (fulltextEnded
                 && !['#!', '~!'].includes(currentWord)
-                && previousOperatorSymbol() !== isOperatorSymbol(chr)) {
+                && isPreviousSymbolAnOperator() !== isSymbolAnOperator(chr)) {
 
-                finishWord();
+                finishWord(i - 1);
 
                 currentWord += chr;
                 continue;
@@ -122,7 +123,7 @@ function lex(str) {
         currentWord += chr;
     }
 
-    finishWord();
+    finishWord(str.length - 1);
 
     return {
         fulltextTokens,