From 60e8bd98b97695506498c223ce571a4ce681ad11 Mon Sep 17 00:00:00 2001 From: zadam Date: Tue, 21 Jul 2020 23:42:59 +0200 Subject: [PATCH] improvements to lexer and its tests --- spec/search/lexer.spec.js | 29 +++++++++++++-------- src/services/search/services/lex.js | 39 +++++++++++++++-------------- 2 files changed, 39 insertions(+), 29 deletions(-) diff --git a/spec/search/lexer.spec.js b/spec/search/lexer.spec.js index 09d27b3d3..8cce5e6c0 100644 --- a/spec/search/lexer.spec.js +++ b/spec/search/lexer.spec.js @@ -22,6 +22,16 @@ describe("Lexer fulltext", () => { .toEqual(["i can use \" or ` or #~=*", "without", "problem"]); }); + it("quote inside a word does not have a special meaning", () => { + const lexResult = lex("d'Artagnan is dead #hero = d'Artagnan"); + + expect(lexResult.fulltextTokens.map(t => t.token)) + .toEqual(["d'artagnan", "is", "dead"]); + + expect(lexResult.expressionTokens.map(t => t.token)) + .toEqual(['#hero', '=', "d'artagnan"]); + }); + it("if quote is not ended then it's just one long token", () => { expect(lex("'unfinished quote").fulltextTokens.map(t => t.token)) .toEqual(["unfinished quote"]); @@ -52,16 +62,16 @@ describe("Lexer expression", () => { it("simple label operator with in quotes and without", () => { expect(lex("#label*=*'text'").expressionTokens) .toEqual([ - {token: "#label", inQuotes: false}, - {token: "*=*", inQuotes: false}, - {token: "text", inQuotes: true} + {token: "#label", inQuotes: false, startIndex: 0, endIndex: 5}, + {token: "*=*", inQuotes: false, startIndex: 6, endIndex: 8}, + {token: "text", inQuotes: true, startIndex: 10, endIndex: 13} ]); expect(lex("#label*=*text").expressionTokens) .toEqual([ - {token: "#label", inQuotes: false}, - {token: "*=*", inQuotes: false}, - {token: "text", inQuotes: false} + {token: "#label", inQuotes: false, startIndex: 0, endIndex: 5}, + {token: "*=*", inQuotes: false, startIndex: 6, endIndex: 8}, + {token: "text", inQuotes: false, startIndex: 9, endIndex: 12} ]); }); @@ -92,9 +102,8 @@ describe("Lexer invalid queries and edge cases", () => { .toEqual(["#label", "~relation"]); }); - it("spaces in attribute names and values", () => { - // invalid but should be reported by parser as an error - expect(lex(`#'long label'="hello o' world" ~'long relation'`).expressionTokens.map(t => t.token)) - .toEqual(["#long label", "=", "hello o' world", "~long relation"]); + it("trailing escape \\", () => { + expect(lex('abc \\').fulltextTokens.map(t => t.token)) + .toEqual(["abc", "\\"]); }); }); diff --git a/src/services/search/services/lex.js b/src/services/search/services/lex.js index 65ff8823a..821a3e2fc 100644 --- a/src/services/search/services/lex.js +++ b/src/services/search/services/lex.js @@ -4,31 +4,33 @@ function lex(str) { const fulltextTokens = []; const expressionTokens = []; - let quotes = false; + let quotes = false; // otherwise contains used quote - ', " or ` let fulltextEnded = false; let currentWord = ''; - function isOperatorSymbol(chr) { + function isSymbolAnOperator(chr) { return ['=', '*', '>', '<', '!'].includes(chr); } - function previousOperatorSymbol() { + function isPreviousSymbolAnOperator() { if (currentWord.length === 0) { return false; } else { - return isOperatorSymbol(currentWord[currentWord.length - 1]); + return isSymbolAnOperator(currentWord[currentWord.length - 1]); } } - function finishWord() { + function finishWord(endIndex) { if (currentWord === '') { return; } const rec = { token: currentWord, - inQuotes: !!quotes + inQuotes: !!quotes, + startIndex: endIndex - currentWord.length + 1, + endIndex }; if (fulltextEnded) { @@ -44,7 +46,7 @@ function lex(str) { const chr = str[i]; if (chr === '\\') { - if ((i + 1) < str.length) { + if (i + 1 < str.length) { i++; currentWord += str[i]; @@ -57,10 +59,8 @@ function lex(str) { } else if (['"', "'", '`'].includes(chr)) { if (!quotes) { - if (currentWord.length === 0 || fulltextEnded) { - if (previousOperatorSymbol()) { - finishWord(); - } + if (currentWord.length === 0 || isPreviousSymbolAnOperator()) { + finishWord(i - 1); quotes = chr; } @@ -71,7 +71,7 @@ function lex(str) { } } else if (quotes === chr) { - finishWord(); + finishWord(i - 1); quotes = false; } @@ -79,6 +79,7 @@ function lex(str) { // it's a quote but within other kind of quotes so it's valid as a literal character currentWord += chr; } + continue; } else if (!quotes) { @@ -87,7 +88,7 @@ function lex(str) { fulltextEnded = true; } else { - finishWord(); + finishWord(i - 1); } currentWord = chr; @@ -99,20 +100,20 @@ function lex(str) { continue; } else if (chr === ' ') { - finishWord(); + finishWord(i - 1); continue; } else if (fulltextEnded && ['(', ')', '.'].includes(chr)) { - finishWord(); + finishWord(i - 1); currentWord += chr; - finishWord(); + finishWord(i); continue; } else if (fulltextEnded && !['#!', '~!'].includes(currentWord) - && previousOperatorSymbol() !== isOperatorSymbol(chr)) { + && isPreviousSymbolAnOperator() !== isSymbolAnOperator(chr)) { - finishWord(); + finishWord(i - 1); currentWord += chr; continue; @@ -122,7 +123,7 @@ function lex(str) { currentWord += chr; } - finishWord(); + finishWord(str.length - 1); return { fulltextTokens,