search should ignore diacritics, closes #2182

2025-06-06 18:08:33 +02:00 · 2021-09-27 22:09:55 +02:00 · 2021-09-27 22:09:55 +02:00 · 533fcd06e4
commit 533fcd06e4
parent 192a2fe9f9
7 changed files with 30 additions and 15 deletions
--- a/src/becca/entities/note.js
+++ b/src/becca/entities/note.js
@ -649,7 +649,7 @@ class Note extends AbstractEntity {
                this.flatTextCache += ' ';
            }

-            this.flatTextCache = this.flatTextCache.toLowerCase();
+            this.flatTextCache = utils.removeDiacritic(this.flatTextCache.toLowerCase());
        }

        return this.flatTextCache;
--- a/src/services/search/expressions/note_cache_flat_text.js
+++ b/src/services/search/expressions/note_cache_flat_text.js
@ -3,8 +3,9 @@
 const Expression = require('./expression');
 const NoteSet = require('../note_set');
 const becca = require('../../../becca/becca');
+const utils = require("../../utils");

-class BeccaFlatTextExp extends Expression {
+class NoteFlatTextExp extends Expression {
    constructor(tokens) {
        super();

@ -44,15 +45,15 @@ class BeccaFlatTextExp extends Expression {

            for (const attribute of note.ownedAttributes) {
                for (const token of tokens) {
-                    if (attribute.name.toLowerCase().includes(token)
-                        || attribute.value.toLowerCase().includes(token)) {
+                    if (utils.normalize(attribute.name).includes(token)
+                        || utils.normalize(attribute.value).includes(token)) {
                        foundAttrTokens.push(token);
                    }
                }
            }

            for (const parentNote of note.parents) {
-                const title = beccaService.getNoteTitle(note.noteId, parentNote.noteId).toLowerCase();
+                const title = utils.normalize(beccaService.getNoteTitle(note.noteId, parentNote.noteId));
                const foundTokens = foundAttrTokens.slice();

                for (const token of tokens) {
@ -89,8 +90,8 @@ class BeccaFlatTextExp extends Expression {
                }

                for (const attribute of note.ownedAttributes) {
-                    if (attribute.name.toLowerCase().includes(token)
-                        || attribute.value.toLowerCase().includes(token)) {
+                    if (utils.normalize(attribute.name).includes(token)
+                        || utils.normalize(attribute.value).includes(token)) {

                        foundAttrTokens.push(token);
                    }
@ -98,7 +99,7 @@ class BeccaFlatTextExp extends Expression {
            }

            for (const parentNote of note.parents) {
-                const title = beccaService.getNoteTitle(note.noteId, parentNote.noteId).toLowerCase();
+                const title = utils.normalize(beccaService.getNoteTitle(note.noteId, parentNote.noteId));
                const foundTokens = foundAttrTokens.slice();

                for (const token of this.tokens) {
@ -140,4 +141,4 @@ class BeccaFlatTextExp extends Expression {
    }
 }

-module.exports = BeccaFlatTextExp;
+module.exports = NoteFlatTextExp;
--- a/src/services/search/expressions/note_content_protected_fulltext.js
+++ b/src/services/search/expressions/note_content_protected_fulltext.js
@ -6,6 +6,7 @@ const log = require('../../log');
 const becca = require('../../../becca/becca');
 const protectedSessionService = require('../../protected_session');
 const striptags = require('striptags');
+const utils = require("../../utils");

 class NoteContentProtectedFulltextExp extends Expression {
    constructor(operator, tokens, raw) {
@ -45,7 +46,7 @@ class NoteContentProtectedFulltextExp extends Expression {
                continue;
            }

-            content = content.toLowerCase();
+            content = utils.normalize(content);

            if (type === 'text' && mime === 'text/html') {
                if (!this.raw && content.length < 20000) { // striptags is slow for very large notes
--- a/src/services/search/expressions/note_content_unprotected_fulltext.js
+++ b/src/services/search/expressions/note_content_unprotected_fulltext.js
@ -4,6 +4,7 @@ const Expression = require('./expression');
 const NoteSet = require('../note_set');
 const becca = require('../../../becca/becca');
 const striptags = require('striptags');
+const utils = require("../../utils");

 class NoteContentUnprotectedFulltextExp extends Expression {
    constructor(operator, tokens, raw) {
@ -31,7 +32,7 @@ class NoteContentUnprotectedFulltextExp extends Expression {
                continue;
            }

-            content = content.toString().toLowerCase();
+            content = utils.normalize(content.toString());

            if (type === 'text' && mime === 'text/html') {
                if (!this.raw && content.length < 20000) { // striptags is slow for very large notes
--- a/src/services/search/services/parse.js
+++ b/src/services/search/services/parse.js
@ -18,9 +18,10 @@ const OrderByAndLimitExp = require('../expressions/order_by_and_limit');
 const AncestorExp = require("../expressions/ancestor");
 const buildComparator = require('./build_comparator');
 const ValueExtractor = require('../value_extractor');
+const utils = require("../../utils");

 function getFulltext(tokens, searchContext) {
-    tokens = tokens.map(t => t.token);
+    tokens = tokens.map(t => utils.removeDiacritic(t.token));

    searchContext.highlightedTokens.push(...tokens);

--- a/src/services/search/services/search.js
+++ b/src/services/search/services/search.js
@ -223,8 +223,8 @@ function highlightSearchResults(searchResults, highlightedTokens) {
        }

        for (const attr of note.getAttributes()) {
-            if (highlightedTokens.find(token => attr.name.toLowerCase().includes(token)
-                || attr.value.toLowerCase().includes(token))) {
+            if (highlightedTokens.find(token => utils.normalize(attr.name).includes(token)
+                || utils.normalize(attr.value).includes(token))) {

                result.highlightedNotePathTitle += ` "${formatAttribute(attr)}'`;
            }
@ -232,6 +232,7 @@ function highlightSearchResults(searchResults, highlightedTokens) {
    }

    for (const token of highlightedTokens) {
+        // this approach won't work for strings with diacritics
        const tokenRegex = new RegExp("(" + utils.escapeRegExp(token) + ")", "gi");

        for (const result of searchResults) {
--- a/src/services/utils.js
+++ b/src/services/utils.js
@ -290,6 +290,14 @@ function deferred() {
    })();
 }

+function removeDiacritic(str) {
+    return str.normalize("NFD").replace(/\p{Diacritic}/gu, "");
+}
+
+function normalize(str) {
+    return removeDiacritic(str).toLowerCase();
+}
+
 module.exports = {
    randomSecureToken,
    randomString,
@ -321,5 +329,7 @@ module.exports = {
    removeTextFileExtension,
    formatDownloadTitle,
    timeLimit,
-    deferred
+    deferred,
+    removeDiacritic,
+    normalize
 };