search should ignore diacritics, closes #2182

This commit is contained in:
zadam 2021-09-27 22:09:55 +02:00
parent 192a2fe9f9
commit 533fcd06e4
7 changed files with 30 additions and 15 deletions

View File

@ -649,7 +649,7 @@ class Note extends AbstractEntity {
this.flatTextCache += ' '; this.flatTextCache += ' ';
} }
this.flatTextCache = this.flatTextCache.toLowerCase(); this.flatTextCache = utils.removeDiacritic(this.flatTextCache.toLowerCase());
} }
return this.flatTextCache; return this.flatTextCache;

View File

@ -3,8 +3,9 @@
const Expression = require('./expression'); const Expression = require('./expression');
const NoteSet = require('../note_set'); const NoteSet = require('../note_set');
const becca = require('../../../becca/becca'); const becca = require('../../../becca/becca');
const utils = require("../../utils");
class BeccaFlatTextExp extends Expression { class NoteFlatTextExp extends Expression {
constructor(tokens) { constructor(tokens) {
super(); super();
@ -44,15 +45,15 @@ class BeccaFlatTextExp extends Expression {
for (const attribute of note.ownedAttributes) { for (const attribute of note.ownedAttributes) {
for (const token of tokens) { for (const token of tokens) {
if (attribute.name.toLowerCase().includes(token) if (utils.normalize(attribute.name).includes(token)
|| attribute.value.toLowerCase().includes(token)) { || utils.normalize(attribute.value).includes(token)) {
foundAttrTokens.push(token); foundAttrTokens.push(token);
} }
} }
} }
for (const parentNote of note.parents) { for (const parentNote of note.parents) {
const title = beccaService.getNoteTitle(note.noteId, parentNote.noteId).toLowerCase(); const title = utils.normalize(beccaService.getNoteTitle(note.noteId, parentNote.noteId));
const foundTokens = foundAttrTokens.slice(); const foundTokens = foundAttrTokens.slice();
for (const token of tokens) { for (const token of tokens) {
@ -89,8 +90,8 @@ class BeccaFlatTextExp extends Expression {
} }
for (const attribute of note.ownedAttributes) { for (const attribute of note.ownedAttributes) {
if (attribute.name.toLowerCase().includes(token) if (utils.normalize(attribute.name).includes(token)
|| attribute.value.toLowerCase().includes(token)) { || utils.normalize(attribute.value).includes(token)) {
foundAttrTokens.push(token); foundAttrTokens.push(token);
} }
@ -98,7 +99,7 @@ class BeccaFlatTextExp extends Expression {
} }
for (const parentNote of note.parents) { for (const parentNote of note.parents) {
const title = beccaService.getNoteTitle(note.noteId, parentNote.noteId).toLowerCase(); const title = utils.normalize(beccaService.getNoteTitle(note.noteId, parentNote.noteId));
const foundTokens = foundAttrTokens.slice(); const foundTokens = foundAttrTokens.slice();
for (const token of this.tokens) { for (const token of this.tokens) {
@ -140,4 +141,4 @@ class BeccaFlatTextExp extends Expression {
} }
} }
module.exports = BeccaFlatTextExp; module.exports = NoteFlatTextExp;

View File

@ -6,6 +6,7 @@ const log = require('../../log');
const becca = require('../../../becca/becca'); const becca = require('../../../becca/becca');
const protectedSessionService = require('../../protected_session'); const protectedSessionService = require('../../protected_session');
const striptags = require('striptags'); const striptags = require('striptags');
const utils = require("../../utils");
class NoteContentProtectedFulltextExp extends Expression { class NoteContentProtectedFulltextExp extends Expression {
constructor(operator, tokens, raw) { constructor(operator, tokens, raw) {
@ -45,7 +46,7 @@ class NoteContentProtectedFulltextExp extends Expression {
continue; continue;
} }
content = content.toLowerCase(); content = utils.normalize(content);
if (type === 'text' && mime === 'text/html') { if (type === 'text' && mime === 'text/html') {
if (!this.raw && content.length < 20000) { // striptags is slow for very large notes if (!this.raw && content.length < 20000) { // striptags is slow for very large notes

View File

@ -4,6 +4,7 @@ const Expression = require('./expression');
const NoteSet = require('../note_set'); const NoteSet = require('../note_set');
const becca = require('../../../becca/becca'); const becca = require('../../../becca/becca');
const striptags = require('striptags'); const striptags = require('striptags');
const utils = require("../../utils");
class NoteContentUnprotectedFulltextExp extends Expression { class NoteContentUnprotectedFulltextExp extends Expression {
constructor(operator, tokens, raw) { constructor(operator, tokens, raw) {
@ -31,7 +32,7 @@ class NoteContentUnprotectedFulltextExp extends Expression {
continue; continue;
} }
content = content.toString().toLowerCase(); content = utils.normalize(content.toString());
if (type === 'text' && mime === 'text/html') { if (type === 'text' && mime === 'text/html') {
if (!this.raw && content.length < 20000) { // striptags is slow for very large notes if (!this.raw && content.length < 20000) { // striptags is slow for very large notes

View File

@ -18,9 +18,10 @@ const OrderByAndLimitExp = require('../expressions/order_by_and_limit');
const AncestorExp = require("../expressions/ancestor"); const AncestorExp = require("../expressions/ancestor");
const buildComparator = require('./build_comparator'); const buildComparator = require('./build_comparator');
const ValueExtractor = require('../value_extractor'); const ValueExtractor = require('../value_extractor');
const utils = require("../../utils");
function getFulltext(tokens, searchContext) { function getFulltext(tokens, searchContext) {
tokens = tokens.map(t => t.token); tokens = tokens.map(t => utils.removeDiacritic(t.token));
searchContext.highlightedTokens.push(...tokens); searchContext.highlightedTokens.push(...tokens);

View File

@ -223,8 +223,8 @@ function highlightSearchResults(searchResults, highlightedTokens) {
} }
for (const attr of note.getAttributes()) { for (const attr of note.getAttributes()) {
if (highlightedTokens.find(token => attr.name.toLowerCase().includes(token) if (highlightedTokens.find(token => utils.normalize(attr.name).includes(token)
|| attr.value.toLowerCase().includes(token))) { || utils.normalize(attr.value).includes(token))) {
result.highlightedNotePathTitle += ` "${formatAttribute(attr)}'`; result.highlightedNotePathTitle += ` "${formatAttribute(attr)}'`;
} }
@ -232,6 +232,7 @@ function highlightSearchResults(searchResults, highlightedTokens) {
} }
for (const token of highlightedTokens) { for (const token of highlightedTokens) {
// this approach won't work for strings with diacritics
const tokenRegex = new RegExp("(" + utils.escapeRegExp(token) + ")", "gi"); const tokenRegex = new RegExp("(" + utils.escapeRegExp(token) + ")", "gi");
for (const result of searchResults) { for (const result of searchResults) {

View File

@ -290,6 +290,14 @@ function deferred() {
})(); })();
} }
function removeDiacritic(str) {
return str.normalize("NFD").replace(/\p{Diacritic}/gu, "");
}
function normalize(str) {
return removeDiacritic(str).toLowerCase();
}
module.exports = { module.exports = {
randomSecureToken, randomSecureToken,
randomString, randomString,
@ -321,5 +329,7 @@ module.exports = {
removeTextFileExtension, removeTextFileExtension,
formatDownloadTitle, formatDownloadTitle,
timeLimit, timeLimit,
deferred deferred,
removeDiacritic,
normalize
}; };