trilium/src/services/search/expressions/note_content_fulltext.js

"use strict";

const Expression = require('./expression.js');
const NoteSet = require('../note_set.js');
const log = require('../../log.js');
const becca = require('../../../becca/becca.js');
const protectedSessionService = require('../../protected_session.js');
const striptags = require('striptags');
const utils = require('../../utils.js');

const ALLOWED_OPERATORS = ['=', '!=', '*=*', '*=', '=*', '%='];

const cachedRegexes = {};

function getRegex(str) {
    if (!(str in cachedRegexes)) {
        cachedRegexes[str] = new RegExp(str, 'ms'); // multiline, dot-all
    }

    return cachedRegexes[str];
}

class NoteContentFulltextExp extends Expression {
    constructor(operator, {tokens, raw, flatText}) {
        super();

        this.operator = operator;
        this.tokens = tokens;
        this.raw = !!raw;
        this.flatText = !!flatText;
    }

    execute(inputNoteSet, executionContext, searchContext) {
        if (!ALLOWED_OPERATORS.includes(this.operator)) {
            searchContext.addError(`Note content can be searched only with operators: ${ALLOWED_OPERATORS.join(", ")}, operator ${this.operator} given.`);

            return inputNoteSet;
        }

        const resultNoteSet = new NoteSet();
        const sql = require('../../sql.js');

        for (const row of sql.iterateRows(`
                SELECT noteId, type, mime, content, isProtected
                FROM notes JOIN blobs USING (blobId)
                WHERE type IN ('text', 'code', 'mermaid') AND isDeleted = 0`)) {

            this.findInText(row, inputNoteSet, resultNoteSet);
        }

        return resultNoteSet;
    }

    findInText({noteId, isProtected, content, type, mime}, inputNoteSet, resultNoteSet) {
        if (!inputNoteSet.hasNoteId(noteId) || !(noteId in becca.notes)) {
            return;
        }

        if (isProtected) {
            if (!protectedSessionService.isProtectedSessionAvailable()) {
                return;
            }

            try {
                content = protectedSessionService.decryptString(content);
            } catch (e) {
                log.info(`Cannot decrypt content of note ${noteId}`);
                return;
            }
        }

        if (!content) {
            return;
        }

        content = this.preprocessContent(content, type, mime);

        if (this.tokens.length === 1) {
            const [token] = this.tokens;

            if ((this.operator === '=' && token === content)
                || (this.operator === '!=' && token !== content)
                || (this.operator === '*=' && content.endsWith(token))
                || (this.operator === '=*' && content.startsWith(token))
                || (this.operator === '*=*' && content.includes(token))
                || (this.operator === '%=' && getRegex(token).test(content))) {

                resultNoteSet.add(becca.notes[noteId]);
            }
        } else {
            const nonMatchingToken = this.tokens.find(token =>
                !content.includes(token) &&
                (
                    // in case of default fulltext search, we should consider both title, attrs and content
                    // so e.g. "hello world" should match when "hello" is in title and "world" in content
                    !this.flatText
                    || !becca.notes[noteId].getFlatText().includes(token)
                )
            );

            if (!nonMatchingToken) {
                resultNoteSet.add(becca.notes[noteId]);
            }
        }

        return content;
    }

    preprocessContent(content, type, mime) {
        content = utils.normalize(content.toString());

        if (type === 'text' && mime === 'text/html') {
            if (!this.raw && content.length < 20000) { // striptags is slow for very large notes
                content = this.stripTags(content);
            }

            content = content.replace(/&nbsp;/g, ' ');
        }

        return content.trim();
    }

    stripTags(content) {
        // we want to allow link to preserve URLs: https://github.com/zadam/trilium/issues/2412
        // we want to insert space in place of block tags (because they imply text separation)
        // but we don't want to insert text for typical formatting inline tags which can occur within one word
        const linkTag = 'a';
        const inlineFormattingTags = ['b', 'strong', 'em', 'i', 'span', 'big', 'small', 'font', 'sub', 'sup'];

        // replace tags which imply text separation with a space
        content = striptags(content, [linkTag, ...inlineFormattingTags], ' ');

        // replace the inline formatting tags (but not links) without a space
        content = striptags(content, [linkTag], '');

        // at least the closing link tag can be easily stripped
        return content.replace(/<\/a>/ig, "");
    }
}

module.exports = NoteContentFulltextExp;