trilium/apps/server/src/services/search/expressions/note_content_fulltext.ts

"use strict";

import type { NoteRow } from "@triliumnext/commons";
import type SearchContext from "../search_context.js";

import Expression from "./expression.js";
import NoteSet from "../note_set.js";
import log from "../../log.js";
import becca from "../../../becca/becca.js";
import protectedSessionService from "../../protected_session.js";
import striptags from "striptags";
import { normalize } from "../../utils.js";
import sql from "../../sql.js";
import {
    normalizeSearchText,
    calculateOptimizedEditDistance,
    validateFuzzySearchTokens,
    validateAndPreprocessContent,
    fuzzyMatchWord,
    FUZZY_SEARCH_CONFIG
} from "../utils/text_utils.js";
import ftsSearchService, { FTSError, FTSNotAvailableError, FTSQueryError } from "../fts_search.js";

const ALLOWED_OPERATORS = new Set(["=", "!=", "*=*", "*=", "=*", "%=", "~=", "~*"]);

// Maximum content size for search processing (2MB)
const MAX_SEARCH_CONTENT_SIZE = 2 * 1024 * 1024;

const cachedRegexes: Record<string, RegExp> = {};

function getRegex(str: string): RegExp {
    if (!(str in cachedRegexes)) {
        cachedRegexes[str] = new RegExp(str, "ms"); // multiline, dot-all
    }

    return cachedRegexes[str];
}

interface ConstructorOpts {
    tokens: string[];
    raw?: boolean;
    flatText?: boolean;
}

type SearchRow = Pick<NoteRow, "noteId" | "type" | "mime" | "content" | "isProtected">;

class NoteContentFulltextExp extends Expression {
    private operator: string;
    tokens: string[];
    private raw: boolean;
    private flatText: boolean;

    constructor(operator: string, { tokens, raw, flatText }: ConstructorOpts) {
        super();

        if (!operator || !tokens || !Array.isArray(tokens)) {
            throw new Error('Invalid parameters: operator and tokens are required');
        }

        // Validate fuzzy search tokens
        const validation = validateFuzzySearchTokens(tokens, operator);
        if (!validation.isValid) {
            throw new Error(validation.error!);
        }

        this.operator = operator;
        this.tokens = tokens;
        this.raw = !!raw;
        this.flatText = !!flatText;
    }

    execute(inputNoteSet: NoteSet, executionContext: {}, searchContext: SearchContext) {
        if (!ALLOWED_OPERATORS.has(this.operator)) {
            searchContext.addError(`Note content can be searched only with operators: ${Array.from(ALLOWED_OPERATORS).join(", ")}, operator ${this.operator} given.`);

            return inputNoteSet;
        }

        const resultNoteSet = new NoteSet();

        // Skip FTS5 for empty token searches - traditional search is more efficient
        // Empty tokens means we're returning all notes (no filtering), which FTS5 doesn't optimize
        if (this.tokens.length === 0) {
            // Fall through to traditional search below
        }
        // Try to use FTS5 if available for better performance
        else if (ftsSearchService.checkFTS5Availability() && this.canUseFTS5()) {
            try {
                // Check if we need to search protected notes
                const searchProtected = protectedSessionService.isProtectedSessionAvailable();

                const noteIdSet = inputNoteSet.getNoteIds();

                // Determine which FTS5 method to use based on operator
                let ftsResults;
                if (this.operator === "*=*" || this.operator === "*=" || this.operator === "=*") {
                    // Substring operators use LIKE queries (optimized by trigram index)
                    // Do NOT pass a limit - we want all results to match traditional search behavior
                    ftsResults = ftsSearchService.searchWithLike(
                        this.tokens,
                        this.operator,
                        noteIdSet.size > 0 ? noteIdSet : undefined,
                        {
                            includeSnippets: false,
                            searchProtected: false
                            // No limit specified - return all results
                        },
                        searchContext // Pass context to track internal timing
                    );
                } else {
                    // Other operators use MATCH syntax
                    ftsResults = ftsSearchService.searchSync(
                        this.tokens,
                        this.operator,
                        noteIdSet.size > 0 ? noteIdSet : undefined,
                        {
                            includeSnippets: false,
                            searchProtected: false // FTS5 doesn't index protected notes
                        },
                        searchContext // Pass context to track internal timing
                    );
                }

                // Add FTS results to note set
                for (const result of ftsResults) {
                    if (becca.notes[result.noteId]) {
                        resultNoteSet.add(becca.notes[result.noteId]);
                    }
                }

                // If we need to search protected notes, use the separate method
                if (searchProtected) {
                    const protectedResults = ftsSearchService.searchProtectedNotesSync(
                        this.tokens,
                        this.operator,
                        noteIdSet.size > 0 ? noteIdSet : undefined,
                        {
                            includeSnippets: false
                        }
                    );

                    // Add protected note results
                    for (const result of protectedResults) {
                        if (becca.notes[result.noteId]) {
                            resultNoteSet.add(becca.notes[result.noteId]);
                        }
                    }
                }

                // Handle special cases that FTS5 doesn't support well
                if (this.operator === "%=" || this.flatText) {
                    // Fall back to original implementation for regex and flat text searches
                    return this.executeWithFallback(inputNoteSet, resultNoteSet, searchContext);
                }

                return resultNoteSet;
            } catch (error) {
                // Handle structured errors from FTS service
                if (error instanceof FTSError) {
                    if (error instanceof FTSNotAvailableError) {
                        log.info("FTS5 not available, using standard search");
                    } else if (error instanceof FTSQueryError) {
                        log.error(`FTS5 query error: ${error.message}`);
                        searchContext.addError(`Search optimization failed: ${error.message}`);
                    } else {
                        log.error(`FTS5 error: ${error}`);
                    }

                    // Use fallback for recoverable errors
                    if (error.recoverable) {
                        log.info("Using fallback search implementation");
                    } else {
                        // For non-recoverable errors, return empty result
                        searchContext.addError(`Search failed: ${error.message}`);
                        return resultNoteSet;
                    }
                } else {
                    log.error(`Unexpected error in FTS5 search: ${error}`);
                }
                // Fall back to original implementation
            }
        }

        // Original implementation for fallback or when FTS5 is not available
        for (const row of sql.iterateRows<SearchRow>(`
                SELECT noteId, type, mime, content, isProtected
                FROM notes JOIN blobs USING (blobId)
                WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
                  AND isDeleted = 0
                  AND LENGTH(content) < ${MAX_SEARCH_CONTENT_SIZE}`)) {
            this.findInText(row, inputNoteSet, resultNoteSet);
        }

        return resultNoteSet;
    }

    /**
     * Determines if the current search can use FTS5
     */
    private canUseFTS5(): boolean {
        // FTS5 doesn't support regex searches well
        if (this.operator === "%=") {
            return false;
        }

        // For now, we'll use FTS5 for most text searches
        // but keep the original implementation for complex cases
        return true;
    }

    /**
     * Executes search with fallback for special cases
     */
    private executeWithFallback(inputNoteSet: NoteSet, resultNoteSet: NoteSet, searchContext: SearchContext): NoteSet {
        // Keep existing results from FTS5 and add additional results from fallback
        for (const row of sql.iterateRows<SearchRow>(`
                SELECT noteId, type, mime, content, isProtected
                FROM notes JOIN blobs USING (blobId)
                WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
                  AND isDeleted = 0
                  AND LENGTH(content) < ${MAX_SEARCH_CONTENT_SIZE}`)) {
            if (this.operator === "%=" || this.flatText) {
                // Only process for special cases
                this.findInText(row, inputNoteSet, resultNoteSet);
            }
        }
        return resultNoteSet;
    }

    findInText({ noteId, isProtected, content, type, mime }: SearchRow, inputNoteSet: NoteSet, resultNoteSet: NoteSet) {
        if (!inputNoteSet.hasNoteId(noteId) || !(noteId in becca.notes)) {
            return;
        }

        if (isProtected) {
            if (!protectedSessionService.isProtectedSessionAvailable() || !content || typeof content !== "string") {
                return;
            }

            try {
                content = protectedSessionService.decryptString(content) || undefined;
            } catch (e) {
                log.info(`Cannot decrypt content of note ${noteId}`);
                return;
            }
        }

        if (!content) {
            return;
        }

        content = this.preprocessContent(content, type, mime);

        // Apply content size validation and preprocessing
        const processedContent = validateAndPreprocessContent(content, noteId);
        if (!processedContent) {
            return; // Content too large or invalid
        }
        content = processedContent;

        if (this.tokens.length === 1) {
            const [token] = this.tokens;

            if (
                (this.operator === "=" && token === content) ||
                (this.operator === "!=" && token !== content) ||
                (this.operator === "*=" && content.endsWith(token)) ||
                (this.operator === "=*" && content.startsWith(token)) ||
                (this.operator === "*=*" && content.includes(token)) ||
                (this.operator === "%=" && getRegex(token).test(content)) ||
                (this.operator === "~=" && this.matchesWithFuzzy(content, noteId)) ||
                (this.operator === "~*" && this.fuzzyMatchToken(normalizeSearchText(token), normalizeSearchText(content)))
            ) {
                resultNoteSet.add(becca.notes[noteId]);
            }
        } else {
            // Multi-token matching with fuzzy support and phrase proximity
            if (this.operator === "~=" || this.operator === "~*") {
                if (this.matchesWithFuzzy(content, noteId)) {
                    resultNoteSet.add(becca.notes[noteId]);
                }
            } else {
                const nonMatchingToken = this.tokens.find(
                    (token) =>
                        !this.tokenMatchesContent(token, content, noteId)
                );

                if (!nonMatchingToken) {
                    resultNoteSet.add(becca.notes[noteId]);
                }
            }
        }

        return content;
    }

    preprocessContent(content: string | Buffer, type: string, mime: string) {
        content = normalize(content.toString());

        if (type === "text" && mime === "text/html") {
            if (!this.raw) {
                // Content size already filtered at DB level, safe to process
                content = this.stripTags(content);
            }

            content = content.replace(/&nbsp;/g, " ");
        } else if (type === "mindMap" && mime === "application/json") {
            content = processMindmapContent(content);
        } else if (type === "canvas" && mime === "application/json") {
            interface Element {
                type: string;
                text?: string; // Optional since not all objects have a `text` property
                id: string;
                [key: string]: any; // Other properties that may exist
            }

            try {
                let canvasContent = JSON.parse(content);
                // Canvas content may not have elements array, use empty array as default
                const elements: Element[] = canvasContent.elements || [];
                const texts = elements
                    .filter((element: Element) => element.type === "text" && element.text) // Filter for 'text' type elements with a 'text' property
                    .map((element: Element) => element.text!); // Use `!` to assert `text` is defined after filtering

                content = normalize(texts.join(" "));
            } catch (e) {
                // Handle JSON parse errors or malformed canvas content
                content = "";
            }
        }

        return content.trim();
    }

    /**
     * Checks if a token matches content with optional fuzzy matching
     */
    private tokenMatchesContent(token: string, content: string, noteId: string): boolean {
        const normalizedToken = normalizeSearchText(token);
        const normalizedContent = normalizeSearchText(content);

        if (normalizedContent.includes(normalizedToken)) {
            return true;
        }

        // Check flat text for default fulltext search
        if (!this.flatText || !becca.notes[noteId].getFlatText().includes(token)) {
            return false;
        }

        return true;
    }

    /**
     * Performs fuzzy matching with edit distance and phrase proximity
     */
    private matchesWithFuzzy(content: string, noteId: string): boolean {
        try {
            const normalizedContent = normalizeSearchText(content);
            const flatText = this.flatText ? normalizeSearchText(becca.notes[noteId].getFlatText()) : "";

            // For phrase matching, check if tokens appear within reasonable proximity
            if (this.tokens.length > 1) {
                return this.matchesPhrase(normalizedContent, flatText);
            }

            // Single token fuzzy matching
            const token = normalizeSearchText(this.tokens[0]);
            return this.fuzzyMatchToken(token, normalizedContent) ||
                   (this.flatText && this.fuzzyMatchToken(token, flatText));
        } catch (error) {
            log.error(`Error in fuzzy matching for note ${noteId}: ${error}`);
            return false;
        }
    }

    /**
     * Checks if multiple tokens match as a phrase with proximity consideration
     */
    private matchesPhrase(content: string, flatText: string): boolean {
        const searchText = this.flatText ? `${content} ${flatText}` : content;

        // Apply content size limits for phrase matching
        const limitedText = validateAndPreprocessContent(searchText);
        if (!limitedText) {
            return false;
        }

        const words = limitedText.toLowerCase().split(/\s+/);

        // Only skip phrase matching for truly extreme word counts that could crash the system
        if (words.length > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT) {
            console.error(`Phrase matching skipped due to extreme word count that could cause system instability: ${words.length} words`);
            return false;
        }

        // Warn about large word counts but still attempt matching
        if (words.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) {
            console.info(`Large word count for phrase matching: ${words.length} words - may take longer but will attempt full matching`);
        }

        // Find positions of each token
        const tokenPositions: number[][] = this.tokens.map(token => {
            const normalizedToken = normalizeSearchText(token);
            const positions: number[] = [];

            words.forEach((word, index) => {
                if (this.fuzzyMatchSingle(normalizedToken, word)) {
                    positions.push(index);
                }
            });

            return positions;
        });

        // Check if we found all tokens
        if (tokenPositions.some(positions => positions.length === 0)) {
            return false;
        }

        // Check for phrase proximity using configurable distance
        return this.hasProximityMatch(tokenPositions, FUZZY_SEARCH_CONFIG.MAX_PHRASE_PROXIMITY);
    }

    /**
     * Checks if token positions indicate a phrase match within max distance
     */
    private hasProximityMatch(tokenPositions: number[][], maxDistance: number): boolean {
        // For 2 tokens, simple proximity check
        if (tokenPositions.length === 2) {
            const [pos1, pos2] = tokenPositions;
            return pos1.some(p1 => pos2.some(p2 => Math.abs(p1 - p2) <= maxDistance));
        }

        // For more tokens, check if we can find a sequence where all tokens are within range
        const findSequence = (remaining: number[][], currentPos: number): boolean => {
            if (remaining.length === 0) return true;

            const [nextPositions, ...rest] = remaining;
            return nextPositions.some(pos =>
                Math.abs(pos - currentPos) <= maxDistance &&
                findSequence(rest, pos)
            );
        };

        const [firstPositions, ...rest] = tokenPositions;
        return firstPositions.some(startPos => findSequence(rest, startPos));
    }

    /**
     * Performs fuzzy matching for a single token against content
     */
    private fuzzyMatchToken(token: string, content: string): boolean {
        if (token.length < FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH) {
            // For short tokens, require exact match to avoid too many false positives
            return content.includes(token);
        }

        const words = content.split(/\s+/);

        // Only limit word processing for truly extreme cases to prevent system instability
        const limitedWords = words.slice(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT);

        return limitedWords.some(word => this.fuzzyMatchSingle(token, word));
    }

    /**
     * Fuzzy matches a single token against a single word
     */
    private fuzzyMatchSingle(token: string, word: string): boolean {
        // Use shared optimized fuzzy matching logic
        return fuzzyMatchWord(token, word, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
    }


    stripTags(content: string) {
        // we want to allow link to preserve URLs: https://github.com/zadam/trilium/issues/2412
        // we want to insert space in place of block tags (because they imply text separation)
        // but we don't want to insert text for typical formatting inline tags which can occur within one word
        const linkTag = "a";
        const inlineFormattingTags = ["b", "strong", "em", "i", "span", "big", "small", "font", "sub", "sup"];

        // replace tags which imply text separation with a space
        content = striptags(content, [linkTag, ...inlineFormattingTags], " ");

        // replace the inline formatting tags (but not links) without a space
        content = striptags(content, [linkTag], "");

        // at least the closing link tag can be easily stripped
        return content.replace(/<\/a>/gi, "");
    }
}

export function processMindmapContent(content: string) {
    let mindMapcontent;

    try {
        mindMapcontent = JSON.parse(content);
    } catch (e) {
        return "";
    }

    // Define interfaces for the JSON structure
    interface MindmapNode {
        id: string;
        topic: string;
        children: MindmapNode[]; // Recursive structure
        direction?: number;
        expanded?: boolean;
    }

    interface MindmapData {
        nodedata: MindmapNode;
        arrows: any[]; // If you know the structure, replace `any` with the correct type
        summaries: any[];
        direction: number;
        theme: {
            name: string;
            type: string;
            palette: string[];
            cssvar: Record<string, string>; // Object with string keys and string values
        };
    }

    // Recursive function to collect all topics
    function collectTopics(node?: MindmapNode): string[] {
        if (!node) {
            return [];
        }

        // Collect the current node's topic
        let topics = [node.topic];

        // If the node has children, collect topics recursively
        if (node.children && node.children.length > 0) {
            for (const child of node.children) {
                topics = topics.concat(collectTopics(child));
            }
        }

        return topics;
    }

    // Start extracting from the root node
    const topicsArray = collectTopics(mindMapcontent.nodedata);

    // Combine topics into a single string
    const topicsString = topicsArray.join(", ");

    return normalize(topicsString.toString());
}

export default NoteContentFulltextExp;