trilium/apps/server/src/services/search/fts/query_builder.ts

/**
 * FTS5 Query Builder
 *
 * Utilities for converting Trilium search syntax to FTS5 MATCH syntax,
 * sanitizing tokens, and handling text matching operations.
 */

import striptags from "striptags";
import log from "../../log.js";
import { FTSQueryError } from "./errors.js";

/**
 * Converts Trilium search syntax to FTS5 MATCH syntax
 *
 * @param tokens - Array of search tokens
 * @param operator - Trilium search operator
 * @returns FTS5 MATCH query string
 */
export function convertToFTS5Query(tokens: string[], operator: string): string {
    if (!tokens || tokens.length === 0) {
        throw new Error("No search tokens provided");
    }

    // Substring operators (*=*, *=, =*) use LIKE queries now, not MATCH
    if (operator === "*=*" || operator === "*=" || operator === "=*") {
        throw new Error("Substring operators should use searchWithLike(), not MATCH queries");
    }

    // Trigram tokenizer requires minimum 3 characters
    const shortTokens = tokens.filter(token => token.length < 3);
    if (shortTokens.length > 0) {
        const shortList = shortTokens.join(', ');
        log.info(`Tokens shorter than 3 characters detected (${shortList}) - cannot use trigram FTS5`);
        throw new FTSQueryError(
            `Trigram tokenizer requires tokens of at least 3 characters. Short tokens: ${shortList}`
        );
    }

    // Sanitize tokens to prevent FTS5 syntax injection
    const sanitizedTokens = tokens.map(token => sanitizeFTS5Token(token));

    // Only handle operators that work with MATCH
    switch (operator) {
        case "=": // Exact phrase match
            return `"${sanitizedTokens.join(" ")}"`;

        case "!=": // Does not contain
            return `NOT (${sanitizedTokens.join(" OR ")})`;

        case "~=": // Fuzzy match (use OR)
        case "~*":
            return sanitizedTokens.join(" OR ");

        case "%=": // Regex - uses traditional SQL iteration fallback
            throw new FTSQueryError("Regex search not supported in FTS5 - use traditional search path");

        default:
            throw new FTSQueryError(`Unsupported MATCH operator: ${operator}`);
    }
}

/**
 * Sanitizes a token for safe use in FTS5 queries
 * Validates that the token is not empty after sanitization
 */
export function sanitizeFTS5Token(token: string): string {
    // Remove special FTS5 characters that could break syntax
    const sanitized = token
        .replace(/["\(\)\*]/g, '') // Remove quotes, parens, wildcards
        .replace(/\s+/g, ' ')       // Normalize whitespace
        .trim();

    // Validate that token is not empty after sanitization
    if (!sanitized || sanitized.length === 0) {
        log.info(`Token became empty after sanitization: "${token}"`);
        // Return a safe placeholder that won't match anything
        return "__empty_token__";
    }

    return sanitized;
}

/**
 * Escapes LIKE wildcards (% and _) in user input to treat them as literals
 * @param str - User input string
 * @returns String with LIKE wildcards escaped
 */
export function escapeLikeWildcards(str: string): string {
    return str.replace(/[%_]/g, '\\$&');
}

/**
 * Checks if a phrase appears as exact words in text (respecting word boundaries)
 * @param phrase - The phrase to search for (case-insensitive)
 * @param text - The text to search in
 * @returns true if the phrase appears as complete words, false otherwise
 */
export function containsExactPhrase(phrase: string, text: string | null | undefined): boolean {
    if (!text || !phrase || typeof text !== 'string') {
        return false;
    }

    // Normalize both to lowercase for case-insensitive comparison
    const normalizedPhrase = phrase.toLowerCase().trim();
    const normalizedText = text.toLowerCase();

    // Strip HTML tags for content matching
    const plainText = striptags(normalizedText);

    // For single words, use word-boundary matching
    if (!normalizedPhrase.includes(' ')) {
        // Split text into words and check for exact match
        const words = plainText.split(/\s+/);
        return words.some(word => word === normalizedPhrase);
    }

    // For multi-word phrases, check if the phrase appears as consecutive words
    // Split text into words, then check if the phrase appears in the word sequence
    const textWords = plainText.split(/\s+/);
    const phraseWords = normalizedPhrase.split(/\s+/);

    // Sliding window to find exact phrase match
    for (let i = 0; i <= textWords.length - phraseWords.length; i++) {
        let match = true;
        for (let j = 0; j < phraseWords.length; j++) {
            if (textWords[i + j] !== phraseWords[j]) {
                match = false;
                break;
            }
        }
        if (match) {
            return true;
        }
    }

    return false;
}

/**
 * Generates a snippet from content
 */
export function generateSnippet(content: string, maxLength: number = 30): string {
    // Strip HTML tags for snippet
    const plainText = striptags(content);
    // Simple normalization - just trim and collapse whitespace
    const normalized = plainText.replace(/\s+/g, ' ').trim();

    if (normalized.length <= maxLength * 10) {
        return normalized;
    }

    // Extract snippet around first occurrence
    return normalized.substring(0, maxLength * 10) + '...';
}