trilium/apps/server/src/services/search/fts/query_builder.ts

155 lines
5.2 KiB
TypeScript

/**
* FTS5 Query Builder
*
* Utilities for converting Trilium search syntax to FTS5 MATCH syntax,
* sanitizing tokens, and handling text matching operations.
*/
import striptags from "striptags";
import log from "../../log.js";
import { FTSQueryError } from "./errors.js";
/**
* Converts Trilium search syntax to FTS5 MATCH syntax
*
* @param tokens - Array of search tokens
* @param operator - Trilium search operator
* @returns FTS5 MATCH query string
*/
export function convertToFTS5Query(tokens: string[], operator: string): string {
if (!tokens || tokens.length === 0) {
throw new Error("No search tokens provided");
}
// Substring operators (*=*, *=, =*) use LIKE queries now, not MATCH
if (operator === "*=*" || operator === "*=" || operator === "=*") {
throw new Error("Substring operators should use searchWithLike(), not MATCH queries");
}
// Trigram tokenizer requires minimum 3 characters
const shortTokens = tokens.filter(token => token.length < 3);
if (shortTokens.length > 0) {
const shortList = shortTokens.join(', ');
log.info(`Tokens shorter than 3 characters detected (${shortList}) - cannot use trigram FTS5`);
throw new FTSQueryError(
`Trigram tokenizer requires tokens of at least 3 characters. Short tokens: ${shortList}`
);
}
// Sanitize tokens to prevent FTS5 syntax injection
const sanitizedTokens = tokens.map(token => sanitizeFTS5Token(token));
// Only handle operators that work with MATCH
switch (operator) {
case "=": // Exact phrase match
return `"${sanitizedTokens.join(" ")}"`;
case "!=": // Does not contain
return `NOT (${sanitizedTokens.join(" OR ")})`;
case "~=": // Fuzzy match (use OR)
case "~*":
return sanitizedTokens.join(" OR ");
case "%=": // Regex - uses traditional SQL iteration fallback
throw new FTSQueryError("Regex search not supported in FTS5 - use traditional search path");
default:
throw new FTSQueryError(`Unsupported MATCH operator: ${operator}`);
}
}
/**
* Sanitizes a token for safe use in FTS5 queries
* Validates that the token is not empty after sanitization
*/
export function sanitizeFTS5Token(token: string): string {
// Remove special FTS5 characters that could break syntax
const sanitized = token
.replace(/["\(\)\*]/g, '') // Remove quotes, parens, wildcards
.replace(/\s+/g, ' ') // Normalize whitespace
.trim();
// Validate that token is not empty after sanitization
if (!sanitized || sanitized.length === 0) {
log.info(`Token became empty after sanitization: "${token}"`);
// Return a safe placeholder that won't match anything
return "__empty_token__";
}
return sanitized;
}
/**
* Escapes LIKE wildcards (% and _) in user input to treat them as literals
* @param str - User input string
* @returns String with LIKE wildcards escaped
*/
export function escapeLikeWildcards(str: string): string {
return str.replace(/[%_]/g, '\\$&');
}
/**
* Checks if a phrase appears as exact words in text (respecting word boundaries)
* @param phrase - The phrase to search for (case-insensitive)
* @param text - The text to search in
* @returns true if the phrase appears as complete words, false otherwise
*/
export function containsExactPhrase(phrase: string, text: string | null | undefined): boolean {
if (!text || !phrase || typeof text !== 'string') {
return false;
}
// Normalize both to lowercase for case-insensitive comparison
const normalizedPhrase = phrase.toLowerCase().trim();
const normalizedText = text.toLowerCase();
// Strip HTML tags for content matching
const plainText = striptags(normalizedText);
// For single words, use word-boundary matching
if (!normalizedPhrase.includes(' ')) {
// Split text into words and check for exact match
const words = plainText.split(/\s+/);
return words.some(word => word === normalizedPhrase);
}
// For multi-word phrases, check if the phrase appears as consecutive words
// Split text into words, then check if the phrase appears in the word sequence
const textWords = plainText.split(/\s+/);
const phraseWords = normalizedPhrase.split(/\s+/);
// Sliding window to find exact phrase match
for (let i = 0; i <= textWords.length - phraseWords.length; i++) {
let match = true;
for (let j = 0; j < phraseWords.length; j++) {
if (textWords[i + j] !== phraseWords[j]) {
match = false;
break;
}
}
if (match) {
return true;
}
}
return false;
}
/**
* Generates a snippet from content
*/
export function generateSnippet(content: string, maxLength: number = 30): string {
// Strip HTML tags for snippet
const plainText = striptags(content);
// Simple normalization - just trim and collapse whitespace
const normalized = plainText.replace(/\s+/g, ' ').trim();
if (normalized.length <= maxLength * 10) {
return normalized;
}
// Extract snippet around first occurrence
return normalized.substring(0, maxLength * 10) + '...';
}