mirror of
https://github.com/zadam/trilium.git
synced 2025-12-05 06:54:23 +01:00
155 lines
5.2 KiB
TypeScript
155 lines
5.2 KiB
TypeScript
/**
|
|
* FTS5 Query Builder
|
|
*
|
|
* Utilities for converting Trilium search syntax to FTS5 MATCH syntax,
|
|
* sanitizing tokens, and handling text matching operations.
|
|
*/
|
|
|
|
import striptags from "striptags";
|
|
import log from "../../log.js";
|
|
import { FTSQueryError } from "./errors.js";
|
|
|
|
/**
|
|
* Converts Trilium search syntax to FTS5 MATCH syntax
|
|
*
|
|
* @param tokens - Array of search tokens
|
|
* @param operator - Trilium search operator
|
|
* @returns FTS5 MATCH query string
|
|
*/
|
|
export function convertToFTS5Query(tokens: string[], operator: string): string {
|
|
if (!tokens || tokens.length === 0) {
|
|
throw new Error("No search tokens provided");
|
|
}
|
|
|
|
// Substring operators (*=*, *=, =*) use LIKE queries now, not MATCH
|
|
if (operator === "*=*" || operator === "*=" || operator === "=*") {
|
|
throw new Error("Substring operators should use searchWithLike(), not MATCH queries");
|
|
}
|
|
|
|
// Trigram tokenizer requires minimum 3 characters
|
|
const shortTokens = tokens.filter(token => token.length < 3);
|
|
if (shortTokens.length > 0) {
|
|
const shortList = shortTokens.join(', ');
|
|
log.info(`Tokens shorter than 3 characters detected (${shortList}) - cannot use trigram FTS5`);
|
|
throw new FTSQueryError(
|
|
`Trigram tokenizer requires tokens of at least 3 characters. Short tokens: ${shortList}`
|
|
);
|
|
}
|
|
|
|
// Sanitize tokens to prevent FTS5 syntax injection
|
|
const sanitizedTokens = tokens.map(token => sanitizeFTS5Token(token));
|
|
|
|
// Only handle operators that work with MATCH
|
|
switch (operator) {
|
|
case "=": // Exact phrase match
|
|
return `"${sanitizedTokens.join(" ")}"`;
|
|
|
|
case "!=": // Does not contain
|
|
return `NOT (${sanitizedTokens.join(" OR ")})`;
|
|
|
|
case "~=": // Fuzzy match (use OR)
|
|
case "~*":
|
|
return sanitizedTokens.join(" OR ");
|
|
|
|
case "%=": // Regex - uses traditional SQL iteration fallback
|
|
throw new FTSQueryError("Regex search not supported in FTS5 - use traditional search path");
|
|
|
|
default:
|
|
throw new FTSQueryError(`Unsupported MATCH operator: ${operator}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Sanitizes a token for safe use in FTS5 queries
|
|
* Validates that the token is not empty after sanitization
|
|
*/
|
|
export function sanitizeFTS5Token(token: string): string {
|
|
// Remove special FTS5 characters that could break syntax
|
|
const sanitized = token
|
|
.replace(/["\(\)\*]/g, '') // Remove quotes, parens, wildcards
|
|
.replace(/\s+/g, ' ') // Normalize whitespace
|
|
.trim();
|
|
|
|
// Validate that token is not empty after sanitization
|
|
if (!sanitized || sanitized.length === 0) {
|
|
log.info(`Token became empty after sanitization: "${token}"`);
|
|
// Return a safe placeholder that won't match anything
|
|
return "__empty_token__";
|
|
}
|
|
|
|
return sanitized;
|
|
}
|
|
|
|
/**
|
|
* Escapes LIKE wildcards (% and _) in user input to treat them as literals
|
|
* @param str - User input string
|
|
* @returns String with LIKE wildcards escaped
|
|
*/
|
|
export function escapeLikeWildcards(str: string): string {
|
|
return str.replace(/[%_]/g, '\\$&');
|
|
}
|
|
|
|
/**
|
|
* Checks if a phrase appears as exact words in text (respecting word boundaries)
|
|
* @param phrase - The phrase to search for (case-insensitive)
|
|
* @param text - The text to search in
|
|
* @returns true if the phrase appears as complete words, false otherwise
|
|
*/
|
|
export function containsExactPhrase(phrase: string, text: string | null | undefined): boolean {
|
|
if (!text || !phrase || typeof text !== 'string') {
|
|
return false;
|
|
}
|
|
|
|
// Normalize both to lowercase for case-insensitive comparison
|
|
const normalizedPhrase = phrase.toLowerCase().trim();
|
|
const normalizedText = text.toLowerCase();
|
|
|
|
// Strip HTML tags for content matching
|
|
const plainText = striptags(normalizedText);
|
|
|
|
// For single words, use word-boundary matching
|
|
if (!normalizedPhrase.includes(' ')) {
|
|
// Split text into words and check for exact match
|
|
const words = plainText.split(/\s+/);
|
|
return words.some(word => word === normalizedPhrase);
|
|
}
|
|
|
|
// For multi-word phrases, check if the phrase appears as consecutive words
|
|
// Split text into words, then check if the phrase appears in the word sequence
|
|
const textWords = plainText.split(/\s+/);
|
|
const phraseWords = normalizedPhrase.split(/\s+/);
|
|
|
|
// Sliding window to find exact phrase match
|
|
for (let i = 0; i <= textWords.length - phraseWords.length; i++) {
|
|
let match = true;
|
|
for (let j = 0; j < phraseWords.length; j++) {
|
|
if (textWords[i + j] !== phraseWords[j]) {
|
|
match = false;
|
|
break;
|
|
}
|
|
}
|
|
if (match) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Generates a snippet from content
|
|
*/
|
|
export function generateSnippet(content: string, maxLength: number = 30): string {
|
|
// Strip HTML tags for snippet
|
|
const plainText = striptags(content);
|
|
// Simple normalization - just trim and collapse whitespace
|
|
const normalized = plainText.replace(/\s+/g, ' ').trim();
|
|
|
|
if (normalized.length <= maxLength * 10) {
|
|
return normalized;
|
|
}
|
|
|
|
// Extract snippet around first occurrence
|
|
return normalized.substring(0, maxLength * 10) + '...';
|
|
}
|