Paragraph with bold text.
+ + + + `; + const result = db.prepare('SELECT strip_html(?) as text').get(html) as any; + expect(result.text).toContain('Title'); + expect(result.text).toContain('Paragraph with bold text'); + expect(result.text).not.toContain('console.log'); + }); + + it('should handle null input', () => { + const result = db.prepare('SELECT strip_html(?) as text').get(null) as any; + expect(result.text).toBe(''); + }); + }); + + describe('fuzzy_match function', () => { + beforeEach(() => { + service.registerFunctions(db); + }); + + it('should perform exact matches', () => { + const tests = [ + ['hello', 'hello world', 1], + ['world', 'hello world', 1], + ['foo', 'hello world', 0], + ]; + + for (const [needle, haystack, expected] of tests) { + const result = db.prepare('SELECT fuzzy_match(?, ?, 2) as match').get(needle, haystack) as any; + expect(result.match).toBe(expected); + } + }); + + it('should perform fuzzy matches within edit distance', () => { + const tests = [ + ['helo', 'hello world', 1], // 1 edit distance + ['wrld', 'hello world', 1], // 1 edit distance + ['hallo', 'hello world', 1], // 1 edit distance + ['xyz', 'hello world', 0], // Too different + ]; + + for (const [needle, haystack, expected] of tests) { + const result = db.prepare('SELECT fuzzy_match(?, ?, 2) as match').get(needle, haystack) as any; + expect(result.match).toBe(expected); + } + }); + + it('should handle case insensitive matching', () => { + const result = db.prepare('SELECT fuzzy_match(?, ?, 2) as match').get('HELLO', 'hello world') as any; + expect(result.match).toBe(1); + }); + + it('should handle null inputs', () => { + const result = db.prepare('SELECT fuzzy_match(?, ?, 2) as match').get(null, 'test') as any; + expect(result.match).toBe(0); + }); + }); + + describe('Integration with SQL queries', () => { + beforeEach(() => { + service.registerFunctions(db); + + // Create a test table + db.exec(` + CREATE TABLE test_notes ( + id INTEGER PRIMARY KEY, + title TEXT, + content TEXT + ) + `); + + // Insert test data + const insert = db.prepare('INSERT INTO test_notes (title, content) VALUES (?, ?)'); + insert.run('Café Meeting', 'Discussion about naïve implementation
'); + insert.run('über wichtig', 'Very important note with HTML & entities'); + insert.run('getUserData', 'Function to get_user_data from database'); + }); + + it('should work in WHERE clauses with normalize_text', () => { + const results = db.prepare(` + SELECT title FROM test_notes + WHERE normalize_text(title) LIKE '%cafe%' + `).all(); + + expect(results).toHaveLength(1); + expect((results[0] as any).title).toBe('Café Meeting'); + }); + + it('should work with fuzzy matching in queries', () => { + const results = db.prepare(` + SELECT title FROM test_notes + WHERE fuzzy_match('getuserdata', normalize_text(title), 2) = 1 + `).all(); + + expect(results).toHaveLength(1); + expect((results[0] as any).title).toBe('getUserData'); + }); + + it('should work with HTML stripping', () => { + const results = db.prepare(` + SELECT strip_html(content) as clean_content + FROM test_notes + WHERE title = 'Café Meeting' + `).all(); + + expect((results[0] as any).clean_content).toBe('Discussion about naïve implementation'); + }); + + it('should work with tokenization', () => { + const result = db.prepare(` + SELECT tokenize_text(title) as tokens + FROM test_notes + WHERE title = 'getUserData' + `).get() as any; + + const tokens = JSON.parse(result.tokens); + expect(tokens).toContain('get'); + expect(tokens).toContain('user'); + expect(tokens).toContain('data'); + }); + }); +}); \ No newline at end of file diff --git a/apps/server/src/services/search/sqlite_functions.ts b/apps/server/src/services/search/sqlite_functions.ts new file mode 100644 index 000000000..904a04507 --- /dev/null +++ b/apps/server/src/services/search/sqlite_functions.ts @@ -0,0 +1,514 @@ +/** + * SQLite Custom Functions Service + * + * This service manages custom SQLite functions that enhance search capabilities. + * Functions are registered with better-sqlite3 to provide native-speed operations + * directly within SQL queries, enabling efficient search indexing and querying. + * + * These functions are used by: + * - Database triggers for automatic search index maintenance + * - Direct SQL queries for search operations + * - Migration scripts for initial data population + */ + +import type { Database } from "better-sqlite3"; +import log from "../log.js"; +import { normalize as utilsNormalize, stripTags } from "../utils.js"; + +/** + * Configuration for fuzzy search operations + */ +const FUZZY_CONFIG = { + MAX_EDIT_DISTANCE: 2, + MIN_TOKEN_LENGTH: 3, + MAX_STRING_LENGTH: 1000, // Performance guard for edit distance +} as const; + +/** + * Interface for registering a custom SQL function + */ +interface SQLiteFunction { + name: string; + implementation: (...args: any[]) => any; + options?: { + deterministic?: boolean; + varargs?: boolean; + directOnly?: boolean; + }; +} + +/** + * Manages registration and lifecycle of custom SQLite functions + */ +export class SqliteFunctionsService { + private static instance: SqliteFunctionsService | null = null; + private registered = false; + private functions: SQLiteFunction[] = []; + + private constructor() { + // Initialize the function definitions + this.initializeFunctions(); + } + + /** + * Get singleton instance of the service + */ + static getInstance(): SqliteFunctionsService { + if (!SqliteFunctionsService.instance) { + SqliteFunctionsService.instance = new SqliteFunctionsService(); + } + return SqliteFunctionsService.instance; + } + + /** + * Initialize all custom function definitions + */ + private initializeFunctions(): void { + // Bind all methods to preserve 'this' context + this.functions = [ + { + name: "normalize_text", + implementation: this.normalizeText.bind(this), + options: { + deterministic: true, + varargs: false + } + }, + { + name: "edit_distance", + implementation: this.editDistance.bind(this), + options: { + deterministic: true, + varargs: true // Changed to true to handle variable arguments + } + }, + { + name: "regex_match", + implementation: this.regexMatch.bind(this), + options: { + deterministic: true, + varargs: true // Changed to true to handle variable arguments + } + }, + { + name: "tokenize_text", + implementation: this.tokenizeText.bind(this), + options: { + deterministic: true, + varargs: false + } + }, + { + name: "strip_html", + implementation: this.stripHtml.bind(this), + options: { + deterministic: true, + varargs: false + } + }, + { + name: "fuzzy_match", + implementation: this.fuzzyMatch.bind(this), + options: { + deterministic: true, + varargs: true // Changed to true to handle variable arguments + } + } + ]; + } + + /** + * Register all custom functions with the database connection + * + * @param db The better-sqlite3 database connection + * @returns true if registration was successful, false otherwise + */ + registerFunctions(db: Database): boolean { + if (this.registered) { + log.info("SQLite custom functions already registered"); + return true; + } + + try { + // Test if the database connection is valid first + // This will throw if the database is closed + db.pragma("user_version"); + + log.info("Registering SQLite custom functions..."); + + let successCount = 0; + for (const func of this.functions) { + try { + db.function(func.name, func.options || {}, func.implementation); + log.info(`Registered SQLite function: ${func.name}`); + successCount++; + } catch (error) { + log.error(`Failed to register SQLite function ${func.name}: ${error}`); + // Continue registering other functions even if one fails + } + } + + // Only mark as registered if at least some functions were registered + if (successCount > 0) { + this.registered = true; + log.info(`SQLite custom functions registration completed (${successCount}/${this.functions.length})`); + return true; + } else { + log.error("No SQLite functions could be registered"); + return false; + } + + } catch (error) { + log.error(`Failed to register SQLite custom functions: ${error}`); + return false; + } + } + + /** + * Unregister all custom functions (for cleanup/testing) + * Note: better-sqlite3 doesn't provide a way to unregister functions, + * so this just resets the internal state + */ + unregister(): void { + this.registered = false; + } + + /** + * Check if functions are currently registered + */ + isRegistered(): boolean { + return this.registered; + } + + // ===== Function Implementations ===== + + /** + * Normalize text by removing diacritics and converting to lowercase + * Matches the behavior of utils.normalize() exactly + * + * @param text Text to normalize + * @returns Normalized text + */ + private normalizeText(text: string | null | undefined): string { + if (!text || typeof text !== 'string') { + return ''; + } + + // Use the exact same normalization as the rest of the codebase + return utilsNormalize(text); + } + + /** + * Calculate Levenshtein edit distance between two strings + * Optimized with early termination and single-array approach + * + * SQLite will pass 2 or 3 arguments: + * - 2 args: str1, str2 (uses default maxDistance) + * - 3 args: str1, str2, maxDistance + * + * @returns Edit distance or maxDistance + 1 if exceeded + */ + private editDistance(...args: any[]): number { + // Handle variable arguments from SQLite + let str1: string | null | undefined = args[0]; + let str2: string | null | undefined = args[1]; + let maxDistance: number = args.length > 2 ? args[2] : FUZZY_CONFIG.MAX_EDIT_DISTANCE; + // Handle null/undefined inputs + if (!str1 || typeof str1 !== 'string') str1 = ''; + if (!str2 || typeof str2 !== 'string') str2 = ''; + + // Validate and sanitize maxDistance + if (typeof maxDistance !== 'number' || !Number.isFinite(maxDistance)) { + maxDistance = FUZZY_CONFIG.MAX_EDIT_DISTANCE; + } else { + // Ensure it's a positive integer + maxDistance = Math.max(0, Math.floor(maxDistance)); + } + + const len1 = str1.length; + const len2 = str2.length; + + // Performance guard for very long strings + if (len1 > FUZZY_CONFIG.MAX_STRING_LENGTH || len2 > FUZZY_CONFIG.MAX_STRING_LENGTH) { + return Math.abs(len1 - len2) <= maxDistance ? Math.abs(len1 - len2) : maxDistance + 1; + } + + // Early termination: length difference exceeds max + if (Math.abs(len1 - len2) > maxDistance) { + return maxDistance + 1; + } + + // Handle edge cases + if (len1 === 0) return len2 <= maxDistance ? len2 : maxDistance + 1; + if (len2 === 0) return len1 <= maxDistance ? len1 : maxDistance + 1; + + // Single-array optimization for memory efficiency + let previousRow = Array.from({ length: len2 + 1 }, (_, i) => i); + let currentRow = new Array(len2 + 1); + + for (let i = 1; i <= len1; i++) { + currentRow[0] = i; + let minInRow = i; + + for (let j = 1; j <= len2; j++) { + const cost = str1[i - 1] === str2[j - 1] ? 0 : 1; + currentRow[j] = Math.min( + previousRow[j] + 1, // deletion + currentRow[j - 1] + 1, // insertion + previousRow[j - 1] + cost // substitution + ); + + if (currentRow[j] < minInRow) { + minInRow = currentRow[j]; + } + } + + // Early termination: minimum distance in row exceeds threshold + if (minInRow > maxDistance) { + return maxDistance + 1; + } + + // Swap arrays for next iteration + [previousRow, currentRow] = [currentRow, previousRow]; + } + + const result = previousRow[len2]; + return result <= maxDistance ? result : maxDistance + 1; + } + + /** + * Test if a string matches a JavaScript regular expression + * + * SQLite will pass 2 or 3 arguments: + * - 2 args: text, pattern (uses default flags 'i') + * - 3 args: text, pattern, flags + * + * @returns 1 if match, 0 if no match, null on error + */ + private regexMatch(...args: any[]): number | null { + // Handle variable arguments from SQLite + let text: string | null | undefined = args[0]; + let pattern: string | null | undefined = args[1]; + let flags: string = args.length > 2 ? args[2] : 'i'; + if (!text || !pattern) { + return 0; + } + + if (typeof text !== 'string' || typeof pattern !== 'string') { + return null; + } + + try { + // Validate flags + const validFlags = ['i', 'g', 'm', 's', 'u', 'y']; + const flagsArray = (flags || '').split(''); + if (!flagsArray.every(f => validFlags.includes(f))) { + flags = 'i'; // Fall back to case-insensitive + } + + const regex = new RegExp(pattern, flags); + return regex.test(text) ? 1 : 0; + } catch (error) { + // Invalid regex pattern + log.error(`Invalid regex pattern in SQL: ${pattern} - ${error}`); + return null; + } + } + + /** + * Tokenize text into searchable words + * Handles punctuation, camelCase, and snake_case + * + * @param text Text to tokenize + * @returns JSON array string of tokens + */ + private tokenizeText(text: string | null | undefined): string { + if (!text || typeof text !== 'string') { + return '[]'; + } + + try { + // Use a Set to avoid duplicates from the start + const expandedTokens: Set