From 3957d789da7f95e03e1c99bdfae06988b8a82438 Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Tue, 18 Nov 2025 13:08:11 -0800 Subject: [PATCH] feat(fts5): also create an fts5 index for attributes, and allow them to be searchable using fts5 indexes --- apps/server/src/assets/db/schema.sql | 74 ++++++- .../src/migrations/0234__add_fts5_search.ts | 113 +++++++++- .../src/services/search/fts_search.test.ts | 181 ++++++++++++++++ apps/server/src/services/search/fts_search.ts | 196 +++++++++++++++++- 4 files changed, 545 insertions(+), 19 deletions(-) diff --git a/apps/server/src/assets/db/schema.sql b/apps/server/src/assets/db/schema.sql index 11c0afb5e..d46e6ba72 100644 --- a/apps/server/src/assets/db/schema.sql +++ b/apps/server/src/assets/db/schema.sql @@ -227,14 +227,16 @@ CREATE TABLE IF NOT EXISTS sessions ( -- 4. Boolean operators (AND, OR, NOT) and phrase matching with quotes -- -- IMPORTANT: Trigram requires minimum 3-character tokens for matching --- detail='none' reduces index size by ~50% while maintaining MATCH/rank performance --- (loses position info for highlight() function, but snippet() still works) +-- detail='full' enables phrase queries (required for exact match with = operator) +-- and provides position info for highlight() function +-- Note: Using detail='full' instead of detail='none' increases index size by ~50% +-- but is necessary to support phrase queries like "exact phrase" CREATE VIRTUAL TABLE notes_fts USING fts5( noteId UNINDEXED, title, content, tokenize = 'trigram', - detail = 'none' + detail = 'full' ); -- Triggers to keep FTS table synchronized with notes @@ -354,14 +356,14 @@ END; -- Trigger for INSERT operations on blobs -- Handles: INSERT, INSERT OR REPLACE, and the INSERT part of upsert -- Updates all notes that reference this blob (common during import and deduplication) -CREATE TRIGGER notes_fts_blob_insert +CREATE TRIGGER notes_fts_blob_insert AFTER INSERT ON blobs BEGIN -- Use INSERT OR REPLACE to handle both new and existing FTS entries -- This is crucial for blob deduplication where multiple notes may already -- exist that reference this blob before the blob itself is created INSERT OR REPLACE INTO notes_fts (noteId, title, content) - SELECT + SELECT n.noteId, n.title, NEW.content @@ -371,3 +373,65 @@ BEGIN AND n.isDeleted = 0 AND n.isProtected = 0; END; + +-- ===================================================== +-- FTS5 Full-Text Search Index for Attributes +-- ===================================================== +-- This FTS5 table enables fast full-text searching of attribute names and values +-- Benefits: +-- - Fast free-text searches like ="somevalue" (10-50ms vs 1-2 seconds) +-- - Scales well with large attribute counts (650K+ attributes) +-- - Consistent performance with notes_fts +-- +-- Uses trigram tokenizer with detail='full' for: +-- 1. Substring matching (3+ characters) +-- 2. Phrase query support (exact matches with word boundaries) +-- 3. Multi-language support without stemming assumptions + +CREATE VIRTUAL TABLE attributes_fts USING fts5( + attributeId UNINDEXED, + noteId UNINDEXED, + name, + value, + tokenize = 'trigram', + detail = 'full' +); + +-- Triggers to keep attributes_fts synchronized with attributes table + +-- Trigger for INSERT operations +CREATE TRIGGER attributes_fts_insert +AFTER INSERT ON attributes +WHEN NEW.isDeleted = 0 +BEGIN + INSERT INTO attributes_fts (attributeId, noteId, name, value) + VALUES (NEW.attributeId, NEW.noteId, NEW.name, COALESCE(NEW.value, '')); +END; + +-- Trigger for UPDATE operations +CREATE TRIGGER attributes_fts_update +AFTER UPDATE ON attributes +BEGIN + -- Remove old entry + DELETE FROM attributes_fts WHERE attributeId = OLD.attributeId; + + -- Add new entry if not deleted + INSERT INTO attributes_fts (attributeId, noteId, name, value) + SELECT NEW.attributeId, NEW.noteId, NEW.name, COALESCE(NEW.value, '') + WHERE NEW.isDeleted = 0; +END; + +-- Trigger for DELETE operations +CREATE TRIGGER attributes_fts_delete +AFTER DELETE ON attributes +BEGIN + DELETE FROM attributes_fts WHERE attributeId = OLD.attributeId; +END; + +-- Trigger for soft delete (isDeleted = 1) +CREATE TRIGGER attributes_fts_soft_delete +AFTER UPDATE ON attributes +WHEN OLD.isDeleted = 0 AND NEW.isDeleted = 1 +BEGIN + DELETE FROM attributes_fts WHERE attributeId = NEW.attributeId; +END; diff --git a/apps/server/src/migrations/0234__add_fts5_search.ts b/apps/server/src/migrations/0234__add_fts5_search.ts index 9818f578d..42034a191 100644 --- a/apps/server/src/migrations/0234__add_fts5_search.ts +++ b/apps/server/src/migrations/0234__add_fts5_search.ts @@ -1,12 +1,14 @@ /** * Migration to add FTS5 full-text search support and strategic performance indexes - * + * * This migration: - * 1. Creates an FTS5 virtual table for full-text searching + * 1. Creates an FTS5 virtual table for full-text searching of notes * 2. Populates it with existing note content * 3. Creates triggers to keep the FTS table synchronized with note changes - * 4. Adds strategic composite and covering indexes for improved query performance - * 5. Optimizes common query patterns identified through performance analysis + * 4. Creates an FTS5 virtual table for full-text searching of attributes + * 5. Populates it with existing attributes and creates synchronization triggers + * 6. Adds strategic composite and covering indexes for improved query performance + * 7. Optimizes common query patterns identified through performance analysis */ import sql from "../services/sql.js"; @@ -46,14 +48,16 @@ export default function addFTS5SearchAndPerformanceIndexes() { -- 4. Boolean operators (AND, OR, NOT) and phrase matching with quotes -- -- IMPORTANT: Trigram requires minimum 3-character tokens for matching - -- detail='none' reduces index size by ~50% while maintaining MATCH/rank performance - -- (loses position info for highlight() function, but snippet() still works) + -- detail='full' enables phrase queries (required for exact match with = operator) + -- and provides position info for highlight() function + -- Note: Using detail='full' instead of detail='none' increases index size by ~50% + -- but is necessary to support phrase queries like "exact phrase" CREATE VIRTUAL TABLE IF NOT EXISTS notes_fts USING fts5( noteId UNINDEXED, title, content, tokenize = 'trigram', - detail = 'none' + detail = 'full' ); `); @@ -549,5 +553,100 @@ export default function addFTS5SearchAndPerformanceIndexes() { throw error; } + // ======================================== + // Part 3: Attributes FTS5 Setup + // ======================================== + + log.info("Creating FTS5 index for attributes..."); + + sql.transactional(() => { + // Create FTS5 virtual table for attributes + // IMPORTANT: Trigram requires minimum 3-character tokens for matching + // detail='full' enables phrase queries (required for exact match with = operator) + // and provides position info for highlight() function + sql.execute(` + CREATE VIRTUAL TABLE IF NOT EXISTS attributes_fts USING fts5( + attributeId UNINDEXED, + noteId UNINDEXED, + name, + value, + tokenize = 'trigram', + detail = 'full' + ) + `); + + log.info("Populating attributes_fts table..."); + + // Populate FTS table with existing attributes (non-deleted only) + const attrStartTime = Date.now(); + sql.execute(` + INSERT INTO attributes_fts (attributeId, noteId, name, value) + SELECT + attributeId, + noteId, + name, + COALESCE(value, '') + FROM attributes + WHERE isDeleted = 0 + `); + + const populateTime = Date.now() - attrStartTime; + const attrCount = sql.getValue(`SELECT COUNT(*) FROM attributes_fts`) || 0; + log.info(`Populated ${attrCount} attributes in ${populateTime}ms`); + + // Create triggers to keep FTS index synchronized with attributes table + + // Trigger 1: INSERT - Add new attributes to FTS + sql.execute(` + CREATE TRIGGER attributes_fts_insert + AFTER INSERT ON attributes + WHEN NEW.isDeleted = 0 + BEGIN + INSERT INTO attributes_fts (attributeId, noteId, name, value) + VALUES (NEW.attributeId, NEW.noteId, NEW.name, COALESCE(NEW.value, '')); + END + `); + + // Trigger 2: UPDATE - Update FTS when attributes change + sql.execute(` + CREATE TRIGGER attributes_fts_update + AFTER UPDATE ON attributes + BEGIN + -- Remove old entry + DELETE FROM attributes_fts WHERE attributeId = OLD.attributeId; + + -- Add new entry if not deleted + INSERT INTO attributes_fts (attributeId, noteId, name, value) + SELECT NEW.attributeId, NEW.noteId, NEW.name, COALESCE(NEW.value, '') + WHERE NEW.isDeleted = 0; + END + `); + + // Trigger 3: DELETE - Remove from FTS + sql.execute(` + CREATE TRIGGER attributes_fts_delete + AFTER DELETE ON attributes + BEGIN + DELETE FROM attributes_fts WHERE attributeId = OLD.attributeId; + END + `); + + // Trigger 4: Soft delete (isDeleted = 1) - Remove from FTS + sql.execute(` + CREATE TRIGGER attributes_fts_soft_delete + AFTER UPDATE ON attributes + WHEN OLD.isDeleted = 0 AND NEW.isDeleted = 1 + BEGIN + DELETE FROM attributes_fts WHERE attributeId = NEW.attributeId; + END + `); + + // Run ANALYZE to update query planner statistics + log.info("Running ANALYZE on attributes_fts..."); + sql.execute(`ANALYZE attributes_fts`); + + log.info("Attributes FTS5 setup completed successfully"); + }); + log.info("FTS5 and performance optimization migration completed successfully"); } \ No newline at end of file diff --git a/apps/server/src/services/search/fts_search.test.ts b/apps/server/src/services/search/fts_search.test.ts index 897053aff..95c294340 100644 --- a/apps/server/src/services/search/fts_search.test.ts +++ b/apps/server/src/services/search/fts_search.test.ts @@ -1319,4 +1319,185 @@ describe('searchWithLike - Substring Search with LIKE Queries', () => { expect(results[1].score).toBe(1.0); }); }); +}); + +describe('Exact Match with Word Boundaries (= operator)', () => { + let ftsSearchService: any; + let mockSql: any; + let mockLog: any; + let mockProtectedSession: any; + + beforeEach(async () => { + // Reset mocks + vi.resetModules(); + + // Setup mocks + mockSql = { + getValue: vi.fn(), + getRows: vi.fn(), + getColumn: vi.fn(), + execute: vi.fn(), + transactional: vi.fn((fn: Function) => fn()), + iterateRows: vi.fn() + }; + + mockLog = { + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + debug: vi.fn(), + request: vi.fn() + }; + + mockProtectedSession = { + isProtectedSessionAvailable: vi.fn().mockReturnValue(false), + decryptString: vi.fn() + }; + + // Mock the modules + vi.doMock('../sql.js', () => ({ default: mockSql })); + vi.doMock('../log.js', () => ({ default: mockLog })); + vi.doMock('../protected_session.js', () => ({ default: mockProtectedSession })); + + // Import the service after mocking + const module = await import('./fts_search.js'); + ftsSearchService = module.ftsSearchService; + }); + + afterEach(() => { + vi.clearAllMocks(); + }); + + describe('Word boundary matching with trigram tokenizer', () => { + it('should NOT match "test123" when searching for "test1234" (exact match only)', () => { + // This test SHOULD FAIL initially because trigram FTS5 phrase queries + // don't respect word boundaries - "test123" matches "test1234" via shared trigrams + mockSql.getValue.mockReturnValue(1); // FTS5 available + mockSql.getColumn.mockReturnValue([]); + + // Mock FTS5 returning BOTH notes (this is the bug) + mockSql.getRows.mockReturnValue([ + { noteId: 'note1', title: 'Test', score: 1.0, content: '

test123

' }, + { noteId: 'note2', title: 'Test 2', score: 1.0, content: '

test1234

' } + ]); + + const results = ftsSearchService.searchSync(['test123'], '='); + + // After the fix, we should post-filter and only return note1 + // Currently this test will FAIL because we get 2 results + expect(results).toHaveLength(1); + expect(results[0].noteId).toBe('note1'); + expect(results[0].content).toContain('test123'); + expect(results[0].content).not.toContain('test1234'); + }); + + it('should NOT match "abc" when searching for "abcd" (exact word boundary)', () => { + mockSql.getValue.mockReturnValue(1); + mockSql.getColumn.mockReturnValue([]); + + // FTS5 returns both due to trigram overlap + mockSql.getRows.mockReturnValue([ + { noteId: 'note1', title: 'ABC', score: 1.0, content: 'abc' }, + { noteId: 'note2', title: 'ABCD', score: 1.0, content: 'abcd' } + ]); + + const results = ftsSearchService.searchSync(['abc'], '='); + + // Should only match exact word "abc", not "abcd" + expect(results).toHaveLength(1); + expect(results[0].noteId).toBe('note1'); + }); + + it('should match "test123" in "test123 test1234" but still filter out "test1234" match', () => { + mockSql.getValue.mockReturnValue(1); + mockSql.getColumn.mockReturnValue([]); + + mockSql.getRows.mockReturnValue([ + { noteId: 'note1', title: 'Both', score: 1.0, content: 'test123 test1234' } + ]); + + const results = ftsSearchService.searchSync(['test123'], '='); + + // Should match because content contains "test123" as a complete word + expect(results).toHaveLength(1); + expect(results[0].noteId).toBe('note1'); + }); + + it('should handle multi-word exact phrases with word boundaries', () => { + mockSql.getValue.mockReturnValue(1); + mockSql.getColumn.mockReturnValue([]); + + mockSql.getRows.mockReturnValue([ + { noteId: 'note1', title: 'Match', score: 1.0, content: 'hello world' }, + { noteId: 'note2', title: 'No Match', score: 1.0, content: 'hello world2' } + ]); + + const results = ftsSearchService.searchSync(['hello', 'world'], '='); + + // Should only match exact phrase "hello world", not "hello world2" + expect(results).toHaveLength(1); + expect(results[0].noteId).toBe('note1'); + }); + + it('should match word at start of content', () => { + mockSql.getValue.mockReturnValue(1); + mockSql.getColumn.mockReturnValue([]); + + mockSql.getRows.mockReturnValue([ + { noteId: 'note1', title: 'Start', score: 1.0, content: 'test123 other words' }, + { noteId: 'note2', title: 'Not Start', score: 1.0, content: 'test1234 other words' } + ]); + + const results = ftsSearchService.searchSync(['test123'], '='); + + expect(results).toHaveLength(1); + expect(results[0].noteId).toBe('note1'); + }); + + it('should match word at end of content', () => { + mockSql.getValue.mockReturnValue(1); + mockSql.getColumn.mockReturnValue([]); + + mockSql.getRows.mockReturnValue([ + { noteId: 'note1', title: 'End', score: 1.0, content: 'other words test123' }, + { noteId: 'note2', title: 'Not End', score: 1.0, content: 'other words test1234' } + ]); + + const results = ftsSearchService.searchSync(['test123'], '='); + + expect(results).toHaveLength(1); + expect(results[0].noteId).toBe('note1'); + }); + + it('should match word as entire content', () => { + mockSql.getValue.mockReturnValue(1); + mockSql.getColumn.mockReturnValue([]); + + mockSql.getRows.mockReturnValue([ + { noteId: 'note1', title: 'Exact', score: 1.0, content: 'test123' }, + { noteId: 'note2', title: 'Not Exact', score: 1.0, content: 'test1234' } + ]); + + const results = ftsSearchService.searchSync(['test123'], '='); + + expect(results).toHaveLength(1); + expect(results[0].noteId).toBe('note1'); + }); + + it('should also check title for exact matches with word boundaries', () => { + mockSql.getValue.mockReturnValue(1); + mockSql.getColumn.mockReturnValue([]); + + mockSql.getRows.mockReturnValue([ + { noteId: 'note1', title: 'test123', score: 1.0, content: 'other content' }, + { noteId: 'note2', title: 'test1234', score: 1.0, content: 'other content' } + ]); + + const results = ftsSearchService.searchSync(['test123'], '='); + + // Should match based on title + expect(results).toHaveLength(1); + expect(results[0].noteId).toBe('note1'); + }); + }); }); \ No newline at end of file diff --git a/apps/server/src/services/search/fts_search.ts b/apps/server/src/services/search/fts_search.ts index 033dcebb9..f9c41948c 100644 --- a/apps/server/src/services/search/fts_search.ts +++ b/apps/server/src/services/search/fts_search.ts @@ -585,16 +585,20 @@ class FTSSearchService { } // Build snippet extraction if requested - const snippetSelect = includeSnippets + const snippetSelect = includeSnippets ? `, snippet(notes_fts, ${FTS_CONFIG.SNIPPET_COLUMN_CONTENT}, '${highlightTag}', '${highlightTag.replace('<', '(query, params); + // Post-filter for exact match operator (=) to handle word boundaries + // Trigram FTS5 doesn't respect word boundaries in phrase queries, + // so "test123" matches "test1234" due to shared trigrams. + // We need to post-filter results to only include exact word matches. + if (operator === "=") { + const phrase = tokens.join(" "); + results = results.filter(result => { + // Use content from result if available, otherwise fetch it + let noteContent = result.content; + if (!noteContent) { + noteContent = sql.getValue(` + SELECT b.content + FROM notes n + LEFT JOIN blobs b ON n.blobId = b.blobId + WHERE n.noteId = ? + `, [result.noteId]); + } + + if (!noteContent) { + return false; + } + + // Check if phrase appears as exact words in content or title + return this.containsExactPhrase(phrase, result.title) || + this.containsExactPhrase(phrase, noteContent); + }); + } + const searchTime = Date.now() - searchStartTime; log.info(`FTS5 MATCH search returned ${results.length} results in ${searchTime}ms`); @@ -647,17 +680,166 @@ class FTSSearchService { private filterNonProtectedNoteIds(noteIds: Set): string[] { const noteIdList = Array.from(noteIds); const placeholders = noteIdList.map(() => '?').join(','); - + const nonProtectedNotes = sql.getColumn(` - SELECT noteId - FROM notes + SELECT noteId + FROM notes WHERE noteId IN (${placeholders}) AND isProtected = 0 `, noteIdList); - + return nonProtectedNotes; } + /** + * Checks if a phrase appears as exact words in text (respecting word boundaries) + * @param phrase - The phrase to search for (case-insensitive) + * @param text - The text to search in + * @returns true if the phrase appears as complete words, false otherwise + */ + private containsExactPhrase(phrase: string, text: string | null | undefined): boolean { + if (!text || !phrase || typeof text !== 'string') { + return false; + } + + // Normalize both to lowercase for case-insensitive comparison + const normalizedPhrase = phrase.toLowerCase().trim(); + const normalizedText = text.toLowerCase(); + + // Strip HTML tags for content matching + const plainText = striptags(normalizedText); + + // For single words, use word-boundary matching + if (!normalizedPhrase.includes(' ')) { + // Split text into words and check for exact match + const words = plainText.split(/\s+/); + return words.some(word => word === normalizedPhrase); + } + + // For multi-word phrases, check if the phrase appears as consecutive words + // Split text into words, then check if the phrase appears in the word sequence + const textWords = plainText.split(/\s+/); + const phraseWords = normalizedPhrase.split(/\s+/); + + // Sliding window to find exact phrase match + for (let i = 0; i <= textWords.length - phraseWords.length; i++) { + let match = true; + for (let j = 0; j < phraseWords.length; j++) { + if (textWords[i + j] !== phraseWords[j]) { + match = false; + break; + } + } + if (match) { + return true; + } + } + + return false; + } + + /** + * Searches attributes using FTS5 + * Returns noteIds of notes that have matching attributes + */ + searchAttributesSync( + tokens: string[], + operator: string, + noteIds?: Set + ): Set { + const startTime = Date.now(); + + if (!this.checkFTS5Availability()) { + return new Set(); + } + + // Check if attributes_fts table exists + const tableExists = sql.getValue(` + SELECT COUNT(*) + FROM sqlite_master + WHERE type='table' AND name='attributes_fts' + `); + + if (!tableExists) { + log.info("attributes_fts table does not exist - skipping FTS attribute search"); + return new Set(); + } + + try { + // Sanitize tokens to prevent FTS5 syntax injection + const sanitizedTokens = tokens.map(token => this.sanitizeFTS5Token(token)); + + // Check if any tokens became invalid after sanitization + if (sanitizedTokens.some(t => t === '__empty_token__' || t === '__invalid_token__')) { + return new Set(); + } + + const phrase = sanitizedTokens.join(" "); + + // Build FTS5 query for exact match + const ftsQuery = operator === "=" ? `"${phrase}"` : phrase; + + // Search both name and value columns + const whereConditions: string[] = [ + `attributes_fts MATCH '${ftsQuery.replace(/'/g, "''")}'` + ]; + + const params: any[] = []; + + // Filter by noteIds if provided + if (noteIds && noteIds.size > 0 && noteIds.size < 1000) { + const noteIdList = Array.from(noteIds); + whereConditions.push(`noteId IN (${noteIdList.map(() => '?').join(',')})`); + params.push(...noteIdList); + } + + const query = ` + SELECT DISTINCT noteId, name, value + FROM attributes_fts + WHERE ${whereConditions.join(' AND ')} + `; + + const results = sql.getRows<{ + noteId: string; + name: string; + value: string; + }>(query, params); + + log.info(`[FTS5-ATTRIBUTES-RAW] FTS5 query returned ${results.length} raw attribute matches`); + + // Post-filter for exact word matches when operator is "=" + if (operator === "=") { + const matchingNoteIds = new Set(); + for (const result of results) { + // Check if phrase matches attribute name or value with word boundaries + // For attribute names, check exact match (attribute name "test125" matches search "test125") + // For attribute values, check if phrase appears as exact words + const nameMatch = result.name.toLowerCase() === phrase.toLowerCase(); + const valueMatch = result.value ? this.containsExactPhrase(phrase, result.value) : false; + + log.info(`[FTS5-ATTRIBUTES-FILTER] Checking attribute: name="${result.name}", value="${result.value}", phrase="${phrase}", nameMatch=${nameMatch}, valueMatch=${valueMatch}`); + + if (nameMatch || valueMatch) { + matchingNoteIds.add(result.noteId); + } + } + const filterTime = Date.now() - startTime; + log.info(`[FTS5-ATTRIBUTES-FILTERED] After post-filtering: ${matchingNoteIds.size} notes match (total time: ${filterTime}ms)`); + return matchingNoteIds; + } + + // For other operators, return all matching noteIds + const searchTime = Date.now() - startTime; + const matchingNoteIds = new Set(results.map(r => r.noteId)); + log.info(`[FTS5-ATTRIBUTES-TIME] Attribute search completed in ${searchTime}ms, found ${matchingNoteIds.size} notes`); + return matchingNoteIds; + + } catch (error: any) { + log.error(`FTS5 attribute search error: ${error}`); + return new Set(); + } + } + /** * Searches protected notes separately (not in FTS index) * This is a fallback method for protected notes