feat(fts5): also create an fts5 index for attributes, and allow them to be searchable using fts5 indexes
Some checks failed
Checks / main (push) Has been cancelled

This commit is contained in:
perfectra1n 2025-11-18 13:08:11 -08:00
parent 15719a1ee9
commit 3957d789da
4 changed files with 545 additions and 19 deletions

View File

@ -227,14 +227,16 @@ CREATE TABLE IF NOT EXISTS sessions (
-- 4. Boolean operators (AND, OR, NOT) and phrase matching with quotes -- 4. Boolean operators (AND, OR, NOT) and phrase matching with quotes
-- --
-- IMPORTANT: Trigram requires minimum 3-character tokens for matching -- IMPORTANT: Trigram requires minimum 3-character tokens for matching
-- detail='none' reduces index size by ~50% while maintaining MATCH/rank performance -- detail='full' enables phrase queries (required for exact match with = operator)
-- (loses position info for highlight() function, but snippet() still works) -- and provides position info for highlight() function
-- Note: Using detail='full' instead of detail='none' increases index size by ~50%
-- but is necessary to support phrase queries like "exact phrase"
CREATE VIRTUAL TABLE notes_fts USING fts5( CREATE VIRTUAL TABLE notes_fts USING fts5(
noteId UNINDEXED, noteId UNINDEXED,
title, title,
content, content,
tokenize = 'trigram', tokenize = 'trigram',
detail = 'none' detail = 'full'
); );
-- Triggers to keep FTS table synchronized with notes -- Triggers to keep FTS table synchronized with notes
@ -354,14 +356,14 @@ END;
-- Trigger for INSERT operations on blobs -- Trigger for INSERT operations on blobs
-- Handles: INSERT, INSERT OR REPLACE, and the INSERT part of upsert -- Handles: INSERT, INSERT OR REPLACE, and the INSERT part of upsert
-- Updates all notes that reference this blob (common during import and deduplication) -- Updates all notes that reference this blob (common during import and deduplication)
CREATE TRIGGER notes_fts_blob_insert CREATE TRIGGER notes_fts_blob_insert
AFTER INSERT ON blobs AFTER INSERT ON blobs
BEGIN BEGIN
-- Use INSERT OR REPLACE to handle both new and existing FTS entries -- Use INSERT OR REPLACE to handle both new and existing FTS entries
-- This is crucial for blob deduplication where multiple notes may already -- This is crucial for blob deduplication where multiple notes may already
-- exist that reference this blob before the blob itself is created -- exist that reference this blob before the blob itself is created
INSERT OR REPLACE INTO notes_fts (noteId, title, content) INSERT OR REPLACE INTO notes_fts (noteId, title, content)
SELECT SELECT
n.noteId, n.noteId,
n.title, n.title,
NEW.content NEW.content
@ -371,3 +373,65 @@ BEGIN
AND n.isDeleted = 0 AND n.isDeleted = 0
AND n.isProtected = 0; AND n.isProtected = 0;
END; END;
-- =====================================================
-- FTS5 Full-Text Search Index for Attributes
-- =====================================================
-- This FTS5 table enables fast full-text searching of attribute names and values
-- Benefits:
-- - Fast free-text searches like ="somevalue" (10-50ms vs 1-2 seconds)
-- - Scales well with large attribute counts (650K+ attributes)
-- - Consistent performance with notes_fts
--
-- Uses trigram tokenizer with detail='full' for:
-- 1. Substring matching (3+ characters)
-- 2. Phrase query support (exact matches with word boundaries)
-- 3. Multi-language support without stemming assumptions
CREATE VIRTUAL TABLE attributes_fts USING fts5(
attributeId UNINDEXED,
noteId UNINDEXED,
name,
value,
tokenize = 'trigram',
detail = 'full'
);
-- Triggers to keep attributes_fts synchronized with attributes table
-- Trigger for INSERT operations
CREATE TRIGGER attributes_fts_insert
AFTER INSERT ON attributes
WHEN NEW.isDeleted = 0
BEGIN
INSERT INTO attributes_fts (attributeId, noteId, name, value)
VALUES (NEW.attributeId, NEW.noteId, NEW.name, COALESCE(NEW.value, ''));
END;
-- Trigger for UPDATE operations
CREATE TRIGGER attributes_fts_update
AFTER UPDATE ON attributes
BEGIN
-- Remove old entry
DELETE FROM attributes_fts WHERE attributeId = OLD.attributeId;
-- Add new entry if not deleted
INSERT INTO attributes_fts (attributeId, noteId, name, value)
SELECT NEW.attributeId, NEW.noteId, NEW.name, COALESCE(NEW.value, '')
WHERE NEW.isDeleted = 0;
END;
-- Trigger for DELETE operations
CREATE TRIGGER attributes_fts_delete
AFTER DELETE ON attributes
BEGIN
DELETE FROM attributes_fts WHERE attributeId = OLD.attributeId;
END;
-- Trigger for soft delete (isDeleted = 1)
CREATE TRIGGER attributes_fts_soft_delete
AFTER UPDATE ON attributes
WHEN OLD.isDeleted = 0 AND NEW.isDeleted = 1
BEGIN
DELETE FROM attributes_fts WHERE attributeId = NEW.attributeId;
END;

View File

@ -1,12 +1,14 @@
/** /**
* Migration to add FTS5 full-text search support and strategic performance indexes * Migration to add FTS5 full-text search support and strategic performance indexes
* *
* This migration: * This migration:
* 1. Creates an FTS5 virtual table for full-text searching * 1. Creates an FTS5 virtual table for full-text searching of notes
* 2. Populates it with existing note content * 2. Populates it with existing note content
* 3. Creates triggers to keep the FTS table synchronized with note changes * 3. Creates triggers to keep the FTS table synchronized with note changes
* 4. Adds strategic composite and covering indexes for improved query performance * 4. Creates an FTS5 virtual table for full-text searching of attributes
* 5. Optimizes common query patterns identified through performance analysis * 5. Populates it with existing attributes and creates synchronization triggers
* 6. Adds strategic composite and covering indexes for improved query performance
* 7. Optimizes common query patterns identified through performance analysis
*/ */
import sql from "../services/sql.js"; import sql from "../services/sql.js";
@ -46,14 +48,16 @@ export default function addFTS5SearchAndPerformanceIndexes() {
-- 4. Boolean operators (AND, OR, NOT) and phrase matching with quotes -- 4. Boolean operators (AND, OR, NOT) and phrase matching with quotes
-- --
-- IMPORTANT: Trigram requires minimum 3-character tokens for matching -- IMPORTANT: Trigram requires minimum 3-character tokens for matching
-- detail='none' reduces index size by ~50% while maintaining MATCH/rank performance -- detail='full' enables phrase queries (required for exact match with = operator)
-- (loses position info for highlight() function, but snippet() still works) -- and provides position info for highlight() function
-- Note: Using detail='full' instead of detail='none' increases index size by ~50%
-- but is necessary to support phrase queries like "exact phrase"
CREATE VIRTUAL TABLE IF NOT EXISTS notes_fts USING fts5( CREATE VIRTUAL TABLE IF NOT EXISTS notes_fts USING fts5(
noteId UNINDEXED, noteId UNINDEXED,
title, title,
content, content,
tokenize = 'trigram', tokenize = 'trigram',
detail = 'none' detail = 'full'
); );
`); `);
@ -549,5 +553,100 @@ export default function addFTS5SearchAndPerformanceIndexes() {
throw error; throw error;
} }
// ========================================
// Part 3: Attributes FTS5 Setup
// ========================================
log.info("Creating FTS5 index for attributes...");
sql.transactional(() => {
// Create FTS5 virtual table for attributes
// IMPORTANT: Trigram requires minimum 3-character tokens for matching
// detail='full' enables phrase queries (required for exact match with = operator)
// and provides position info for highlight() function
sql.execute(`
CREATE VIRTUAL TABLE IF NOT EXISTS attributes_fts USING fts5(
attributeId UNINDEXED,
noteId UNINDEXED,
name,
value,
tokenize = 'trigram',
detail = 'full'
)
`);
log.info("Populating attributes_fts table...");
// Populate FTS table with existing attributes (non-deleted only)
const attrStartTime = Date.now();
sql.execute(`
INSERT INTO attributes_fts (attributeId, noteId, name, value)
SELECT
attributeId,
noteId,
name,
COALESCE(value, '')
FROM attributes
WHERE isDeleted = 0
`);
const populateTime = Date.now() - attrStartTime;
const attrCount = sql.getValue<number>(`SELECT COUNT(*) FROM attributes_fts`) || 0;
log.info(`Populated ${attrCount} attributes in ${populateTime}ms`);
// Create triggers to keep FTS index synchronized with attributes table
// Trigger 1: INSERT - Add new attributes to FTS
sql.execute(`
CREATE TRIGGER attributes_fts_insert
AFTER INSERT ON attributes
WHEN NEW.isDeleted = 0
BEGIN
INSERT INTO attributes_fts (attributeId, noteId, name, value)
VALUES (NEW.attributeId, NEW.noteId, NEW.name, COALESCE(NEW.value, ''));
END
`);
// Trigger 2: UPDATE - Update FTS when attributes change
sql.execute(`
CREATE TRIGGER attributes_fts_update
AFTER UPDATE ON attributes
BEGIN
-- Remove old entry
DELETE FROM attributes_fts WHERE attributeId = OLD.attributeId;
-- Add new entry if not deleted
INSERT INTO attributes_fts (attributeId, noteId, name, value)
SELECT NEW.attributeId, NEW.noteId, NEW.name, COALESCE(NEW.value, '')
WHERE NEW.isDeleted = 0;
END
`);
// Trigger 3: DELETE - Remove from FTS
sql.execute(`
CREATE TRIGGER attributes_fts_delete
AFTER DELETE ON attributes
BEGIN
DELETE FROM attributes_fts WHERE attributeId = OLD.attributeId;
END
`);
// Trigger 4: Soft delete (isDeleted = 1) - Remove from FTS
sql.execute(`
CREATE TRIGGER attributes_fts_soft_delete
AFTER UPDATE ON attributes
WHEN OLD.isDeleted = 0 AND NEW.isDeleted = 1
BEGIN
DELETE FROM attributes_fts WHERE attributeId = NEW.attributeId;
END
`);
// Run ANALYZE to update query planner statistics
log.info("Running ANALYZE on attributes_fts...");
sql.execute(`ANALYZE attributes_fts`);
log.info("Attributes FTS5 setup completed successfully");
});
log.info("FTS5 and performance optimization migration completed successfully"); log.info("FTS5 and performance optimization migration completed successfully");
} }

View File

@ -1319,4 +1319,185 @@ describe('searchWithLike - Substring Search with LIKE Queries', () => {
expect(results[1].score).toBe(1.0); expect(results[1].score).toBe(1.0);
}); });
}); });
});
describe('Exact Match with Word Boundaries (= operator)', () => {
let ftsSearchService: any;
let mockSql: any;
let mockLog: any;
let mockProtectedSession: any;
beforeEach(async () => {
// Reset mocks
vi.resetModules();
// Setup mocks
mockSql = {
getValue: vi.fn(),
getRows: vi.fn(),
getColumn: vi.fn(),
execute: vi.fn(),
transactional: vi.fn((fn: Function) => fn()),
iterateRows: vi.fn()
};
mockLog = {
info: vi.fn(),
warn: vi.fn(),
error: vi.fn(),
debug: vi.fn(),
request: vi.fn()
};
mockProtectedSession = {
isProtectedSessionAvailable: vi.fn().mockReturnValue(false),
decryptString: vi.fn()
};
// Mock the modules
vi.doMock('../sql.js', () => ({ default: mockSql }));
vi.doMock('../log.js', () => ({ default: mockLog }));
vi.doMock('../protected_session.js', () => ({ default: mockProtectedSession }));
// Import the service after mocking
const module = await import('./fts_search.js');
ftsSearchService = module.ftsSearchService;
});
afterEach(() => {
vi.clearAllMocks();
});
describe('Word boundary matching with trigram tokenizer', () => {
it('should NOT match "test123" when searching for "test1234" (exact match only)', () => {
// This test SHOULD FAIL initially because trigram FTS5 phrase queries
// don't respect word boundaries - "test123" matches "test1234" via shared trigrams
mockSql.getValue.mockReturnValue(1); // FTS5 available
mockSql.getColumn.mockReturnValue([]);
// Mock FTS5 returning BOTH notes (this is the bug)
mockSql.getRows.mockReturnValue([
{ noteId: 'note1', title: 'Test', score: 1.0, content: '<p>test123</p>' },
{ noteId: 'note2', title: 'Test 2', score: 1.0, content: '<p>test1234</p>' }
]);
const results = ftsSearchService.searchSync(['test123'], '=');
// After the fix, we should post-filter and only return note1
// Currently this test will FAIL because we get 2 results
expect(results).toHaveLength(1);
expect(results[0].noteId).toBe('note1');
expect(results[0].content).toContain('test123');
expect(results[0].content).not.toContain('test1234');
});
it('should NOT match "abc" when searching for "abcd" (exact word boundary)', () => {
mockSql.getValue.mockReturnValue(1);
mockSql.getColumn.mockReturnValue([]);
// FTS5 returns both due to trigram overlap
mockSql.getRows.mockReturnValue([
{ noteId: 'note1', title: 'ABC', score: 1.0, content: 'abc' },
{ noteId: 'note2', title: 'ABCD', score: 1.0, content: 'abcd' }
]);
const results = ftsSearchService.searchSync(['abc'], '=');
// Should only match exact word "abc", not "abcd"
expect(results).toHaveLength(1);
expect(results[0].noteId).toBe('note1');
});
it('should match "test123" in "test123 test1234" but still filter out "test1234" match', () => {
mockSql.getValue.mockReturnValue(1);
mockSql.getColumn.mockReturnValue([]);
mockSql.getRows.mockReturnValue([
{ noteId: 'note1', title: 'Both', score: 1.0, content: 'test123 test1234' }
]);
const results = ftsSearchService.searchSync(['test123'], '=');
// Should match because content contains "test123" as a complete word
expect(results).toHaveLength(1);
expect(results[0].noteId).toBe('note1');
});
it('should handle multi-word exact phrases with word boundaries', () => {
mockSql.getValue.mockReturnValue(1);
mockSql.getColumn.mockReturnValue([]);
mockSql.getRows.mockReturnValue([
{ noteId: 'note1', title: 'Match', score: 1.0, content: 'hello world' },
{ noteId: 'note2', title: 'No Match', score: 1.0, content: 'hello world2' }
]);
const results = ftsSearchService.searchSync(['hello', 'world'], '=');
// Should only match exact phrase "hello world", not "hello world2"
expect(results).toHaveLength(1);
expect(results[0].noteId).toBe('note1');
});
it('should match word at start of content', () => {
mockSql.getValue.mockReturnValue(1);
mockSql.getColumn.mockReturnValue([]);
mockSql.getRows.mockReturnValue([
{ noteId: 'note1', title: 'Start', score: 1.0, content: 'test123 other words' },
{ noteId: 'note2', title: 'Not Start', score: 1.0, content: 'test1234 other words' }
]);
const results = ftsSearchService.searchSync(['test123'], '=');
expect(results).toHaveLength(1);
expect(results[0].noteId).toBe('note1');
});
it('should match word at end of content', () => {
mockSql.getValue.mockReturnValue(1);
mockSql.getColumn.mockReturnValue([]);
mockSql.getRows.mockReturnValue([
{ noteId: 'note1', title: 'End', score: 1.0, content: 'other words test123' },
{ noteId: 'note2', title: 'Not End', score: 1.0, content: 'other words test1234' }
]);
const results = ftsSearchService.searchSync(['test123'], '=');
expect(results).toHaveLength(1);
expect(results[0].noteId).toBe('note1');
});
it('should match word as entire content', () => {
mockSql.getValue.mockReturnValue(1);
mockSql.getColumn.mockReturnValue([]);
mockSql.getRows.mockReturnValue([
{ noteId: 'note1', title: 'Exact', score: 1.0, content: 'test123' },
{ noteId: 'note2', title: 'Not Exact', score: 1.0, content: 'test1234' }
]);
const results = ftsSearchService.searchSync(['test123'], '=');
expect(results).toHaveLength(1);
expect(results[0].noteId).toBe('note1');
});
it('should also check title for exact matches with word boundaries', () => {
mockSql.getValue.mockReturnValue(1);
mockSql.getColumn.mockReturnValue([]);
mockSql.getRows.mockReturnValue([
{ noteId: 'note1', title: 'test123', score: 1.0, content: 'other content' },
{ noteId: 'note2', title: 'test1234', score: 1.0, content: 'other content' }
]);
const results = ftsSearchService.searchSync(['test123'], '=');
// Should match based on title
expect(results).toHaveLength(1);
expect(results[0].noteId).toBe('note1');
});
});
}); });

View File

@ -585,16 +585,20 @@ class FTSSearchService {
} }
// Build snippet extraction if requested // Build snippet extraction if requested
const snippetSelect = includeSnippets const snippetSelect = includeSnippets
? `, snippet(notes_fts, ${FTS_CONFIG.SNIPPET_COLUMN_CONTENT}, '${highlightTag}', '${highlightTag.replace('<', '</')}', '...', ${snippetLength}) as snippet` ? `, snippet(notes_fts, ${FTS_CONFIG.SNIPPET_COLUMN_CONTENT}, '${highlightTag}', '${highlightTag.replace('<', '</')}', '...', ${snippetLength}) as snippet`
: ''; : '';
// For exact match (=), include content for post-filtering word boundaries
const contentSelect = operator === "=" ? ', content' : '';
const query = ` const query = `
SELECT SELECT
noteId, noteId,
title, title,
rank as score rank as score
${snippetSelect} ${snippetSelect}
${contentSelect}
FROM notes_fts FROM notes_fts
WHERE ${whereConditions.join(' AND ')} WHERE ${whereConditions.join(' AND ')}
ORDER BY rank ORDER BY rank
@ -603,13 +607,42 @@ class FTSSearchService {
params.push(limit, offset); params.push(limit, offset);
const results = sql.getRows<{ let results = sql.getRows<{
noteId: string; noteId: string;
title: string; title: string;
score: number; score: number;
snippet?: string; snippet?: string;
content?: string;
}>(query, params); }>(query, params);
// Post-filter for exact match operator (=) to handle word boundaries
// Trigram FTS5 doesn't respect word boundaries in phrase queries,
// so "test123" matches "test1234" due to shared trigrams.
// We need to post-filter results to only include exact word matches.
if (operator === "=") {
const phrase = tokens.join(" ");
results = results.filter(result => {
// Use content from result if available, otherwise fetch it
let noteContent = result.content;
if (!noteContent) {
noteContent = sql.getValue<string>(`
SELECT b.content
FROM notes n
LEFT JOIN blobs b ON n.blobId = b.blobId
WHERE n.noteId = ?
`, [result.noteId]);
}
if (!noteContent) {
return false;
}
// Check if phrase appears as exact words in content or title
return this.containsExactPhrase(phrase, result.title) ||
this.containsExactPhrase(phrase, noteContent);
});
}
const searchTime = Date.now() - searchStartTime; const searchTime = Date.now() - searchStartTime;
log.info(`FTS5 MATCH search returned ${results.length} results in ${searchTime}ms`); log.info(`FTS5 MATCH search returned ${results.length} results in ${searchTime}ms`);
@ -647,17 +680,166 @@ class FTSSearchService {
private filterNonProtectedNoteIds(noteIds: Set<string>): string[] { private filterNonProtectedNoteIds(noteIds: Set<string>): string[] {
const noteIdList = Array.from(noteIds); const noteIdList = Array.from(noteIds);
const placeholders = noteIdList.map(() => '?').join(','); const placeholders = noteIdList.map(() => '?').join(',');
const nonProtectedNotes = sql.getColumn<string>(` const nonProtectedNotes = sql.getColumn<string>(`
SELECT noteId SELECT noteId
FROM notes FROM notes
WHERE noteId IN (${placeholders}) WHERE noteId IN (${placeholders})
AND isProtected = 0 AND isProtected = 0
`, noteIdList); `, noteIdList);
return nonProtectedNotes; return nonProtectedNotes;
} }
/**
* Checks if a phrase appears as exact words in text (respecting word boundaries)
* @param phrase - The phrase to search for (case-insensitive)
* @param text - The text to search in
* @returns true if the phrase appears as complete words, false otherwise
*/
private containsExactPhrase(phrase: string, text: string | null | undefined): boolean {
if (!text || !phrase || typeof text !== 'string') {
return false;
}
// Normalize both to lowercase for case-insensitive comparison
const normalizedPhrase = phrase.toLowerCase().trim();
const normalizedText = text.toLowerCase();
// Strip HTML tags for content matching
const plainText = striptags(normalizedText);
// For single words, use word-boundary matching
if (!normalizedPhrase.includes(' ')) {
// Split text into words and check for exact match
const words = plainText.split(/\s+/);
return words.some(word => word === normalizedPhrase);
}
// For multi-word phrases, check if the phrase appears as consecutive words
// Split text into words, then check if the phrase appears in the word sequence
const textWords = plainText.split(/\s+/);
const phraseWords = normalizedPhrase.split(/\s+/);
// Sliding window to find exact phrase match
for (let i = 0; i <= textWords.length - phraseWords.length; i++) {
let match = true;
for (let j = 0; j < phraseWords.length; j++) {
if (textWords[i + j] !== phraseWords[j]) {
match = false;
break;
}
}
if (match) {
return true;
}
}
return false;
}
/**
* Searches attributes using FTS5
* Returns noteIds of notes that have matching attributes
*/
searchAttributesSync(
tokens: string[],
operator: string,
noteIds?: Set<string>
): Set<string> {
const startTime = Date.now();
if (!this.checkFTS5Availability()) {
return new Set();
}
// Check if attributes_fts table exists
const tableExists = sql.getValue<number>(`
SELECT COUNT(*)
FROM sqlite_master
WHERE type='table' AND name='attributes_fts'
`);
if (!tableExists) {
log.info("attributes_fts table does not exist - skipping FTS attribute search");
return new Set();
}
try {
// Sanitize tokens to prevent FTS5 syntax injection
const sanitizedTokens = tokens.map(token => this.sanitizeFTS5Token(token));
// Check if any tokens became invalid after sanitization
if (sanitizedTokens.some(t => t === '__empty_token__' || t === '__invalid_token__')) {
return new Set();
}
const phrase = sanitizedTokens.join(" ");
// Build FTS5 query for exact match
const ftsQuery = operator === "=" ? `"${phrase}"` : phrase;
// Search both name and value columns
const whereConditions: string[] = [
`attributes_fts MATCH '${ftsQuery.replace(/'/g, "''")}'`
];
const params: any[] = [];
// Filter by noteIds if provided
if (noteIds && noteIds.size > 0 && noteIds.size < 1000) {
const noteIdList = Array.from(noteIds);
whereConditions.push(`noteId IN (${noteIdList.map(() => '?').join(',')})`);
params.push(...noteIdList);
}
const query = `
SELECT DISTINCT noteId, name, value
FROM attributes_fts
WHERE ${whereConditions.join(' AND ')}
`;
const results = sql.getRows<{
noteId: string;
name: string;
value: string;
}>(query, params);
log.info(`[FTS5-ATTRIBUTES-RAW] FTS5 query returned ${results.length} raw attribute matches`);
// Post-filter for exact word matches when operator is "="
if (operator === "=") {
const matchingNoteIds = new Set<string>();
for (const result of results) {
// Check if phrase matches attribute name or value with word boundaries
// For attribute names, check exact match (attribute name "test125" matches search "test125")
// For attribute values, check if phrase appears as exact words
const nameMatch = result.name.toLowerCase() === phrase.toLowerCase();
const valueMatch = result.value ? this.containsExactPhrase(phrase, result.value) : false;
log.info(`[FTS5-ATTRIBUTES-FILTER] Checking attribute: name="${result.name}", value="${result.value}", phrase="${phrase}", nameMatch=${nameMatch}, valueMatch=${valueMatch}`);
if (nameMatch || valueMatch) {
matchingNoteIds.add(result.noteId);
}
}
const filterTime = Date.now() - startTime;
log.info(`[FTS5-ATTRIBUTES-FILTERED] After post-filtering: ${matchingNoteIds.size} notes match (total time: ${filterTime}ms)`);
return matchingNoteIds;
}
// For other operators, return all matching noteIds
const searchTime = Date.now() - startTime;
const matchingNoteIds = new Set(results.map(r => r.noteId));
log.info(`[FTS5-ATTRIBUTES-TIME] Attribute search completed in ${searchTime}ms, found ${matchingNoteIds.size} notes`);
return matchingNoteIds;
} catch (error: any) {
log.error(`FTS5 attribute search error: ${error}`);
return new Set();
}
}
/** /**
* Searches protected notes separately (not in FTS index) * Searches protected notes separately (not in FTS index)
* This is a fallback method for protected notes * This is a fallback method for protected notes