From 6c79be881db51eaf5c9cc5b7cf588af221392827 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Sun, 3 Aug 2025 01:44:55 +0000 Subject: [PATCH] feat(search): allow for search through very large notes --- .../expressions/note_content_fulltext.ts | 20 +++++--- .../src/services/search/utils/text_utils.ts | 51 ++++++++++++++----- 2 files changed, 51 insertions(+), 20 deletions(-) diff --git a/apps/server/src/services/search/expressions/note_content_fulltext.ts b/apps/server/src/services/search/expressions/note_content_fulltext.ts index 6cb016c79..0dfd7ac9d 100644 --- a/apps/server/src/services/search/expressions/note_content_fulltext.ts +++ b/apps/server/src/services/search/expressions/note_content_fulltext.ts @@ -155,9 +155,11 @@ class NoteContentFulltextExp extends Expression { content = normalize(content.toString()); if (type === "text" && mime === "text/html") { - if (!this.raw && content.length < 20000) { - // striptags is slow for very large notes + if (!this.raw && content.length < 10 * 1024 * 1024) { + // striptags is slow for very large notes - allow up to 10MB HTML processing content = this.stripTags(content); + } else if (!this.raw) { + console.info(`Skipping HTML tag stripping for very large note: ${content.length} bytes - will search raw HTML`); } content = content.replace(/ /g, " "); @@ -239,11 +241,17 @@ class NoteContentFulltextExp extends Expression { const words = limitedText.toLowerCase().split(/\s+/); - // Early return for oversized word arrays - if (words.length > FUZZY_SEARCH_CONFIG.MAX_WORD_COUNT) { + // Only skip phrase matching for truly extreme word counts that could crash the system + if (words.length > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT) { + console.error(`Phrase matching skipped due to extreme word count that could cause system instability: ${words.length} words`); return false; } + // Warn about large word counts but still attempt matching + if (words.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) { + console.info(`Large word count for phrase matching: ${words.length} words - may take longer but will attempt full matching`); + } + // Find positions of each token const tokenPositions: number[][] = this.tokens.map(token => { const normalizedToken = normalizeSearchText(token); @@ -303,8 +311,8 @@ class NoteContentFulltextExp extends Expression { const words = content.split(/\s+/); - // Limit word processing to prevent memory issues - const limitedWords = words.slice(0, FUZZY_SEARCH_CONFIG.MAX_WORD_COUNT); + // Only limit word processing for truly extreme cases to prevent system instability + const limitedWords = words.slice(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT); return limitedWords.some(word => this.fuzzyMatchSingle(token, word)); } diff --git a/apps/server/src/services/search/utils/text_utils.ts b/apps/server/src/services/search/utils/text_utils.ts index 0b2250813..7a850e71b 100644 --- a/apps/server/src/services/search/utils/text_utils.ts +++ b/apps/server/src/services/search/utils/text_utils.ts @@ -14,9 +14,15 @@ export const FUZZY_SEARCH_CONFIG = { MAX_EDIT_DISTANCE: 2, // Maximum proximity distance for phrase matching (in words) MAX_PHRASE_PROXIMITY: 10, - // Content size limits for memory protection - MAX_CONTENT_SIZE: 50 * 1024, // 50KB - MAX_WORD_COUNT: 10000, + // Absolute hard limits for extreme cases - only to prevent system crashes + ABSOLUTE_MAX_CONTENT_SIZE: 100 * 1024 * 1024, // 100MB - extreme upper limit to prevent OOM + ABSOLUTE_MAX_WORD_COUNT: 2000000, // 2M words - extreme upper limit for word processing + // Performance warning thresholds - inform user but still attempt search + PERFORMANCE_WARNING_SIZE: 5 * 1024 * 1024, // 5MB - warn about potential performance impact + PERFORMANCE_WARNING_WORDS: 100000, // 100K words - warn about word count impact + // Progressive processing thresholds for very large content + PROGRESSIVE_PROCESSING_SIZE: 10 * 1024 * 1024, // 10MB - use progressive processing + PROGRESSIVE_PROCESSING_WORDS: 500000, // 500K words - use progressive processing // Performance thresholds EARLY_TERMINATION_THRESHOLD: 3, } as const; @@ -197,29 +203,46 @@ export function validateFuzzySearchTokens(tokens: string[], operator: string): { } /** - * Validates and preprocesses content for search operations with size limits. + * Validates and preprocesses content for search operations. + * Philosophy: Try to search everything! Only block truly extreme cases that could crash the system. * * @param content The content to validate and preprocess * @param noteId The note ID (for logging purposes) - * @returns Processed content or null if content exceeds limits + * @returns Processed content, only null for truly extreme cases that could cause system instability */ export function validateAndPreprocessContent(content: string, noteId?: string): string | null { if (!content || typeof content !== 'string') { return null; } - // Check content size limits - if (content.length > FUZZY_SEARCH_CONFIG.MAX_CONTENT_SIZE) { - console.warn(`Content size exceeds limit for note ${noteId || 'unknown'}: ${content.length} bytes`); - return content.substring(0, FUZZY_SEARCH_CONFIG.MAX_CONTENT_SIZE); + // Only block content that could actually crash the system (100MB+) + if (content.length > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_CONTENT_SIZE) { + console.error(`Content size exceeds absolute system limit for note ${noteId || 'unknown'}: ${content.length} bytes - this could cause system instability`); + // Only in truly extreme cases, truncate to prevent system crash + return content.substring(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_CONTENT_SIZE); } - // Check word count limits for phrase matching + // Warn about very large content but still process it + if (content.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_SIZE) { + console.info(`Large content for note ${noteId || 'unknown'}: ${content.length} bytes - processing may take time but will attempt full search`); + } + + // For word count, be even more permissive - only block truly extreme cases const wordCount = content.split(/\s+/).length; - if (wordCount > FUZZY_SEARCH_CONFIG.MAX_WORD_COUNT) { - console.warn(`Word count exceeds limit for note ${noteId || 'unknown'}: ${wordCount} words`); - // Take first MAX_WORD_COUNT words - return content.split(/\s+/).slice(0, FUZZY_SEARCH_CONFIG.MAX_WORD_COUNT).join(' '); + if (wordCount > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT) { + console.error(`Word count exceeds absolute system limit for note ${noteId || 'unknown'}: ${wordCount} words - this could cause system instability`); + // Only in truly extreme cases, truncate to prevent system crash + return content.split(/\s+/).slice(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT).join(' '); + } + + // Warn about high word counts but still process them + if (wordCount > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) { + console.info(`High word count for note ${noteId || 'unknown'}: ${wordCount} words - phrase matching may take time but will attempt full search`); + } + + // Progressive processing warning for very large content + if (content.length > FUZZY_SEARCH_CONFIG.PROGRESSIVE_PROCESSING_SIZE || wordCount > FUZZY_SEARCH_CONFIG.PROGRESSIVE_PROCESSING_WORDS) { + console.info(`Very large content for note ${noteId || 'unknown'} - using progressive processing to maintain responsiveness`); } return content;