feat(search): allow for search through very large notes

This commit is contained in:
perf3ct 2025-08-03 01:44:55 +00:00
parent 51a8937c64
commit 6c79be881d
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
2 changed files with 51 additions and 20 deletions

View File

@ -155,9 +155,11 @@ class NoteContentFulltextExp extends Expression {
content = normalize(content.toString());
if (type === "text" && mime === "text/html") {
if (!this.raw && content.length < 20000) {
// striptags is slow for very large notes
if (!this.raw && content.length < 10 * 1024 * 1024) {
// striptags is slow for very large notes - allow up to 10MB HTML processing
content = this.stripTags(content);
} else if (!this.raw) {
console.info(`Skipping HTML tag stripping for very large note: ${content.length} bytes - will search raw HTML`);
}
content = content.replace(/&nbsp;/g, " ");
@ -239,11 +241,17 @@ class NoteContentFulltextExp extends Expression {
const words = limitedText.toLowerCase().split(/\s+/);
// Early return for oversized word arrays
if (words.length > FUZZY_SEARCH_CONFIG.MAX_WORD_COUNT) {
// Only skip phrase matching for truly extreme word counts that could crash the system
if (words.length > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT) {
console.error(`Phrase matching skipped due to extreme word count that could cause system instability: ${words.length} words`);
return false;
}
// Warn about large word counts but still attempt matching
if (words.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) {
console.info(`Large word count for phrase matching: ${words.length} words - may take longer but will attempt full matching`);
}
// Find positions of each token
const tokenPositions: number[][] = this.tokens.map(token => {
const normalizedToken = normalizeSearchText(token);
@ -303,8 +311,8 @@ class NoteContentFulltextExp extends Expression {
const words = content.split(/\s+/);
// Limit word processing to prevent memory issues
const limitedWords = words.slice(0, FUZZY_SEARCH_CONFIG.MAX_WORD_COUNT);
// Only limit word processing for truly extreme cases to prevent system instability
const limitedWords = words.slice(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT);
return limitedWords.some(word => this.fuzzyMatchSingle(token, word));
}

View File

@ -14,9 +14,15 @@ export const FUZZY_SEARCH_CONFIG = {
MAX_EDIT_DISTANCE: 2,
// Maximum proximity distance for phrase matching (in words)
MAX_PHRASE_PROXIMITY: 10,
// Content size limits for memory protection
MAX_CONTENT_SIZE: 50 * 1024, // 50KB
MAX_WORD_COUNT: 10000,
// Absolute hard limits for extreme cases - only to prevent system crashes
ABSOLUTE_MAX_CONTENT_SIZE: 100 * 1024 * 1024, // 100MB - extreme upper limit to prevent OOM
ABSOLUTE_MAX_WORD_COUNT: 2000000, // 2M words - extreme upper limit for word processing
// Performance warning thresholds - inform user but still attempt search
PERFORMANCE_WARNING_SIZE: 5 * 1024 * 1024, // 5MB - warn about potential performance impact
PERFORMANCE_WARNING_WORDS: 100000, // 100K words - warn about word count impact
// Progressive processing thresholds for very large content
PROGRESSIVE_PROCESSING_SIZE: 10 * 1024 * 1024, // 10MB - use progressive processing
PROGRESSIVE_PROCESSING_WORDS: 500000, // 500K words - use progressive processing
// Performance thresholds
EARLY_TERMINATION_THRESHOLD: 3,
} as const;
@ -197,29 +203,46 @@ export function validateFuzzySearchTokens(tokens: string[], operator: string): {
}
/**
* Validates and preprocesses content for search operations with size limits.
* Validates and preprocesses content for search operations.
* Philosophy: Try to search everything! Only block truly extreme cases that could crash the system.
*
* @param content The content to validate and preprocess
* @param noteId The note ID (for logging purposes)
* @returns Processed content or null if content exceeds limits
* @returns Processed content, only null for truly extreme cases that could cause system instability
*/
export function validateAndPreprocessContent(content: string, noteId?: string): string | null {
if (!content || typeof content !== 'string') {
return null;
}
// Check content size limits
if (content.length > FUZZY_SEARCH_CONFIG.MAX_CONTENT_SIZE) {
console.warn(`Content size exceeds limit for note ${noteId || 'unknown'}: ${content.length} bytes`);
return content.substring(0, FUZZY_SEARCH_CONFIG.MAX_CONTENT_SIZE);
// Only block content that could actually crash the system (100MB+)
if (content.length > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_CONTENT_SIZE) {
console.error(`Content size exceeds absolute system limit for note ${noteId || 'unknown'}: ${content.length} bytes - this could cause system instability`);
// Only in truly extreme cases, truncate to prevent system crash
return content.substring(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_CONTENT_SIZE);
}
// Check word count limits for phrase matching
// Warn about very large content but still process it
if (content.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_SIZE) {
console.info(`Large content for note ${noteId || 'unknown'}: ${content.length} bytes - processing may take time but will attempt full search`);
}
// For word count, be even more permissive - only block truly extreme cases
const wordCount = content.split(/\s+/).length;
if (wordCount > FUZZY_SEARCH_CONFIG.MAX_WORD_COUNT) {
console.warn(`Word count exceeds limit for note ${noteId || 'unknown'}: ${wordCount} words`);
// Take first MAX_WORD_COUNT words
return content.split(/\s+/).slice(0, FUZZY_SEARCH_CONFIG.MAX_WORD_COUNT).join(' ');
if (wordCount > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT) {
console.error(`Word count exceeds absolute system limit for note ${noteId || 'unknown'}: ${wordCount} words - this could cause system instability`);
// Only in truly extreme cases, truncate to prevent system crash
return content.split(/\s+/).slice(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT).join(' ');
}
// Warn about high word counts but still process them
if (wordCount > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) {
console.info(`High word count for note ${noteId || 'unknown'}: ${wordCount} words - phrase matching may take time but will attempt full search`);
}
// Progressive processing warning for very large content
if (content.length > FUZZY_SEARCH_CONFIG.PROGRESSIVE_PROCESSING_SIZE || wordCount > FUZZY_SEARCH_CONFIG.PROGRESSIVE_PROCESSING_WORDS) {
console.info(`Very large content for note ${noteId || 'unknown'} - using progressive processing to maintain responsiveness`);
}
return content;