import sql from '../sql.js'; import sanitizeHtml from 'sanitize-html'; /** * Utility class for extracting context from notes to provide to AI models * Enhanced with advanced capabilities for handling large notes and specialized content */ export class ContextExtractor { /** * Get the content of a note */ async getNoteContent(noteId: string): Promise { const note = await sql.getRow<{content: string, type: string, mime: string, title: string}>( `SELECT note_contents.content, notes.type, notes.mime, notes.title FROM notes JOIN note_contents ON notes.noteId = note_contents.noteId WHERE notes.noteId = ?`, [noteId] ); if (!note) { return null; } return this.formatNoteContent(note.content, note.type, note.mime, note.title); } /** * Split a large note into smaller, semantically meaningful chunks * This is useful for handling large notes that exceed the context window of LLMs * * @param noteId - The ID of the note to chunk * @param maxChunkSize - Maximum size of each chunk in characters * @returns Array of content chunks, or empty array if note not found */ async getChunkedNoteContent(noteId: string, maxChunkSize = 2000): Promise { const content = await this.getNoteContent(noteId); if (!content) return []; // Split into semantic chunks (paragraphs, sections, etc.) return this.splitContentIntoChunks(content, maxChunkSize); } /** * Split text content into semantically meaningful chunks based on natural boundaries * like paragraphs, headings, and code blocks * * @param content - The text content to split * @param maxChunkSize - Maximum size of each chunk in characters * @returns Array of content chunks */ private splitContentIntoChunks(content: string, maxChunkSize: number): string[] { // Look for semantic boundaries (headings, blank lines, etc.) const headingPattern = /^(#+)\s+(.+)$/gm; const codeBlockPattern = /```[\s\S]+?```/gm; // Replace code blocks with placeholders to avoid splitting inside them const codeBlocks: string[] = []; let contentWithPlaceholders = content.replace(codeBlockPattern, (match) => { const placeholder = `__CODE_BLOCK_${codeBlocks.length}__`; codeBlocks.push(match); return placeholder; }); // Split content at headings and paragraphs const sections: string[] = []; let currentSection = ''; // First split by headings const lines = contentWithPlaceholders.split('\n'); for (const line of lines) { const isHeading = headingPattern.test(line); headingPattern.lastIndex = 0; // Reset regex // If this is a heading and we already have content, start a new section if (isHeading && currentSection.trim().length > 0) { sections.push(currentSection.trim()); currentSection = line; } else { currentSection += (currentSection ? '\n' : '') + line; } } // Add the last section if there's any content if (currentSection.trim().length > 0) { sections.push(currentSection.trim()); } // Now combine smaller sections to respect maxChunkSize const chunks: string[] = []; let currentChunk = ''; for (const section of sections) { // If adding this section exceeds maxChunkSize and we already have content, // finalize the current chunk and start a new one if ((currentChunk + section).length > maxChunkSize && currentChunk.length > 0) { chunks.push(currentChunk); currentChunk = section; } else { currentChunk += (currentChunk ? '\n\n' : '') + section; } } // Add the last chunk if there's any content if (currentChunk.length > 0) { chunks.push(currentChunk); } // Restore code blocks in all chunks return chunks.map(chunk => { return chunk.replace(/__CODE_BLOCK_(\d+)__/g, (_, index) => { return codeBlocks[parseInt(index)]; }); }); } /** * Generate a summary of a note's content * Useful for providing a condensed version of very large notes * * @param noteId - The ID of the note to summarize * @param maxLength - Cut-off length to trigger summarization * @returns Summary of the note or the original content if small enough */ async getNoteSummary(noteId: string, maxLength = 5000): Promise { const content = await this.getNoteContent(noteId); if (!content || content.length < maxLength) return content || ''; // For larger content, generate a summary return this.summarizeContent(content); } /** * Summarize content by extracting key information * This uses a heuristic approach to find important sentences and paragraphs * * @param content - The content to summarize * @returns A summarized version of the content */ private summarizeContent(content: string): string { // Extract title/heading if present const titleMatch = content.match(/^# (.+)$/m); const title = titleMatch ? titleMatch[1] : 'Untitled Note'; // Extract all headings for an outline const headings: string[] = []; const headingMatches = content.matchAll(/^(#+)\s+(.+)$/gm); for (const match of headingMatches) { const level = match[1].length; const text = match[2]; headings.push(`${' '.repeat(level-1)}- ${text}`); } // Extract first sentence of each paragraph for a summary const paragraphs = content.split(/\n\s*\n/); const firstSentences = paragraphs .filter(p => p.trim().length > 0 && !p.trim().startsWith('#') && !p.trim().startsWith('```')) .map(p => { const sentenceMatch = p.match(/^[^.!?]+[.!?]/); return sentenceMatch ? sentenceMatch[0].trim() : p.substring(0, Math.min(150, p.length)).trim() + '...'; }) .slice(0, 5); // Limit to 5 sentences // Create the summary let summary = `# Summary of: ${title}\n\n`; if (headings.length > 0) { summary += `## Document Outline\n${headings.join('\n')}\n\n`; } if (firstSentences.length > 0) { summary += `## Key Points\n${firstSentences.map(s => `- ${s}`).join('\n')}\n\n`; } summary += `(Note: This is an automatically generated summary of a larger document with ${content.length} characters)`; return summary; } /** * Get a set of parent notes to provide hierarchical context */ async getParentContext(noteId: string, maxDepth = 3): Promise { const parents = await this.getParentNotes(noteId, maxDepth); if (!parents.length) return ''; let context = 'Here is the hierarchical context for the current note:\n\n'; for (const parent of parents) { context += `- ${parent.title}\n`; } return context + '\n'; } /** * Get child notes to provide additional context */ async getChildContext(noteId: string, maxChildren = 5): Promise { const children = await sql.getRows<{noteId: string, title: string}>( `SELECT noteId, title FROM notes WHERE parentNoteId = ? AND isDeleted = 0 LIMIT ?`, [noteId, maxChildren] ); if (!children.length) return ''; let context = 'The current note has these child notes:\n\n'; for (const child of children) { context += `- ${child.title}\n`; } return context + '\n'; } /** * Get notes linked to this note */ async getLinkedNotesContext(noteId: string, maxLinks = 5): Promise { const linkedNotes = await sql.getRows<{title: string}>( `SELECT title FROM notes WHERE noteId IN ( SELECT value FROM attributes WHERE noteId = ? AND type = 'relation' LIMIT ? )`, [noteId, maxLinks] ); if (!linkedNotes.length) return ''; let context = 'This note has relationships with these notes:\n\n'; for (const linked of linkedNotes) { context += `- ${linked.title}\n`; } return context + '\n'; } /** * Format the content of a note based on its type * Enhanced with better handling for large and specialized content types */ private formatNoteContent(content: string, type: string, mime: string, title: string): string { let formattedContent = `# ${title}\n\n`; switch (type) { case 'text': // Remove HTML formatting for text notes formattedContent += this.sanitizeHtml(content); break; case 'code': // Improved code handling with language detection const codeLanguage = this.detectCodeLanguage(content, mime); // For large code files, extract structure rather than full content if (content.length > 8000) { formattedContent += this.extractCodeStructure(content, codeLanguage); } else { formattedContent += `\`\`\`${codeLanguage}\n${content}\n\`\`\``; } break; case 'canvas': if (mime === 'application/json') { try { // Parse JSON content const jsonContent = JSON.parse(content); // Extract text elements from canvas if (jsonContent.elements && Array.isArray(jsonContent.elements)) { const texts = jsonContent.elements .filter((element: any) => element.type === 'text' && element.text) .map((element: any) => element.text); formattedContent += 'Canvas content:\n' + texts.join('\n'); } else { formattedContent += '[Empty canvas]'; } } catch (e: any) { formattedContent += `[Error parsing canvas content: ${e.message}]`; } } else { formattedContent += '[Canvas content]'; } break; case 'mindMap': if (mime === 'application/json') { try { // Parse JSON content const jsonContent = JSON.parse(content); // Extract node text from mind map const extractMindMapNodes = (node: any): string[] => { let texts: string[] = []; if (node.text) { texts.push(node.text); } if (node.children && Array.isArray(node.children)) { for (const child of node.children) { texts = texts.concat(extractMindMapNodes(child)); } } return texts; }; if (jsonContent.root) { formattedContent += 'Mind map content:\n' + extractMindMapNodes(jsonContent.root).join('\n'); } else { formattedContent += '[Empty mind map]'; } } catch (e: any) { formattedContent += `[Error parsing mind map content: ${e.message}]`; } } else { formattedContent += '[Mind map content]'; } break; case 'relationMap': if (mime === 'application/json') { try { // Parse JSON content const jsonContent = JSON.parse(content); // Extract relation map entities and connections let result = 'Relation map content:\n'; if (jsonContent.notes && Array.isArray(jsonContent.notes)) { result += 'Notes: ' + jsonContent.notes .map((note: any) => note.title || note.name) .filter(Boolean) .join(', ') + '\n'; } if (jsonContent.relations && Array.isArray(jsonContent.relations)) { result += 'Relations: ' + jsonContent.relations .map((rel: any) => { const sourceNote = jsonContent.notes.find((n: any) => n.noteId === rel.sourceNoteId); const targetNote = jsonContent.notes.find((n: any) => n.noteId === rel.targetNoteId); const source = sourceNote ? (sourceNote.title || sourceNote.name) : 'unknown'; const target = targetNote ? (targetNote.title || targetNote.name) : 'unknown'; return `${source} → ${rel.name || ''} → ${target}`; }) .join('; '); } formattedContent += result; } catch (e: any) { formattedContent += `[Error parsing relation map content: ${e.message}]`; } } else { formattedContent += '[Relation map content]'; } break; case 'geoMap': if (mime === 'application/json') { try { // Parse JSON content const jsonContent = JSON.parse(content); let result = 'Geographic map content:\n'; if (jsonContent.markers && Array.isArray(jsonContent.markers)) { if (jsonContent.markers.length > 0) { result += jsonContent.markers .map((marker: any) => { return `Location: ${marker.title || ''} (${marker.lat}, ${marker.lng})${marker.description ? ' - ' + marker.description : ''}`; }) .join('\n'); } else { result += 'Empty geographic map'; } } else { result += 'Empty geographic map'; } formattedContent += result; } catch (e: any) { formattedContent += `[Error parsing geographic map content: ${e.message}]`; } } else { formattedContent += '[Geographic map content]'; } break; case 'mermaid': // Format mermaid diagrams as code blocks formattedContent += '```mermaid\n' + content + '\n```'; break; case 'image': case 'file': formattedContent += `[${type} attachment]`; break; default: // For other notes, just use the content as is formattedContent += this.sanitizeHtml(content); } return formattedContent; } /** * Detect the programming language of code content * * @param content - The code content to analyze * @param mime - MIME type (if available) * @returns The detected language or empty string */ private detectCodeLanguage(content: string, mime: string): string { // First check if mime type provides a hint if (mime) { const mimeMap: Record = { 'text/x-python': 'python', 'text/javascript': 'javascript', 'application/javascript': 'javascript', 'text/typescript': 'typescript', 'application/typescript': 'typescript', 'text/x-java': 'java', 'text/html': 'html', 'text/css': 'css', 'text/x-c': 'c', 'text/x-c++': 'cpp', 'text/x-csharp': 'csharp', 'text/x-go': 'go', 'text/x-ruby': 'ruby', 'text/x-php': 'php', 'text/x-swift': 'swift', 'text/x-rust': 'rust', 'text/markdown': 'markdown', 'text/x-sql': 'sql', 'text/x-yaml': 'yaml', 'application/json': 'json', 'text/x-shell': 'bash' }; for (const [mimePattern, language] of Object.entries(mimeMap)) { if (mime.includes(mimePattern)) { return language; } } } // Check for common language patterns in the content const firstLines = content.split('\n', 20).join('\n'); const languagePatterns: Record = { 'python': /^(import\s+|from\s+\w+\s+import|def\s+\w+\s*\(|class\s+\w+\s*:)/m, 'javascript': /^(const\s+\w+\s*=|let\s+\w+\s*=|var\s+\w+\s*=|function\s+\w+\s*\(|import\s+.*from\s+)/m, 'typescript': /^(interface\s+\w+|type\s+\w+\s*=|class\s+\w+\s*{)/m, 'html': /^|||/m, 'css': /^(\.\w+\s*{|\#\w+\s*{|@media|@import)/m, 'java': /^(public\s+class|import\s+java|package\s+)/m, 'cpp': /^(#include\s+<\w+>|namespace\s+\w+|void\s+\w+\s*\()/m, 'csharp': /^(using\s+System|namespace\s+\w+|public\s+class)/m, 'go': /^(package\s+\w+|import\s+\(|func\s+\w+\s*\()/m, 'ruby': /^(require\s+|class\s+\w+\s*<|def\s+\w+)/m, 'php': /^(<\?php|namespace\s+\w+|use\s+\w+)/m, 'sql': /^(SELECT|INSERT|UPDATE|DELETE|CREATE TABLE|ALTER TABLE)/im, 'bash': /^(#!\/bin\/sh|#!\/bin\/bash|function\s+\w+\s*\(\))/m, 'markdown': /^(#\s+|##\s+|###\s+|\*\s+|-\s+|>\s+)/m, 'json': /^({[\s\n]*"|[\s\n]*\[)/m, 'yaml': /^(---|\w+:\s+)/m }; for (const [language, pattern] of Object.entries(languagePatterns)) { if (pattern.test(firstLines)) { return language; } } // Default to empty string if we can't detect the language return ''; } /** * Extract the structure of a code file rather than its full content * Useful for providing high-level understanding of large code files * * @param content - The full code content * @param language - The programming language * @returns A structured representation of the code */ private extractCodeStructure(content: string, language: string): string { const lines = content.split('\n'); const maxLines = 8000; // If it's not that much over the limit, just include the whole thing if (lines.length <= maxLines * 1.2) { return `\`\`\`${language}\n${content}\n\`\`\``; } // For large files, extract important structural elements based on language let extractedStructure = ''; let importSection = ''; let classDefinitions = []; let functionDefinitions = []; let otherImportantLines = []; // Extract imports/includes, class/function definitions based on language if (['javascript', 'typescript', 'python', 'java', 'csharp'].includes(language)) { // Find imports for (let i = 0; i < Math.min(100, lines.length); i++) { if (lines[i].match(/^(import|from|using|require|#include|package)\s+/)) { importSection += lines[i] + '\n'; } } // Find class definitions for (let i = 0; i < lines.length; i++) { if (lines[i].match(/^(class|interface|type)\s+\w+/)) { const endBracketLine = this.findMatchingEnd(lines, i, language); if (endBracketLine > i && endBracketLine <= i + 10) { // Include small class definitions entirely classDefinitions.push(lines.slice(i, endBracketLine + 1).join('\n')); i = endBracketLine; } else { // For larger classes, just show the definition and methods let className = lines[i]; classDefinitions.push(className); // Look for methods in this class for (let j = i + 1; j < Math.min(endBracketLine, lines.length); j++) { if (lines[j].match(/^\s+(function|def|public|private|protected)\s+\w+/)) { classDefinitions.push(' ' + lines[j].trim()); } } if (endBracketLine > 0 && endBracketLine < lines.length) { i = endBracketLine; } } } } // Find function definitions not inside classes for (let i = 0; i < lines.length; i++) { if (lines[i].match(/^(function|def|const\s+\w+\s*=\s*\(|let\s+\w+\s*=\s*\(|var\s+\w+\s*=\s*\()/)) { functionDefinitions.push(lines[i]); } } } // Build the extracted structure extractedStructure += `# Code Structure (${lines.length} lines total)\n\n`; if (importSection) { extractedStructure += "## Imports/Dependencies\n```" + language + "\n" + importSection + "```\n\n"; } if (classDefinitions.length > 0) { extractedStructure += "## Classes/Interfaces\n```" + language + "\n" + classDefinitions.join('\n\n') + "\n```\n\n"; } if (functionDefinitions.length > 0) { extractedStructure += "## Functions\n```" + language + "\n" + functionDefinitions.join('\n\n') + "\n```\n\n"; } // Add beginning and end of the file for context extractedStructure += "## Beginning of File\n```" + language + "\n" + lines.slice(0, Math.min(50, lines.length)).join('\n') + "\n```\n\n"; if (lines.length > 100) { extractedStructure += "## End of File\n```" + language + "\n" + lines.slice(Math.max(0, lines.length - 50)).join('\n') + "\n```\n\n"; } return extractedStructure; } /** * Find the line number of the matching ending bracket/block * * @param lines - Array of code lines * @param startLine - Starting line number * @param language - Programming language * @returns The line number of the matching end, or -1 if not found */ private findMatchingEnd(lines: string[], startLine: number, language: string): number { let depth = 0; let inClass = false; // Different languages have different ways to define blocks if (['javascript', 'typescript', 'java', 'csharp', 'cpp'].includes(language)) { // Curly brace languages for (let i = startLine; i < lines.length; i++) { const line = lines[i]; // Count opening braces for (const char of line) { if (char === '{') depth++; if (char === '}') { depth--; if (depth === 0 && inClass) return i; } } // Check if this line contains the class declaration if (i === startLine && line.includes('{')) { inClass = true; } else if (i === startLine) { // If the first line doesn't have an opening brace, look at the next few lines if (i + 1 < lines.length && lines[i + 1].includes('{')) { inClass = true; } } } } else if (language === 'python') { // Indentation-based language const baseIndentation = lines[startLine].match(/^\s*/)?.[0].length || 0; for (let i = startLine + 1; i < lines.length; i++) { // Skip empty lines if (lines[i].trim() === '') continue; const currentIndentation = lines[i].match(/^\s*/)?.[0].length || 0; // If we're back to the same or lower indentation level, we've reached the end if (currentIndentation <= baseIndentation) { return i - 1; } } } return -1; } /** * Sanitize HTML content to plain text */ private sanitizeHtml(html: string): string { if (!html) return ''; // Use sanitizeHtml to remove all HTML tags let content = sanitizeHtml(html, { allowedTags: [], allowedAttributes: {}, textFilter: (text) => { // Replace multiple newlines with a single one return text.replace(/\n\s*\n/g, '\n\n'); } }); // Additional cleanup for any remaining HTML entities content = content .replace(/ /g, ' ') .replace(/</g, '<') .replace(/>/g, '>') .replace(/&/g, '&') .replace(/"/g, '"') .replace(/'/g, "'"); return content; } /** * Get parent notes in the hierarchy */ private async getParentNotes(noteId: string, maxDepth: number): Promise<{noteId: string, title: string}[]> { const parentNotes: {noteId: string, title: string}[] = []; let currentNoteId = noteId; for (let i = 0; i < maxDepth; i++) { const parent = await sql.getRow<{parentNoteId: string, title: string}>( `SELECT branches.parentNoteId, notes.title FROM branches JOIN notes ON branches.parentNoteId = notes.noteId WHERE branches.noteId = ? AND branches.isDeleted = 0 LIMIT 1`, [currentNoteId] ); if (!parent || parent.parentNoteId === 'root') { break; } parentNotes.unshift({ noteId: parent.parentNoteId, title: parent.title }); currentNoteId = parent.parentNoteId; } return parentNotes; } /** * Get the full context for a note, including parent hierarchy, content, and children */ async getFullContext(noteId: string): Promise { const noteContent = await this.getNoteContent(noteId); if (!noteContent) { return 'Note not found'; } const parentContext = await this.getParentContext(noteId); const childContext = await this.getChildContext(noteId); const linkedContext = await this.getLinkedNotesContext(noteId); return [ parentContext, noteContent, childContext, linkedContext ].filter(Boolean).join('\n\n'); } /** * Get semantically ranked context based on semantic similarity to a query * This method delegates to the semantic context service for the actual ranking * * @param noteId - The ID of the current note * @param query - The user's query to compare against * @param maxResults - Maximum number of related notes to include * @returns Context with the most semantically relevant related notes */ async getSemanticContext(noteId: string, query: string, maxResults = 5): Promise { try { // This requires the semantic context service to be available // We're using a dynamic import to avoid circular dependencies const { default: aiServiceManager } = await import('./ai_service_manager.js'); const semanticContext = aiServiceManager.getInstance().getSemanticContextService(); if (!semanticContext) { return this.getFullContext(noteId); } return await semanticContext.getSemanticContext(noteId, query, maxResults); } catch (error) { // Fall back to regular context if semantic ranking fails console.error('Error in semantic context ranking:', error); return this.getFullContext(noteId); } } /** * Get progressively loaded context based on depth level * This provides different levels of context detail depending on the depth parameter * * @param noteId - The ID of the note to get context for * @param depth - Depth level (1-4) determining how much context to include * @returns Context appropriate for the requested depth */ async getProgressiveContext(noteId: string, depth = 1): Promise { try { // This requires the semantic context service to be available // We're using a dynamic import to avoid circular dependencies const { default: aiServiceManager } = await import('./ai_service_manager.js'); const semanticContext = aiServiceManager.getInstance().getSemanticContextService(); if (!semanticContext) { return this.getFullContext(noteId); } return await semanticContext.getProgressiveContext(noteId, depth); } catch (error) { // Fall back to regular context if progressive loading fails console.error('Error in progressive context loading:', error); return this.getFullContext(noteId); } } /** * Get smart context based on the query complexity * This automatically selects the appropriate context depth and relevance * * @param noteId - The ID of the note to get context for * @param query - The user's query for semantic relevance matching * @returns The optimal context for answering the query */ async getSmartContext(noteId: string, query: string): Promise { try { // This requires the semantic context service to be available // We're using a dynamic import to avoid circular dependencies const { default: aiServiceManager } = await import('./ai_service_manager.js'); const semanticContext = aiServiceManager.getInstance().getSemanticContextService(); if (!semanticContext) { return this.getFullContext(noteId); } return await semanticContext.getSmartContext(noteId, query); } catch (error) { // Fall back to regular context if smart context fails console.error('Error in smart context selection:', error); return this.getFullContext(noteId); } } } // Singleton instance const contextExtractor = new ContextExtractor(); export default contextExtractor;