From 9029f59410fbe9385d5fa8caa519b095ca71e6fa Mon Sep 17 00:00:00 2001 From: perf3ct Date: Mon, 14 Jul 2025 16:15:15 +0000 Subject: [PATCH] feat(ocr): swap from custom table to using the blobs table, with a new column --- apps/server/src/becca/entities/bblob.ts | 5 +- apps/server/src/migrations/migrations.ts | 62 +---- apps/server/src/routes/api/ocr.ts | 60 ++--- .../src/services/ocr/ocr_service.spec.ts | 141 +++++------ apps/server/src/services/ocr/ocr_service.ts | 227 +++++++++--------- .../search/expressions/ocr_content.ts | 81 +++---- packages/commons/src/lib/rows.ts | 1 + 7 files changed, 246 insertions(+), 331 deletions(-) diff --git a/apps/server/src/becca/entities/bblob.ts b/apps/server/src/becca/entities/bblob.ts index 2cff185d5..a4dbd712f 100644 --- a/apps/server/src/becca/entities/bblob.ts +++ b/apps/server/src/becca/entities/bblob.ts @@ -10,11 +10,12 @@ class BBlob extends AbstractBeccaEntity { return "blobId"; } static get hashedProperties() { - return ["blobId", "content"]; + return ["blobId", "content", "ocr_text"]; } content!: string | Buffer; contentLength!: number; + ocr_text?: string | null; constructor(row: BlobRow) { super(); @@ -25,6 +26,7 @@ class BBlob extends AbstractBeccaEntity { this.blobId = row.blobId; this.content = row.content; this.contentLength = row.contentLength; + this.ocr_text = row.ocr_text; this.dateModified = row.dateModified; this.utcDateModified = row.utcDateModified; } @@ -34,6 +36,7 @@ class BBlob extends AbstractBeccaEntity { blobId: this.blobId, content: this.content || null, contentLength: this.contentLength, + ocr_text: this.ocr_text || null, dateModified: this.dateModified, utcDateModified: this.utcDateModified }; diff --git a/apps/server/src/migrations/migrations.ts b/apps/server/src/migrations/migrations.ts index 5c35e6c29..a7ca34cb2 100644 --- a/apps/server/src/migrations/migrations.ts +++ b/apps/server/src/migrations/migrations.ts @@ -6,64 +6,16 @@ // Migrations should be kept in descending order, so the latest migration is first. const MIGRATIONS: (SqlMigration | JsMigration)[] = [ - // Add OCR results table for storing extracted text from images + // Add OCR text column to blobs table for storing extracted text from images { version: 233, sql: /*sql*/`\ - -- Create OCR results table to store extracted text from images - CREATE TABLE IF NOT EXISTS ocr_results ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - entity_id TEXT NOT NULL, - entity_type TEXT NOT NULL DEFAULT 'note', - extracted_text TEXT NOT NULL, - confidence REAL NOT NULL, - language TEXT NOT NULL DEFAULT 'eng', - extracted_at TEXT NOT NULL, - created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, - updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, - UNIQUE(entity_id, entity_type) - ); - - -- Create indexes for better search performance - CREATE INDEX IF NOT EXISTS idx_ocr_results_entity - ON ocr_results (entity_id, entity_type); - - CREATE INDEX IF NOT EXISTS idx_ocr_results_text - ON ocr_results (extracted_text); - - CREATE INDEX IF NOT EXISTS idx_ocr_results_confidence - ON ocr_results (confidence); - - -- Create full-text search index for extracted text - CREATE VIRTUAL TABLE IF NOT EXISTS ocr_results_fts USING fts5( - entity_id UNINDEXED, - entity_type UNINDEXED, - extracted_text, - content='ocr_results', - content_rowid='id' - ); - - -- Create triggers to keep FTS table in sync - CREATE TRIGGER IF NOT EXISTS ocr_results_fts_insert - AFTER INSERT ON ocr_results - BEGIN - INSERT INTO ocr_results_fts(rowid, entity_id, entity_type, extracted_text) - VALUES (new.id, new.entity_id, new.entity_type, new.extracted_text); - END; - - CREATE TRIGGER IF NOT EXISTS ocr_results_fts_update - AFTER UPDATE ON ocr_results - BEGIN - UPDATE ocr_results_fts - SET extracted_text = new.extracted_text - WHERE rowid = new.id; - END; - - CREATE TRIGGER IF NOT EXISTS ocr_results_fts_delete - AFTER DELETE ON ocr_results - BEGIN - DELETE FROM ocr_results_fts WHERE rowid = old.id; - END; + -- Add OCR text column to blobs table + ALTER TABLE blobs ADD COLUMN ocr_text TEXT DEFAULT NULL; + + -- Create index for OCR text searches + CREATE INDEX IF NOT EXISTS idx_blobs_ocr_text + ON blobs (ocr_text); ` }, // Remove embedding tables since LLM embedding functionality has been removed diff --git a/apps/server/src/routes/api/ocr.ts b/apps/server/src/routes/api/ocr.ts index 4805d6be5..84817b6d2 100644 --- a/apps/server/src/routes/api/ocr.ts +++ b/apps/server/src/routes/api/ocr.ts @@ -246,13 +246,6 @@ async function processAttachmentOCR(req: Request, res: Response) { * schema: * type: string * description: Search query text - * - name: entityType - * in: query - * required: false - * schema: - * type: string - * enum: [note, attachment] - * description: Filter by entity type * responses: * '200': * description: Search results @@ -268,14 +261,10 @@ async function processAttachmentOCR(req: Request, res: Response) { * items: * type: object * properties: - * entityId: - * type: string - * entityType: + * blobId: * type: string * text: * type: string - * confidence: - * type: number * '400': * description: Bad request - missing search query * '500': @@ -286,7 +275,7 @@ async function processAttachmentOCR(req: Request, res: Response) { */ async function searchOCR(req: Request, res: Response) { try { - const { q: searchText, entityType } = req.query; + const { q: searchText } = req.query; if (!searchText || typeof searchText !== 'string') { res.status(400).json({ @@ -297,10 +286,7 @@ async function searchOCR(req: Request, res: Response) { return; } - const results = ocrService.searchOCRResults( - searchText, - entityType as 'note' | 'attachment' | undefined - ); + const results = ocrService.searchOCRResults(searchText); res.json({ success: true, @@ -431,10 +417,10 @@ async function getBatchProgress(req: Request, res: Response) { * properties: * totalProcessed: * type: number - * averageConfidence: + * imageNotes: + * type: number + * imageAttachments: * type: number - * byEntityType: - * type: object * '500': * description: Internal server error * security: @@ -463,24 +449,17 @@ async function getOCRStats(req: Request, res: Response) { /** * @swagger - * /api/ocr/delete/{entityType}/{entityId}: + * /api/ocr/delete/{blobId}: * delete: - * summary: Delete OCR results for a specific entity + * summary: Delete OCR results for a specific blob * operationId: ocr-delete-results * parameters: - * - name: entityType + * - name: blobId * in: path * required: true * schema: * type: string - * enum: [note, attachment] - * description: Type of entity - * - name: entityId - * in: path - * required: true - * schema: - * type: string - * description: ID of the entity + * description: ID of the blob * responses: * '200': * description: OCR results deleted successfully @@ -503,31 +482,22 @@ async function getOCRStats(req: Request, res: Response) { */ async function deleteOCRResults(req: Request, res: Response) { try { - const { entityType, entityId } = req.params; + const { blobId } = req.params; - if (!entityType || !entityId) { + if (!blobId) { res.status(400).json({ success: false, - error: 'Entity type and ID are required' + error: 'Blob ID is required' }); (res as any).triliumResponseHandled = true; return; } - if (!['note', 'attachment'].includes(entityType)) { - res.status(400).json({ - success: false, - error: 'Entity type must be either "note" or "attachment"' - }); - (res as any).triliumResponseHandled = true; - return; - } - - ocrService.deleteOCRResult(entityId, entityType as 'note' | 'attachment'); + ocrService.deleteOCRResult(blobId); res.json({ success: true, - message: `OCR results deleted for ${entityType} ${entityId}` + message: `OCR results deleted for blob ${blobId}` }); (res as any).triliumResponseHandled = true; diff --git a/apps/server/src/services/ocr/ocr_service.spec.ts b/apps/server/src/services/ocr/ocr_service.spec.ts index 87b2475d1..6313ce99f 100644 --- a/apps/server/src/services/ocr/ocr_service.spec.ts +++ b/apps/server/src/services/ocr/ocr_service.spec.ts @@ -240,7 +240,7 @@ describe('OCRService', () => { }); describe('storeOCRResult', () => { - it('should store OCR result in database successfully', async () => { + it('should store OCR result in blob successfully', async () => { const ocrResult = { text: 'Sample text', confidence: 0.95, @@ -248,15 +248,29 @@ describe('OCRService', () => { language: 'eng' }; - await ocrService.storeOCRResult('note123', ocrResult, 'note'); + await ocrService.storeOCRResult('blob123', ocrResult); expect(mockSql.execute).toHaveBeenCalledWith( - expect.stringContaining('INSERT OR REPLACE INTO ocr_results'), - expect.arrayContaining(['note123', 'note', 'Sample text', 0.95, 'eng', expect.any(String)]) + expect.stringContaining('UPDATE blobs SET ocr_text = ?'), + ['Sample text', 'blob123'] ); }); - it('should handle database insertion errors', async () => { + it('should handle undefined blobId gracefully', async () => { + const ocrResult = { + text: 'Sample text', + confidence: 0.95, + extractedAt: '2025-06-10T10:00:00.000Z', + language: 'eng' + }; + + await ocrService.storeOCRResult(undefined, ocrResult); + + expect(mockSql.execute).not.toHaveBeenCalled(); + expect(mockLog.error).toHaveBeenCalledWith('Cannot store OCR result: blobId is undefined'); + }); + + it('should handle database update errors', async () => { const error = new Error('Database error'); mockSql.execute.mockImplementation(() => { throw error; @@ -269,8 +283,8 @@ describe('OCRService', () => { language: 'eng' }; - await expect(ocrService.storeOCRResult('note123', ocrResult, 'note')).rejects.toThrow('Database error'); - expect(mockLog.error).toHaveBeenCalledWith('Failed to store OCR result for note note123: Error: Database error'); + await expect(ocrService.storeOCRResult('blob123', ocrResult)).rejects.toThrow('Database error'); + expect(mockLog.error).toHaveBeenCalledWith('Failed to store OCR result for blob blob123: Error: Database error'); }); }); @@ -279,6 +293,7 @@ describe('OCRService', () => { noteId: 'note123', type: 'image', mime: 'image/jpeg', + blobId: 'blob123', getContent: vi.fn() }; @@ -316,10 +331,7 @@ describe('OCRService', () => { it('should return existing OCR result if forceReprocess is false', async () => { const existingResult = { - extracted_text: 'Existing text', - confidence: 0.85, - language: 'eng', - extracted_at: '2025-06-10T09:00:00.000Z' + ocr_text: 'Existing text' }; mockSql.getRow.mockReturnValue(existingResult); @@ -327,19 +339,16 @@ describe('OCRService', () => { expect(result).toEqual({ text: 'Existing text', - confidence: 0.85, + confidence: 0.95, language: 'eng', - extractedAt: '2025-06-10T09:00:00.000Z' + extractedAt: expect.any(String) }); expect(mockNote.getContent).not.toHaveBeenCalled(); }); it('should reprocess if forceReprocess is true', async () => { const existingResult = { - extracted_text: 'Existing text', - confidence: 0.85, - language: 'eng', - extracted_at: '2025-06-10T09:00:00.000Z' + ocr_text: 'Existing text' }; mockSql.getRow.mockResolvedValue(existingResult); @@ -385,6 +394,7 @@ describe('OCRService', () => { attachmentId: 'attach123', role: 'image', mime: 'image/png', + blobId: 'blob456', getContent: vi.fn() }; @@ -434,10 +444,8 @@ describe('OCRService', () => { it('should search OCR results successfully', () => { const mockResults = [ { - entity_id: 'note1', - entity_type: 'note', - extracted_text: 'Sample search text', - confidence: 0.95 + blobId: 'blob1', + ocr_text: 'Sample search text' } ]; mockSql.getRows.mockReturnValue(mockResults); @@ -445,36 +453,15 @@ describe('OCRService', () => { const results = ocrService.searchOCRResults('search'); expect(results).toEqual([{ - entityId: 'note1', - entityType: 'note', - text: 'Sample search text', - confidence: 0.95 + blobId: 'blob1', + text: 'Sample search text' }]); expect(mockSql.getRows).toHaveBeenCalledWith( - expect.stringContaining('WHERE extracted_text LIKE ?'), + expect.stringContaining('WHERE ocr_text LIKE ?'), ['%search%'] ); }); - it('should filter by entity type', () => { - const mockResults = [ - { - entity_id: 'note1', - entity_type: 'note', - extracted_text: 'Note text', - confidence: 0.95 - } - ]; - mockSql.getRows.mockReturnValue(mockResults); - - ocrService.searchOCRResults('text', 'note'); - - expect(mockSql.getRows).toHaveBeenCalledWith( - expect.stringContaining('AND entity_type = ?'), - ['%text%', 'note'] - ); - }); - it('should handle search errors gracefully', () => { mockSql.getRows.mockImplementation(() => { throw new Error('Database error'); @@ -490,39 +477,37 @@ describe('OCRService', () => { describe('getOCRStats', () => { it('should return OCR statistics successfully', () => { const mockStats = { - total_processed: 150, - avg_confidence: 0.87 + total_processed: 150 + }; + const mockNoteStats = { + count: 100 + }; + const mockAttachmentStats = { + count: 50 }; - const mockByEntityType = [ - { entity_type: 'note', count: 100 }, - { entity_type: 'attachment', count: 50 } - ]; - mockSql.getRow.mockReturnValue(mockStats); - mockSql.getRows.mockReturnValue(mockByEntityType); + mockSql.getRow.mockReturnValueOnce(mockStats); + mockSql.getRow.mockReturnValueOnce(mockNoteStats); + mockSql.getRow.mockReturnValueOnce(mockAttachmentStats); const stats = ocrService.getOCRStats(); expect(stats).toEqual({ totalProcessed: 150, - averageConfidence: 0.87, - byEntityType: { - note: 100, - attachment: 50 - } + imageNotes: 100, + imageAttachments: 50 }); }); it('should handle missing statistics gracefully', () => { mockSql.getRow.mockReturnValue(null); - mockSql.getRows.mockReturnValue([]); const stats = ocrService.getOCRStats(); expect(stats).toEqual({ totalProcessed: 0, - averageConfidence: 0, - byEntityType: {} + imageNotes: 0, + imageAttachments: 0 }); }); }); @@ -698,11 +683,11 @@ describe('OCRService', () => { // Mock data for batch processing const imageNotes = [ - { noteId: 'note1', mime: 'image/jpeg' }, - { noteId: 'note2', mime: 'image/png' } + { noteId: 'note1', mime: 'image/jpeg', blobId: 'blob1' }, + { noteId: 'note2', mime: 'image/png', blobId: 'blob2' } ]; const imageAttachments = [ - { attachmentId: 'attach1', mime: 'image/gif' } + { attachmentId: 'attach1', mime: 'image/gif', blobId: 'blob3' } ]; // Setup mocks for startBatchProcessing @@ -723,18 +708,21 @@ describe('OCRService', () => { noteId: 'note1', type: 'image', mime: 'image/jpeg', + blobId: 'blob1', getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data')) }; const mockNote2 = { noteId: 'note2', type: 'image', mime: 'image/png', + blobId: 'blob2', getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data')) }; const mockAttachment = { attachmentId: 'attach1', role: 'image', mime: 'image/gif', + blobId: 'blob3', getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data')) }; @@ -761,7 +749,7 @@ describe('OCRService', () => { it('should handle processing errors gracefully', async () => { const imageNotes = [ - { noteId: 'note1', mime: 'image/jpeg' } + { noteId: 'note1', mime: 'image/jpeg', blobId: 'blob1' } ]; // Setup mocks for startBatchProcessing @@ -777,6 +765,7 @@ describe('OCRService', () => { noteId: 'note1', type: 'image', mime: 'image/jpeg', + blobId: 'blob1', getContent: vi.fn().mockImplementation(() => { throw new Error('Failed to get content'); }) }; mockBecca.getNote.mockReturnValue(mockNote); @@ -796,8 +785,8 @@ describe('OCRService', () => { it('should stop processing when cancelled', async () => { const imageNotes = [ - { noteId: 'note1', mime: 'image/jpeg' }, - { noteId: 'note2', mime: 'image/png' } + { noteId: 'note1', mime: 'image/jpeg', blobId: 'blob1' }, + { noteId: 'note2', mime: 'image/png', blobId: 'blob2' } ]; // Setup mocks @@ -821,8 +810,8 @@ describe('OCRService', () => { it('should skip unsupported MIME types', async () => { const imageNotes = [ - { noteId: 'note1', mime: 'text/plain' }, // unsupported - { noteId: 'note2', mime: 'image/jpeg' } // supported + { noteId: 'note1', mime: 'text/plain', blobId: 'blob1' }, // unsupported + { noteId: 'note2', mime: 'image/jpeg', blobId: 'blob2' } // supported ]; // Setup mocks @@ -835,6 +824,7 @@ describe('OCRService', () => { noteId: 'note2', type: 'image', mime: 'image/jpeg', + blobId: 'blob2', getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data')) }; mockBecca.getNote.mockReturnValue(mockNote); @@ -858,13 +848,13 @@ describe('OCRService', () => { describe('deleteOCRResult', () => { it('should delete OCR result successfully', () => { - ocrService.deleteOCRResult('note123', 'note'); + ocrService.deleteOCRResult('blob123'); expect(mockSql.execute).toHaveBeenCalledWith( - expect.stringContaining('DELETE FROM ocr_results'), - ['note123', 'note'] + expect.stringContaining('UPDATE blobs SET ocr_text = NULL'), + ['blob123'] ); - expect(mockLog.info).toHaveBeenCalledWith('Deleted OCR result for note note123'); + expect(mockLog.info).toHaveBeenCalledWith('Deleted OCR result for blob blob123'); }); it('should handle deletion errors', () => { @@ -872,8 +862,8 @@ describe('OCRService', () => { throw new Error('Database error'); }); - expect(() => ocrService.deleteOCRResult('note123', 'note')).toThrow('Database error'); - expect(mockLog.error).toHaveBeenCalledWith('Failed to delete OCR result for note note123: Error: Database error'); + expect(() => ocrService.deleteOCRResult('blob123')).toThrow('Database error'); + expect(mockLog.error).toHaveBeenCalledWith('Failed to delete OCR result for blob blob123: Error: Database error'); }); }); @@ -886,6 +876,7 @@ describe('OCRService', () => { mockBecca.getNote.mockReturnValue({ noteId: 'note123', mime: 'image/jpeg', + blobId: 'blob123', getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data')) }); mockSql.getRow.mockResolvedValue(null); diff --git a/apps/server/src/services/ocr/ocr_service.ts b/apps/server/src/services/ocr/ocr_service.ts index a8bef3236..54361284b 100644 --- a/apps/server/src/services/ocr/ocr_service.ts +++ b/apps/server/src/services/ocr/ocr_service.ts @@ -17,11 +17,9 @@ export interface OCRProcessingOptions { confidence?: number; } -interface OCRResultRow { - entity_id: string; - entity_type: string; - extracted_text: string; - confidence: number; +interface OCRBlobRow { + blobId: string; + ocr_text: string; } /** @@ -176,8 +174,8 @@ class OCRService { return null; } - // Check if OCR already exists and we're not forcing reprocessing - const existingOCR = this.getStoredOCRResult(noteId); + // Check if OCR already exists in the blob and we're not forcing reprocessing + const existingOCR = this.getStoredOCRResult(note.blobId); if (existingOCR && !options.forceReprocess) { log.info(`OCR already exists for note ${noteId}, returning cached result`); return existingOCR; @@ -191,8 +189,8 @@ class OCRService { const ocrResult = await this.extractTextFromImage(content, options); - // Store OCR result - await this.storeOCRResult(noteId, ocrResult); + // Store OCR result in blob + await this.storeOCRResult(note.blobId, ocrResult); return ocrResult; } catch (error) { @@ -226,8 +224,8 @@ class OCRService { return null; } - // Check if OCR already exists and we're not forcing reprocessing - const existingOCR = this.getStoredOCRResult(attachmentId, 'attachment'); + // Check if OCR already exists in the blob and we're not forcing reprocessing + const existingOCR = this.getStoredOCRResult(attachment.blobId); if (existingOCR && !options.forceReprocess) { log.info(`OCR already exists for attachment ${attachmentId}, returning cached result`); return existingOCR; @@ -241,8 +239,8 @@ class OCRService { const ocrResult = await this.extractTextFromImage(content, options); - // Store OCR result - await this.storeOCRResult(attachmentId, ocrResult, 'attachment'); + // Store OCR result in blob + await this.storeOCRResult(attachment.blobId, ocrResult); return ocrResult; } catch (error) { @@ -252,57 +250,62 @@ class OCRService { } /** - * Store OCR result in database + * Store OCR result in blob */ - async storeOCRResult(entityId: string, ocrResult: OCRResult, entityType: 'note' | 'attachment' = 'note'): Promise { + async storeOCRResult(blobId: string | undefined, ocrResult: OCRResult): Promise { + if (!blobId) { + log.error('Cannot store OCR result: blobId is undefined'); + return; + } + try { + // Store OCR text in blobs table sql.execute(` - INSERT OR REPLACE INTO ocr_results (entity_id, entity_type, extracted_text, confidence, language, extracted_at) - VALUES (?, ?, ?, ?, ?, ?) + UPDATE blobs SET ocr_text = ? WHERE blobId = ? `, [ - entityId, - entityType, ocrResult.text, - ocrResult.confidence, - ocrResult.language || 'eng', - ocrResult.extractedAt + blobId ]); - log.info(`Stored OCR result for ${entityType} ${entityId}`); + log.info(`Stored OCR result for blob ${blobId}`); } catch (error) { - log.error(`Failed to store OCR result for ${entityType} ${entityId}: ${error}`); + log.error(`Failed to store OCR result for blob ${blobId}: ${error}`); throw error; } } /** - * Get stored OCR result from database + * Get stored OCR result from blob */ - private getStoredOCRResult(entityId: string, entityType: 'note' | 'attachment' = 'note'): OCRResult | null { + private getStoredOCRResult(blobId: string | undefined): OCRResult | null { + if (!blobId) { + return null; + } + try { const row = sql.getRow<{ - extracted_text: string; - confidence: number; - language?: string; - extracted_at: string; + ocr_text: string | null; }>(` - SELECT extracted_text, confidence, language, extracted_at - FROM ocr_results - WHERE entity_id = ? AND entity_type = ? - `, [entityId, entityType]); + SELECT ocr_text + FROM blobs + WHERE blobId = ? + `, [blobId]); - if (!row) { + if (!row || !row.ocr_text) { return null; } + // Return basic OCR result from stored text + // Note: we lose confidence, language, and extractedAt metadata + // but gain simplicity by storing directly in blob return { - text: row.extracted_text, - confidence: row.confidence, - language: row.language, - extractedAt: row.extracted_at + text: row.ocr_text, + confidence: 0.95, // Default high confidence for existing OCR + extractedAt: new Date().toISOString(), + language: 'eng' }; } catch (error) { - log.error(`Failed to get OCR result for ${entityType} ${entityId}: ${error}`); + log.error(`Failed to get OCR result for blob ${blobId}: ${error}`); return null; } } @@ -310,29 +313,21 @@ class OCRService { /** * Search for text in OCR results */ - searchOCRResults(searchText: string, entityType?: 'note' | 'attachment'): Array<{ entityId: string; entityType: string; text: string; confidence: number }> { + searchOCRResults(searchText: string): Array<{ blobId: string; text: string }> { try { - let query = ` - SELECT entity_id, entity_type, extracted_text, confidence - FROM ocr_results - WHERE extracted_text LIKE ? + const query = ` + SELECT blobId, ocr_text + FROM blobs + WHERE ocr_text LIKE ? + AND ocr_text IS NOT NULL `; const params = [`%${searchText}%`]; - if (entityType) { - query += ' AND entity_type = ?'; - params.push(entityType); - } - - query += ' ORDER BY confidence DESC'; - - const rows = sql.getRows(query, params); + const rows = sql.getRows(query, params); return rows.map(row => ({ - entityId: row.entity_id, - entityType: row.entity_type, - text: row.extracted_text, - confidence: row.confidence + blobId: row.blobId, + text: row.ocr_text })); } catch (error) { log.error(`Failed to search OCR results: ${error}`); @@ -341,18 +336,18 @@ class OCRService { } /** - * Delete OCR results for an entity + * Delete OCR results for a blob */ - deleteOCRResult(entityId: string, entityType: 'note' | 'attachment' = 'note'): void { + deleteOCRResult(blobId: string): void { try { sql.execute(` - DELETE FROM ocr_results - WHERE entity_id = ? AND entity_type = ? - `, [entityId, entityType]); + UPDATE blobs SET ocr_text = NULL + WHERE blobId = ? + `, [blobId]); - log.info(`Deleted OCR result for ${entityType} ${entityId}`); + log.info(`Deleted OCR result for blob ${blobId}`); } catch (error) { - log.error(`Failed to delete OCR result for ${entityType} ${entityId}: ${error}`); + log.error(`Failed to delete OCR result for blob ${blobId}: ${error}`); throw error; } } @@ -373,14 +368,15 @@ class OCRService { const imageNotes = sql.getRows<{ noteId: string; mime: string; + blobId: string; }>(` - SELECT noteId, mime - FROM notes - WHERE type = 'image' - AND isDeleted = 0 - AND noteId NOT IN ( - SELECT entity_id FROM ocr_results WHERE entity_type = 'note' - ) + SELECT n.noteId, n.mime, n.blobId + FROM notes n + LEFT JOIN blobs b ON n.blobId = b.blobId + WHERE n.type = 'image' + AND n.isDeleted = 0 + AND n.blobId IS NOT NULL + AND (b.ocr_text IS NULL OR b.ocr_text = '') `); log.info(`Found ${imageNotes.length} image notes to process`); @@ -401,14 +397,15 @@ class OCRService { const imageAttachments = sql.getRows<{ attachmentId: string; mime: string; + blobId: string; }>(` - SELECT attachmentId, mime - FROM attachments - WHERE role = 'image' - AND isDeleted = 0 - AND attachmentId NOT IN ( - SELECT entity_id FROM ocr_results WHERE entity_type = 'attachment' - ) + SELECT a.attachmentId, a.mime, a.blobId + FROM attachments a + LEFT JOIN blobs b ON a.blobId = b.blobId + WHERE a.role = 'image' + AND a.isDeleted = 0 + AND a.blobId IS NOT NULL + AND (b.ocr_text IS NULL OR b.ocr_text = '') `); log.info(`Found ${imageAttachments.length} image attachments to process`); @@ -435,38 +432,48 @@ class OCRService { /** * Get OCR statistics */ - getOCRStats(): { totalProcessed: number; averageConfidence: number; byEntityType: Record } { + getOCRStats(): { totalProcessed: number; imageNotes: number; imageAttachments: number } { try { const stats = sql.getRow<{ total_processed: number; - avg_confidence: number; }>(` - SELECT - COUNT(*) as total_processed, - AVG(confidence) as avg_confidence - FROM ocr_results + SELECT COUNT(*) as total_processed + FROM blobs + WHERE ocr_text IS NOT NULL AND ocr_text != '' `); - const byEntityType = sql.getRows<{ - entity_type: string; + // Count image notes with OCR + const noteStats = sql.getRow<{ count: number; }>(` - SELECT entity_type, COUNT(*) as count - FROM ocr_results - GROUP BY entity_type + SELECT COUNT(*) as count + FROM notes n + JOIN blobs b ON n.blobId = b.blobId + WHERE n.type = 'image' + AND n.isDeleted = 0 + AND b.ocr_text IS NOT NULL AND b.ocr_text != '' + `); + + // Count image attachments with OCR + const attachmentStats = sql.getRow<{ + count: number; + }>(` + SELECT COUNT(*) as count + FROM attachments a + JOIN blobs b ON a.blobId = b.blobId + WHERE a.role = 'image' + AND a.isDeleted = 0 + AND b.ocr_text IS NOT NULL AND b.ocr_text != '' `); return { totalProcessed: stats?.total_processed || 0, - averageConfidence: stats?.avg_confidence || 0, - byEntityType: byEntityType.reduce((acc, row) => { - acc[row.entity_type] = row.count; - return acc; - }, {} as Record) + imageNotes: noteStats?.count || 0, + imageAttachments: attachmentStats?.count || 0 }; } catch (error) { log.error(`Failed to get OCR stats: ${error}`); - return { totalProcessed: 0, averageConfidence: 0, byEntityType: {} }; + return { totalProcessed: 0, imageNotes: 0, imageAttachments: 0 }; } } @@ -584,14 +591,15 @@ class OCRService { const imageNotes = sql.getRows<{ noteId: string; mime: string; + blobId: string; }>(` - SELECT noteId, mime - FROM notes - WHERE type = 'image' - AND isDeleted = 0 - AND noteId NOT IN ( - SELECT entity_id FROM ocr_results WHERE entity_type = 'note' - ) + SELECT n.noteId, n.mime, n.blobId + FROM notes n + LEFT JOIN blobs b ON n.blobId = b.blobId + WHERE n.type = 'image' + AND n.isDeleted = 0 + AND n.blobId IS NOT NULL + AND (b.ocr_text IS NULL OR b.ocr_text = '') `); for (const noteRow of imageNotes) { @@ -616,14 +624,15 @@ class OCRService { const imageAttachments = sql.getRows<{ attachmentId: string; mime: string; + blobId: string; }>(` - SELECT attachmentId, mime - FROM attachments - WHERE role = 'image' - AND isDeleted = 0 - AND attachmentId NOT IN ( - SELECT entity_id FROM ocr_results WHERE entity_type = 'attachment' - ) + SELECT a.attachmentId, a.mime, a.blobId + FROM attachments a + LEFT JOIN blobs b ON a.blobId = b.blobId + WHERE a.role = 'image' + AND a.isDeleted = 0 + AND a.blobId IS NOT NULL + AND (b.ocr_text IS NULL OR b.ocr_text = '') `); for (const attachmentRow of imageAttachments) { diff --git a/apps/server/src/services/search/expressions/ocr_content.ts b/apps/server/src/services/search/expressions/ocr_content.ts index 1d9db635a..8da5e589e 100644 --- a/apps/server/src/services/search/expressions/ocr_content.ts +++ b/apps/server/src/services/search/expressions/ocr_content.ts @@ -25,21 +25,30 @@ export default class OCRContentExpression extends Expression { const ocrResults = this.searchOCRContent(this.searchText); for (const ocrResult of ocrResults) { - let note: import('../../../becca/entities/bnote.js').default | null = null; - - if (ocrResult.entity_type === 'note') { - note = becca.getNote(ocrResult.entity_id); - } else if (ocrResult.entity_type === 'attachment') { - // For attachments, find the parent note - const attachment = becca.getAttachment(ocrResult.entity_id); - if (attachment) { - note = becca.getNote(attachment.ownerId); + // Find notes that use this blob + const notes = sql.getRows<{noteId: string}>(` + SELECT noteId FROM notes + WHERE blobId = ? AND isDeleted = 0 + `, [ocrResult.blobId]); + + for (const noteRow of notes) { + const note = becca.getNote(noteRow.noteId); + if (note && !note.isDeleted && inputNoteSet.hasNoteId(note.noteId)) { + resultNoteSet.add(note); } } - // Only add notes that are in the input note set and not deleted - if (note && !note.isDeleted && inputNoteSet.hasNoteId(note.noteId)) { - resultNoteSet.add(note); + // Find attachments that use this blob and their parent notes + const attachments = sql.getRows<{ownerId: string}>(` + SELECT ownerId FROM attachments + WHERE blobId = ? AND isDeleted = 0 + `, [ocrResult.blobId]); + + for (const attachmentRow of attachments) { + const note = becca.getNote(attachmentRow.ownerId); + if (note && !note.isDeleted && inputNoteSet.hasNoteId(note.noteId)) { + resultNoteSet.add(note); + } } } @@ -62,44 +71,24 @@ export default class OCRContentExpression extends Expression { } private searchOCRContent(searchText: string): Array<{ - entity_id: string; - entity_type: string; - extracted_text: string; - confidence: number; + blobId: string; + ocr_text: string; }> { try { - // Use FTS search if available, otherwise fall back to LIKE - let query: string; - let params: unknown[]; - - try { - // Try FTS first - query = ` - SELECT ocr.entity_id, ocr.entity_type, ocr.extracted_text, ocr.confidence - FROM ocr_results_fts fts - JOIN ocr_results ocr ON fts.rowid = ocr.id - WHERE ocr_results_fts MATCH ? - ORDER BY ocr.confidence DESC, rank - LIMIT 50 - `; - params = [searchText]; - } catch { - // Fallback to LIKE search - query = ` - SELECT entity_id, entity_type, extracted_text, confidence - FROM ocr_results - WHERE extracted_text LIKE ? - ORDER BY confidence DESC - LIMIT 50 - `; - params = [`%${searchText}%`]; - } + // Search in blobs table for OCR text + const query = ` + SELECT blobId, ocr_text + FROM blobs + WHERE ocr_text LIKE ? + AND ocr_text IS NOT NULL + AND ocr_text != '' + LIMIT 50 + `; + const params = [`%${searchText}%`]; return sql.getRows<{ - entity_id: string; - entity_type: string; - extracted_text: string; - confidence: number; + blobId: string; + ocr_text: string; }>(query, params); } catch (error) { console.error('Error searching OCR content:', error); diff --git a/packages/commons/src/lib/rows.ts b/packages/commons/src/lib/rows.ts index a407d8001..1afb3d9c1 100644 --- a/packages/commons/src/lib/rows.ts +++ b/packages/commons/src/lib/rows.ts @@ -70,6 +70,7 @@ export interface BlobRow { blobId: string; content: string | Buffer; contentLength: number; + ocr_text?: string | null; dateModified: string; utcDateModified: string; }