feat(ocr): basic processing of new files

2025-12-06 23:44:25 +01:00 · 2025-07-26 11:46:28 +03:00 · 2025-07-26 11:46:28 +03:00 · 11e9b097a2
commit 11e9b097a2
parent 2adfc1d32b
3 changed files with 157 additions and 51 deletions
--- a/apps/server/src/services/handlers.ts
+++ b/apps/server/src/services/handlers.ts
@ -6,6 +6,8 @@ import becca from "../becca/becca.js";
 import BAttribute from "../becca/entities/battribute.js";
 import hiddenSubtreeService from "./hidden_subtree.js";
 import oneTimeTimer from "./one_time_timer.js";
 import ocrService from "./ocr/ocr_service.js";
 import log from "./log.js";
 import type BNote from "../becca/entities/bnote.js";
 import type AbstractBeccaEntity from "../becca/entities/abstract_becca_entity.js";
 import type { DefinitionObject } from "./promoted_attribute_definition_interface.js";
@ -137,6 +139,42 @@ eventService.subscribe(eventService.ENTITY_CREATED, ({ entityName, entity }) =>
        }
    } else if (entityName === "notes") {
        runAttachedRelations(entity, "runOnNoteCreation", entity);
        // Automatically process OCR for file notes if OCR is enabled
        if (entity.type === 'file' && ocrService.isOCREnabled()) {
            // Check if the file MIME type is supported by any OCR processor
            const supportedMimeTypes = [
                // Office documents
                'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
                'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
                'application/vnd.openxmlformats-officedocument.presentationml.presentation',
                'application/msword',
                'application/vnd.ms-excel',
                'application/vnd.ms-powerpoint',
                'application/rtf',
                // PDFs
                'application/pdf',
                // Images (though these are usually type='image', not 'file')
                'image/jpeg',
                'image/jpg',
                'image/png',
                'image/gif',
                'image/bmp',
                'image/tiff',
                'image/webp'
            ];
            if (entity.mime && supportedMimeTypes.includes(entity.mime)) {
                // Process OCR asynchronously to avoid blocking note creation
                ocrService.processNoteOCR(entity.noteId).then(result => {
                    if (result) {
                        log.info(`Automatically processed OCR for file note ${entity.noteId} with MIME type ${entity.mime}`);
                    }
                }).catch(error => {
                    log.error(`Failed to automatically process OCR for file note ${entity.noteId}: ${error}`);
                });
            }
        }
    }
 });
--- a/apps/server/src/services/ocr/ocr_service.ts
+++ b/apps/server/src/services/ocr/ocr_service.ts
@ -50,13 +50,13 @@ class OCRService {
        try {
            log.info('Initializing OCR service with file processors...');
-            
+
            // Initialize file processors
            this.processors.set('image', new ImageProcessor());
            this.processors.set('pdf', new PDFProcessor());
            this.processors.set('tiff', new TIFFProcessor());
            this.processors.set('office', new OfficeProcessor());
-            
+
            this.isInitialized = true;
            log.info('OCR service initialized successfully');
        } catch (error) {
@ -84,10 +84,10 @@ class OCRService {
        if (!mimeType || typeof mimeType !== 'string') {
            return false;
        }
-        
+
        const supportedTypes = [
            'image/jpeg',
-            'image/jpg', 
+            'image/jpg',
            'image/png',
            'image/gif',
            'image/bmp',
@ -116,7 +116,7 @@ class OCRService {
            }
            const result = await processor.extractText(fileBuffer, options);
-            
+
            log.info(`OCR extraction completed. Confidence: ${result.confidence}%, Text length: ${result.text.length}`);
            return result;
@ -143,13 +143,25 @@ class OCRService {
            return null;
        }
-        if (note.type !== 'image') {
+        if (!this.isInitialized) {
-            log.info(`Note ${noteId} is not an image note, skipping OCR`);
+            await this.initialize();
            return null;
        }
-        if (!this.isSupportedMimeType(note.mime)) {
+        // Check if note type and MIME type are supported for OCR
-            log.info(`Note ${noteId} has unsupported MIME type ${note.mime}, skipping OCR`);
+        if (note.type === 'image') {
            if (!this.isSupportedMimeType(note.mime)) {
                log.info(`Image note ${noteId} has unsupported MIME type ${note.mime}, skipping OCR`);
                return null;
            }
        } else if (note.type === 'file') {
            // Check if file MIME type is supported by any processor
            const processor = this.getProcessorForMimeType(note.mime);
            if (!processor) {
                log.info(`File note ${noteId} has unsupported MIME type ${note.mime} for OCR, skipping`);
                return null;
            }
        } else {
            log.info(`Note ${noteId} is not an image or file note, skipping OCR`);
            return null;
        }
@ -167,10 +179,10 @@ class OCRService {
            }
            const ocrResult = await this.extractTextFromFile(content, note.mime, options);
-            
+
            // Store OCR result in blob
            await this.storeOCRResult(note.blobId, ocrResult);
-            
+
            return ocrResult;
        } catch (error) {
            log.error(`Failed to process OCR for note ${noteId}: ${error}`);
@ -193,13 +205,25 @@ class OCRService {
            return null;
        }
-        if (attachment.role !== 'image') {
+        if (!this.isInitialized) {
-            log.info(`Attachment ${attachmentId} is not an image, skipping OCR`);
+            await this.initialize();
            return null;
        }
-        if (!this.isSupportedMimeType(attachment.mime)) {
+        // Check if attachment role and MIME type are supported for OCR
-            log.info(`Attachment ${attachmentId} has unsupported MIME type ${attachment.mime}, skipping OCR`);
+        if (attachment.role === 'image') {
            if (!this.isSupportedMimeType(attachment.mime)) {
                log.info(`Image attachment ${attachmentId} has unsupported MIME type ${attachment.mime}, skipping OCR`);
                return null;
            }
        } else if (attachment.role === 'file') {
            // Check if file MIME type is supported by any processor
            const processor = this.getProcessorForMimeType(attachment.mime);
            if (!processor) {
                log.info(`File attachment ${attachmentId} has unsupported MIME type ${attachment.mime} for OCR, skipping`);
                return null;
            }
        } else {
            log.info(`Attachment ${attachmentId} is not an image or file, skipping OCR`);
            return null;
        }
@ -217,10 +241,10 @@ class OCRService {
            }
            const ocrResult = await this.extractTextFromFile(content, attachment.mime, options);
-            
+
            // Store OCR result in blob
            await this.storeOCRResult(attachment.blobId, ocrResult);
-            
+
            return ocrResult;
        } catch (error) {
            log.error(`Failed to process OCR for attachment ${attachmentId}: ${error}`);
@ -240,8 +264,8 @@ class OCRService {
        try {
            // Store OCR text and timestamp in blobs table
            sql.execute(`
-                UPDATE blobs SET 
+                UPDATE blobs SET
-                    ocr_text = ?, 
+                    ocr_text = ?,
                    ocr_last_processed = ?
                WHERE blobId = ?
            `, [
@ -249,7 +273,7 @@ class OCRService {
                new Date().toISOString(),
                blobId
            ]);
-            
+
            log.info(`Stored OCR result for blob ${blobId}`);
        } catch (error) {
            log.error(`Failed to store OCR result for blob ${blobId}: ${error}`);
@ -270,14 +294,14 @@ class OCRService {
                ocr_text: string | null;
            }>(`
                SELECT ocr_text
-                FROM blobs 
+                FROM blobs
                WHERE blobId = ?
            `, [blobId]);
-            
+
            if (!row || !row.ocr_text) {
                return null;
            }
-            
+
            // Return basic OCR result from stored text
            // Note: we lose confidence, language, and extractedAt metadata
            // but gain simplicity by storing directly in blob
@ -300,14 +324,14 @@ class OCRService {
        try {
            const query = `
                SELECT blobId, ocr_text
-                FROM blobs 
+                FROM blobs
                WHERE ocr_text LIKE ?
                AND ocr_text IS NOT NULL
            `;
            const params = [`%${searchText}%`];
-            
+
            const rows = sql.getRows<OCRBlobRow>(query, params);
-            
+
            return rows.map(row => ({
                blobId: row.blobId,
                text: row.ocr_text
@ -324,10 +348,10 @@ class OCRService {
    deleteOCRResult(blobId: string): void {
        try {
            sql.execute(`
-                UPDATE blobs SET ocr_text = NULL 
+                UPDATE blobs SET ocr_text = NULL
                WHERE blobId = ?
            `, [blobId]);
-            
+
            log.info(`Deleted OCR result for blob ${blobId}`);
        } catch (error) {
            log.error(`Failed to delete OCR result for blob ${blobId}: ${error}`);
@ -547,7 +571,7 @@ class OCRService {
                ocr_last_processed: string | null;
            }>(`
                SELECT utcDateModified, ocr_last_processed
-                FROM blobs 
+                FROM blobs
                WHERE blobId = ?
            `, [blobId]);
@ -563,7 +587,7 @@ class OCRService {
            // If blob was modified after last OCR processing, it needs re-processing
            const blobModified = new Date(blobInfo.utcDateModified);
            const lastOcrProcessed = new Date(blobInfo.ocr_last_processed);
-            
+
            return blobModified > lastOcrProcessed;
        } catch (error) {
            log.error(`Failed to check if blob ${blobId} needs reprocessing: ${error}`);
@ -581,12 +605,12 @@ class OCRService {
        try {
            sql.execute(`
-                UPDATE blobs SET 
+                UPDATE blobs SET
                    ocr_text = NULL,
                    ocr_last_processed = NULL
                WHERE blobId = ?
            `, [blobId]);
-            
+
            log.info(`Invalidated OCR result for blob ${blobId}`);
        } catch (error) {
            log.error(`Failed to invalidate OCR result for blob ${blobId}: ${error}`);
@ -599,7 +623,7 @@ class OCRService {
     */
    getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> {
        try {
-            // Get notes with blobs that need OCR
+            // Get notes with blobs that need OCR (both image notes and file notes with supported MIME types)
            const noteBlobs = sql.getRows<{
                blobId: string;
                mimeType: string;
@ -608,16 +632,38 @@ class OCRService {
                SELECT n.blobId, n.mime as mimeType, n.noteId as entityId
                FROM notes n
                JOIN blobs b ON n.blobId = b.blobId
-                WHERE n.type = 'image' 
+                WHERE (
                    n.type = 'image'
                    OR (
                        n.type = 'file'
                        AND n.mime IN (
                            'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
                            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
                            'application/vnd.openxmlformats-officedocument.presentationml.presentation',
                            'application/msword',
                            'application/vnd.ms-excel',
                            'application/vnd.ms-powerpoint',
                            'application/rtf',
                            'application/pdf',
                            'image/jpeg',
                            'image/jpg',
                            'image/png',
                            'image/gif',
                            'image/bmp',
                            'image/tiff',
                            'image/webp'
                        )
                    )
                )
                AND n.isDeleted = 0
                AND n.blobId IS NOT NULL
                AND (
-                    b.ocr_last_processed IS NULL 
+                    b.ocr_last_processed IS NULL
                    OR b.utcDateModified > b.ocr_last_processed
                )
            `);
-            // Get attachments with blobs that need OCR
+            // Get attachments with blobs that need OCR (both image and file attachments with supported MIME types)
            const attachmentBlobs = sql.getRows<{
                blobId: string;
                mimeType: string;
@ -626,11 +672,33 @@ class OCRService {
                SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId
                FROM attachments a
                JOIN blobs b ON a.blobId = b.blobId
-                WHERE a.role = 'image'
+                WHERE (
                    a.role = 'image'
                    OR (
                        a.role = 'file'
                        AND a.mime IN (
                            'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
                            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
                            'application/vnd.openxmlformats-officedocument.presentationml.presentation',
                            'application/msword',
                            'application/vnd.ms-excel',
                            'application/vnd.ms-powerpoint',
                            'application/rtf',
                            'application/pdf',
                            'image/jpeg',
                            'image/jpg',
                            'image/png',
                            'image/gif',
                            'image/bmp',
                            'image/tiff',
                            'image/webp'
                        )
                    )
                )
                AND a.isDeleted = 0
                AND a.blobId IS NOT NULL
                AND (
-                    b.ocr_last_processed IS NULL 
+                    b.ocr_last_processed IS NULL
                    OR b.utcDateModified > b.ocr_last_processed
                )
            `);
@ -641,8 +709,8 @@ class OCRService {
                ...attachmentBlobs.map(blob => ({ ...blob, entityType: 'attachment' as const }))
            ];
-            // Filter to only supported MIME types
+            // Return all results (no need to filter by MIME type as we already did in the query)
-            return result.filter(blob => this.isSupportedMimeType(blob.mimeType));
+            return result;
        } catch (error) {
            log.error(`Failed to get blobs needing OCR: ${error}`);
            return [];
@ -673,7 +741,7 @@ class OCRService {
                } else {
                    await this.processAttachmentOCR(blobInfo.entityId);
                }
-                
+
                // Add small delay to prevent overwhelming the system
                await new Promise(resolve => setTimeout(resolve, 100));
            } catch (error) {
@ -686,4 +754,4 @@ class OCRService {
    }
 }
-export default new OCRService();
+export default new OCRService();
--- a/apps/server/src/services/ocr/processors/office_processor.ts
+++ b/apps/server/src/services/ocr/processors/office_processor.ts
@ -25,7 +25,7 @@ export class OfficeProcessor extends FileProcessor {
            'application/vnd.ms-powerpoint', // PPT
            'application/rtf' // RTF
        ];
-        return supportedTypes.includes(mimeType.toLowerCase());
+        return supportedTypes.includes(mimeType);
    }
    async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
@ -40,7 +40,7 @@ export class OfficeProcessor extends FileProcessor {
            // Extract text from Office document
            const data = await this.parseOfficeDocument(buffer);
-            
+
            // Extract text from Office document
            const combinedText = data.data && data.data.trim().length > 0 ? data.data.trim() : '';
            const confidence = combinedText.length > 0 ? 0.99 : 0; // High confidence for direct text extraction
@ -71,7 +71,7 @@ export class OfficeProcessor extends FileProcessor {
                ignoreNotes: false,
                putNotesAtLast: false
            });
-            
+
            return {
                data: data || ''
            };
@ -113,16 +113,16 @@ export class OfficeProcessor extends FileProcessor {
        if (!language || typeof language !== 'string') {
            return false;
        }
-        
+
        // Split by '+' for multi-language format
        const languages = language.split('+');
-        
+
        // Check each language code (should be 2-7 characters, alphanumeric with underscores)
        const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
-        
+
        return languages.every(lang => {
            const trimmed = lang.trim();
            return trimmed.length > 0 && validLanguagePattern.test(trimmed);
        });
    }
-}
+}