From 11e9b097a26f5711a0c0c3f85f4eebd1a39a2c04 Mon Sep 17 00:00:00 2001
From: Elian Doran <contact@eliandoran.me>
Date: Sat, 26 Jul 2025 11:46:28 +0300
Subject: [PATCH] feat(ocr): basic processing of new files

---
 apps/server/src/services/handlers.ts          |  38 +++++
 apps/server/src/services/ocr/ocr_service.ts   | 156 +++++++++++++-----
 .../ocr/processors/office_processor.ts        |  14 +-
 3 files changed, 157 insertions(+), 51 deletions(-)

diff --git a/apps/server/src/services/handlers.ts b/apps/server/src/services/handlers.ts
index 52e50cbf3..b26fa1271 100644
--- a/apps/server/src/services/handlers.ts
+++ b/apps/server/src/services/handlers.ts
@@ -6,6 +6,8 @@ import becca from "../becca/becca.js";
 import BAttribute from "../becca/entities/battribute.js";
 import hiddenSubtreeService from "./hidden_subtree.js";
 import oneTimeTimer from "./one_time_timer.js";
+import ocrService from "./ocr/ocr_service.js";
+import log from "./log.js";
 import type BNote from "../becca/entities/bnote.js";
 import type AbstractBeccaEntity from "../becca/entities/abstract_becca_entity.js";
 import type { DefinitionObject } from "./promoted_attribute_definition_interface.js";
@@ -137,6 +139,42 @@ eventService.subscribe(eventService.ENTITY_CREATED, ({ entityName, entity }) =>
         }
     } else if (entityName === "notes") {
         runAttachedRelations(entity, "runOnNoteCreation", entity);
+        
+        // Automatically process OCR for file notes if OCR is enabled
+        if (entity.type === 'file' && ocrService.isOCREnabled()) {
+            // Check if the file MIME type is supported by any OCR processor
+            const supportedMimeTypes = [
+                // Office documents
+                'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+                'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+                'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+                'application/msword',
+                'application/vnd.ms-excel',
+                'application/vnd.ms-powerpoint',
+                'application/rtf',
+                // PDFs
+                'application/pdf',
+                // Images (though these are usually type='image', not 'file')
+                'image/jpeg',
+                'image/jpg',
+                'image/png',
+                'image/gif',
+                'image/bmp',
+                'image/tiff',
+                'image/webp'
+            ];
+            
+            if (entity.mime && supportedMimeTypes.includes(entity.mime)) {
+                // Process OCR asynchronously to avoid blocking note creation
+                ocrService.processNoteOCR(entity.noteId).then(result => {
+                    if (result) {
+                        log.info(`Automatically processed OCR for file note ${entity.noteId} with MIME type ${entity.mime}`);
+                    }
+                }).catch(error => {
+                    log.error(`Failed to automatically process OCR for file note ${entity.noteId}: ${error}`);
+                });
+            }
+        }
     }
 });
 
diff --git a/apps/server/src/services/ocr/ocr_service.ts b/apps/server/src/services/ocr/ocr_service.ts
index 3d5c4aea4..89b420428 100644
--- a/apps/server/src/services/ocr/ocr_service.ts
+++ b/apps/server/src/services/ocr/ocr_service.ts
@@ -50,13 +50,13 @@ class OCRService {
 
         try {
             log.info('Initializing OCR service with file processors...');
-            
+
             // Initialize file processors
             this.processors.set('image', new ImageProcessor());
             this.processors.set('pdf', new PDFProcessor());
             this.processors.set('tiff', new TIFFProcessor());
             this.processors.set('office', new OfficeProcessor());
-            
+
             this.isInitialized = true;
             log.info('OCR service initialized successfully');
         } catch (error) {
@@ -84,10 +84,10 @@ class OCRService {
         if (!mimeType || typeof mimeType !== 'string') {
             return false;
         }
-        
+
         const supportedTypes = [
             'image/jpeg',
-            'image/jpg', 
+            'image/jpg',
             'image/png',
             'image/gif',
             'image/bmp',
@@ -116,7 +116,7 @@ class OCRService {
             }
 
             const result = await processor.extractText(fileBuffer, options);
-            
+
             log.info(`OCR extraction completed. Confidence: ${result.confidence}%, Text length: ${result.text.length}`);
             return result;
 
@@ -143,13 +143,25 @@ class OCRService {
             return null;
         }
 
-        if (note.type !== 'image') {
-            log.info(`Note ${noteId} is not an image note, skipping OCR`);
-            return null;
+        if (!this.isInitialized) {
+            await this.initialize();
         }
 
-        if (!this.isSupportedMimeType(note.mime)) {
-            log.info(`Note ${noteId} has unsupported MIME type ${note.mime}, skipping OCR`);
+        // Check if note type and MIME type are supported for OCR
+        if (note.type === 'image') {
+            if (!this.isSupportedMimeType(note.mime)) {
+                log.info(`Image note ${noteId} has unsupported MIME type ${note.mime}, skipping OCR`);
+                return null;
+            }
+        } else if (note.type === 'file') {
+            // Check if file MIME type is supported by any processor
+            const processor = this.getProcessorForMimeType(note.mime);
+            if (!processor) {
+                log.info(`File note ${noteId} has unsupported MIME type ${note.mime} for OCR, skipping`);
+                return null;
+            }
+        } else {
+            log.info(`Note ${noteId} is not an image or file note, skipping OCR`);
             return null;
         }
 
@@ -167,10 +179,10 @@ class OCRService {
             }
 
             const ocrResult = await this.extractTextFromFile(content, note.mime, options);
-            
+
             // Store OCR result in blob
             await this.storeOCRResult(note.blobId, ocrResult);
-            
+
             return ocrResult;
         } catch (error) {
             log.error(`Failed to process OCR for note ${noteId}: ${error}`);
@@ -193,13 +205,25 @@ class OCRService {
             return null;
         }
 
-        if (attachment.role !== 'image') {
-            log.info(`Attachment ${attachmentId} is not an image, skipping OCR`);
-            return null;
+        if (!this.isInitialized) {
+            await this.initialize();
         }
 
-        if (!this.isSupportedMimeType(attachment.mime)) {
-            log.info(`Attachment ${attachmentId} has unsupported MIME type ${attachment.mime}, skipping OCR`);
+        // Check if attachment role and MIME type are supported for OCR
+        if (attachment.role === 'image') {
+            if (!this.isSupportedMimeType(attachment.mime)) {
+                log.info(`Image attachment ${attachmentId} has unsupported MIME type ${attachment.mime}, skipping OCR`);
+                return null;
+            }
+        } else if (attachment.role === 'file') {
+            // Check if file MIME type is supported by any processor
+            const processor = this.getProcessorForMimeType(attachment.mime);
+            if (!processor) {
+                log.info(`File attachment ${attachmentId} has unsupported MIME type ${attachment.mime} for OCR, skipping`);
+                return null;
+            }
+        } else {
+            log.info(`Attachment ${attachmentId} is not an image or file, skipping OCR`);
             return null;
         }
 
@@ -217,10 +241,10 @@ class OCRService {
             }
 
             const ocrResult = await this.extractTextFromFile(content, attachment.mime, options);
-            
+
             // Store OCR result in blob
             await this.storeOCRResult(attachment.blobId, ocrResult);
-            
+
             return ocrResult;
         } catch (error) {
             log.error(`Failed to process OCR for attachment ${attachmentId}: ${error}`);
@@ -240,8 +264,8 @@ class OCRService {
         try {
             // Store OCR text and timestamp in blobs table
             sql.execute(`
-                UPDATE blobs SET 
-                    ocr_text = ?, 
+                UPDATE blobs SET
+                    ocr_text = ?,
                     ocr_last_processed = ?
                 WHERE blobId = ?
             `, [
@@ -249,7 +273,7 @@ class OCRService {
                 new Date().toISOString(),
                 blobId
             ]);
-            
+
             log.info(`Stored OCR result for blob ${blobId}`);
         } catch (error) {
             log.error(`Failed to store OCR result for blob ${blobId}: ${error}`);
@@ -270,14 +294,14 @@ class OCRService {
                 ocr_text: string | null;
             }>(`
                 SELECT ocr_text
-                FROM blobs 
+                FROM blobs
                 WHERE blobId = ?
             `, [blobId]);
-            
+
             if (!row || !row.ocr_text) {
                 return null;
             }
-            
+
             // Return basic OCR result from stored text
             // Note: we lose confidence, language, and extractedAt metadata
             // but gain simplicity by storing directly in blob
@@ -300,14 +324,14 @@ class OCRService {
         try {
             const query = `
                 SELECT blobId, ocr_text
-                FROM blobs 
+                FROM blobs
                 WHERE ocr_text LIKE ?
                 AND ocr_text IS NOT NULL
             `;
             const params = [`%${searchText}%`];
-            
+
             const rows = sql.getRows<OCRBlobRow>(query, params);
-            
+
             return rows.map(row => ({
                 blobId: row.blobId,
                 text: row.ocr_text
@@ -324,10 +348,10 @@ class OCRService {
     deleteOCRResult(blobId: string): void {
         try {
             sql.execute(`
-                UPDATE blobs SET ocr_text = NULL 
+                UPDATE blobs SET ocr_text = NULL
                 WHERE blobId = ?
             `, [blobId]);
-            
+
             log.info(`Deleted OCR result for blob ${blobId}`);
         } catch (error) {
             log.error(`Failed to delete OCR result for blob ${blobId}: ${error}`);
@@ -547,7 +571,7 @@ class OCRService {
                 ocr_last_processed: string | null;
             }>(`
                 SELECT utcDateModified, ocr_last_processed
-                FROM blobs 
+                FROM blobs
                 WHERE blobId = ?
             `, [blobId]);
 
@@ -563,7 +587,7 @@ class OCRService {
             // If blob was modified after last OCR processing, it needs re-processing
             const blobModified = new Date(blobInfo.utcDateModified);
             const lastOcrProcessed = new Date(blobInfo.ocr_last_processed);
-            
+
             return blobModified > lastOcrProcessed;
         } catch (error) {
             log.error(`Failed to check if blob ${blobId} needs reprocessing: ${error}`);
@@ -581,12 +605,12 @@ class OCRService {
 
         try {
             sql.execute(`
-                UPDATE blobs SET 
+                UPDATE blobs SET
                     ocr_text = NULL,
                     ocr_last_processed = NULL
                 WHERE blobId = ?
             `, [blobId]);
-            
+
             log.info(`Invalidated OCR result for blob ${blobId}`);
         } catch (error) {
             log.error(`Failed to invalidate OCR result for blob ${blobId}: ${error}`);
@@ -599,7 +623,7 @@ class OCRService {
      */
     getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> {
         try {
-            // Get notes with blobs that need OCR
+            // Get notes with blobs that need OCR (both image notes and file notes with supported MIME types)
             const noteBlobs = sql.getRows<{
                 blobId: string;
                 mimeType: string;
@@ -608,16 +632,38 @@ class OCRService {
                 SELECT n.blobId, n.mime as mimeType, n.noteId as entityId
                 FROM notes n
                 JOIN blobs b ON n.blobId = b.blobId
-                WHERE n.type = 'image' 
+                WHERE (
+                    n.type = 'image'
+                    OR (
+                        n.type = 'file'
+                        AND n.mime IN (
+                            'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+                            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+                            'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+                            'application/msword',
+                            'application/vnd.ms-excel',
+                            'application/vnd.ms-powerpoint',
+                            'application/rtf',
+                            'application/pdf',
+                            'image/jpeg',
+                            'image/jpg',
+                            'image/png',
+                            'image/gif',
+                            'image/bmp',
+                            'image/tiff',
+                            'image/webp'
+                        )
+                    )
+                )
                 AND n.isDeleted = 0
                 AND n.blobId IS NOT NULL
                 AND (
-                    b.ocr_last_processed IS NULL 
+                    b.ocr_last_processed IS NULL
                     OR b.utcDateModified > b.ocr_last_processed
                 )
             `);
 
-            // Get attachments with blobs that need OCR
+            // Get attachments with blobs that need OCR (both image and file attachments with supported MIME types)
             const attachmentBlobs = sql.getRows<{
                 blobId: string;
                 mimeType: string;
@@ -626,11 +672,33 @@ class OCRService {
                 SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId
                 FROM attachments a
                 JOIN blobs b ON a.blobId = b.blobId
-                WHERE a.role = 'image'
+                WHERE (
+                    a.role = 'image'
+                    OR (
+                        a.role = 'file'
+                        AND a.mime IN (
+                            'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+                            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+                            'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+                            'application/msword',
+                            'application/vnd.ms-excel',
+                            'application/vnd.ms-powerpoint',
+                            'application/rtf',
+                            'application/pdf',
+                            'image/jpeg',
+                            'image/jpg',
+                            'image/png',
+                            'image/gif',
+                            'image/bmp',
+                            'image/tiff',
+                            'image/webp'
+                        )
+                    )
+                )
                 AND a.isDeleted = 0
                 AND a.blobId IS NOT NULL
                 AND (
-                    b.ocr_last_processed IS NULL 
+                    b.ocr_last_processed IS NULL
                     OR b.utcDateModified > b.ocr_last_processed
                 )
             `);
@@ -641,8 +709,8 @@ class OCRService {
                 ...attachmentBlobs.map(blob => ({ ...blob, entityType: 'attachment' as const }))
             ];
 
-            // Filter to only supported MIME types
-            return result.filter(blob => this.isSupportedMimeType(blob.mimeType));
+            // Return all results (no need to filter by MIME type as we already did in the query)
+            return result;
         } catch (error) {
             log.error(`Failed to get blobs needing OCR: ${error}`);
             return [];
@@ -673,7 +741,7 @@ class OCRService {
                 } else {
                     await this.processAttachmentOCR(blobInfo.entityId);
                 }
-                
+
                 // Add small delay to prevent overwhelming the system
                 await new Promise(resolve => setTimeout(resolve, 100));
             } catch (error) {
@@ -686,4 +754,4 @@ class OCRService {
     }
 }
 
-export default new OCRService();
\ No newline at end of file
+export default new OCRService();
diff --git a/apps/server/src/services/ocr/processors/office_processor.ts b/apps/server/src/services/ocr/processors/office_processor.ts
index 794ec52e8..92cb1844f 100644
--- a/apps/server/src/services/ocr/processors/office_processor.ts
+++ b/apps/server/src/services/ocr/processors/office_processor.ts
@@ -25,7 +25,7 @@ export class OfficeProcessor extends FileProcessor {
             'application/vnd.ms-powerpoint', // PPT
             'application/rtf' // RTF
         ];
-        return supportedTypes.includes(mimeType.toLowerCase());
+        return supportedTypes.includes(mimeType);
     }
 
     async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
@@ -40,7 +40,7 @@ export class OfficeProcessor extends FileProcessor {
 
             // Extract text from Office document
             const data = await this.parseOfficeDocument(buffer);
-            
+
             // Extract text from Office document
             const combinedText = data.data && data.data.trim().length > 0 ? data.data.trim() : '';
             const confidence = combinedText.length > 0 ? 0.99 : 0; // High confidence for direct text extraction
@@ -71,7 +71,7 @@ export class OfficeProcessor extends FileProcessor {
                 ignoreNotes: false,
                 putNotesAtLast: false
             });
-            
+
             return {
                 data: data || ''
             };
@@ -113,16 +113,16 @@ export class OfficeProcessor extends FileProcessor {
         if (!language || typeof language !== 'string') {
             return false;
         }
-        
+
         // Split by '+' for multi-language format
         const languages = language.split('+');
-        
+
         // Check each language code (should be 2-7 characters, alphanumeric with underscores)
         const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
-        
+
         return languages.every(lang => {
             const trimmed = lang.trim();
             return trimmed.length > 0 && validLanguagePattern.test(trimmed);
         });
     }
-}
\ No newline at end of file
+}