feat(ocr): basic processing of new files

2025-12-06 07:24:25 +01:00 · 2025-07-26 11:46:28 +03:00 · 2025-07-26 11:46:28 +03:00 · 11e9b097a2
commit 11e9b097a2
parent 2adfc1d32b
3 changed files with 157 additions and 51 deletions
--- a/apps/server/src/services/handlers.ts
+++ b/apps/server/src/services/handlers.ts
@ -6,6 +6,8 @@ import becca from "../becca/becca.js";
 import BAttribute from "../becca/entities/battribute.js";
 import hiddenSubtreeService from "./hidden_subtree.js";
 import oneTimeTimer from "./one_time_timer.js";
+import ocrService from "./ocr/ocr_service.js";
+import log from "./log.js";
 import type BNote from "../becca/entities/bnote.js";
 import type AbstractBeccaEntity from "../becca/entities/abstract_becca_entity.js";
 import type { DefinitionObject } from "./promoted_attribute_definition_interface.js";
@ -137,6 +139,42 @@ eventService.subscribe(eventService.ENTITY_CREATED, ({ entityName, entity }) =>
        }
    } else if (entityName === "notes") {
        runAttachedRelations(entity, "runOnNoteCreation", entity);
+        
+        // Automatically process OCR for file notes if OCR is enabled
+        if (entity.type === 'file' && ocrService.isOCREnabled()) {
+            // Check if the file MIME type is supported by any OCR processor
+            const supportedMimeTypes = [
+                // Office documents
+                'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+                'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+                'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+                'application/msword',
+                'application/vnd.ms-excel',
+                'application/vnd.ms-powerpoint',
+                'application/rtf',
+                // PDFs
+                'application/pdf',
+                // Images (though these are usually type='image', not 'file')
+                'image/jpeg',
+                'image/jpg',
+                'image/png',
+                'image/gif',
+                'image/bmp',
+                'image/tiff',
+                'image/webp'
+            ];
+            
+            if (entity.mime && supportedMimeTypes.includes(entity.mime)) {
+                // Process OCR asynchronously to avoid blocking note creation
+                ocrService.processNoteOCR(entity.noteId).then(result => {
+                    if (result) {
+                        log.info(`Automatically processed OCR for file note ${entity.noteId} with MIME type ${entity.mime}`);
+                    }
+                }).catch(error => {
+                    log.error(`Failed to automatically process OCR for file note ${entity.noteId}: ${error}`);
+                });
+            }
+        }
    }
 });

--- a/apps/server/src/services/ocr/ocr_service.ts
+++ b/apps/server/src/services/ocr/ocr_service.ts
@ -143,13 +143,25 @@ class OCRService {
            return null;
        }

-        if (note.type !== 'image') {
-            log.info(`Note ${noteId} is not an image note, skipping OCR`);
-            return null;
+        if (!this.isInitialized) {
+            await this.initialize();
        }

+        // Check if note type and MIME type are supported for OCR
+        if (note.type === 'image') {
            if (!this.isSupportedMimeType(note.mime)) {
-            log.info(`Note ${noteId} has unsupported MIME type ${note.mime}, skipping OCR`);
+                log.info(`Image note ${noteId} has unsupported MIME type ${note.mime}, skipping OCR`);
+                return null;
+            }
+        } else if (note.type === 'file') {
+            // Check if file MIME type is supported by any processor
+            const processor = this.getProcessorForMimeType(note.mime);
+            if (!processor) {
+                log.info(`File note ${noteId} has unsupported MIME type ${note.mime} for OCR, skipping`);
+                return null;
+            }
+        } else {
+            log.info(`Note ${noteId} is not an image or file note, skipping OCR`);
            return null;
        }

@ -193,13 +205,25 @@ class OCRService {
            return null;
        }

-        if (attachment.role !== 'image') {
-            log.info(`Attachment ${attachmentId} is not an image, skipping OCR`);
-            return null;
+        if (!this.isInitialized) {
+            await this.initialize();
        }

+        // Check if attachment role and MIME type are supported for OCR
+        if (attachment.role === 'image') {
            if (!this.isSupportedMimeType(attachment.mime)) {
-            log.info(`Attachment ${attachmentId} has unsupported MIME type ${attachment.mime}, skipping OCR`);
+                log.info(`Image attachment ${attachmentId} has unsupported MIME type ${attachment.mime}, skipping OCR`);
+                return null;
+            }
+        } else if (attachment.role === 'file') {
+            // Check if file MIME type is supported by any processor
+            const processor = this.getProcessorForMimeType(attachment.mime);
+            if (!processor) {
+                log.info(`File attachment ${attachmentId} has unsupported MIME type ${attachment.mime} for OCR, skipping`);
+                return null;
+            }
+        } else {
+            log.info(`Attachment ${attachmentId} is not an image or file, skipping OCR`);
            return null;
        }

@ -599,7 +623,7 @@ class OCRService {
     */
    getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> {
        try {
-            // Get notes with blobs that need OCR
+            // Get notes with blobs that need OCR (both image notes and file notes with supported MIME types)
            const noteBlobs = sql.getRows<{
                blobId: string;
                mimeType: string;
@ -608,7 +632,29 @@ class OCRService {
                SELECT n.blobId, n.mime as mimeType, n.noteId as entityId
                FROM notes n
                JOIN blobs b ON n.blobId = b.blobId
-                WHERE n.type = 'image' 
+                WHERE (
+                    n.type = 'image'
+                    OR (
+                        n.type = 'file'
+                        AND n.mime IN (
+                            'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+                            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+                            'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+                            'application/msword',
+                            'application/vnd.ms-excel',
+                            'application/vnd.ms-powerpoint',
+                            'application/rtf',
+                            'application/pdf',
+                            'image/jpeg',
+                            'image/jpg',
+                            'image/png',
+                            'image/gif',
+                            'image/bmp',
+                            'image/tiff',
+                            'image/webp'
+                        )
+                    )
+                )
                AND n.isDeleted = 0
                AND n.blobId IS NOT NULL
                AND (
@ -617,7 +663,7 @@ class OCRService {
                )
            `);

-            // Get attachments with blobs that need OCR
+            // Get attachments with blobs that need OCR (both image and file attachments with supported MIME types)
            const attachmentBlobs = sql.getRows<{
                blobId: string;
                mimeType: string;
@ -626,7 +672,29 @@ class OCRService {
                SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId
                FROM attachments a
                JOIN blobs b ON a.blobId = b.blobId
-                WHERE a.role = 'image'
+                WHERE (
+                    a.role = 'image'
+                    OR (
+                        a.role = 'file'
+                        AND a.mime IN (
+                            'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+                            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+                            'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+                            'application/msword',
+                            'application/vnd.ms-excel',
+                            'application/vnd.ms-powerpoint',
+                            'application/rtf',
+                            'application/pdf',
+                            'image/jpeg',
+                            'image/jpg',
+                            'image/png',
+                            'image/gif',
+                            'image/bmp',
+                            'image/tiff',
+                            'image/webp'
+                        )
+                    )
+                )
                AND a.isDeleted = 0
                AND a.blobId IS NOT NULL
                AND (
@ -641,8 +709,8 @@ class OCRService {
                ...attachmentBlobs.map(blob => ({ ...blob, entityType: 'attachment' as const }))
            ];

-            // Filter to only supported MIME types
-            return result.filter(blob => this.isSupportedMimeType(blob.mimeType));
+            // Return all results (no need to filter by MIME type as we already did in the query)
+            return result;
        } catch (error) {
            log.error(`Failed to get blobs needing OCR: ${error}`);
            return [];
--- a/apps/server/src/services/ocr/processors/office_processor.ts
+++ b/apps/server/src/services/ocr/processors/office_processor.ts
@ -25,7 +25,7 @@ export class OfficeProcessor extends FileProcessor {
            'application/vnd.ms-powerpoint', // PPT
            'application/rtf' // RTF
        ];
-        return supportedTypes.includes(mimeType.toLowerCase());
+        return supportedTypes.includes(mimeType);
    }

    async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {