From 090b1751521f9798bf3f3abf126583bfdf76774b Mon Sep 17 00:00:00 2001
From: Elian Doran <contact@eliandoran.me>
Date: Sat, 26 Jul 2025 11:51:53 +0300
Subject: [PATCH] refactor(ocr): deduplicate mime types partially

---
 apps/server/src/services/handlers.ts          | 21 +-------
 apps/server/src/services/ocr/ocr_service.ts   | 52 +++++++++++++++++++
 .../services/ocr/processors/file_processor.ts |  5 ++
 .../ocr/processors/image_processor.ts         | 24 +++++----
 .../ocr/processors/office_processor.ts        | 24 +++++----
 .../services/ocr/processors/pdf_processor.ts  |  5 ++
 .../services/ocr/processors/tiff_processor.ts |  5 ++
 7 files changed, 96 insertions(+), 40 deletions(-)
diff --git a/apps/server/src/services/handlers.ts b/apps/server/src/services/handlers.ts
index b26fa1271..6d6108b42 100644
--- a/apps/server/src/services/handlers.ts
+++ b/apps/server/src/services/handlers.ts
@@ -143,26 +143,7 @@ eventService.subscribe(eventService.ENTITY_CREATED, ({ entityName, entity }) =>
         // Automatically process OCR for file notes if OCR is enabled
         if (entity.type === 'file' && ocrService.isOCREnabled()) {
             // Check if the file MIME type is supported by any OCR processor
-            const supportedMimeTypes = [
-                // Office documents
-                'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
-                'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
-                'application/vnd.openxmlformats-officedocument.presentationml.presentation',
-                'application/msword',
-                'application/vnd.ms-excel',
-                'application/vnd.ms-powerpoint',
-                'application/rtf',
-                // PDFs
-                'application/pdf',
-                // Images (though these are usually type='image', not 'file')
-                'image/jpeg',
-                'image/jpg',
-                'image/png',
-                'image/gif',
-                'image/bmp',
-                'image/tiff',
-                'image/webp'
-            ];
+            const supportedMimeTypes = ocrService.getAllSupportedMimeTypes();
             
             if (entity.mime && supportedMimeTypes.includes(entity.mime)) {
                 // Process OCR asynchronously to avoid blocking note creation
diff --git a/apps/server/src/services/ocr/ocr_service.ts b/apps/server/src/services/ocr/ocr_service.ts
index 89b420428..a2b64beea 100644
--- a/apps/server/src/services/ocr/ocr_service.ts
+++ b/apps/server/src/services/ocr/ocr_service.ts
@@ -557,6 +557,58 @@ class OCRService {
         return null;
     }
 
+    /**
+     * Get all MIME types supported by all registered processors
+     */
+    getAllSupportedMimeTypes(): string[] {
+        const supportedTypes = new Set<string>();
+
+        // Initialize processors if not already done
+        if (!this.isInitialized) {
+            // Return a static list if not initialized to avoid async issues
+            // This covers all known supported types
+            return [
+                // Images
+                'image/jpeg',
+                'image/jpg',
+                'image/png',
+                'image/gif',
+                'image/bmp',
+                'image/tiff',
+                'image/tif',
+                'image/webp',
+                // Documents
+                'application/pdf',
+                'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+                'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+                'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+                'application/msword',
+                'application/vnd.ms-excel',
+                'application/vnd.ms-powerpoint',
+                'application/rtf'
+            ];
+        }
+
+        // Gather MIME types from all registered processors
+        for (const processor of this.processors.values()) {
+            const processorTypes = processor.getSupportedMimeTypes();
+            processorTypes.forEach(type => supportedTypes.add(type));
+        }
+
+        return Array.from(supportedTypes);
+    }
+
+    /**
+     * Check if a MIME type is supported by any processor
+     */
+    isSupportedByAnyProcessor(mimeType: string): boolean {
+        if (!mimeType) return false;
+
+        // Check if any processor can handle this MIME type
+        const processor = this.getProcessorForMimeType(mimeType);
+        return processor !== null;
+    }
+
     /**
      * Check if blob needs OCR re-processing due to content changes
      */
diff --git a/apps/server/src/services/ocr/processors/file_processor.ts b/apps/server/src/services/ocr/processors/file_processor.ts
index 98dd3dfd9..d46b823ba 100644
--- a/apps/server/src/services/ocr/processors/file_processor.ts
+++ b/apps/server/src/services/ocr/processors/file_processor.ts
@@ -19,6 +19,11 @@ export abstract class FileProcessor {
      */
     abstract getProcessingType(): string;
 
+    /**
+     * Get list of MIME types supported by this processor
+     */
+    abstract getSupportedMimeTypes(): string[];
+
     /**
      * Clean up any resources
      */
diff --git a/apps/server/src/services/ocr/processors/image_processor.ts b/apps/server/src/services/ocr/processors/image_processor.ts
index 7ca86f50e..2666fe965 100644
--- a/apps/server/src/services/ocr/processors/image_processor.ts
+++ b/apps/server/src/services/ocr/processors/image_processor.ts
@@ -9,18 +9,22 @@ import log from '../../log.js';
 export class ImageProcessor extends FileProcessor {
     private worker: Tesseract.Worker | null = null;
     private isInitialized = false;
+    private readonly supportedTypes = [
+        'image/jpeg',
+        'image/jpg', 
+        'image/png',
+        'image/gif',
+        'image/bmp',
+        'image/tiff',
+        'image/webp'
+    ];
 
     canProcess(mimeType: string): boolean {
-        const supportedTypes = [
-            'image/jpeg',
-            'image/jpg', 
-            'image/png',
-            'image/gif',
-            'image/bmp',
-            'image/tiff',
-            'image/webp'
-        ];
-        return supportedTypes.includes(mimeType.toLowerCase());
+        return this.supportedTypes.includes(mimeType.toLowerCase());
+    }
+
+    getSupportedMimeTypes(): string[] {
+        return [...this.supportedTypes];
     }
 
     async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
diff --git a/apps/server/src/services/ocr/processors/office_processor.ts b/apps/server/src/services/ocr/processors/office_processor.ts
index 92cb1844f..8e99eea55 100644
--- a/apps/server/src/services/ocr/processors/office_processor.ts
+++ b/apps/server/src/services/ocr/processors/office_processor.ts
@@ -9,6 +9,15 @@ import log from '../../log.js';
  */
 export class OfficeProcessor extends FileProcessor {
     private imageProcessor: ImageProcessor;
+    private readonly supportedTypes = [
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
+        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
+        'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
+        'application/msword', // DOC
+        'application/vnd.ms-excel', // XLS
+        'application/vnd.ms-powerpoint', // PPT
+        'application/rtf' // RTF
+    ];
 
     constructor() {
         super();
@@ -16,16 +25,11 @@ export class OfficeProcessor extends FileProcessor {
     }
 
     canProcess(mimeType: string): boolean {
-        const supportedTypes = [
-            'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
-            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
-            'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
-            'application/msword', // DOC
-            'application/vnd.ms-excel', // XLS
-            'application/vnd.ms-powerpoint', // PPT
-            'application/rtf' // RTF
-        ];
-        return supportedTypes.includes(mimeType);
+        return this.supportedTypes.includes(mimeType);
+    }
+
+    getSupportedMimeTypes(): string[] {
+        return [...this.supportedTypes];
     }
 
     async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
diff --git a/apps/server/src/services/ocr/processors/pdf_processor.ts b/apps/server/src/services/ocr/processors/pdf_processor.ts
index 54ca2d4c6..902715900 100644
--- a/apps/server/src/services/ocr/processors/pdf_processor.ts
+++ b/apps/server/src/services/ocr/processors/pdf_processor.ts
@@ -11,6 +11,7 @@ import sharp from 'sharp';
  */
 export class PDFProcessor extends FileProcessor {
     private imageProcessor: ImageProcessor;
+    private readonly supportedTypes = ['application/pdf'];
 
     constructor() {
         super();
@@ -21,6 +22,10 @@ export class PDFProcessor extends FileProcessor {
         return mimeType.toLowerCase() === 'application/pdf';
     }
 
+    getSupportedMimeTypes(): string[] {
+        return [...this.supportedTypes];
+    }
+
     async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
         try {
             log.info('Starting PDF text extraction...');
diff --git a/apps/server/src/services/ocr/processors/tiff_processor.ts b/apps/server/src/services/ocr/processors/tiff_processor.ts
index 1755d77e2..2fba58ce9 100644
--- a/apps/server/src/services/ocr/processors/tiff_processor.ts
+++ b/apps/server/src/services/ocr/processors/tiff_processor.ts
@@ -9,6 +9,7 @@ import log from '../../log.js';
  */
 export class TIFFProcessor extends FileProcessor {
     private imageProcessor: ImageProcessor;
+    private readonly supportedTypes = ['image/tiff', 'image/tif'];
 
     constructor() {
         super();
@@ -19,6 +20,10 @@ export class TIFFProcessor extends FileProcessor {
         return mimeType.toLowerCase() === 'image/tiff' || mimeType.toLowerCase() === 'image/tif';
     }
 
+    getSupportedMimeTypes(): string[] {
+        return [...this.supportedTypes];
+    }
+
     async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
         try {
             log.info('Starting TIFF text extraction...');