refactor(ocr): deduplicate mime types partially

2025-12-04 22:44:25 +01:00 · 2025-07-26 11:51:53 +03:00 · 2025-07-26 11:51:53 +03:00 · 090b175152
commit 090b175152
parent 11e9b097a2
7 changed files with 96 additions and 40 deletions
--- a/apps/server/src/services/handlers.ts
+++ b/apps/server/src/services/handlers.ts
@ -143,26 +143,7 @@ eventService.subscribe(eventService.ENTITY_CREATED, ({ entityName, entity }) =>
        // Automatically process OCR for file notes if OCR is enabled
        if (entity.type === 'file' && ocrService.isOCREnabled()) {
            // Check if the file MIME type is supported by any OCR processor
-            const supportedMimeTypes = [
-                // Office documents
-                'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
-                'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
-                'application/vnd.openxmlformats-officedocument.presentationml.presentation',
-                'application/msword',
-                'application/vnd.ms-excel',
-                'application/vnd.ms-powerpoint',
-                'application/rtf',
-                // PDFs
-                'application/pdf',
-                // Images (though these are usually type='image', not 'file')
-                'image/jpeg',
-                'image/jpg',
-                'image/png',
-                'image/gif',
-                'image/bmp',
-                'image/tiff',
-                'image/webp'
-            ];
+            const supportedMimeTypes = ocrService.getAllSupportedMimeTypes();
            
            if (entity.mime && supportedMimeTypes.includes(entity.mime)) {
                // Process OCR asynchronously to avoid blocking note creation
--- a/apps/server/src/services/ocr/ocr_service.ts
+++ b/apps/server/src/services/ocr/ocr_service.ts
@ -557,6 +557,58 @@ class OCRService {
        return null;
    }

+    /**
+     * Get all MIME types supported by all registered processors
+     */
+    getAllSupportedMimeTypes(): string[] {
+        const supportedTypes = new Set<string>();
+
+        // Initialize processors if not already done
+        if (!this.isInitialized) {
+            // Return a static list if not initialized to avoid async issues
+            // This covers all known supported types
+            return [
+                // Images
+                'image/jpeg',
+                'image/jpg',
+                'image/png',
+                'image/gif',
+                'image/bmp',
+                'image/tiff',
+                'image/tif',
+                'image/webp',
+                // Documents
+                'application/pdf',
+                'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+                'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+                'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+                'application/msword',
+                'application/vnd.ms-excel',
+                'application/vnd.ms-powerpoint',
+                'application/rtf'
+            ];
+        }
+
+        // Gather MIME types from all registered processors
+        for (const processor of this.processors.values()) {
+            const processorTypes = processor.getSupportedMimeTypes();
+            processorTypes.forEach(type => supportedTypes.add(type));
+        }
+
+        return Array.from(supportedTypes);
+    }
+
+    /**
+     * Check if a MIME type is supported by any processor
+     */
+    isSupportedByAnyProcessor(mimeType: string): boolean {
+        if (!mimeType) return false;
+
+        // Check if any processor can handle this MIME type
+        const processor = this.getProcessorForMimeType(mimeType);
+        return processor !== null;
+    }
+
    /**
     * Check if blob needs OCR re-processing due to content changes
     */
--- a/apps/server/src/services/ocr/processors/file_processor.ts
+++ b/apps/server/src/services/ocr/processors/file_processor.ts
@ -19,6 +19,11 @@ export abstract class FileProcessor {
     */
    abstract getProcessingType(): string;

+    /**
+     * Get list of MIME types supported by this processor
+     */
+    abstract getSupportedMimeTypes(): string[];
+
    /**
     * Clean up any resources
     */
--- a/apps/server/src/services/ocr/processors/image_processor.ts
+++ b/apps/server/src/services/ocr/processors/image_processor.ts
@ -9,18 +9,22 @@ import log from '../../log.js';
 export class ImageProcessor extends FileProcessor {
    private worker: Tesseract.Worker | null = null;
    private isInitialized = false;
+    private readonly supportedTypes = [
+        'image/jpeg',
+        'image/jpg', 
+        'image/png',
+        'image/gif',
+        'image/bmp',
+        'image/tiff',
+        'image/webp'
+    ];

    canProcess(mimeType: string): boolean {
-        const supportedTypes = [
-            'image/jpeg',
-            'image/jpg', 
-            'image/png',
-            'image/gif',
-            'image/bmp',
-            'image/tiff',
-            'image/webp'
-        ];
-        return supportedTypes.includes(mimeType.toLowerCase());
+        return this.supportedTypes.includes(mimeType.toLowerCase());
+    }
+
+    getSupportedMimeTypes(): string[] {
+        return [...this.supportedTypes];
    }

    async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
--- a/apps/server/src/services/ocr/processors/office_processor.ts
+++ b/apps/server/src/services/ocr/processors/office_processor.ts
@ -9,6 +9,15 @@ import log from '../../log.js';
 */
 export class OfficeProcessor extends FileProcessor {
    private imageProcessor: ImageProcessor;
+    private readonly supportedTypes = [
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
+        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
+        'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
+        'application/msword', // DOC
+        'application/vnd.ms-excel', // XLS
+        'application/vnd.ms-powerpoint', // PPT
+        'application/rtf' // RTF
+    ];

    constructor() {
        super();
@ -16,16 +25,11 @@ export class OfficeProcessor extends FileProcessor {
    }

    canProcess(mimeType: string): boolean {
-        const supportedTypes = [
-            'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
-            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
-            'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
-            'application/msword', // DOC
-            'application/vnd.ms-excel', // XLS
-            'application/vnd.ms-powerpoint', // PPT
-            'application/rtf' // RTF
-        ];
-        return supportedTypes.includes(mimeType);
+        return this.supportedTypes.includes(mimeType);
+    }
+
+    getSupportedMimeTypes(): string[] {
+        return [...this.supportedTypes];
    }

    async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
--- a/apps/server/src/services/ocr/processors/pdf_processor.ts
+++ b/apps/server/src/services/ocr/processors/pdf_processor.ts
@ -11,6 +11,7 @@ import sharp from 'sharp';
 */
 export class PDFProcessor extends FileProcessor {
    private imageProcessor: ImageProcessor;
+    private readonly supportedTypes = ['application/pdf'];

    constructor() {
        super();
@ -21,6 +22,10 @@ export class PDFProcessor extends FileProcessor {
        return mimeType.toLowerCase() === 'application/pdf';
    }

+    getSupportedMimeTypes(): string[] {
+        return [...this.supportedTypes];
+    }
+
    async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
        try {
            log.info('Starting PDF text extraction...');
--- a/apps/server/src/services/ocr/processors/tiff_processor.ts
+++ b/apps/server/src/services/ocr/processors/tiff_processor.ts
@ -9,6 +9,7 @@ import log from '../../log.js';
 */
 export class TIFFProcessor extends FileProcessor {
    private imageProcessor: ImageProcessor;
+    private readonly supportedTypes = ['image/tiff', 'image/tif'];

    constructor() {
        super();
@ -19,6 +20,10 @@ export class TIFFProcessor extends FileProcessor {
        return mimeType.toLowerCase() === 'image/tiff' || mimeType.toLowerCase() === 'image/tif';
    }

+    getSupportedMimeTypes(): string[] {
+        return [...this.supportedTypes];
+    }
+
    async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
        try {
            log.info('Starting TIFF text extraction...');