From 090b1751521f9798bf3f3abf126583bfdf76774b Mon Sep 17 00:00:00 2001 From: Elian Doran Date: Sat, 26 Jul 2025 11:51:53 +0300 Subject: [PATCH] refactor(ocr): deduplicate mime types partially --- apps/server/src/services/handlers.ts | 21 +------- apps/server/src/services/ocr/ocr_service.ts | 52 +++++++++++++++++++ .../services/ocr/processors/file_processor.ts | 5 ++ .../ocr/processors/image_processor.ts | 24 +++++---- .../ocr/processors/office_processor.ts | 24 +++++---- .../services/ocr/processors/pdf_processor.ts | 5 ++ .../services/ocr/processors/tiff_processor.ts | 5 ++ 7 files changed, 96 insertions(+), 40 deletions(-) diff --git a/apps/server/src/services/handlers.ts b/apps/server/src/services/handlers.ts index b26fa1271..6d6108b42 100644 --- a/apps/server/src/services/handlers.ts +++ b/apps/server/src/services/handlers.ts @@ -143,26 +143,7 @@ eventService.subscribe(eventService.ENTITY_CREATED, ({ entityName, entity }) => // Automatically process OCR for file notes if OCR is enabled if (entity.type === 'file' && ocrService.isOCREnabled()) { // Check if the file MIME type is supported by any OCR processor - const supportedMimeTypes = [ - // Office documents - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', - 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', - 'application/vnd.openxmlformats-officedocument.presentationml.presentation', - 'application/msword', - 'application/vnd.ms-excel', - 'application/vnd.ms-powerpoint', - 'application/rtf', - // PDFs - 'application/pdf', - // Images (though these are usually type='image', not 'file') - 'image/jpeg', - 'image/jpg', - 'image/png', - 'image/gif', - 'image/bmp', - 'image/tiff', - 'image/webp' - ]; + const supportedMimeTypes = ocrService.getAllSupportedMimeTypes(); if (entity.mime && supportedMimeTypes.includes(entity.mime)) { // Process OCR asynchronously to avoid blocking note creation diff --git a/apps/server/src/services/ocr/ocr_service.ts b/apps/server/src/services/ocr/ocr_service.ts index 89b420428..a2b64beea 100644 --- a/apps/server/src/services/ocr/ocr_service.ts +++ b/apps/server/src/services/ocr/ocr_service.ts @@ -557,6 +557,58 @@ class OCRService { return null; } + /** + * Get all MIME types supported by all registered processors + */ + getAllSupportedMimeTypes(): string[] { + const supportedTypes = new Set(); + + // Initialize processors if not already done + if (!this.isInitialized) { + // Return a static list if not initialized to avoid async issues + // This covers all known supported types + return [ + // Images + 'image/jpeg', + 'image/jpg', + 'image/png', + 'image/gif', + 'image/bmp', + 'image/tiff', + 'image/tif', + 'image/webp', + // Documents + 'application/pdf', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + 'application/msword', + 'application/vnd.ms-excel', + 'application/vnd.ms-powerpoint', + 'application/rtf' + ]; + } + + // Gather MIME types from all registered processors + for (const processor of this.processors.values()) { + const processorTypes = processor.getSupportedMimeTypes(); + processorTypes.forEach(type => supportedTypes.add(type)); + } + + return Array.from(supportedTypes); + } + + /** + * Check if a MIME type is supported by any processor + */ + isSupportedByAnyProcessor(mimeType: string): boolean { + if (!mimeType) return false; + + // Check if any processor can handle this MIME type + const processor = this.getProcessorForMimeType(mimeType); + return processor !== null; + } + /** * Check if blob needs OCR re-processing due to content changes */ diff --git a/apps/server/src/services/ocr/processors/file_processor.ts b/apps/server/src/services/ocr/processors/file_processor.ts index 98dd3dfd9..d46b823ba 100644 --- a/apps/server/src/services/ocr/processors/file_processor.ts +++ b/apps/server/src/services/ocr/processors/file_processor.ts @@ -19,6 +19,11 @@ export abstract class FileProcessor { */ abstract getProcessingType(): string; + /** + * Get list of MIME types supported by this processor + */ + abstract getSupportedMimeTypes(): string[]; + /** * Clean up any resources */ diff --git a/apps/server/src/services/ocr/processors/image_processor.ts b/apps/server/src/services/ocr/processors/image_processor.ts index 7ca86f50e..2666fe965 100644 --- a/apps/server/src/services/ocr/processors/image_processor.ts +++ b/apps/server/src/services/ocr/processors/image_processor.ts @@ -9,18 +9,22 @@ import log from '../../log.js'; export class ImageProcessor extends FileProcessor { private worker: Tesseract.Worker | null = null; private isInitialized = false; + private readonly supportedTypes = [ + 'image/jpeg', + 'image/jpg', + 'image/png', + 'image/gif', + 'image/bmp', + 'image/tiff', + 'image/webp' + ]; canProcess(mimeType: string): boolean { - const supportedTypes = [ - 'image/jpeg', - 'image/jpg', - 'image/png', - 'image/gif', - 'image/bmp', - 'image/tiff', - 'image/webp' - ]; - return supportedTypes.includes(mimeType.toLowerCase()); + return this.supportedTypes.includes(mimeType.toLowerCase()); + } + + getSupportedMimeTypes(): string[] { + return [...this.supportedTypes]; } async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise { diff --git a/apps/server/src/services/ocr/processors/office_processor.ts b/apps/server/src/services/ocr/processors/office_processor.ts index 92cb1844f..8e99eea55 100644 --- a/apps/server/src/services/ocr/processors/office_processor.ts +++ b/apps/server/src/services/ocr/processors/office_processor.ts @@ -9,6 +9,15 @@ import log from '../../log.js'; */ export class OfficeProcessor extends FileProcessor { private imageProcessor: ImageProcessor; + private readonly supportedTypes = [ + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX + 'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX + 'application/msword', // DOC + 'application/vnd.ms-excel', // XLS + 'application/vnd.ms-powerpoint', // PPT + 'application/rtf' // RTF + ]; constructor() { super(); @@ -16,16 +25,11 @@ export class OfficeProcessor extends FileProcessor { } canProcess(mimeType: string): boolean { - const supportedTypes = [ - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX - 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX - 'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX - 'application/msword', // DOC - 'application/vnd.ms-excel', // XLS - 'application/vnd.ms-powerpoint', // PPT - 'application/rtf' // RTF - ]; - return supportedTypes.includes(mimeType); + return this.supportedTypes.includes(mimeType); + } + + getSupportedMimeTypes(): string[] { + return [...this.supportedTypes]; } async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise { diff --git a/apps/server/src/services/ocr/processors/pdf_processor.ts b/apps/server/src/services/ocr/processors/pdf_processor.ts index 54ca2d4c6..902715900 100644 --- a/apps/server/src/services/ocr/processors/pdf_processor.ts +++ b/apps/server/src/services/ocr/processors/pdf_processor.ts @@ -11,6 +11,7 @@ import sharp from 'sharp'; */ export class PDFProcessor extends FileProcessor { private imageProcessor: ImageProcessor; + private readonly supportedTypes = ['application/pdf']; constructor() { super(); @@ -21,6 +22,10 @@ export class PDFProcessor extends FileProcessor { return mimeType.toLowerCase() === 'application/pdf'; } + getSupportedMimeTypes(): string[] { + return [...this.supportedTypes]; + } + async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise { try { log.info('Starting PDF text extraction...'); diff --git a/apps/server/src/services/ocr/processors/tiff_processor.ts b/apps/server/src/services/ocr/processors/tiff_processor.ts index 1755d77e2..2fba58ce9 100644 --- a/apps/server/src/services/ocr/processors/tiff_processor.ts +++ b/apps/server/src/services/ocr/processors/tiff_processor.ts @@ -9,6 +9,7 @@ import log from '../../log.js'; */ export class TIFFProcessor extends FileProcessor { private imageProcessor: ImageProcessor; + private readonly supportedTypes = ['image/tiff', 'image/tif']; constructor() { super(); @@ -19,6 +20,10 @@ export class TIFFProcessor extends FileProcessor { return mimeType.toLowerCase() === 'image/tiff' || mimeType.toLowerCase() === 'image/tif'; } + getSupportedMimeTypes(): string[] { + return [...this.supportedTypes]; + } + async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise { try { log.info('Starting TIFF text extraction...');