feat(ocr): add additional processors for OCR feature

2025-12-05 06:54:23 +01:00 · 2025-07-16 20:10:56 +00:00 · 2025-07-16 20:10:56 +00:00 · ca8cbf8ccf
commit ca8cbf8ccf
parent 6722d2d266
6 changed files with 812 additions and 211 deletions
--- a/apps/server/src/services/ocr/ocr_service.ts
+++ b/apps/server/src/services/ocr/ocr_service.ts
@ -3,23 +3,31 @@ import log from '../log.js';
 import sql from '../sql.js';
 import becca from '../../becca/becca.js';
 import options from '../options.js';
+import { ImageProcessor } from './processors/image_processor.js';
+import { PDFProcessor } from './processors/pdf_processor.js';
+import { TIFFProcessor } from './processors/tiff_processor.js';
+import { OfficeProcessor } from './processors/office_processor.js';
+import { FileProcessor } from './processors/file_processor.js';

 export interface OCRResult {
    text: string;
    confidence: number;
    extractedAt: string;
    language?: string;
+    pageCount?: number;
 }

 export interface OCRProcessingOptions {
    language?: string;
    forceReprocess?: boolean;
    confidence?: number;
+    enablePDFTextExtraction?: boolean;
 }

 interface OCRBlobRow {
    blobId: string;
    ocr_text: string;
+    ocr_last_processed?: string;
 }

 /**
@ -30,6 +38,7 @@ class OCRService {
    private isInitialized = false;
    private worker: Tesseract.Worker | null = null;
    private isProcessing = false;
+    private processors: Map<string, FileProcessor> = new Map();

    /**
     * Initialize the OCR service
@ -40,25 +49,14 @@ class OCRService {
        }

        try {
-            log.info('Initializing OCR service with Tesseract.js...');
+            log.info('Initializing OCR service with file processors...');
            
-            // Configure proper paths for Node.js environment
-            const tesseractDir = require.resolve('tesseract.js').replace('/src/index.js', '');
-            const workerPath = require.resolve('tesseract.js/src/worker-script/node/index.js');
-            const corePath = require.resolve('tesseract.js-core/tesseract-core.wasm.js');
+            // Initialize file processors
+            this.processors.set('image', new ImageProcessor());
+            this.processors.set('pdf', new PDFProcessor());
+            this.processors.set('tiff', new TIFFProcessor());
+            this.processors.set('office', new OfficeProcessor());
            
-            log.info(`Using worker path: ${workerPath}`);
-            log.info(`Using core path: ${corePath}`);
-            
-            this.worker = await Tesseract.createWorker('eng', 1, {
-                workerPath,
-                corePath,
-                logger: (m: { status: string; progress: number }) => {
-                    if (m.status === 'recognizing text') {
-                        log.info(`OCR progress: ${Math.round(m.progress * 100)}%`);
-                    }
-                }
-            });
            this.isInitialized = true;
            log.info('OCR service initialized successfully');
        } catch (error) {
@ -100,46 +98,27 @@ class OCRService {
    }

    /**
-     * Extract text from image buffer
+     * Extract text from file buffer using appropriate processor
     */
-    async extractTextFromImage(imageBuffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
+    async extractTextFromFile(fileBuffer: Buffer, mimeType: string, options: OCRProcessingOptions = {}): Promise<OCRResult> {
        if (!this.isInitialized) {
            await this.initialize();
        }

-        if (!this.worker) {
-            throw new Error('OCR worker not initialized');
-        }
-
        try {
-            log.info('Starting OCR text extraction...');
+            log.info(`Starting OCR text extraction for MIME type: ${mimeType}`);
            this.isProcessing = true;

-            // Set language if specified and different from current
-            const language = options.language || 'eng';
-            if (language !== 'eng') {
-                // For different languages, create a new worker
-                await this.worker.terminate();
-                this.worker = await Tesseract.createWorker(language, 1, {
-                    logger: (m: { status: string; progress: number }) => {
-                        if (m.status === 'recognizing text') {
-                            log.info(`OCR progress: ${Math.round(m.progress * 100)}%`);
-                        }
-                    }
-                });
+            // Find appropriate processor
+            const processor = this.getProcessorForMimeType(mimeType);
+            if (!processor) {
+                throw new Error(`No processor found for MIME type: ${mimeType}`);
            }

-            const result = await this.worker.recognize(imageBuffer);
+            const result = await processor.extractText(fileBuffer, options);
            
-            const ocrResult: OCRResult = {
-                text: result.data.text.trim(),
-                confidence: result.data.confidence / 100,  // Convert percentage to decimal
-                extractedAt: new Date().toISOString(),
-                language: options.language || 'eng'
-            };
-
-            log.info(`OCR extraction completed. Confidence: ${ocrResult.confidence}%, Text length: ${ocrResult.text.length}`);
-            return ocrResult;
+            log.info(`OCR extraction completed. Confidence: ${result.confidence}%, Text length: ${result.text.length}`);
+            return result;

        } catch (error) {
            log.error(`OCR text extraction failed: ${error}`);
@ -174,10 +153,10 @@ class OCRService {
            return null;
        }

-        // Check if OCR already exists in the blob and we're not forcing reprocessing
+        // Check if OCR already exists and is up-to-date
        const existingOCR = this.getStoredOCRResult(note.blobId);
-        if (existingOCR && !options.forceReprocess) {
-            log.info(`OCR already exists for note ${noteId}, returning cached result`);
+        if (existingOCR && !options.forceReprocess && note.blobId && !this.needsReprocessing(note.blobId)) {
+            log.info(`OCR already exists and is up-to-date for note ${noteId}, returning cached result`);
            return existingOCR;
        }

@ -187,7 +166,7 @@ class OCRService {
                throw new Error(`Cannot get image content for note ${noteId}`);
            }

-            const ocrResult = await this.extractTextFromImage(content, options);
+            const ocrResult = await this.extractTextFromFile(content, note.mime, options);
            
            // Store OCR result in blob
            await this.storeOCRResult(note.blobId, ocrResult);
@ -224,10 +203,10 @@ class OCRService {
            return null;
        }

-        // Check if OCR already exists in the blob and we're not forcing reprocessing
+        // Check if OCR already exists and is up-to-date
        const existingOCR = this.getStoredOCRResult(attachment.blobId);
-        if (existingOCR && !options.forceReprocess) {
-            log.info(`OCR already exists for attachment ${attachmentId}, returning cached result`);
+        if (existingOCR && !options.forceReprocess && attachment.blobId && !this.needsReprocessing(attachment.blobId)) {
+            log.info(`OCR already exists and is up-to-date for attachment ${attachmentId}, returning cached result`);
            return existingOCR;
        }

@ -237,7 +216,7 @@ class OCRService {
                throw new Error(`Cannot get image content for attachment ${attachmentId}`);
            }

-            const ocrResult = await this.extractTextFromImage(content, options);
+            const ocrResult = await this.extractTextFromFile(content, attachment.mime, options);
            
            // Store OCR result in blob
            await this.storeOCRResult(attachment.blobId, ocrResult);
@ -259,11 +238,15 @@ class OCRService {
        }

        try {
-            // Store OCR text in blobs table
+            // Store OCR text and timestamp in blobs table
            sql.execute(`
-                UPDATE blobs SET ocr_text = ? WHERE blobId = ?
+                UPDATE blobs SET 
+                    ocr_text = ?, 
+                    ocr_last_processed = ?
+                WHERE blobId = ?
            `, [
                ocrResult.text,
+                new Date().toISOString(),
                blobId
            ]);
            
@ -353,80 +336,10 @@ class OCRService {
    }

    /**
-     * Process OCR for all images that don't have OCR results yet
+     * Process OCR for all files that don't have OCR results yet or need reprocessing
     */
    async processAllImages(): Promise<void> {
-        if (!this.isOCREnabled()) {
-            log.info('OCR is disabled, skipping batch processing');
-            return;
-        }
-
-        log.info('Starting batch OCR processing for all images...');
-
-        try {
-            // Process image notes
-            const imageNotes = sql.getRows<{
-                noteId: string;
-                mime: string;
-                blobId: string;
-            }>(`
-                SELECT n.noteId, n.mime, n.blobId
-                FROM notes n
-                LEFT JOIN blobs b ON n.blobId = b.blobId
-                WHERE n.type = 'image' 
-                AND n.isDeleted = 0
-                AND n.blobId IS NOT NULL
-                AND (b.ocr_text IS NULL OR b.ocr_text = '')
-            `);
-
-            log.info(`Found ${imageNotes.length} image notes to process`);
-
-            for (const noteRow of imageNotes) {
-                if (this.isSupportedMimeType(noteRow.mime)) {
-                    try {
-                        await this.processNoteOCR(noteRow.noteId);
-                        // Add small delay to prevent overwhelming the system
-                        await new Promise(resolve => setTimeout(resolve, 100));
-                    } catch (error) {
-                        log.error(`Failed to process OCR for note ${noteRow.noteId}: ${error}`);
-                    }
-                }
-            }
-
-            // Process image attachments
-            const imageAttachments = sql.getRows<{
-                attachmentId: string;
-                mime: string;
-                blobId: string;
-            }>(`
-                SELECT a.attachmentId, a.mime, a.blobId
-                FROM attachments a
-                LEFT JOIN blobs b ON a.blobId = b.blobId
-                WHERE a.role = 'image'
-                AND a.isDeleted = 0
-                AND a.blobId IS NOT NULL
-                AND (b.ocr_text IS NULL OR b.ocr_text = '')
-            `);
-
-            log.info(`Found ${imageAttachments.length} image attachments to process`);
-
-            for (const attachmentRow of imageAttachments) {
-                if (this.isSupportedMimeType(attachmentRow.mime)) {
-                    try {
-                        await this.processAttachmentOCR(attachmentRow.attachmentId);
-                        // Add small delay to prevent overwhelming the system
-                        await new Promise(resolve => setTimeout(resolve, 100));
-                    } catch (error) {
-                        log.error(`Failed to process OCR for attachment ${attachmentRow.attachmentId}: ${error}`);
-                    }
-                }
-            }
-
-            log.info('Batch OCR processing completed');
-        } catch (error) {
-            log.error(`Batch OCR processing failed: ${error}`);
-            throw error;
-        }
+        return this.processAllBlobsNeedingOCR();
    }

    /**
@ -521,28 +434,9 @@ class OCRService {
        }

        try {
-            // Count total images to process
-            const imageNotesCount = sql.getRow<{ count: number }>(`
-                SELECT COUNT(*) as count
-                FROM notes 
-                WHERE type = 'image' 
-                AND isDeleted = 0
-                AND noteId NOT IN (
-                    SELECT entity_id FROM ocr_results WHERE entity_type = 'note'
-                )
-            `)?.count || 0;
-
-            const imageAttachmentsCount = sql.getRow<{ count: number }>(`
-                SELECT COUNT(*) as count
-                FROM attachments 
-                WHERE role = 'image'
-                AND isDeleted = 0
-                AND attachmentId NOT IN (
-                    SELECT entity_id FROM ocr_results WHERE entity_type = 'attachment'
-                )
-            `)?.count || 0;
-
-            const totalCount = imageNotesCount + imageAttachmentsCount;
+            // Count total blobs needing OCR processing
+            const blobsNeedingOCR = this.getBlobsNeedingOCR();
+            const totalCount = blobsNeedingOCR.length;

            if (totalCount === 0) {
                return { success: false, message: 'No images found that need OCR processing' };
@ -557,7 +451,7 @@ class OCRService {
            };

            // Start processing in background
-            this.processBatchInBackground().catch(error => {
+            this.processBatchInBackground(blobsNeedingOCR).catch(error => {
                log.error(`Batch processing failed: ${error instanceof Error ? error.message : String(error)}`);
                this.batchProcessingState.inProgress = false;
            });
@ -583,79 +477,33 @@ class OCRService {
    /**
     * Process batch OCR in background with progress tracking
     */
-    private async processBatchInBackground(): Promise<void> {
+    private async processBatchInBackground(blobsToProcess: Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }>): Promise<void> {
        try {
            log.info('Starting batch OCR processing...');

-            // Process image notes
-            const imageNotes = sql.getRows<{
-                noteId: string;
-                mime: string;
-                blobId: string;
-            }>(`
-                SELECT n.noteId, n.mime, n.blobId
-                FROM notes n
-                LEFT JOIN blobs b ON n.blobId = b.blobId
-                WHERE n.type = 'image' 
-                AND n.isDeleted = 0
-                AND n.blobId IS NOT NULL
-                AND (b.ocr_text IS NULL OR b.ocr_text = '')
-            `);
-
-            for (const noteRow of imageNotes) {
+            for (const blobInfo of blobsToProcess) {
                if (!this.batchProcessingState.inProgress) {
                    break; // Stop if processing was cancelled
                }

-                if (this.isSupportedMimeType(noteRow.mime)) {
                try {
-                        await this.processNoteOCR(noteRow.noteId);
+                    if (blobInfo.entityType === 'note') {
+                        await this.processNoteOCR(blobInfo.entityId);
+                    } else {
+                        await this.processAttachmentOCR(blobInfo.entityId);
+                    }
                    this.batchProcessingState.processed++;
                    // Add small delay to prevent overwhelming the system
                    await new Promise(resolve => setTimeout(resolve, 500));
                } catch (error) {
-                        log.error(`Failed to process OCR for note ${noteRow.noteId}: ${error}`);
+                    log.error(`Failed to process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`);
                    this.batchProcessingState.processed++; // Count as processed even if failed
                }
            }
-            }
-
-            // Process image attachments
-            const imageAttachments = sql.getRows<{
-                attachmentId: string;
-                mime: string;
-                blobId: string;
-            }>(`
-                SELECT a.attachmentId, a.mime, a.blobId
-                FROM attachments a
-                LEFT JOIN blobs b ON a.blobId = b.blobId
-                WHERE a.role = 'image'
-                AND a.isDeleted = 0
-                AND a.blobId IS NOT NULL
-                AND (b.ocr_text IS NULL OR b.ocr_text = '')
-            `);
-
-            for (const attachmentRow of imageAttachments) {
-                if (!this.batchProcessingState.inProgress) {
-                    break; // Stop if processing was cancelled
-                }
-
-                if (this.isSupportedMimeType(attachmentRow.mime)) {
-                    try {
-                        await this.processAttachmentOCR(attachmentRow.attachmentId);
-                        this.batchProcessingState.processed++;
-                        // Add small delay to prevent overwhelming the system
-                        await new Promise(resolve => setTimeout(resolve, 500));
-                    } catch (error) {
-                        log.error(`Failed to process OCR for attachment ${attachmentRow.attachmentId}: ${error}`);
-                        this.batchProcessingState.processed++; // Count as processed even if failed
-                    }
-                }
-            }

            // Mark as completed
            this.batchProcessingState.inProgress = false;
-            log.info(`Batch OCR processing completed. Processed ${this.batchProcessingState.processed} images.`);
+            log.info(`Batch OCR processing completed. Processed ${this.batchProcessingState.processed} files.`);
        } catch (error) {
            log.error(`Batch OCR processing failed: ${error}`);
            this.batchProcessingState.inProgress = false;
@ -672,6 +520,170 @@ class OCRService {
            log.info('Batch OCR processing cancelled');
        }
    }
+
+    /**
+     * Get processor for a given MIME type
+     */
+    private getProcessorForMimeType(mimeType: string): FileProcessor | null {
+        for (const processor of this.processors.values()) {
+            if (processor.canProcess(mimeType)) {
+                return processor;
+            }
+        }
+        return null;
+    }
+
+    /**
+     * Check if blob needs OCR re-processing due to content changes
+     */
+    needsReprocessing(blobId: string): boolean {
+        if (!blobId) {
+            return false;
+        }
+
+        try {
+            const blobInfo = sql.getRow<{
+                utcDateModified: string;
+                ocr_last_processed: string | null;
+            }>(`
+                SELECT utcDateModified, ocr_last_processed
+                FROM blobs 
+                WHERE blobId = ?
+            `, [blobId]);
+
+            if (!blobInfo) {
+                return false;
+            }
+
+            // If OCR was never processed, it needs processing
+            if (!blobInfo.ocr_last_processed) {
+                return true;
+            }
+
+            // If blob was modified after last OCR processing, it needs re-processing
+            const blobModified = new Date(blobInfo.utcDateModified);
+            const lastOcrProcessed = new Date(blobInfo.ocr_last_processed);
+            
+            return blobModified > lastOcrProcessed;
+        } catch (error) {
+            log.error(`Failed to check if blob ${blobId} needs reprocessing: ${error}`);
+            return false;
+        }
+    }
+
+    /**
+     * Invalidate OCR results for a blob (clear ocr_text and ocr_last_processed)
+     */
+    invalidateOCRResult(blobId: string): void {
+        if (!blobId) {
+            return;
+        }
+
+        try {
+            sql.execute(`
+                UPDATE blobs SET 
+                    ocr_text = NULL,
+                    ocr_last_processed = NULL
+                WHERE blobId = ?
+            `, [blobId]);
+            
+            log.info(`Invalidated OCR result for blob ${blobId}`);
+        } catch (error) {
+            log.error(`Failed to invalidate OCR result for blob ${blobId}: ${error}`);
+            throw error;
+        }
+    }
+
+    /**
+     * Get blobs that need OCR processing (modified after last OCR or never processed)
+     */
+    getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> {
+        try {
+            // Get notes with blobs that need OCR
+            const noteBlobs = sql.getRows<{
+                blobId: string;
+                mimeType: string;
+                entityId: string;
+            }>(`
+                SELECT n.blobId, n.mime as mimeType, n.noteId as entityId
+                FROM notes n
+                JOIN blobs b ON n.blobId = b.blobId
+                WHERE n.type = 'image' 
+                AND n.isDeleted = 0
+                AND n.blobId IS NOT NULL
+                AND (
+                    b.ocr_last_processed IS NULL 
+                    OR b.utcDateModified > b.ocr_last_processed
+                )
+            `);
+
+            // Get attachments with blobs that need OCR
+            const attachmentBlobs = sql.getRows<{
+                blobId: string;
+                mimeType: string;
+                entityId: string;
+            }>(`
+                SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId
+                FROM attachments a
+                JOIN blobs b ON a.blobId = b.blobId
+                WHERE a.role = 'image'
+                AND a.isDeleted = 0
+                AND a.blobId IS NOT NULL
+                AND (
+                    b.ocr_last_processed IS NULL 
+                    OR b.utcDateModified > b.ocr_last_processed
+                )
+            `);
+
+            // Combine results
+            const result = [
+                ...noteBlobs.map(blob => ({ ...blob, entityType: 'note' as const })),
+                ...attachmentBlobs.map(blob => ({ ...blob, entityType: 'attachment' as const }))
+            ];
+
+            // Filter to only supported MIME types
+            return result.filter(blob => this.isSupportedMimeType(blob.mimeType));
+        } catch (error) {
+            log.error(`Failed to get blobs needing OCR: ${error}`);
+            return [];
+        }
+    }
+
+    /**
+     * Process OCR for all blobs that need it (auto-processing)
+     */
+    async processAllBlobsNeedingOCR(): Promise<void> {
+        if (!this.isOCREnabled()) {
+            log.info('OCR is disabled, skipping auto-processing');
+            return;
+        }
+
+        const blobsNeedingOCR = this.getBlobsNeedingOCR();
+        if (blobsNeedingOCR.length === 0) {
+            log.info('No blobs need OCR processing');
+            return;
+        }
+
+        log.info(`Auto-processing OCR for ${blobsNeedingOCR.length} blobs...`);
+
+        for (const blobInfo of blobsNeedingOCR) {
+            try {
+                if (blobInfo.entityType === 'note') {
+                    await this.processNoteOCR(blobInfo.entityId);
+                } else {
+                    await this.processAttachmentOCR(blobInfo.entityId);
+                }
+                
+                // Add small delay to prevent overwhelming the system
+                await new Promise(resolve => setTimeout(resolve, 100));
+            } catch (error) {
+                log.error(`Failed to auto-process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`);
+                // Continue with other blobs
+            }
+        }
+
+        log.info('Auto-processing OCR completed');
+    }
 }

 export default new OCRService();
--- a/apps/server/src/services/ocr/processors/file_processor.ts
+++ b/apps/server/src/services/ocr/processors/file_processor.ts
@ -0,0 +1,28 @@
+import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
+
+/**
+ * Base class for file processors that extract text from different file types
+ */
+export abstract class FileProcessor {
+    /**
+     * Check if this processor can handle the given MIME type
+     */
+    abstract canProcess(mimeType: string): boolean;
+
+    /**
+     * Extract text from the given file buffer
+     */
+    abstract extractText(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult>;
+
+    /**
+     * Get the processing type identifier
+     */
+    abstract getProcessingType(): string;
+
+    /**
+     * Clean up any resources
+     */
+    cleanup(): Promise<void> {
+        return Promise.resolve();
+    }
+}
--- a/apps/server/src/services/ocr/processors/image_processor.ts
+++ b/apps/server/src/services/ocr/processors/image_processor.ts
@ -0,0 +1,162 @@
+import Tesseract from 'tesseract.js';
+import { FileProcessor } from './file_processor.js';
+import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
+import log from '../../log.js';
+
+/**
+ * Image processor for extracting text from image files using Tesseract
+ */
+export class ImageProcessor extends FileProcessor {
+    private worker: Tesseract.Worker | null = null;
+    private isInitialized = false;
+
+    canProcess(mimeType: string): boolean {
+        const supportedTypes = [
+            'image/jpeg',
+            'image/jpg', 
+            'image/png',
+            'image/gif',
+            'image/bmp',
+            'image/tiff',
+            'image/webp'
+        ];
+        return supportedTypes.includes(mimeType.toLowerCase());
+    }
+
+    async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
+        if (!this.isInitialized) {
+            await this.initialize();
+        }
+
+        if (!this.worker) {
+            throw new Error('Image processor worker not initialized');
+        }
+
+        try {
+            log.info('Starting image OCR text extraction...');
+
+            // Set language if specified and different from current
+            // Support multi-language format like 'ron+eng'
+            const language = options.language || this.getDefaultOCRLanguage();
+            
+            // Validate language format
+            if (!this.isValidLanguageFormat(language)) {
+                throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
+            }
+            
+            if (language !== 'eng') {
+                // For different languages, create a new worker
+                await this.worker.terminate();
+                log.info(`Initializing Tesseract worker for language(s): ${language}`);
+                this.worker = await Tesseract.createWorker(language, 1, {
+                    logger: (m: { status: string; progress: number }) => {
+                        if (m.status === 'recognizing text') {
+                            log.info(`Image OCR progress (${language}): ${Math.round(m.progress * 100)}%`);
+                        }
+                    }
+                });
+            }
+
+            const result = await this.worker.recognize(buffer);
+            
+            const ocrResult: OCRResult = {
+                text: result.data.text.trim(),
+                confidence: result.data.confidence / 100,  // Convert percentage to decimal
+                extractedAt: new Date().toISOString(),
+                language: options.language || this.getDefaultOCRLanguage(),
+                pageCount: 1
+            };
+
+            log.info(`Image OCR extraction completed. Confidence: ${ocrResult.confidence}%, Text length: ${ocrResult.text.length}`);
+            return ocrResult;
+
+        } catch (error) {
+            log.error(`Image OCR text extraction failed: ${error}`);
+            throw error;
+        }
+    }
+
+    getProcessingType(): string {
+        return 'image';
+    }
+
+    private async initialize(): Promise<void> {
+        if (this.isInitialized) {
+            return;
+        }
+
+        try {
+            log.info('Initializing image OCR processor with Tesseract.js...');
+            
+            // Configure proper paths for Node.js environment
+            const tesseractDir = require.resolve('tesseract.js').replace('/src/index.js', '');
+            const workerPath = require.resolve('tesseract.js/src/worker-script/node/index.js');
+            const corePath = require.resolve('tesseract.js-core/tesseract-core.wasm.js');
+            
+            log.info(`Using worker path: ${workerPath}`);
+            log.info(`Using core path: ${corePath}`);
+            
+            this.worker = await Tesseract.createWorker(this.getDefaultOCRLanguage(), 1, {
+                workerPath,
+                corePath,
+                logger: (m: { status: string; progress: number }) => {
+                    if (m.status === 'recognizing text') {
+                        log.info(`Image OCR progress: ${Math.round(m.progress * 100)}%`);
+                    }
+                }
+            });
+            this.isInitialized = true;
+            log.info('Image OCR processor initialized successfully');
+        } catch (error) {
+            log.error(`Failed to initialize image OCR processor: ${error}`);
+            throw error;
+        }
+    }
+
+    async cleanup(): Promise<void> {
+        if (this.worker) {
+            await this.worker.terminate();
+            this.worker = null;
+        }
+        this.isInitialized = false;
+        log.info('Image OCR processor cleaned up');
+    }
+
+    /**
+     * Get default OCR language from options
+     */
+    private getDefaultOCRLanguage(): string {
+        try {
+            const options = require('../../options.js').default;
+            const ocrLanguage = options.getOption('ocrLanguage');
+            if (!ocrLanguage) {
+                throw new Error('OCR language not configured in user settings');
+            }
+            return ocrLanguage;
+        } catch (error) {
+            log.error(`Failed to get default OCR language: ${error}`);
+            throw new Error('OCR language must be configured in settings before processing');
+        }
+    }
+
+    /**
+     * Validate OCR language format
+     * Supports single language (eng) or multi-language (ron+eng)
+     */
+    private isValidLanguageFormat(language: string): boolean {
+        if (!language || typeof language !== 'string') {
+            return false;
+        }
+        
+        // Split by '+' for multi-language format
+        const languages = language.split('+');
+        
+        // Check each language code (should be 2-7 characters, alphanumeric with underscores)
+        const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
+        
+        return languages.every(lang => {
+            const trimmed = lang.trim();
+            return trimmed.length > 0 && validLanguagePattern.test(trimmed);
+        });
+    }
+}
--- a/apps/server/src/services/ocr/processors/office_processor.ts
+++ b/apps/server/src/services/ocr/processors/office_processor.ts
@ -0,0 +1,128 @@
+import * as officeParser from 'officeparser';
+import { FileProcessor } from './file_processor.js';
+import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
+import { ImageProcessor } from './image_processor.js';
+import log from '../../log.js';
+
+/**
+ * Office document processor for extracting text and images from DOCX/XLSX/PPTX files
+ */
+export class OfficeProcessor extends FileProcessor {
+    private imageProcessor: ImageProcessor;
+
+    constructor() {
+        super();
+        this.imageProcessor = new ImageProcessor();
+    }
+
+    canProcess(mimeType: string): boolean {
+        const supportedTypes = [
+            'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
+            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
+            'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
+            'application/msword', // DOC
+            'application/vnd.ms-excel', // XLS
+            'application/vnd.ms-powerpoint', // PPT
+            'application/rtf' // RTF
+        ];
+        return supportedTypes.includes(mimeType.toLowerCase());
+    }
+
+    async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
+        try {
+            log.info('Starting Office document text extraction...');
+
+            // Validate language format
+            const language = options.language || this.getDefaultOCRLanguage();
+            if (!this.isValidLanguageFormat(language)) {
+                throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
+            }
+
+            // Extract text from Office document
+            const data = await this.parseOfficeDocument(buffer);
+            
+            // Extract text from Office document
+            const combinedText = data.data && data.data.trim().length > 0 ? data.data.trim() : '';
+            const confidence = combinedText.length > 0 ? 0.99 : 0; // High confidence for direct text extraction
+
+            const result: OCRResult = {
+                text: combinedText,
+                confidence: confidence,
+                extractedAt: new Date().toISOString(),
+                language: language,
+                pageCount: 1 // Office documents are treated as single logical document
+            };
+
+            log.info(`Office document text extraction completed. Confidence: ${confidence}%, Text length: ${result.text.length}`);
+            return result;
+
+        } catch (error) {
+            log.error(`Office document text extraction failed: ${error}`);
+            throw error;
+        }
+    }
+
+    private async parseOfficeDocument(buffer: Buffer): Promise<{ data: string }> {
+        try {
+            // Use promise-based API directly
+            const data = await officeParser.parseOfficeAsync(buffer, {
+                outputErrorToConsole: false,
+                newlineDelimiter: '\n',
+                ignoreNotes: false,
+                putNotesAtLast: false
+            });
+            
+            return {
+                data: data || ''
+            };
+        } catch (error) {
+            throw new Error(`Office document parsing failed: ${error}`);
+        }
+    }
+
+    getProcessingType(): string {
+        return 'office';
+    }
+
+    async cleanup(): Promise<void> {
+        await this.imageProcessor.cleanup();
+    }
+
+    /**
+     * Get default OCR language from options
+     */
+    private getDefaultOCRLanguage(): string {
+        try {
+            const options = require('../../options.js').default;
+            const ocrLanguage = options.getOption('ocrLanguage');
+            if (!ocrLanguage) {
+                throw new Error('OCR language not configured in user settings');
+            }
+            return ocrLanguage;
+        } catch (error) {
+            log.error(`Failed to get default OCR language: ${error}`);
+            throw new Error('OCR language must be configured in settings before processing');
+        }
+    }
+
+    /**
+     * Validate OCR language format
+     * Supports single language (eng) or multi-language (ron+eng)
+     */
+    private isValidLanguageFormat(language: string): boolean {
+        if (!language || typeof language !== 'string') {
+            return false;
+        }
+        
+        // Split by '+' for multi-language format
+        const languages = language.split('+');
+        
+        // Check each language code (should be 2-7 characters, alphanumeric with underscores)
+        const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
+        
+        return languages.every(lang => {
+            const trimmed = lang.trim();
+            return trimmed.length > 0 && validLanguagePattern.test(trimmed);
+        });
+    }
+}
--- a/apps/server/src/services/ocr/processors/pdf_processor.ts
+++ b/apps/server/src/services/ocr/processors/pdf_processor.ts
@ -0,0 +1,142 @@
+import * as pdfParse from 'pdf-parse';
+import { FileProcessor } from './file_processor.js';
+import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
+import { ImageProcessor } from './image_processor.js';
+import log from '../../log.js';
+import sharp from 'sharp';
+
+/**
+ * PDF processor for extracting text from PDF files
+ * First tries to extract existing text, then falls back to OCR on images
+ */
+export class PDFProcessor extends FileProcessor {
+    private imageProcessor: ImageProcessor;
+
+    constructor() {
+        super();
+        this.imageProcessor = new ImageProcessor();
+    }
+
+    canProcess(mimeType: string): boolean {
+        return mimeType.toLowerCase() === 'application/pdf';
+    }
+
+    async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
+        try {
+            log.info('Starting PDF text extraction...');
+
+            // Validate language format
+            const language = options.language || this.getDefaultOCRLanguage();
+            if (!this.isValidLanguageFormat(language)) {
+                throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
+            }
+
+            // First try to extract existing text from PDF
+            if (options.enablePDFTextExtraction !== false) {
+                const textResult = await this.extractTextFromPDF(buffer, options);
+                if (textResult.text.trim().length > 0) {
+                    log.info(`PDF text extraction successful. Length: ${textResult.text.length}`);
+                    return textResult;
+                }
+            }
+
+            // Fall back to OCR if no text found or PDF text extraction is disabled
+            log.info('No text found in PDF or text extraction disabled, falling back to OCR...');
+            return await this.extractTextViaOCR(buffer, options);
+
+        } catch (error) {
+            log.error(`PDF text extraction failed: ${error}`);
+            throw error;
+        }
+    }
+
+    private async extractTextFromPDF(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult> {
+        try {
+            const data = await pdfParse(buffer);
+            
+            return {
+                text: data.text.trim(),
+                confidence: 0.99, // High confidence for direct text extraction
+                extractedAt: new Date().toISOString(),
+                language: options.language || this.getDefaultOCRLanguage(),
+                pageCount: data.numpages
+            };
+        } catch (error) {
+            log.error(`PDF text extraction failed: ${error}`);
+            throw error;
+        }
+    }
+
+    private async extractTextViaOCR(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult> {
+        try {
+            // Convert PDF to images and OCR each page
+            // For now, we'll use a simple approach - convert first page to image
+            // In a full implementation, we'd convert all pages
+            
+            // This is a simplified implementation
+            // In practice, you might want to use pdf2pic or similar library
+            // to convert PDF pages to images for OCR
+            
+            // For now, we'll return a placeholder result
+            // indicating that OCR on PDF is not fully implemented
+            log.info('PDF to image conversion not fully implemented, returning placeholder');
+            
+            return {
+                text: '[PDF OCR not fully implemented - would convert PDF pages to images and OCR each page]',
+                confidence: 0.0,
+                extractedAt: new Date().toISOString(),
+                language: options.language || this.getDefaultOCRLanguage(),
+                pageCount: 1
+            };
+        } catch (error) {
+            log.error(`PDF OCR extraction failed: ${error}`);
+            throw error;
+        }
+    }
+
+    getProcessingType(): string {
+        return 'pdf';
+    }
+
+    async cleanup(): Promise<void> {
+        await this.imageProcessor.cleanup();
+    }
+
+    /**
+     * Get default OCR language from options
+     */
+    private getDefaultOCRLanguage(): string {
+        try {
+            const options = require('../../options.js').default;
+            const ocrLanguage = options.getOption('ocrLanguage');
+            if (!ocrLanguage) {
+                throw new Error('OCR language not configured in user settings');
+            }
+            return ocrLanguage;
+        } catch (error) {
+            log.error(`Failed to get default OCR language: ${error}`);
+            throw new Error('OCR language must be configured in settings before processing');
+        }
+    }
+
+    /**
+     * Validate OCR language format
+     * Supports single language (eng) or multi-language (ron+eng)
+     */
+    private isValidLanguageFormat(language: string): boolean {
+        if (!language || typeof language !== 'string') {
+            return false;
+        }
+        
+        // Split by '+' for multi-language format
+        const languages = language.split('+');
+        
+        // Check each language code (should be 2-7 characters, alphanumeric with underscores)
+        const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
+        
+        return languages.every(lang => {
+            const trimmed = lang.trim();
+            return trimmed.length > 0 && validLanguagePattern.test(trimmed);
+        });
+    }
+}
--- a/apps/server/src/services/ocr/processors/tiff_processor.ts
+++ b/apps/server/src/services/ocr/processors/tiff_processor.ts
@ -0,0 +1,129 @@
+import sharp from 'sharp';
+import { FileProcessor } from './file_processor.js';
+import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
+import { ImageProcessor } from './image_processor.js';
+import log from '../../log.js';
+
+/**
+ * TIFF processor for extracting text from multi-page TIFF files
+ */
+export class TIFFProcessor extends FileProcessor {
+    private imageProcessor: ImageProcessor;
+
+    constructor() {
+        super();
+        this.imageProcessor = new ImageProcessor();
+    }
+
+    canProcess(mimeType: string): boolean {
+        return mimeType.toLowerCase() === 'image/tiff' || mimeType.toLowerCase() === 'image/tif';
+    }
+
+    async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
+        try {
+            log.info('Starting TIFF text extraction...');
+
+            // Validate language format
+            const language = options.language || this.getDefaultOCRLanguage();
+            if (!this.isValidLanguageFormat(language)) {
+                throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
+            }
+
+            // Check if this is a multi-page TIFF
+            const metadata = await sharp(buffer).metadata();
+            const pageCount = metadata.pages || 1;
+
+            let combinedText = '';
+            let totalConfidence = 0;
+
+            // Process each page
+            for (let page = 0; page < pageCount; page++) {
+                try {
+                    log.info(`Processing TIFF page ${page + 1}/${pageCount}...`);
+                    
+                    // Extract page as PNG buffer
+                    const pageBuffer = await sharp(buffer, { page })
+                        .png()
+                        .toBuffer();
+
+                    // OCR the page
+                    const pageResult = await this.imageProcessor.extractText(pageBuffer, options);
+                    
+                    if (pageResult.text.trim().length > 0) {
+                        if (combinedText.length > 0) {
+                            combinedText += '\n\n--- Page ' + (page + 1) + ' ---\n';
+                        }
+                        combinedText += pageResult.text;
+                        totalConfidence += pageResult.confidence;
+                    }
+                } catch (error) {
+                    log.error(`Failed to process TIFF page ${page + 1}: ${error}`);
+                    // Continue with other pages
+                }
+            }
+
+            const averageConfidence = pageCount > 0 ? totalConfidence / pageCount : 0;
+
+            const result: OCRResult = {
+                text: combinedText.trim(),
+                confidence: averageConfidence,
+                extractedAt: new Date().toISOString(),
+                language: options.language || this.getDefaultOCRLanguage(),
+                pageCount: pageCount
+            };
+
+            log.info(`TIFF text extraction completed. Pages: ${pageCount}, Confidence: ${averageConfidence}%, Text length: ${result.text.length}`);
+            return result;
+
+        } catch (error) {
+            log.error(`TIFF text extraction failed: ${error}`);
+            throw error;
+        }
+    }
+
+    getProcessingType(): string {
+        return 'tiff';
+    }
+
+    async cleanup(): Promise<void> {
+        await this.imageProcessor.cleanup();
+    }
+
+    /**
+     * Get default OCR language from options
+     */
+    private getDefaultOCRLanguage(): string {
+        try {
+            const options = require('../../options.js').default;
+            const ocrLanguage = options.getOption('ocrLanguage');
+            if (!ocrLanguage) {
+                throw new Error('OCR language not configured in user settings');
+            }
+            return ocrLanguage;
+        } catch (error) {
+            log.error(`Failed to get default OCR language: ${error}`);
+            throw new Error('OCR language must be configured in settings before processing');
+        }
+    }
+
+    /**
+     * Validate OCR language format
+     * Supports single language (eng) or multi-language (ron+eng)
+     */
+    private isValidLanguageFormat(language: string): boolean {
+        if (!language || typeof language !== 'string') {
+            return false;
+        }
+        
+        // Split by '+' for multi-language format
+        const languages = language.split('+');
+        
+        // Check each language code (should be 2-7 characters, alphanumeric with underscores)
+        const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
+        
+        return languages.every(lang => {
+            const trimmed = lang.trim();
+            return trimmed.length > 0 && validLanguagePattern.test(trimmed);
+        });
+    }
+}