feat(ocr): add additional processors for OCR feature

2025-12-05 06:54:23 +01:00 · 2025-07-16 20:10:56 +00:00 · 2025-07-16 20:10:56 +00:00 · ca8cbf8ccf
commit ca8cbf8ccf
parent 6722d2d266
6 changed files with 812 additions and 211 deletions
--- a/apps/server/src/services/ocr/ocr_service.ts
+++ b/apps/server/src/services/ocr/ocr_service.ts
@ -3,23 +3,31 @@ import log from '../log.js';
 import sql from '../sql.js';
 import becca from '../../becca/becca.js';
 import options from '../options.js';
 import { ImageProcessor } from './processors/image_processor.js';
 import { PDFProcessor } from './processors/pdf_processor.js';
 import { TIFFProcessor } from './processors/tiff_processor.js';
 import { OfficeProcessor } from './processors/office_processor.js';
 import { FileProcessor } from './processors/file_processor.js';
 export interface OCRResult {
    text: string;
    confidence: number;
    extractedAt: string;
    language?: string;
    pageCount?: number;
 }
 export interface OCRProcessingOptions {
    language?: string;
    forceReprocess?: boolean;
    confidence?: number;
    enablePDFTextExtraction?: boolean;
 }
 interface OCRBlobRow {
    blobId: string;
    ocr_text: string;
    ocr_last_processed?: string;
 }
 /**
@ -30,6 +38,7 @@ class OCRService {
    private isInitialized = false;
    private worker: Tesseract.Worker | null = null;
    private isProcessing = false;
    private processors: Map<string, FileProcessor> = new Map();
    /**
     * Initialize the OCR service
@ -40,25 +49,14 @@ class OCRService {
        }
        try {
-            log.info('Initializing OCR service with Tesseract.js...');
+            log.info('Initializing OCR service with file processors...');
-            // Configure proper paths for Node.js environment
+            // Initialize file processors
-            const tesseractDir = require.resolve('tesseract.js').replace('/src/index.js', '');
+            this.processors.set('image', new ImageProcessor());
-            const workerPath = require.resolve('tesseract.js/src/worker-script/node/index.js');
+            this.processors.set('pdf', new PDFProcessor());
-            const corePath = require.resolve('tesseract.js-core/tesseract-core.wasm.js');
+            this.processors.set('tiff', new TIFFProcessor());
            this.processors.set('office', new OfficeProcessor());
            log.info(`Using worker path: ${workerPath}`);
            log.info(`Using core path: ${corePath}`);
            this.worker = await Tesseract.createWorker('eng', 1, {
                workerPath,
                corePath,
                logger: (m: { status: string; progress: number }) => {
                    if (m.status === 'recognizing text') {
                        log.info(`OCR progress: ${Math.round(m.progress * 100)}%`);
                    }
                }
            });
            this.isInitialized = true;
            log.info('OCR service initialized successfully');
        } catch (error) {
@ -100,46 +98,27 @@ class OCRService {
    }
    /**
-     * Extract text from image buffer
+     * Extract text from file buffer using appropriate processor
     */
-    async extractTextFromImage(imageBuffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
+    async extractTextFromFile(fileBuffer: Buffer, mimeType: string, options: OCRProcessingOptions = {}): Promise<OCRResult> {
        if (!this.isInitialized) {
            await this.initialize();
        }
        if (!this.worker) {
            throw new Error('OCR worker not initialized');
        }
        try {
-            log.info('Starting OCR text extraction...');
+            log.info(`Starting OCR text extraction for MIME type: ${mimeType}`);
            this.isProcessing = true;
-            // Set language if specified and different from current
+            // Find appropriate processor
-            const language = options.language || 'eng';
+            const processor = this.getProcessorForMimeType(mimeType);
-            if (language !== 'eng') {
+            if (!processor) {
-                // For different languages, create a new worker
+                throw new Error(`No processor found for MIME type: ${mimeType}`);
                await this.worker.terminate();
                this.worker = await Tesseract.createWorker(language, 1, {
                    logger: (m: { status: string; progress: number }) => {
                        if (m.status === 'recognizing text') {
                            log.info(`OCR progress: ${Math.round(m.progress * 100)}%`);
                        }
                    }
                });
            }
-            const result = await this.worker.recognize(imageBuffer);
+            const result = await processor.extractText(fileBuffer, options);
-            const ocrResult: OCRResult = {
+            log.info(`OCR extraction completed. Confidence: ${result.confidence}%, Text length: ${result.text.length}`);
-                text: result.data.text.trim(),
+            return result;
                confidence: result.data.confidence / 100,  // Convert percentage to decimal
                extractedAt: new Date().toISOString(),
                language: options.language || 'eng'
            };
            log.info(`OCR extraction completed. Confidence: ${ocrResult.confidence}%, Text length: ${ocrResult.text.length}`);
            return ocrResult;
        } catch (error) {
            log.error(`OCR text extraction failed: ${error}`);
@ -174,10 +153,10 @@ class OCRService {
            return null;
        }
-        // Check if OCR already exists in the blob and we're not forcing reprocessing
+        // Check if OCR already exists and is up-to-date
        const existingOCR = this.getStoredOCRResult(note.blobId);
-        if (existingOCR && !options.forceReprocess) {
+        if (existingOCR && !options.forceReprocess && note.blobId && !this.needsReprocessing(note.blobId)) {
-            log.info(`OCR already exists for note ${noteId}, returning cached result`);
+            log.info(`OCR already exists and is up-to-date for note ${noteId}, returning cached result`);
            return existingOCR;
        }
@ -187,7 +166,7 @@ class OCRService {
                throw new Error(`Cannot get image content for note ${noteId}`);
            }
-            const ocrResult = await this.extractTextFromImage(content, options);
+            const ocrResult = await this.extractTextFromFile(content, note.mime, options);
            // Store OCR result in blob
            await this.storeOCRResult(note.blobId, ocrResult);
@ -224,10 +203,10 @@ class OCRService {
            return null;
        }
-        // Check if OCR already exists in the blob and we're not forcing reprocessing
+        // Check if OCR already exists and is up-to-date
        const existingOCR = this.getStoredOCRResult(attachment.blobId);
-        if (existingOCR && !options.forceReprocess) {
+        if (existingOCR && !options.forceReprocess && attachment.blobId && !this.needsReprocessing(attachment.blobId)) {
-            log.info(`OCR already exists for attachment ${attachmentId}, returning cached result`);
+            log.info(`OCR already exists and is up-to-date for attachment ${attachmentId}, returning cached result`);
            return existingOCR;
        }
@ -237,7 +216,7 @@ class OCRService {
                throw new Error(`Cannot get image content for attachment ${attachmentId}`);
            }
-            const ocrResult = await this.extractTextFromImage(content, options);
+            const ocrResult = await this.extractTextFromFile(content, attachment.mime, options);
            // Store OCR result in blob
            await this.storeOCRResult(attachment.blobId, ocrResult);
@ -259,11 +238,15 @@ class OCRService {
        }
        try {
-            // Store OCR text in blobs table
+            // Store OCR text and timestamp in blobs table
            sql.execute(`
-                UPDATE blobs SET ocr_text = ? WHERE blobId = ?
+                UPDATE blobs SET 
                    ocr_text = ?, 
                    ocr_last_processed = ?
                WHERE blobId = ?
            `, [
                ocrResult.text,
                new Date().toISOString(),
                blobId
            ]);
@ -353,80 +336,10 @@ class OCRService {
    }
    /**
-     * Process OCR for all images that don't have OCR results yet
+     * Process OCR for all files that don't have OCR results yet or need reprocessing
     */
    async processAllImages(): Promise<void> {
-        if (!this.isOCREnabled()) {
+        return this.processAllBlobsNeedingOCR();
            log.info('OCR is disabled, skipping batch processing');
            return;
        }
        log.info('Starting batch OCR processing for all images...');
        try {
            // Process image notes
            const imageNotes = sql.getRows<{
                noteId: string;
                mime: string;
                blobId: string;
            }>(`
                SELECT n.noteId, n.mime, n.blobId
                FROM notes n
                LEFT JOIN blobs b ON n.blobId = b.blobId
                WHERE n.type = 'image' 
                AND n.isDeleted = 0
                AND n.blobId IS NOT NULL
                AND (b.ocr_text IS NULL OR b.ocr_text = '')
            `);
            log.info(`Found ${imageNotes.length} image notes to process`);
            for (const noteRow of imageNotes) {
                if (this.isSupportedMimeType(noteRow.mime)) {
                    try {
                        await this.processNoteOCR(noteRow.noteId);
                        // Add small delay to prevent overwhelming the system
                        await new Promise(resolve => setTimeout(resolve, 100));
                    } catch (error) {
                        log.error(`Failed to process OCR for note ${noteRow.noteId}: ${error}`);
                    }
                }
            }
            // Process image attachments
            const imageAttachments = sql.getRows<{
                attachmentId: string;
                mime: string;
                blobId: string;
            }>(`
                SELECT a.attachmentId, a.mime, a.blobId
                FROM attachments a
                LEFT JOIN blobs b ON a.blobId = b.blobId
                WHERE a.role = 'image'
                AND a.isDeleted = 0
                AND a.blobId IS NOT NULL
                AND (b.ocr_text IS NULL OR b.ocr_text = '')
            `);
            log.info(`Found ${imageAttachments.length} image attachments to process`);
            for (const attachmentRow of imageAttachments) {
                if (this.isSupportedMimeType(attachmentRow.mime)) {
                    try {
                        await this.processAttachmentOCR(attachmentRow.attachmentId);
                        // Add small delay to prevent overwhelming the system
                        await new Promise(resolve => setTimeout(resolve, 100));
                    } catch (error) {
                        log.error(`Failed to process OCR for attachment ${attachmentRow.attachmentId}: ${error}`);
                    }
                }
            }
            log.info('Batch OCR processing completed');
        } catch (error) {
            log.error(`Batch OCR processing failed: ${error}`);
            throw error;
        }
    }
    /**
@ -521,28 +434,9 @@ class OCRService {
        }
        try {
-            // Count total images to process
+            // Count total blobs needing OCR processing
-            const imageNotesCount = sql.getRow<{ count: number }>(`
+            const blobsNeedingOCR = this.getBlobsNeedingOCR();
-                SELECT COUNT(*) as count
+            const totalCount = blobsNeedingOCR.length;
                FROM notes 
                WHERE type = 'image' 
                AND isDeleted = 0
                AND noteId NOT IN (
                    SELECT entity_id FROM ocr_results WHERE entity_type = 'note'
                )
            `)?.count || 0;
            const imageAttachmentsCount = sql.getRow<{ count: number }>(`
                SELECT COUNT(*) as count
                FROM attachments 
                WHERE role = 'image'
                AND isDeleted = 0
                AND attachmentId NOT IN (
                    SELECT entity_id FROM ocr_results WHERE entity_type = 'attachment'
                )
            `)?.count || 0;
            const totalCount = imageNotesCount + imageAttachmentsCount;
            if (totalCount === 0) {
                return { success: false, message: 'No images found that need OCR processing' };
@ -557,7 +451,7 @@ class OCRService {
            };
            // Start processing in background
-            this.processBatchInBackground().catch(error => {
+            this.processBatchInBackground(blobsNeedingOCR).catch(error => {
                log.error(`Batch processing failed: ${error instanceof Error ? error.message : String(error)}`);
                this.batchProcessingState.inProgress = false;
            });
@ -583,79 +477,33 @@ class OCRService {
    /**
     * Process batch OCR in background with progress tracking
     */
-    private async processBatchInBackground(): Promise<void> {
+    private async processBatchInBackground(blobsToProcess: Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }>): Promise<void> {
        try {
            log.info('Starting batch OCR processing...');
-            // Process image notes
+            for (const blobInfo of blobsToProcess) {
            const imageNotes = sql.getRows<{
                noteId: string;
                mime: string;
                blobId: string;
            }>(`
                SELECT n.noteId, n.mime, n.blobId
                FROM notes n
                LEFT JOIN blobs b ON n.blobId = b.blobId
                WHERE n.type = 'image' 
                AND n.isDeleted = 0
                AND n.blobId IS NOT NULL
                AND (b.ocr_text IS NULL OR b.ocr_text = '')
            `);
            for (const noteRow of imageNotes) {
                if (!this.batchProcessingState.inProgress) {
                    break; // Stop if processing was cancelled
                }
-                if (this.isSupportedMimeType(noteRow.mime)) {
+                try {
-                    try {
+                    if (blobInfo.entityType === 'note') {
-                        await this.processNoteOCR(noteRow.noteId);
+                        await this.processNoteOCR(blobInfo.entityId);
-                        this.batchProcessingState.processed++;
+                    } else {
-                        // Add small delay to prevent overwhelming the system
+                        await this.processAttachmentOCR(blobInfo.entityId);
                        await new Promise(resolve => setTimeout(resolve, 500));
                    } catch (error) {
                        log.error(`Failed to process OCR for note ${noteRow.noteId}: ${error}`);
                        this.batchProcessingState.processed++; // Count as processed even if failed
                    }
                }
            }
            // Process image attachments
            const imageAttachments = sql.getRows<{
                attachmentId: string;
                mime: string;
                blobId: string;
            }>(`
                SELECT a.attachmentId, a.mime, a.blobId
                FROM attachments a
                LEFT JOIN blobs b ON a.blobId = b.blobId
                WHERE a.role = 'image'
                AND a.isDeleted = 0
                AND a.blobId IS NOT NULL
                AND (b.ocr_text IS NULL OR b.ocr_text = '')
            `);
            for (const attachmentRow of imageAttachments) {
                if (!this.batchProcessingState.inProgress) {
                    break; // Stop if processing was cancelled
                }
                if (this.isSupportedMimeType(attachmentRow.mime)) {
                    try {
                        await this.processAttachmentOCR(attachmentRow.attachmentId);
                        this.batchProcessingState.processed++;
                        // Add small delay to prevent overwhelming the system
                        await new Promise(resolve => setTimeout(resolve, 500));
                    } catch (error) {
                        log.error(`Failed to process OCR for attachment ${attachmentRow.attachmentId}: ${error}`);
                        this.batchProcessingState.processed++; // Count as processed even if failed
                    }
                    this.batchProcessingState.processed++;
                    // Add small delay to prevent overwhelming the system
                    await new Promise(resolve => setTimeout(resolve, 500));
                } catch (error) {
                    log.error(`Failed to process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`);
                    this.batchProcessingState.processed++; // Count as processed even if failed
                }
            }
            // Mark as completed
            this.batchProcessingState.inProgress = false;
-            log.info(`Batch OCR processing completed. Processed ${this.batchProcessingState.processed} images.`);
+            log.info(`Batch OCR processing completed. Processed ${this.batchProcessingState.processed} files.`);
        } catch (error) {
            log.error(`Batch OCR processing failed: ${error}`);
            this.batchProcessingState.inProgress = false;
@ -672,6 +520,170 @@ class OCRService {
            log.info('Batch OCR processing cancelled');
        }
    }
    /**
     * Get processor for a given MIME type
     */
    private getProcessorForMimeType(mimeType: string): FileProcessor | null {
        for (const processor of this.processors.values()) {
            if (processor.canProcess(mimeType)) {
                return processor;
            }
        }
        return null;
    }
    /**
     * Check if blob needs OCR re-processing due to content changes
     */
    needsReprocessing(blobId: string): boolean {
        if (!blobId) {
            return false;
        }
        try {
            const blobInfo = sql.getRow<{
                utcDateModified: string;
                ocr_last_processed: string | null;
            }>(`
                SELECT utcDateModified, ocr_last_processed
                FROM blobs 
                WHERE blobId = ?
            `, [blobId]);
            if (!blobInfo) {
                return false;
            }
            // If OCR was never processed, it needs processing
            if (!blobInfo.ocr_last_processed) {
                return true;
            }
            // If blob was modified after last OCR processing, it needs re-processing
            const blobModified = new Date(blobInfo.utcDateModified);
            const lastOcrProcessed = new Date(blobInfo.ocr_last_processed);
            return blobModified > lastOcrProcessed;
        } catch (error) {
            log.error(`Failed to check if blob ${blobId} needs reprocessing: ${error}`);
            return false;
        }
    }
    /**
     * Invalidate OCR results for a blob (clear ocr_text and ocr_last_processed)
     */
    invalidateOCRResult(blobId: string): void {
        if (!blobId) {
            return;
        }
        try {
            sql.execute(`
                UPDATE blobs SET 
                    ocr_text = NULL,
                    ocr_last_processed = NULL
                WHERE blobId = ?
            `, [blobId]);
            log.info(`Invalidated OCR result for blob ${blobId}`);
        } catch (error) {
            log.error(`Failed to invalidate OCR result for blob ${blobId}: ${error}`);
            throw error;
        }
    }
    /**
     * Get blobs that need OCR processing (modified after last OCR or never processed)
     */
    getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> {
        try {
            // Get notes with blobs that need OCR
            const noteBlobs = sql.getRows<{
                blobId: string;
                mimeType: string;
                entityId: string;
            }>(`
                SELECT n.blobId, n.mime as mimeType, n.noteId as entityId
                FROM notes n
                JOIN blobs b ON n.blobId = b.blobId
                WHERE n.type = 'image' 
                AND n.isDeleted = 0
                AND n.blobId IS NOT NULL
                AND (
                    b.ocr_last_processed IS NULL 
                    OR b.utcDateModified > b.ocr_last_processed
                )
            `);
            // Get attachments with blobs that need OCR
            const attachmentBlobs = sql.getRows<{
                blobId: string;
                mimeType: string;
                entityId: string;
            }>(`
                SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId
                FROM attachments a
                JOIN blobs b ON a.blobId = b.blobId
                WHERE a.role = 'image'
                AND a.isDeleted = 0
                AND a.blobId IS NOT NULL
                AND (
                    b.ocr_last_processed IS NULL 
                    OR b.utcDateModified > b.ocr_last_processed
                )
            `);
            // Combine results
            const result = [
                ...noteBlobs.map(blob => ({ ...blob, entityType: 'note' as const })),
                ...attachmentBlobs.map(blob => ({ ...blob, entityType: 'attachment' as const }))
            ];
            // Filter to only supported MIME types
            return result.filter(blob => this.isSupportedMimeType(blob.mimeType));
        } catch (error) {
            log.error(`Failed to get blobs needing OCR: ${error}`);
            return [];
        }
    }
    /**
     * Process OCR for all blobs that need it (auto-processing)
     */
    async processAllBlobsNeedingOCR(): Promise<void> {
        if (!this.isOCREnabled()) {
            log.info('OCR is disabled, skipping auto-processing');
            return;
        }
        const blobsNeedingOCR = this.getBlobsNeedingOCR();
        if (blobsNeedingOCR.length === 0) {
            log.info('No blobs need OCR processing');
            return;
        }
        log.info(`Auto-processing OCR for ${blobsNeedingOCR.length} blobs...`);
        for (const blobInfo of blobsNeedingOCR) {
            try {
                if (blobInfo.entityType === 'note') {
                    await this.processNoteOCR(blobInfo.entityId);
                } else {
                    await this.processAttachmentOCR(blobInfo.entityId);
                }
                // Add small delay to prevent overwhelming the system
                await new Promise(resolve => setTimeout(resolve, 100));
            } catch (error) {
                log.error(`Failed to auto-process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`);
                // Continue with other blobs
            }
        }
        log.info('Auto-processing OCR completed');
    }
 }
 export default new OCRService();
--- a/apps/server/src/services/ocr/processors/file_processor.ts
+++ b/apps/server/src/services/ocr/processors/file_processor.ts
@ -0,0 +1,28 @@
 import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
 /**
 * Base class for file processors that extract text from different file types
 */
 export abstract class FileProcessor {
    /**
     * Check if this processor can handle the given MIME type
     */
    abstract canProcess(mimeType: string): boolean;
    /**
     * Extract text from the given file buffer
     */
    abstract extractText(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult>;
    /**
     * Get the processing type identifier
     */
    abstract getProcessingType(): string;
    /**
     * Clean up any resources
     */
    cleanup(): Promise<void> {
        return Promise.resolve();
    }
 }
--- a/apps/server/src/services/ocr/processors/image_processor.ts
+++ b/apps/server/src/services/ocr/processors/image_processor.ts
@ -0,0 +1,162 @@
 import Tesseract from 'tesseract.js';
 import { FileProcessor } from './file_processor.js';
 import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
 import log from '../../log.js';
 /**
 * Image processor for extracting text from image files using Tesseract
 */
 export class ImageProcessor extends FileProcessor {
    private worker: Tesseract.Worker | null = null;
    private isInitialized = false;
    canProcess(mimeType: string): boolean {
        const supportedTypes = [
            'image/jpeg',
            'image/jpg', 
            'image/png',
            'image/gif',
            'image/bmp',
            'image/tiff',
            'image/webp'
        ];
        return supportedTypes.includes(mimeType.toLowerCase());
    }
    async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
        if (!this.isInitialized) {
            await this.initialize();
        }
        if (!this.worker) {
            throw new Error('Image processor worker not initialized');
        }
        try {
            log.info('Starting image OCR text extraction...');
            // Set language if specified and different from current
            // Support multi-language format like 'ron+eng'
            const language = options.language || this.getDefaultOCRLanguage();
            // Validate language format
            if (!this.isValidLanguageFormat(language)) {
                throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
            }
            if (language !== 'eng') {
                // For different languages, create a new worker
                await this.worker.terminate();
                log.info(`Initializing Tesseract worker for language(s): ${language}`);
                this.worker = await Tesseract.createWorker(language, 1, {
                    logger: (m: { status: string; progress: number }) => {
                        if (m.status === 'recognizing text') {
                            log.info(`Image OCR progress (${language}): ${Math.round(m.progress * 100)}%`);
                        }
                    }
                });
            }
            const result = await this.worker.recognize(buffer);
            const ocrResult: OCRResult = {
                text: result.data.text.trim(),
                confidence: result.data.confidence / 100,  // Convert percentage to decimal
                extractedAt: new Date().toISOString(),
                language: options.language || this.getDefaultOCRLanguage(),
                pageCount: 1
            };
            log.info(`Image OCR extraction completed. Confidence: ${ocrResult.confidence}%, Text length: ${ocrResult.text.length}`);
            return ocrResult;
        } catch (error) {
            log.error(`Image OCR text extraction failed: ${error}`);
            throw error;
        }
    }
    getProcessingType(): string {
        return 'image';
    }
    private async initialize(): Promise<void> {
        if (this.isInitialized) {
            return;
        }
        try {
            log.info('Initializing image OCR processor with Tesseract.js...');
            // Configure proper paths for Node.js environment
            const tesseractDir = require.resolve('tesseract.js').replace('/src/index.js', '');
            const workerPath = require.resolve('tesseract.js/src/worker-script/node/index.js');
            const corePath = require.resolve('tesseract.js-core/tesseract-core.wasm.js');
            log.info(`Using worker path: ${workerPath}`);
            log.info(`Using core path: ${corePath}`);
            this.worker = await Tesseract.createWorker(this.getDefaultOCRLanguage(), 1, {
                workerPath,
                corePath,
                logger: (m: { status: string; progress: number }) => {
                    if (m.status === 'recognizing text') {
                        log.info(`Image OCR progress: ${Math.round(m.progress * 100)}%`);
                    }
                }
            });
            this.isInitialized = true;
            log.info('Image OCR processor initialized successfully');
        } catch (error) {
            log.error(`Failed to initialize image OCR processor: ${error}`);
            throw error;
        }
    }
    async cleanup(): Promise<void> {
        if (this.worker) {
            await this.worker.terminate();
            this.worker = null;
        }
        this.isInitialized = false;
        log.info('Image OCR processor cleaned up');
    }
    /**
     * Get default OCR language from options
     */
    private getDefaultOCRLanguage(): string {
        try {
            const options = require('../../options.js').default;
            const ocrLanguage = options.getOption('ocrLanguage');
            if (!ocrLanguage) {
                throw new Error('OCR language not configured in user settings');
            }
            return ocrLanguage;
        } catch (error) {
            log.error(`Failed to get default OCR language: ${error}`);
            throw new Error('OCR language must be configured in settings before processing');
        }
    }
    /**
     * Validate OCR language format
     * Supports single language (eng) or multi-language (ron+eng)
     */
    private isValidLanguageFormat(language: string): boolean {
        if (!language || typeof language !== 'string') {
            return false;
        }
        // Split by '+' for multi-language format
        const languages = language.split('+');
        // Check each language code (should be 2-7 characters, alphanumeric with underscores)
        const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
        return languages.every(lang => {
            const trimmed = lang.trim();
            return trimmed.length > 0 && validLanguagePattern.test(trimmed);
        });
    }
 }
--- a/apps/server/src/services/ocr/processors/office_processor.ts
+++ b/apps/server/src/services/ocr/processors/office_processor.ts
@ -0,0 +1,128 @@
 import * as officeParser from 'officeparser';
 import { FileProcessor } from './file_processor.js';
 import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
 import { ImageProcessor } from './image_processor.js';
 import log from '../../log.js';
 /**
 * Office document processor for extracting text and images from DOCX/XLSX/PPTX files
 */
 export class OfficeProcessor extends FileProcessor {
    private imageProcessor: ImageProcessor;
    constructor() {
        super();
        this.imageProcessor = new ImageProcessor();
    }
    canProcess(mimeType: string): boolean {
        const supportedTypes = [
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
            'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
            'application/msword', // DOC
            'application/vnd.ms-excel', // XLS
            'application/vnd.ms-powerpoint', // PPT
            'application/rtf' // RTF
        ];
        return supportedTypes.includes(mimeType.toLowerCase());
    }
    async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
        try {
            log.info('Starting Office document text extraction...');
            // Validate language format
            const language = options.language || this.getDefaultOCRLanguage();
            if (!this.isValidLanguageFormat(language)) {
                throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
            }
            // Extract text from Office document
            const data = await this.parseOfficeDocument(buffer);
            // Extract text from Office document
            const combinedText = data.data && data.data.trim().length > 0 ? data.data.trim() : '';
            const confidence = combinedText.length > 0 ? 0.99 : 0; // High confidence for direct text extraction
            const result: OCRResult = {
                text: combinedText,
                confidence: confidence,
                extractedAt: new Date().toISOString(),
                language: language,
                pageCount: 1 // Office documents are treated as single logical document
            };
            log.info(`Office document text extraction completed. Confidence: ${confidence}%, Text length: ${result.text.length}`);
            return result;
        } catch (error) {
            log.error(`Office document text extraction failed: ${error}`);
            throw error;
        }
    }
    private async parseOfficeDocument(buffer: Buffer): Promise<{ data: string }> {
        try {
            // Use promise-based API directly
            const data = await officeParser.parseOfficeAsync(buffer, {
                outputErrorToConsole: false,
                newlineDelimiter: '\n',
                ignoreNotes: false,
                putNotesAtLast: false
            });
            return {
                data: data || ''
            };
        } catch (error) {
            throw new Error(`Office document parsing failed: ${error}`);
        }
    }
    getProcessingType(): string {
        return 'office';
    }
    async cleanup(): Promise<void> {
        await this.imageProcessor.cleanup();
    }
    /**
     * Get default OCR language from options
     */
    private getDefaultOCRLanguage(): string {
        try {
            const options = require('../../options.js').default;
            const ocrLanguage = options.getOption('ocrLanguage');
            if (!ocrLanguage) {
                throw new Error('OCR language not configured in user settings');
            }
            return ocrLanguage;
        } catch (error) {
            log.error(`Failed to get default OCR language: ${error}`);
            throw new Error('OCR language must be configured in settings before processing');
        }
    }
    /**
     * Validate OCR language format
     * Supports single language (eng) or multi-language (ron+eng)
     */
    private isValidLanguageFormat(language: string): boolean {
        if (!language || typeof language !== 'string') {
            return false;
        }
        // Split by '+' for multi-language format
        const languages = language.split('+');
        // Check each language code (should be 2-7 characters, alphanumeric with underscores)
        const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
        return languages.every(lang => {
            const trimmed = lang.trim();
            return trimmed.length > 0 && validLanguagePattern.test(trimmed);
        });
    }
 }
--- a/apps/server/src/services/ocr/processors/pdf_processor.ts
+++ b/apps/server/src/services/ocr/processors/pdf_processor.ts
@ -0,0 +1,142 @@
 import * as pdfParse from 'pdf-parse';
 import { FileProcessor } from './file_processor.js';
 import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
 import { ImageProcessor } from './image_processor.js';
 import log from '../../log.js';
 import sharp from 'sharp';
 /**
 * PDF processor for extracting text from PDF files
 * First tries to extract existing text, then falls back to OCR on images
 */
 export class PDFProcessor extends FileProcessor {
    private imageProcessor: ImageProcessor;
    constructor() {
        super();
        this.imageProcessor = new ImageProcessor();
    }
    canProcess(mimeType: string): boolean {
        return mimeType.toLowerCase() === 'application/pdf';
    }
    async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
        try {
            log.info('Starting PDF text extraction...');
            // Validate language format
            const language = options.language || this.getDefaultOCRLanguage();
            if (!this.isValidLanguageFormat(language)) {
                throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
            }
            // First try to extract existing text from PDF
            if (options.enablePDFTextExtraction !== false) {
                const textResult = await this.extractTextFromPDF(buffer, options);
                if (textResult.text.trim().length > 0) {
                    log.info(`PDF text extraction successful. Length: ${textResult.text.length}`);
                    return textResult;
                }
            }
            // Fall back to OCR if no text found or PDF text extraction is disabled
            log.info('No text found in PDF or text extraction disabled, falling back to OCR...');
            return await this.extractTextViaOCR(buffer, options);
        } catch (error) {
            log.error(`PDF text extraction failed: ${error}`);
            throw error;
        }
    }
    private async extractTextFromPDF(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult> {
        try {
            const data = await pdfParse(buffer);
            return {
                text: data.text.trim(),
                confidence: 0.99, // High confidence for direct text extraction
                extractedAt: new Date().toISOString(),
                language: options.language || this.getDefaultOCRLanguage(),
                pageCount: data.numpages
            };
        } catch (error) {
            log.error(`PDF text extraction failed: ${error}`);
            throw error;
        }
    }
    private async extractTextViaOCR(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult> {
        try {
            // Convert PDF to images and OCR each page
            // For now, we'll use a simple approach - convert first page to image
            // In a full implementation, we'd convert all pages
            // This is a simplified implementation
            // In practice, you might want to use pdf2pic or similar library
            // to convert PDF pages to images for OCR
            // For now, we'll return a placeholder result
            // indicating that OCR on PDF is not fully implemented
            log.info('PDF to image conversion not fully implemented, returning placeholder');
            return {
                text: '[PDF OCR not fully implemented - would convert PDF pages to images and OCR each page]',
                confidence: 0.0,
                extractedAt: new Date().toISOString(),
                language: options.language || this.getDefaultOCRLanguage(),
                pageCount: 1
            };
        } catch (error) {
            log.error(`PDF OCR extraction failed: ${error}`);
            throw error;
        }
    }
    getProcessingType(): string {
        return 'pdf';
    }
    async cleanup(): Promise<void> {
        await this.imageProcessor.cleanup();
    }
    /**
     * Get default OCR language from options
     */
    private getDefaultOCRLanguage(): string {
        try {
            const options = require('../../options.js').default;
            const ocrLanguage = options.getOption('ocrLanguage');
            if (!ocrLanguage) {
                throw new Error('OCR language not configured in user settings');
            }
            return ocrLanguage;
        } catch (error) {
            log.error(`Failed to get default OCR language: ${error}`);
            throw new Error('OCR language must be configured in settings before processing');
        }
    }
    /**
     * Validate OCR language format
     * Supports single language (eng) or multi-language (ron+eng)
     */
    private isValidLanguageFormat(language: string): boolean {
        if (!language || typeof language !== 'string') {
            return false;
        }
        // Split by '+' for multi-language format
        const languages = language.split('+');
        // Check each language code (should be 2-7 characters, alphanumeric with underscores)
        const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
        return languages.every(lang => {
            const trimmed = lang.trim();
            return trimmed.length > 0 && validLanguagePattern.test(trimmed);
        });
    }
 }
--- a/apps/server/src/services/ocr/processors/tiff_processor.ts
+++ b/apps/server/src/services/ocr/processors/tiff_processor.ts
@ -0,0 +1,129 @@
 import sharp from 'sharp';
 import { FileProcessor } from './file_processor.js';
 import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
 import { ImageProcessor } from './image_processor.js';
 import log from '../../log.js';
 /**
 * TIFF processor for extracting text from multi-page TIFF files
 */
 export class TIFFProcessor extends FileProcessor {
    private imageProcessor: ImageProcessor;
    constructor() {
        super();
        this.imageProcessor = new ImageProcessor();
    }
    canProcess(mimeType: string): boolean {
        return mimeType.toLowerCase() === 'image/tiff' || mimeType.toLowerCase() === 'image/tif';
    }
    async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
        try {
            log.info('Starting TIFF text extraction...');
            // Validate language format
            const language = options.language || this.getDefaultOCRLanguage();
            if (!this.isValidLanguageFormat(language)) {
                throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
            }
            // Check if this is a multi-page TIFF
            const metadata = await sharp(buffer).metadata();
            const pageCount = metadata.pages || 1;
            let combinedText = '';
            let totalConfidence = 0;
            // Process each page
            for (let page = 0; page < pageCount; page++) {
                try {
                    log.info(`Processing TIFF page ${page + 1}/${pageCount}...`);
                    // Extract page as PNG buffer
                    const pageBuffer = await sharp(buffer, { page })
                        .png()
                        .toBuffer();
                    // OCR the page
                    const pageResult = await this.imageProcessor.extractText(pageBuffer, options);
                    if (pageResult.text.trim().length > 0) {
                        if (combinedText.length > 0) {
                            combinedText += '\n\n--- Page ' + (page + 1) + ' ---\n';
                        }
                        combinedText += pageResult.text;
                        totalConfidence += pageResult.confidence;
                    }
                } catch (error) {
                    log.error(`Failed to process TIFF page ${page + 1}: ${error}`);
                    // Continue with other pages
                }
            }
            const averageConfidence = pageCount > 0 ? totalConfidence / pageCount : 0;
            const result: OCRResult = {
                text: combinedText.trim(),
                confidence: averageConfidence,
                extractedAt: new Date().toISOString(),
                language: options.language || this.getDefaultOCRLanguage(),
                pageCount: pageCount
            };
            log.info(`TIFF text extraction completed. Pages: ${pageCount}, Confidence: ${averageConfidence}%, Text length: ${result.text.length}`);
            return result;
        } catch (error) {
            log.error(`TIFF text extraction failed: ${error}`);
            throw error;
        }
    }
    getProcessingType(): string {
        return 'tiff';
    }
    async cleanup(): Promise<void> {
        await this.imageProcessor.cleanup();
    }
    /**
     * Get default OCR language from options
     */
    private getDefaultOCRLanguage(): string {
        try {
            const options = require('../../options.js').default;
            const ocrLanguage = options.getOption('ocrLanguage');
            if (!ocrLanguage) {
                throw new Error('OCR language not configured in user settings');
            }
            return ocrLanguage;
        } catch (error) {
            log.error(`Failed to get default OCR language: ${error}`);
            throw new Error('OCR language must be configured in settings before processing');
        }
    }
    /**
     * Validate OCR language format
     * Supports single language (eng) or multi-language (ron+eng)
     */
    private isValidLanguageFormat(language: string): boolean {
        if (!language || typeof language !== 'string') {
            return false;
        }
        // Split by '+' for multi-language format
        const languages = language.split('+');
        // Check each language code (should be 2-7 characters, alphanumeric with underscores)
        const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
        return languages.every(lang => {
            const trimmed = lang.trim();
            return trimmed.length > 0 && validLanguagePattern.test(trimmed);
        });
    }
 }