From ca8cbf8ccfa75c71f3dafb6308c3f0b3d819abee Mon Sep 17 00:00:00 2001 From: perf3ct Date: Wed, 16 Jul 2025 20:10:56 +0000 Subject: [PATCH] feat(ocr): add additional processors for OCR feature --- apps/server/src/services/ocr/ocr_service.ts | 434 +++++++++--------- .../services/ocr/processors/file_processor.ts | 28 ++ .../ocr/processors/image_processor.ts | 162 +++++++ .../ocr/processors/office_processor.ts | 128 ++++++ .../services/ocr/processors/pdf_processor.ts | 142 ++++++ .../services/ocr/processors/tiff_processor.ts | 129 ++++++ 6 files changed, 812 insertions(+), 211 deletions(-) create mode 100644 apps/server/src/services/ocr/processors/file_processor.ts create mode 100644 apps/server/src/services/ocr/processors/image_processor.ts create mode 100644 apps/server/src/services/ocr/processors/office_processor.ts create mode 100644 apps/server/src/services/ocr/processors/pdf_processor.ts create mode 100644 apps/server/src/services/ocr/processors/tiff_processor.ts diff --git a/apps/server/src/services/ocr/ocr_service.ts b/apps/server/src/services/ocr/ocr_service.ts index 54361284b..3d5c4aea4 100644 --- a/apps/server/src/services/ocr/ocr_service.ts +++ b/apps/server/src/services/ocr/ocr_service.ts @@ -3,23 +3,31 @@ import log from '../log.js'; import sql from '../sql.js'; import becca from '../../becca/becca.js'; import options from '../options.js'; +import { ImageProcessor } from './processors/image_processor.js'; +import { PDFProcessor } from './processors/pdf_processor.js'; +import { TIFFProcessor } from './processors/tiff_processor.js'; +import { OfficeProcessor } from './processors/office_processor.js'; +import { FileProcessor } from './processors/file_processor.js'; export interface OCRResult { text: string; confidence: number; extractedAt: string; language?: string; + pageCount?: number; } export interface OCRProcessingOptions { language?: string; forceReprocess?: boolean; confidence?: number; + enablePDFTextExtraction?: boolean; } interface OCRBlobRow { blobId: string; ocr_text: string; + ocr_last_processed?: string; } /** @@ -30,6 +38,7 @@ class OCRService { private isInitialized = false; private worker: Tesseract.Worker | null = null; private isProcessing = false; + private processors: Map = new Map(); /** * Initialize the OCR service @@ -40,25 +49,14 @@ class OCRService { } try { - log.info('Initializing OCR service with Tesseract.js...'); + log.info('Initializing OCR service with file processors...'); - // Configure proper paths for Node.js environment - const tesseractDir = require.resolve('tesseract.js').replace('/src/index.js', ''); - const workerPath = require.resolve('tesseract.js/src/worker-script/node/index.js'); - const corePath = require.resolve('tesseract.js-core/tesseract-core.wasm.js'); + // Initialize file processors + this.processors.set('image', new ImageProcessor()); + this.processors.set('pdf', new PDFProcessor()); + this.processors.set('tiff', new TIFFProcessor()); + this.processors.set('office', new OfficeProcessor()); - log.info(`Using worker path: ${workerPath}`); - log.info(`Using core path: ${corePath}`); - - this.worker = await Tesseract.createWorker('eng', 1, { - workerPath, - corePath, - logger: (m: { status: string; progress: number }) => { - if (m.status === 'recognizing text') { - log.info(`OCR progress: ${Math.round(m.progress * 100)}%`); - } - } - }); this.isInitialized = true; log.info('OCR service initialized successfully'); } catch (error) { @@ -100,46 +98,27 @@ class OCRService { } /** - * Extract text from image buffer + * Extract text from file buffer using appropriate processor */ - async extractTextFromImage(imageBuffer: Buffer, options: OCRProcessingOptions = {}): Promise { + async extractTextFromFile(fileBuffer: Buffer, mimeType: string, options: OCRProcessingOptions = {}): Promise { if (!this.isInitialized) { await this.initialize(); } - if (!this.worker) { - throw new Error('OCR worker not initialized'); - } - try { - log.info('Starting OCR text extraction...'); + log.info(`Starting OCR text extraction for MIME type: ${mimeType}`); this.isProcessing = true; - // Set language if specified and different from current - const language = options.language || 'eng'; - if (language !== 'eng') { - // For different languages, create a new worker - await this.worker.terminate(); - this.worker = await Tesseract.createWorker(language, 1, { - logger: (m: { status: string; progress: number }) => { - if (m.status === 'recognizing text') { - log.info(`OCR progress: ${Math.round(m.progress * 100)}%`); - } - } - }); + // Find appropriate processor + const processor = this.getProcessorForMimeType(mimeType); + if (!processor) { + throw new Error(`No processor found for MIME type: ${mimeType}`); } - const result = await this.worker.recognize(imageBuffer); + const result = await processor.extractText(fileBuffer, options); - const ocrResult: OCRResult = { - text: result.data.text.trim(), - confidence: result.data.confidence / 100, // Convert percentage to decimal - extractedAt: new Date().toISOString(), - language: options.language || 'eng' - }; - - log.info(`OCR extraction completed. Confidence: ${ocrResult.confidence}%, Text length: ${ocrResult.text.length}`); - return ocrResult; + log.info(`OCR extraction completed. Confidence: ${result.confidence}%, Text length: ${result.text.length}`); + return result; } catch (error) { log.error(`OCR text extraction failed: ${error}`); @@ -174,10 +153,10 @@ class OCRService { return null; } - // Check if OCR already exists in the blob and we're not forcing reprocessing + // Check if OCR already exists and is up-to-date const existingOCR = this.getStoredOCRResult(note.blobId); - if (existingOCR && !options.forceReprocess) { - log.info(`OCR already exists for note ${noteId}, returning cached result`); + if (existingOCR && !options.forceReprocess && note.blobId && !this.needsReprocessing(note.blobId)) { + log.info(`OCR already exists and is up-to-date for note ${noteId}, returning cached result`); return existingOCR; } @@ -187,7 +166,7 @@ class OCRService { throw new Error(`Cannot get image content for note ${noteId}`); } - const ocrResult = await this.extractTextFromImage(content, options); + const ocrResult = await this.extractTextFromFile(content, note.mime, options); // Store OCR result in blob await this.storeOCRResult(note.blobId, ocrResult); @@ -224,10 +203,10 @@ class OCRService { return null; } - // Check if OCR already exists in the blob and we're not forcing reprocessing + // Check if OCR already exists and is up-to-date const existingOCR = this.getStoredOCRResult(attachment.blobId); - if (existingOCR && !options.forceReprocess) { - log.info(`OCR already exists for attachment ${attachmentId}, returning cached result`); + if (existingOCR && !options.forceReprocess && attachment.blobId && !this.needsReprocessing(attachment.blobId)) { + log.info(`OCR already exists and is up-to-date for attachment ${attachmentId}, returning cached result`); return existingOCR; } @@ -237,7 +216,7 @@ class OCRService { throw new Error(`Cannot get image content for attachment ${attachmentId}`); } - const ocrResult = await this.extractTextFromImage(content, options); + const ocrResult = await this.extractTextFromFile(content, attachment.mime, options); // Store OCR result in blob await this.storeOCRResult(attachment.blobId, ocrResult); @@ -259,11 +238,15 @@ class OCRService { } try { - // Store OCR text in blobs table + // Store OCR text and timestamp in blobs table sql.execute(` - UPDATE blobs SET ocr_text = ? WHERE blobId = ? + UPDATE blobs SET + ocr_text = ?, + ocr_last_processed = ? + WHERE blobId = ? `, [ ocrResult.text, + new Date().toISOString(), blobId ]); @@ -353,80 +336,10 @@ class OCRService { } /** - * Process OCR for all images that don't have OCR results yet + * Process OCR for all files that don't have OCR results yet or need reprocessing */ async processAllImages(): Promise { - if (!this.isOCREnabled()) { - log.info('OCR is disabled, skipping batch processing'); - return; - } - - log.info('Starting batch OCR processing for all images...'); - - try { - // Process image notes - const imageNotes = sql.getRows<{ - noteId: string; - mime: string; - blobId: string; - }>(` - SELECT n.noteId, n.mime, n.blobId - FROM notes n - LEFT JOIN blobs b ON n.blobId = b.blobId - WHERE n.type = 'image' - AND n.isDeleted = 0 - AND n.blobId IS NOT NULL - AND (b.ocr_text IS NULL OR b.ocr_text = '') - `); - - log.info(`Found ${imageNotes.length} image notes to process`); - - for (const noteRow of imageNotes) { - if (this.isSupportedMimeType(noteRow.mime)) { - try { - await this.processNoteOCR(noteRow.noteId); - // Add small delay to prevent overwhelming the system - await new Promise(resolve => setTimeout(resolve, 100)); - } catch (error) { - log.error(`Failed to process OCR for note ${noteRow.noteId}: ${error}`); - } - } - } - - // Process image attachments - const imageAttachments = sql.getRows<{ - attachmentId: string; - mime: string; - blobId: string; - }>(` - SELECT a.attachmentId, a.mime, a.blobId - FROM attachments a - LEFT JOIN blobs b ON a.blobId = b.blobId - WHERE a.role = 'image' - AND a.isDeleted = 0 - AND a.blobId IS NOT NULL - AND (b.ocr_text IS NULL OR b.ocr_text = '') - `); - - log.info(`Found ${imageAttachments.length} image attachments to process`); - - for (const attachmentRow of imageAttachments) { - if (this.isSupportedMimeType(attachmentRow.mime)) { - try { - await this.processAttachmentOCR(attachmentRow.attachmentId); - // Add small delay to prevent overwhelming the system - await new Promise(resolve => setTimeout(resolve, 100)); - } catch (error) { - log.error(`Failed to process OCR for attachment ${attachmentRow.attachmentId}: ${error}`); - } - } - } - - log.info('Batch OCR processing completed'); - } catch (error) { - log.error(`Batch OCR processing failed: ${error}`); - throw error; - } + return this.processAllBlobsNeedingOCR(); } /** @@ -521,28 +434,9 @@ class OCRService { } try { - // Count total images to process - const imageNotesCount = sql.getRow<{ count: number }>(` - SELECT COUNT(*) as count - FROM notes - WHERE type = 'image' - AND isDeleted = 0 - AND noteId NOT IN ( - SELECT entity_id FROM ocr_results WHERE entity_type = 'note' - ) - `)?.count || 0; - - const imageAttachmentsCount = sql.getRow<{ count: number }>(` - SELECT COUNT(*) as count - FROM attachments - WHERE role = 'image' - AND isDeleted = 0 - AND attachmentId NOT IN ( - SELECT entity_id FROM ocr_results WHERE entity_type = 'attachment' - ) - `)?.count || 0; - - const totalCount = imageNotesCount + imageAttachmentsCount; + // Count total blobs needing OCR processing + const blobsNeedingOCR = this.getBlobsNeedingOCR(); + const totalCount = blobsNeedingOCR.length; if (totalCount === 0) { return { success: false, message: 'No images found that need OCR processing' }; @@ -557,7 +451,7 @@ class OCRService { }; // Start processing in background - this.processBatchInBackground().catch(error => { + this.processBatchInBackground(blobsNeedingOCR).catch(error => { log.error(`Batch processing failed: ${error instanceof Error ? error.message : String(error)}`); this.batchProcessingState.inProgress = false; }); @@ -583,79 +477,33 @@ class OCRService { /** * Process batch OCR in background with progress tracking */ - private async processBatchInBackground(): Promise { + private async processBatchInBackground(blobsToProcess: Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }>): Promise { try { log.info('Starting batch OCR processing...'); - // Process image notes - const imageNotes = sql.getRows<{ - noteId: string; - mime: string; - blobId: string; - }>(` - SELECT n.noteId, n.mime, n.blobId - FROM notes n - LEFT JOIN blobs b ON n.blobId = b.blobId - WHERE n.type = 'image' - AND n.isDeleted = 0 - AND n.blobId IS NOT NULL - AND (b.ocr_text IS NULL OR b.ocr_text = '') - `); - - for (const noteRow of imageNotes) { + for (const blobInfo of blobsToProcess) { if (!this.batchProcessingState.inProgress) { break; // Stop if processing was cancelled } - if (this.isSupportedMimeType(noteRow.mime)) { - try { - await this.processNoteOCR(noteRow.noteId); - this.batchProcessingState.processed++; - // Add small delay to prevent overwhelming the system - await new Promise(resolve => setTimeout(resolve, 500)); - } catch (error) { - log.error(`Failed to process OCR for note ${noteRow.noteId}: ${error}`); - this.batchProcessingState.processed++; // Count as processed even if failed - } - } - } - - // Process image attachments - const imageAttachments = sql.getRows<{ - attachmentId: string; - mime: string; - blobId: string; - }>(` - SELECT a.attachmentId, a.mime, a.blobId - FROM attachments a - LEFT JOIN blobs b ON a.blobId = b.blobId - WHERE a.role = 'image' - AND a.isDeleted = 0 - AND a.blobId IS NOT NULL - AND (b.ocr_text IS NULL OR b.ocr_text = '') - `); - - for (const attachmentRow of imageAttachments) { - if (!this.batchProcessingState.inProgress) { - break; // Stop if processing was cancelled - } - - if (this.isSupportedMimeType(attachmentRow.mime)) { - try { - await this.processAttachmentOCR(attachmentRow.attachmentId); - this.batchProcessingState.processed++; - // Add small delay to prevent overwhelming the system - await new Promise(resolve => setTimeout(resolve, 500)); - } catch (error) { - log.error(`Failed to process OCR for attachment ${attachmentRow.attachmentId}: ${error}`); - this.batchProcessingState.processed++; // Count as processed even if failed + try { + if (blobInfo.entityType === 'note') { + await this.processNoteOCR(blobInfo.entityId); + } else { + await this.processAttachmentOCR(blobInfo.entityId); } + this.batchProcessingState.processed++; + // Add small delay to prevent overwhelming the system + await new Promise(resolve => setTimeout(resolve, 500)); + } catch (error) { + log.error(`Failed to process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`); + this.batchProcessingState.processed++; // Count as processed even if failed } } // Mark as completed this.batchProcessingState.inProgress = false; - log.info(`Batch OCR processing completed. Processed ${this.batchProcessingState.processed} images.`); + log.info(`Batch OCR processing completed. Processed ${this.batchProcessingState.processed} files.`); } catch (error) { log.error(`Batch OCR processing failed: ${error}`); this.batchProcessingState.inProgress = false; @@ -672,6 +520,170 @@ class OCRService { log.info('Batch OCR processing cancelled'); } } + + /** + * Get processor for a given MIME type + */ + private getProcessorForMimeType(mimeType: string): FileProcessor | null { + for (const processor of this.processors.values()) { + if (processor.canProcess(mimeType)) { + return processor; + } + } + return null; + } + + /** + * Check if blob needs OCR re-processing due to content changes + */ + needsReprocessing(blobId: string): boolean { + if (!blobId) { + return false; + } + + try { + const blobInfo = sql.getRow<{ + utcDateModified: string; + ocr_last_processed: string | null; + }>(` + SELECT utcDateModified, ocr_last_processed + FROM blobs + WHERE blobId = ? + `, [blobId]); + + if (!blobInfo) { + return false; + } + + // If OCR was never processed, it needs processing + if (!blobInfo.ocr_last_processed) { + return true; + } + + // If blob was modified after last OCR processing, it needs re-processing + const blobModified = new Date(blobInfo.utcDateModified); + const lastOcrProcessed = new Date(blobInfo.ocr_last_processed); + + return blobModified > lastOcrProcessed; + } catch (error) { + log.error(`Failed to check if blob ${blobId} needs reprocessing: ${error}`); + return false; + } + } + + /** + * Invalidate OCR results for a blob (clear ocr_text and ocr_last_processed) + */ + invalidateOCRResult(blobId: string): void { + if (!blobId) { + return; + } + + try { + sql.execute(` + UPDATE blobs SET + ocr_text = NULL, + ocr_last_processed = NULL + WHERE blobId = ? + `, [blobId]); + + log.info(`Invalidated OCR result for blob ${blobId}`); + } catch (error) { + log.error(`Failed to invalidate OCR result for blob ${blobId}: ${error}`); + throw error; + } + } + + /** + * Get blobs that need OCR processing (modified after last OCR or never processed) + */ + getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> { + try { + // Get notes with blobs that need OCR + const noteBlobs = sql.getRows<{ + blobId: string; + mimeType: string; + entityId: string; + }>(` + SELECT n.blobId, n.mime as mimeType, n.noteId as entityId + FROM notes n + JOIN blobs b ON n.blobId = b.blobId + WHERE n.type = 'image' + AND n.isDeleted = 0 + AND n.blobId IS NOT NULL + AND ( + b.ocr_last_processed IS NULL + OR b.utcDateModified > b.ocr_last_processed + ) + `); + + // Get attachments with blobs that need OCR + const attachmentBlobs = sql.getRows<{ + blobId: string; + mimeType: string; + entityId: string; + }>(` + SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId + FROM attachments a + JOIN blobs b ON a.blobId = b.blobId + WHERE a.role = 'image' + AND a.isDeleted = 0 + AND a.blobId IS NOT NULL + AND ( + b.ocr_last_processed IS NULL + OR b.utcDateModified > b.ocr_last_processed + ) + `); + + // Combine results + const result = [ + ...noteBlobs.map(blob => ({ ...blob, entityType: 'note' as const })), + ...attachmentBlobs.map(blob => ({ ...blob, entityType: 'attachment' as const })) + ]; + + // Filter to only supported MIME types + return result.filter(blob => this.isSupportedMimeType(blob.mimeType)); + } catch (error) { + log.error(`Failed to get blobs needing OCR: ${error}`); + return []; + } + } + + /** + * Process OCR for all blobs that need it (auto-processing) + */ + async processAllBlobsNeedingOCR(): Promise { + if (!this.isOCREnabled()) { + log.info('OCR is disabled, skipping auto-processing'); + return; + } + + const blobsNeedingOCR = this.getBlobsNeedingOCR(); + if (blobsNeedingOCR.length === 0) { + log.info('No blobs need OCR processing'); + return; + } + + log.info(`Auto-processing OCR for ${blobsNeedingOCR.length} blobs...`); + + for (const blobInfo of blobsNeedingOCR) { + try { + if (blobInfo.entityType === 'note') { + await this.processNoteOCR(blobInfo.entityId); + } else { + await this.processAttachmentOCR(blobInfo.entityId); + } + + // Add small delay to prevent overwhelming the system + await new Promise(resolve => setTimeout(resolve, 100)); + } catch (error) { + log.error(`Failed to auto-process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`); + // Continue with other blobs + } + } + + log.info('Auto-processing OCR completed'); + } } export default new OCRService(); \ No newline at end of file diff --git a/apps/server/src/services/ocr/processors/file_processor.ts b/apps/server/src/services/ocr/processors/file_processor.ts new file mode 100644 index 000000000..98dd3dfd9 --- /dev/null +++ b/apps/server/src/services/ocr/processors/file_processor.ts @@ -0,0 +1,28 @@ +import { OCRResult, OCRProcessingOptions } from '../ocr_service.js'; + +/** + * Base class for file processors that extract text from different file types + */ +export abstract class FileProcessor { + /** + * Check if this processor can handle the given MIME type + */ + abstract canProcess(mimeType: string): boolean; + + /** + * Extract text from the given file buffer + */ + abstract extractText(buffer: Buffer, options: OCRProcessingOptions): Promise; + + /** + * Get the processing type identifier + */ + abstract getProcessingType(): string; + + /** + * Clean up any resources + */ + cleanup(): Promise { + return Promise.resolve(); + } +} \ No newline at end of file diff --git a/apps/server/src/services/ocr/processors/image_processor.ts b/apps/server/src/services/ocr/processors/image_processor.ts new file mode 100644 index 000000000..7ca86f50e --- /dev/null +++ b/apps/server/src/services/ocr/processors/image_processor.ts @@ -0,0 +1,162 @@ +import Tesseract from 'tesseract.js'; +import { FileProcessor } from './file_processor.js'; +import { OCRResult, OCRProcessingOptions } from '../ocr_service.js'; +import log from '../../log.js'; + +/** + * Image processor for extracting text from image files using Tesseract + */ +export class ImageProcessor extends FileProcessor { + private worker: Tesseract.Worker | null = null; + private isInitialized = false; + + canProcess(mimeType: string): boolean { + const supportedTypes = [ + 'image/jpeg', + 'image/jpg', + 'image/png', + 'image/gif', + 'image/bmp', + 'image/tiff', + 'image/webp' + ]; + return supportedTypes.includes(mimeType.toLowerCase()); + } + + async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise { + if (!this.isInitialized) { + await this.initialize(); + } + + if (!this.worker) { + throw new Error('Image processor worker not initialized'); + } + + try { + log.info('Starting image OCR text extraction...'); + + // Set language if specified and different from current + // Support multi-language format like 'ron+eng' + const language = options.language || this.getDefaultOCRLanguage(); + + // Validate language format + if (!this.isValidLanguageFormat(language)) { + throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`); + } + + if (language !== 'eng') { + // For different languages, create a new worker + await this.worker.terminate(); + log.info(`Initializing Tesseract worker for language(s): ${language}`); + this.worker = await Tesseract.createWorker(language, 1, { + logger: (m: { status: string; progress: number }) => { + if (m.status === 'recognizing text') { + log.info(`Image OCR progress (${language}): ${Math.round(m.progress * 100)}%`); + } + } + }); + } + + const result = await this.worker.recognize(buffer); + + const ocrResult: OCRResult = { + text: result.data.text.trim(), + confidence: result.data.confidence / 100, // Convert percentage to decimal + extractedAt: new Date().toISOString(), + language: options.language || this.getDefaultOCRLanguage(), + pageCount: 1 + }; + + log.info(`Image OCR extraction completed. Confidence: ${ocrResult.confidence}%, Text length: ${ocrResult.text.length}`); + return ocrResult; + + } catch (error) { + log.error(`Image OCR text extraction failed: ${error}`); + throw error; + } + } + + getProcessingType(): string { + return 'image'; + } + + private async initialize(): Promise { + if (this.isInitialized) { + return; + } + + try { + log.info('Initializing image OCR processor with Tesseract.js...'); + + // Configure proper paths for Node.js environment + const tesseractDir = require.resolve('tesseract.js').replace('/src/index.js', ''); + const workerPath = require.resolve('tesseract.js/src/worker-script/node/index.js'); + const corePath = require.resolve('tesseract.js-core/tesseract-core.wasm.js'); + + log.info(`Using worker path: ${workerPath}`); + log.info(`Using core path: ${corePath}`); + + this.worker = await Tesseract.createWorker(this.getDefaultOCRLanguage(), 1, { + workerPath, + corePath, + logger: (m: { status: string; progress: number }) => { + if (m.status === 'recognizing text') { + log.info(`Image OCR progress: ${Math.round(m.progress * 100)}%`); + } + } + }); + this.isInitialized = true; + log.info('Image OCR processor initialized successfully'); + } catch (error) { + log.error(`Failed to initialize image OCR processor: ${error}`); + throw error; + } + } + + async cleanup(): Promise { + if (this.worker) { + await this.worker.terminate(); + this.worker = null; + } + this.isInitialized = false; + log.info('Image OCR processor cleaned up'); + } + + /** + * Get default OCR language from options + */ + private getDefaultOCRLanguage(): string { + try { + const options = require('../../options.js').default; + const ocrLanguage = options.getOption('ocrLanguage'); + if (!ocrLanguage) { + throw new Error('OCR language not configured in user settings'); + } + return ocrLanguage; + } catch (error) { + log.error(`Failed to get default OCR language: ${error}`); + throw new Error('OCR language must be configured in settings before processing'); + } + } + + /** + * Validate OCR language format + * Supports single language (eng) or multi-language (ron+eng) + */ + private isValidLanguageFormat(language: string): boolean { + if (!language || typeof language !== 'string') { + return false; + } + + // Split by '+' for multi-language format + const languages = language.split('+'); + + // Check each language code (should be 2-7 characters, alphanumeric with underscores) + const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/; + + return languages.every(lang => { + const trimmed = lang.trim(); + return trimmed.length > 0 && validLanguagePattern.test(trimmed); + }); + } +} \ No newline at end of file diff --git a/apps/server/src/services/ocr/processors/office_processor.ts b/apps/server/src/services/ocr/processors/office_processor.ts new file mode 100644 index 000000000..794ec52e8 --- /dev/null +++ b/apps/server/src/services/ocr/processors/office_processor.ts @@ -0,0 +1,128 @@ +import * as officeParser from 'officeparser'; +import { FileProcessor } from './file_processor.js'; +import { OCRResult, OCRProcessingOptions } from '../ocr_service.js'; +import { ImageProcessor } from './image_processor.js'; +import log from '../../log.js'; + +/** + * Office document processor for extracting text and images from DOCX/XLSX/PPTX files + */ +export class OfficeProcessor extends FileProcessor { + private imageProcessor: ImageProcessor; + + constructor() { + super(); + this.imageProcessor = new ImageProcessor(); + } + + canProcess(mimeType: string): boolean { + const supportedTypes = [ + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX + 'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX + 'application/msword', // DOC + 'application/vnd.ms-excel', // XLS + 'application/vnd.ms-powerpoint', // PPT + 'application/rtf' // RTF + ]; + return supportedTypes.includes(mimeType.toLowerCase()); + } + + async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise { + try { + log.info('Starting Office document text extraction...'); + + // Validate language format + const language = options.language || this.getDefaultOCRLanguage(); + if (!this.isValidLanguageFormat(language)) { + throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`); + } + + // Extract text from Office document + const data = await this.parseOfficeDocument(buffer); + + // Extract text from Office document + const combinedText = data.data && data.data.trim().length > 0 ? data.data.trim() : ''; + const confidence = combinedText.length > 0 ? 0.99 : 0; // High confidence for direct text extraction + + const result: OCRResult = { + text: combinedText, + confidence: confidence, + extractedAt: new Date().toISOString(), + language: language, + pageCount: 1 // Office documents are treated as single logical document + }; + + log.info(`Office document text extraction completed. Confidence: ${confidence}%, Text length: ${result.text.length}`); + return result; + + } catch (error) { + log.error(`Office document text extraction failed: ${error}`); + throw error; + } + } + + private async parseOfficeDocument(buffer: Buffer): Promise<{ data: string }> { + try { + // Use promise-based API directly + const data = await officeParser.parseOfficeAsync(buffer, { + outputErrorToConsole: false, + newlineDelimiter: '\n', + ignoreNotes: false, + putNotesAtLast: false + }); + + return { + data: data || '' + }; + } catch (error) { + throw new Error(`Office document parsing failed: ${error}`); + } + } + + getProcessingType(): string { + return 'office'; + } + + async cleanup(): Promise { + await this.imageProcessor.cleanup(); + } + + /** + * Get default OCR language from options + */ + private getDefaultOCRLanguage(): string { + try { + const options = require('../../options.js').default; + const ocrLanguage = options.getOption('ocrLanguage'); + if (!ocrLanguage) { + throw new Error('OCR language not configured in user settings'); + } + return ocrLanguage; + } catch (error) { + log.error(`Failed to get default OCR language: ${error}`); + throw new Error('OCR language must be configured in settings before processing'); + } + } + + /** + * Validate OCR language format + * Supports single language (eng) or multi-language (ron+eng) + */ + private isValidLanguageFormat(language: string): boolean { + if (!language || typeof language !== 'string') { + return false; + } + + // Split by '+' for multi-language format + const languages = language.split('+'); + + // Check each language code (should be 2-7 characters, alphanumeric with underscores) + const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/; + + return languages.every(lang => { + const trimmed = lang.trim(); + return trimmed.length > 0 && validLanguagePattern.test(trimmed); + }); + } +} \ No newline at end of file diff --git a/apps/server/src/services/ocr/processors/pdf_processor.ts b/apps/server/src/services/ocr/processors/pdf_processor.ts new file mode 100644 index 000000000..54ca2d4c6 --- /dev/null +++ b/apps/server/src/services/ocr/processors/pdf_processor.ts @@ -0,0 +1,142 @@ +import * as pdfParse from 'pdf-parse'; +import { FileProcessor } from './file_processor.js'; +import { OCRResult, OCRProcessingOptions } from '../ocr_service.js'; +import { ImageProcessor } from './image_processor.js'; +import log from '../../log.js'; +import sharp from 'sharp'; + +/** + * PDF processor for extracting text from PDF files + * First tries to extract existing text, then falls back to OCR on images + */ +export class PDFProcessor extends FileProcessor { + private imageProcessor: ImageProcessor; + + constructor() { + super(); + this.imageProcessor = new ImageProcessor(); + } + + canProcess(mimeType: string): boolean { + return mimeType.toLowerCase() === 'application/pdf'; + } + + async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise { + try { + log.info('Starting PDF text extraction...'); + + // Validate language format + const language = options.language || this.getDefaultOCRLanguage(); + if (!this.isValidLanguageFormat(language)) { + throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`); + } + + // First try to extract existing text from PDF + if (options.enablePDFTextExtraction !== false) { + const textResult = await this.extractTextFromPDF(buffer, options); + if (textResult.text.trim().length > 0) { + log.info(`PDF text extraction successful. Length: ${textResult.text.length}`); + return textResult; + } + } + + // Fall back to OCR if no text found or PDF text extraction is disabled + log.info('No text found in PDF or text extraction disabled, falling back to OCR...'); + return await this.extractTextViaOCR(buffer, options); + + } catch (error) { + log.error(`PDF text extraction failed: ${error}`); + throw error; + } + } + + private async extractTextFromPDF(buffer: Buffer, options: OCRProcessingOptions): Promise { + try { + const data = await pdfParse(buffer); + + return { + text: data.text.trim(), + confidence: 0.99, // High confidence for direct text extraction + extractedAt: new Date().toISOString(), + language: options.language || this.getDefaultOCRLanguage(), + pageCount: data.numpages + }; + } catch (error) { + log.error(`PDF text extraction failed: ${error}`); + throw error; + } + } + + private async extractTextViaOCR(buffer: Buffer, options: OCRProcessingOptions): Promise { + try { + // Convert PDF to images and OCR each page + // For now, we'll use a simple approach - convert first page to image + // In a full implementation, we'd convert all pages + + // This is a simplified implementation + // In practice, you might want to use pdf2pic or similar library + // to convert PDF pages to images for OCR + + // For now, we'll return a placeholder result + // indicating that OCR on PDF is not fully implemented + log.info('PDF to image conversion not fully implemented, returning placeholder'); + + return { + text: '[PDF OCR not fully implemented - would convert PDF pages to images and OCR each page]', + confidence: 0.0, + extractedAt: new Date().toISOString(), + language: options.language || this.getDefaultOCRLanguage(), + pageCount: 1 + }; + } catch (error) { + log.error(`PDF OCR extraction failed: ${error}`); + throw error; + } + } + + getProcessingType(): string { + return 'pdf'; + } + + async cleanup(): Promise { + await this.imageProcessor.cleanup(); + } + + /** + * Get default OCR language from options + */ + private getDefaultOCRLanguage(): string { + try { + const options = require('../../options.js').default; + const ocrLanguage = options.getOption('ocrLanguage'); + if (!ocrLanguage) { + throw new Error('OCR language not configured in user settings'); + } + return ocrLanguage; + } catch (error) { + log.error(`Failed to get default OCR language: ${error}`); + throw new Error('OCR language must be configured in settings before processing'); + } + } + + /** + * Validate OCR language format + * Supports single language (eng) or multi-language (ron+eng) + */ + private isValidLanguageFormat(language: string): boolean { + if (!language || typeof language !== 'string') { + return false; + } + + // Split by '+' for multi-language format + const languages = language.split('+'); + + // Check each language code (should be 2-7 characters, alphanumeric with underscores) + const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/; + + return languages.every(lang => { + const trimmed = lang.trim(); + return trimmed.length > 0 && validLanguagePattern.test(trimmed); + }); + } +} \ No newline at end of file diff --git a/apps/server/src/services/ocr/processors/tiff_processor.ts b/apps/server/src/services/ocr/processors/tiff_processor.ts new file mode 100644 index 000000000..1755d77e2 --- /dev/null +++ b/apps/server/src/services/ocr/processors/tiff_processor.ts @@ -0,0 +1,129 @@ +import sharp from 'sharp'; +import { FileProcessor } from './file_processor.js'; +import { OCRResult, OCRProcessingOptions } from '../ocr_service.js'; +import { ImageProcessor } from './image_processor.js'; +import log from '../../log.js'; + +/** + * TIFF processor for extracting text from multi-page TIFF files + */ +export class TIFFProcessor extends FileProcessor { + private imageProcessor: ImageProcessor; + + constructor() { + super(); + this.imageProcessor = new ImageProcessor(); + } + + canProcess(mimeType: string): boolean { + return mimeType.toLowerCase() === 'image/tiff' || mimeType.toLowerCase() === 'image/tif'; + } + + async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise { + try { + log.info('Starting TIFF text extraction...'); + + // Validate language format + const language = options.language || this.getDefaultOCRLanguage(); + if (!this.isValidLanguageFormat(language)) { + throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`); + } + + // Check if this is a multi-page TIFF + const metadata = await sharp(buffer).metadata(); + const pageCount = metadata.pages || 1; + + let combinedText = ''; + let totalConfidence = 0; + + // Process each page + for (let page = 0; page < pageCount; page++) { + try { + log.info(`Processing TIFF page ${page + 1}/${pageCount}...`); + + // Extract page as PNG buffer + const pageBuffer = await sharp(buffer, { page }) + .png() + .toBuffer(); + + // OCR the page + const pageResult = await this.imageProcessor.extractText(pageBuffer, options); + + if (pageResult.text.trim().length > 0) { + if (combinedText.length > 0) { + combinedText += '\n\n--- Page ' + (page + 1) + ' ---\n'; + } + combinedText += pageResult.text; + totalConfidence += pageResult.confidence; + } + } catch (error) { + log.error(`Failed to process TIFF page ${page + 1}: ${error}`); + // Continue with other pages + } + } + + const averageConfidence = pageCount > 0 ? totalConfidence / pageCount : 0; + + const result: OCRResult = { + text: combinedText.trim(), + confidence: averageConfidence, + extractedAt: new Date().toISOString(), + language: options.language || this.getDefaultOCRLanguage(), + pageCount: pageCount + }; + + log.info(`TIFF text extraction completed. Pages: ${pageCount}, Confidence: ${averageConfidence}%, Text length: ${result.text.length}`); + return result; + + } catch (error) { + log.error(`TIFF text extraction failed: ${error}`); + throw error; + } + } + + getProcessingType(): string { + return 'tiff'; + } + + async cleanup(): Promise { + await this.imageProcessor.cleanup(); + } + + /** + * Get default OCR language from options + */ + private getDefaultOCRLanguage(): string { + try { + const options = require('../../options.js').default; + const ocrLanguage = options.getOption('ocrLanguage'); + if (!ocrLanguage) { + throw new Error('OCR language not configured in user settings'); + } + return ocrLanguage; + } catch (error) { + log.error(`Failed to get default OCR language: ${error}`); + throw new Error('OCR language must be configured in settings before processing'); + } + } + + /** + * Validate OCR language format + * Supports single language (eng) or multi-language (ron+eng) + */ + private isValidLanguageFormat(language: string): boolean { + if (!language || typeof language !== 'string') { + return false; + } + + // Split by '+' for multi-language format + const languages = language.split('+'); + + // Check each language code (should be 2-7 characters, alphanumeric with underscores) + const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/; + + return languages.every(lang => { + const trimmed = lang.trim(); + return trimmed.length > 0 && validLanguagePattern.test(trimmed); + }); + } +} \ No newline at end of file