feat(ocr): add additional processors for OCR feature

This commit is contained in:
perf3ct 2025-07-16 20:10:56 +00:00
parent 6722d2d266
commit ca8cbf8ccf
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
6 changed files with 812 additions and 211 deletions

View File

@ -3,23 +3,31 @@ import log from '../log.js';
import sql from '../sql.js';
import becca from '../../becca/becca.js';
import options from '../options.js';
import { ImageProcessor } from './processors/image_processor.js';
import { PDFProcessor } from './processors/pdf_processor.js';
import { TIFFProcessor } from './processors/tiff_processor.js';
import { OfficeProcessor } from './processors/office_processor.js';
import { FileProcessor } from './processors/file_processor.js';
export interface OCRResult {
text: string;
confidence: number;
extractedAt: string;
language?: string;
pageCount?: number;
}
export interface OCRProcessingOptions {
language?: string;
forceReprocess?: boolean;
confidence?: number;
enablePDFTextExtraction?: boolean;
}
interface OCRBlobRow {
blobId: string;
ocr_text: string;
ocr_last_processed?: string;
}
/**
@ -30,6 +38,7 @@ class OCRService {
private isInitialized = false;
private worker: Tesseract.Worker | null = null;
private isProcessing = false;
private processors: Map<string, FileProcessor> = new Map();
/**
* Initialize the OCR service
@ -40,25 +49,14 @@ class OCRService {
}
try {
log.info('Initializing OCR service with Tesseract.js...');
log.info('Initializing OCR service with file processors...');
// Configure proper paths for Node.js environment
const tesseractDir = require.resolve('tesseract.js').replace('/src/index.js', '');
const workerPath = require.resolve('tesseract.js/src/worker-script/node/index.js');
const corePath = require.resolve('tesseract.js-core/tesseract-core.wasm.js');
// Initialize file processors
this.processors.set('image', new ImageProcessor());
this.processors.set('pdf', new PDFProcessor());
this.processors.set('tiff', new TIFFProcessor());
this.processors.set('office', new OfficeProcessor());
log.info(`Using worker path: ${workerPath}`);
log.info(`Using core path: ${corePath}`);
this.worker = await Tesseract.createWorker('eng', 1, {
workerPath,
corePath,
logger: (m: { status: string; progress: number }) => {
if (m.status === 'recognizing text') {
log.info(`OCR progress: ${Math.round(m.progress * 100)}%`);
}
}
});
this.isInitialized = true;
log.info('OCR service initialized successfully');
} catch (error) {
@ -100,46 +98,27 @@ class OCRService {
}
/**
* Extract text from image buffer
* Extract text from file buffer using appropriate processor
*/
async extractTextFromImage(imageBuffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
async extractTextFromFile(fileBuffer: Buffer, mimeType: string, options: OCRProcessingOptions = {}): Promise<OCRResult> {
if (!this.isInitialized) {
await this.initialize();
}
if (!this.worker) {
throw new Error('OCR worker not initialized');
}
try {
log.info('Starting OCR text extraction...');
log.info(`Starting OCR text extraction for MIME type: ${mimeType}`);
this.isProcessing = true;
// Set language if specified and different from current
const language = options.language || 'eng';
if (language !== 'eng') {
// For different languages, create a new worker
await this.worker.terminate();
this.worker = await Tesseract.createWorker(language, 1, {
logger: (m: { status: string; progress: number }) => {
if (m.status === 'recognizing text') {
log.info(`OCR progress: ${Math.round(m.progress * 100)}%`);
}
}
});
// Find appropriate processor
const processor = this.getProcessorForMimeType(mimeType);
if (!processor) {
throw new Error(`No processor found for MIME type: ${mimeType}`);
}
const result = await this.worker.recognize(imageBuffer);
const result = await processor.extractText(fileBuffer, options);
const ocrResult: OCRResult = {
text: result.data.text.trim(),
confidence: result.data.confidence / 100, // Convert percentage to decimal
extractedAt: new Date().toISOString(),
language: options.language || 'eng'
};
log.info(`OCR extraction completed. Confidence: ${ocrResult.confidence}%, Text length: ${ocrResult.text.length}`);
return ocrResult;
log.info(`OCR extraction completed. Confidence: ${result.confidence}%, Text length: ${result.text.length}`);
return result;
} catch (error) {
log.error(`OCR text extraction failed: ${error}`);
@ -174,10 +153,10 @@ class OCRService {
return null;
}
// Check if OCR already exists in the blob and we're not forcing reprocessing
// Check if OCR already exists and is up-to-date
const existingOCR = this.getStoredOCRResult(note.blobId);
if (existingOCR && !options.forceReprocess) {
log.info(`OCR already exists for note ${noteId}, returning cached result`);
if (existingOCR && !options.forceReprocess && note.blobId && !this.needsReprocessing(note.blobId)) {
log.info(`OCR already exists and is up-to-date for note ${noteId}, returning cached result`);
return existingOCR;
}
@ -187,7 +166,7 @@ class OCRService {
throw new Error(`Cannot get image content for note ${noteId}`);
}
const ocrResult = await this.extractTextFromImage(content, options);
const ocrResult = await this.extractTextFromFile(content, note.mime, options);
// Store OCR result in blob
await this.storeOCRResult(note.blobId, ocrResult);
@ -224,10 +203,10 @@ class OCRService {
return null;
}
// Check if OCR already exists in the blob and we're not forcing reprocessing
// Check if OCR already exists and is up-to-date
const existingOCR = this.getStoredOCRResult(attachment.blobId);
if (existingOCR && !options.forceReprocess) {
log.info(`OCR already exists for attachment ${attachmentId}, returning cached result`);
if (existingOCR && !options.forceReprocess && attachment.blobId && !this.needsReprocessing(attachment.blobId)) {
log.info(`OCR already exists and is up-to-date for attachment ${attachmentId}, returning cached result`);
return existingOCR;
}
@ -237,7 +216,7 @@ class OCRService {
throw new Error(`Cannot get image content for attachment ${attachmentId}`);
}
const ocrResult = await this.extractTextFromImage(content, options);
const ocrResult = await this.extractTextFromFile(content, attachment.mime, options);
// Store OCR result in blob
await this.storeOCRResult(attachment.blobId, ocrResult);
@ -259,11 +238,15 @@ class OCRService {
}
try {
// Store OCR text in blobs table
// Store OCR text and timestamp in blobs table
sql.execute(`
UPDATE blobs SET ocr_text = ? WHERE blobId = ?
UPDATE blobs SET
ocr_text = ?,
ocr_last_processed = ?
WHERE blobId = ?
`, [
ocrResult.text,
new Date().toISOString(),
blobId
]);
@ -353,80 +336,10 @@ class OCRService {
}
/**
* Process OCR for all images that don't have OCR results yet
* Process OCR for all files that don't have OCR results yet or need reprocessing
*/
async processAllImages(): Promise<void> {
if (!this.isOCREnabled()) {
log.info('OCR is disabled, skipping batch processing');
return;
}
log.info('Starting batch OCR processing for all images...');
try {
// Process image notes
const imageNotes = sql.getRows<{
noteId: string;
mime: string;
blobId: string;
}>(`
SELECT n.noteId, n.mime, n.blobId
FROM notes n
LEFT JOIN blobs b ON n.blobId = b.blobId
WHERE n.type = 'image'
AND n.isDeleted = 0
AND n.blobId IS NOT NULL
AND (b.ocr_text IS NULL OR b.ocr_text = '')
`);
log.info(`Found ${imageNotes.length} image notes to process`);
for (const noteRow of imageNotes) {
if (this.isSupportedMimeType(noteRow.mime)) {
try {
await this.processNoteOCR(noteRow.noteId);
// Add small delay to prevent overwhelming the system
await new Promise(resolve => setTimeout(resolve, 100));
} catch (error) {
log.error(`Failed to process OCR for note ${noteRow.noteId}: ${error}`);
}
}
}
// Process image attachments
const imageAttachments = sql.getRows<{
attachmentId: string;
mime: string;
blobId: string;
}>(`
SELECT a.attachmentId, a.mime, a.blobId
FROM attachments a
LEFT JOIN blobs b ON a.blobId = b.blobId
WHERE a.role = 'image'
AND a.isDeleted = 0
AND a.blobId IS NOT NULL
AND (b.ocr_text IS NULL OR b.ocr_text = '')
`);
log.info(`Found ${imageAttachments.length} image attachments to process`);
for (const attachmentRow of imageAttachments) {
if (this.isSupportedMimeType(attachmentRow.mime)) {
try {
await this.processAttachmentOCR(attachmentRow.attachmentId);
// Add small delay to prevent overwhelming the system
await new Promise(resolve => setTimeout(resolve, 100));
} catch (error) {
log.error(`Failed to process OCR for attachment ${attachmentRow.attachmentId}: ${error}`);
}
}
}
log.info('Batch OCR processing completed');
} catch (error) {
log.error(`Batch OCR processing failed: ${error}`);
throw error;
}
return this.processAllBlobsNeedingOCR();
}
/**
@ -521,28 +434,9 @@ class OCRService {
}
try {
// Count total images to process
const imageNotesCount = sql.getRow<{ count: number }>(`
SELECT COUNT(*) as count
FROM notes
WHERE type = 'image'
AND isDeleted = 0
AND noteId NOT IN (
SELECT entity_id FROM ocr_results WHERE entity_type = 'note'
)
`)?.count || 0;
const imageAttachmentsCount = sql.getRow<{ count: number }>(`
SELECT COUNT(*) as count
FROM attachments
WHERE role = 'image'
AND isDeleted = 0
AND attachmentId NOT IN (
SELECT entity_id FROM ocr_results WHERE entity_type = 'attachment'
)
`)?.count || 0;
const totalCount = imageNotesCount + imageAttachmentsCount;
// Count total blobs needing OCR processing
const blobsNeedingOCR = this.getBlobsNeedingOCR();
const totalCount = blobsNeedingOCR.length;
if (totalCount === 0) {
return { success: false, message: 'No images found that need OCR processing' };
@ -557,7 +451,7 @@ class OCRService {
};
// Start processing in background
this.processBatchInBackground().catch(error => {
this.processBatchInBackground(blobsNeedingOCR).catch(error => {
log.error(`Batch processing failed: ${error instanceof Error ? error.message : String(error)}`);
this.batchProcessingState.inProgress = false;
});
@ -583,79 +477,33 @@ class OCRService {
/**
* Process batch OCR in background with progress tracking
*/
private async processBatchInBackground(): Promise<void> {
private async processBatchInBackground(blobsToProcess: Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }>): Promise<void> {
try {
log.info('Starting batch OCR processing...');
// Process image notes
const imageNotes = sql.getRows<{
noteId: string;
mime: string;
blobId: string;
}>(`
SELECT n.noteId, n.mime, n.blobId
FROM notes n
LEFT JOIN blobs b ON n.blobId = b.blobId
WHERE n.type = 'image'
AND n.isDeleted = 0
AND n.blobId IS NOT NULL
AND (b.ocr_text IS NULL OR b.ocr_text = '')
`);
for (const noteRow of imageNotes) {
for (const blobInfo of blobsToProcess) {
if (!this.batchProcessingState.inProgress) {
break; // Stop if processing was cancelled
}
if (this.isSupportedMimeType(noteRow.mime)) {
try {
await this.processNoteOCR(noteRow.noteId);
if (blobInfo.entityType === 'note') {
await this.processNoteOCR(blobInfo.entityId);
} else {
await this.processAttachmentOCR(blobInfo.entityId);
}
this.batchProcessingState.processed++;
// Add small delay to prevent overwhelming the system
await new Promise(resolve => setTimeout(resolve, 500));
} catch (error) {
log.error(`Failed to process OCR for note ${noteRow.noteId}: ${error}`);
log.error(`Failed to process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`);
this.batchProcessingState.processed++; // Count as processed even if failed
}
}
}
// Process image attachments
const imageAttachments = sql.getRows<{
attachmentId: string;
mime: string;
blobId: string;
}>(`
SELECT a.attachmentId, a.mime, a.blobId
FROM attachments a
LEFT JOIN blobs b ON a.blobId = b.blobId
WHERE a.role = 'image'
AND a.isDeleted = 0
AND a.blobId IS NOT NULL
AND (b.ocr_text IS NULL OR b.ocr_text = '')
`);
for (const attachmentRow of imageAttachments) {
if (!this.batchProcessingState.inProgress) {
break; // Stop if processing was cancelled
}
if (this.isSupportedMimeType(attachmentRow.mime)) {
try {
await this.processAttachmentOCR(attachmentRow.attachmentId);
this.batchProcessingState.processed++;
// Add small delay to prevent overwhelming the system
await new Promise(resolve => setTimeout(resolve, 500));
} catch (error) {
log.error(`Failed to process OCR for attachment ${attachmentRow.attachmentId}: ${error}`);
this.batchProcessingState.processed++; // Count as processed even if failed
}
}
}
// Mark as completed
this.batchProcessingState.inProgress = false;
log.info(`Batch OCR processing completed. Processed ${this.batchProcessingState.processed} images.`);
log.info(`Batch OCR processing completed. Processed ${this.batchProcessingState.processed} files.`);
} catch (error) {
log.error(`Batch OCR processing failed: ${error}`);
this.batchProcessingState.inProgress = false;
@ -672,6 +520,170 @@ class OCRService {
log.info('Batch OCR processing cancelled');
}
}
/**
* Get processor for a given MIME type
*/
private getProcessorForMimeType(mimeType: string): FileProcessor | null {
for (const processor of this.processors.values()) {
if (processor.canProcess(mimeType)) {
return processor;
}
}
return null;
}
/**
* Check if blob needs OCR re-processing due to content changes
*/
needsReprocessing(blobId: string): boolean {
if (!blobId) {
return false;
}
try {
const blobInfo = sql.getRow<{
utcDateModified: string;
ocr_last_processed: string | null;
}>(`
SELECT utcDateModified, ocr_last_processed
FROM blobs
WHERE blobId = ?
`, [blobId]);
if (!blobInfo) {
return false;
}
// If OCR was never processed, it needs processing
if (!blobInfo.ocr_last_processed) {
return true;
}
// If blob was modified after last OCR processing, it needs re-processing
const blobModified = new Date(blobInfo.utcDateModified);
const lastOcrProcessed = new Date(blobInfo.ocr_last_processed);
return blobModified > lastOcrProcessed;
} catch (error) {
log.error(`Failed to check if blob ${blobId} needs reprocessing: ${error}`);
return false;
}
}
/**
* Invalidate OCR results for a blob (clear ocr_text and ocr_last_processed)
*/
invalidateOCRResult(blobId: string): void {
if (!blobId) {
return;
}
try {
sql.execute(`
UPDATE blobs SET
ocr_text = NULL,
ocr_last_processed = NULL
WHERE blobId = ?
`, [blobId]);
log.info(`Invalidated OCR result for blob ${blobId}`);
} catch (error) {
log.error(`Failed to invalidate OCR result for blob ${blobId}: ${error}`);
throw error;
}
}
/**
* Get blobs that need OCR processing (modified after last OCR or never processed)
*/
getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> {
try {
// Get notes with blobs that need OCR
const noteBlobs = sql.getRows<{
blobId: string;
mimeType: string;
entityId: string;
}>(`
SELECT n.blobId, n.mime as mimeType, n.noteId as entityId
FROM notes n
JOIN blobs b ON n.blobId = b.blobId
WHERE n.type = 'image'
AND n.isDeleted = 0
AND n.blobId IS NOT NULL
AND (
b.ocr_last_processed IS NULL
OR b.utcDateModified > b.ocr_last_processed
)
`);
// Get attachments with blobs that need OCR
const attachmentBlobs = sql.getRows<{
blobId: string;
mimeType: string;
entityId: string;
}>(`
SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId
FROM attachments a
JOIN blobs b ON a.blobId = b.blobId
WHERE a.role = 'image'
AND a.isDeleted = 0
AND a.blobId IS NOT NULL
AND (
b.ocr_last_processed IS NULL
OR b.utcDateModified > b.ocr_last_processed
)
`);
// Combine results
const result = [
...noteBlobs.map(blob => ({ ...blob, entityType: 'note' as const })),
...attachmentBlobs.map(blob => ({ ...blob, entityType: 'attachment' as const }))
];
// Filter to only supported MIME types
return result.filter(blob => this.isSupportedMimeType(blob.mimeType));
} catch (error) {
log.error(`Failed to get blobs needing OCR: ${error}`);
return [];
}
}
/**
* Process OCR for all blobs that need it (auto-processing)
*/
async processAllBlobsNeedingOCR(): Promise<void> {
if (!this.isOCREnabled()) {
log.info('OCR is disabled, skipping auto-processing');
return;
}
const blobsNeedingOCR = this.getBlobsNeedingOCR();
if (blobsNeedingOCR.length === 0) {
log.info('No blobs need OCR processing');
return;
}
log.info(`Auto-processing OCR for ${blobsNeedingOCR.length} blobs...`);
for (const blobInfo of blobsNeedingOCR) {
try {
if (blobInfo.entityType === 'note') {
await this.processNoteOCR(blobInfo.entityId);
} else {
await this.processAttachmentOCR(blobInfo.entityId);
}
// Add small delay to prevent overwhelming the system
await new Promise(resolve => setTimeout(resolve, 100));
} catch (error) {
log.error(`Failed to auto-process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`);
// Continue with other blobs
}
}
log.info('Auto-processing OCR completed');
}
}
export default new OCRService();

View File

@ -0,0 +1,28 @@
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
/**
* Base class for file processors that extract text from different file types
*/
export abstract class FileProcessor {
/**
* Check if this processor can handle the given MIME type
*/
abstract canProcess(mimeType: string): boolean;
/**
* Extract text from the given file buffer
*/
abstract extractText(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult>;
/**
* Get the processing type identifier
*/
abstract getProcessingType(): string;
/**
* Clean up any resources
*/
cleanup(): Promise<void> {
return Promise.resolve();
}
}

View File

@ -0,0 +1,162 @@
import Tesseract from 'tesseract.js';
import { FileProcessor } from './file_processor.js';
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
import log from '../../log.js';
/**
* Image processor for extracting text from image files using Tesseract
*/
export class ImageProcessor extends FileProcessor {
private worker: Tesseract.Worker | null = null;
private isInitialized = false;
canProcess(mimeType: string): boolean {
const supportedTypes = [
'image/jpeg',
'image/jpg',
'image/png',
'image/gif',
'image/bmp',
'image/tiff',
'image/webp'
];
return supportedTypes.includes(mimeType.toLowerCase());
}
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
if (!this.isInitialized) {
await this.initialize();
}
if (!this.worker) {
throw new Error('Image processor worker not initialized');
}
try {
log.info('Starting image OCR text extraction...');
// Set language if specified and different from current
// Support multi-language format like 'ron+eng'
const language = options.language || this.getDefaultOCRLanguage();
// Validate language format
if (!this.isValidLanguageFormat(language)) {
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
}
if (language !== 'eng') {
// For different languages, create a new worker
await this.worker.terminate();
log.info(`Initializing Tesseract worker for language(s): ${language}`);
this.worker = await Tesseract.createWorker(language, 1, {
logger: (m: { status: string; progress: number }) => {
if (m.status === 'recognizing text') {
log.info(`Image OCR progress (${language}): ${Math.round(m.progress * 100)}%`);
}
}
});
}
const result = await this.worker.recognize(buffer);
const ocrResult: OCRResult = {
text: result.data.text.trim(),
confidence: result.data.confidence / 100, // Convert percentage to decimal
extractedAt: new Date().toISOString(),
language: options.language || this.getDefaultOCRLanguage(),
pageCount: 1
};
log.info(`Image OCR extraction completed. Confidence: ${ocrResult.confidence}%, Text length: ${ocrResult.text.length}`);
return ocrResult;
} catch (error) {
log.error(`Image OCR text extraction failed: ${error}`);
throw error;
}
}
getProcessingType(): string {
return 'image';
}
private async initialize(): Promise<void> {
if (this.isInitialized) {
return;
}
try {
log.info('Initializing image OCR processor with Tesseract.js...');
// Configure proper paths for Node.js environment
const tesseractDir = require.resolve('tesseract.js').replace('/src/index.js', '');
const workerPath = require.resolve('tesseract.js/src/worker-script/node/index.js');
const corePath = require.resolve('tesseract.js-core/tesseract-core.wasm.js');
log.info(`Using worker path: ${workerPath}`);
log.info(`Using core path: ${corePath}`);
this.worker = await Tesseract.createWorker(this.getDefaultOCRLanguage(), 1, {
workerPath,
corePath,
logger: (m: { status: string; progress: number }) => {
if (m.status === 'recognizing text') {
log.info(`Image OCR progress: ${Math.round(m.progress * 100)}%`);
}
}
});
this.isInitialized = true;
log.info('Image OCR processor initialized successfully');
} catch (error) {
log.error(`Failed to initialize image OCR processor: ${error}`);
throw error;
}
}
async cleanup(): Promise<void> {
if (this.worker) {
await this.worker.terminate();
this.worker = null;
}
this.isInitialized = false;
log.info('Image OCR processor cleaned up');
}
/**
* Get default OCR language from options
*/
private getDefaultOCRLanguage(): string {
try {
const options = require('../../options.js').default;
const ocrLanguage = options.getOption('ocrLanguage');
if (!ocrLanguage) {
throw new Error('OCR language not configured in user settings');
}
return ocrLanguage;
} catch (error) {
log.error(`Failed to get default OCR language: ${error}`);
throw new Error('OCR language must be configured in settings before processing');
}
}
/**
* Validate OCR language format
* Supports single language (eng) or multi-language (ron+eng)
*/
private isValidLanguageFormat(language: string): boolean {
if (!language || typeof language !== 'string') {
return false;
}
// Split by '+' for multi-language format
const languages = language.split('+');
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
return languages.every(lang => {
const trimmed = lang.trim();
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
});
}
}

View File

@ -0,0 +1,128 @@
import * as officeParser from 'officeparser';
import { FileProcessor } from './file_processor.js';
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
import { ImageProcessor } from './image_processor.js';
import log from '../../log.js';
/**
* Office document processor for extracting text and images from DOCX/XLSX/PPTX files
*/
export class OfficeProcessor extends FileProcessor {
private imageProcessor: ImageProcessor;
constructor() {
super();
this.imageProcessor = new ImageProcessor();
}
canProcess(mimeType: string): boolean {
const supportedTypes = [
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
'application/msword', // DOC
'application/vnd.ms-excel', // XLS
'application/vnd.ms-powerpoint', // PPT
'application/rtf' // RTF
];
return supportedTypes.includes(mimeType.toLowerCase());
}
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
try {
log.info('Starting Office document text extraction...');
// Validate language format
const language = options.language || this.getDefaultOCRLanguage();
if (!this.isValidLanguageFormat(language)) {
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
}
// Extract text from Office document
const data = await this.parseOfficeDocument(buffer);
// Extract text from Office document
const combinedText = data.data && data.data.trim().length > 0 ? data.data.trim() : '';
const confidence = combinedText.length > 0 ? 0.99 : 0; // High confidence for direct text extraction
const result: OCRResult = {
text: combinedText,
confidence: confidence,
extractedAt: new Date().toISOString(),
language: language,
pageCount: 1 // Office documents are treated as single logical document
};
log.info(`Office document text extraction completed. Confidence: ${confidence}%, Text length: ${result.text.length}`);
return result;
} catch (error) {
log.error(`Office document text extraction failed: ${error}`);
throw error;
}
}
private async parseOfficeDocument(buffer: Buffer): Promise<{ data: string }> {
try {
// Use promise-based API directly
const data = await officeParser.parseOfficeAsync(buffer, {
outputErrorToConsole: false,
newlineDelimiter: '\n',
ignoreNotes: false,
putNotesAtLast: false
});
return {
data: data || ''
};
} catch (error) {
throw new Error(`Office document parsing failed: ${error}`);
}
}
getProcessingType(): string {
return 'office';
}
async cleanup(): Promise<void> {
await this.imageProcessor.cleanup();
}
/**
* Get default OCR language from options
*/
private getDefaultOCRLanguage(): string {
try {
const options = require('../../options.js').default;
const ocrLanguage = options.getOption('ocrLanguage');
if (!ocrLanguage) {
throw new Error('OCR language not configured in user settings');
}
return ocrLanguage;
} catch (error) {
log.error(`Failed to get default OCR language: ${error}`);
throw new Error('OCR language must be configured in settings before processing');
}
}
/**
* Validate OCR language format
* Supports single language (eng) or multi-language (ron+eng)
*/
private isValidLanguageFormat(language: string): boolean {
if (!language || typeof language !== 'string') {
return false;
}
// Split by '+' for multi-language format
const languages = language.split('+');
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
return languages.every(lang => {
const trimmed = lang.trim();
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
});
}
}

View File

@ -0,0 +1,142 @@
import * as pdfParse from 'pdf-parse';
import { FileProcessor } from './file_processor.js';
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
import { ImageProcessor } from './image_processor.js';
import log from '../../log.js';
import sharp from 'sharp';
/**
* PDF processor for extracting text from PDF files
* First tries to extract existing text, then falls back to OCR on images
*/
export class PDFProcessor extends FileProcessor {
private imageProcessor: ImageProcessor;
constructor() {
super();
this.imageProcessor = new ImageProcessor();
}
canProcess(mimeType: string): boolean {
return mimeType.toLowerCase() === 'application/pdf';
}
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
try {
log.info('Starting PDF text extraction...');
// Validate language format
const language = options.language || this.getDefaultOCRLanguage();
if (!this.isValidLanguageFormat(language)) {
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
}
// First try to extract existing text from PDF
if (options.enablePDFTextExtraction !== false) {
const textResult = await this.extractTextFromPDF(buffer, options);
if (textResult.text.trim().length > 0) {
log.info(`PDF text extraction successful. Length: ${textResult.text.length}`);
return textResult;
}
}
// Fall back to OCR if no text found or PDF text extraction is disabled
log.info('No text found in PDF or text extraction disabled, falling back to OCR...');
return await this.extractTextViaOCR(buffer, options);
} catch (error) {
log.error(`PDF text extraction failed: ${error}`);
throw error;
}
}
private async extractTextFromPDF(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult> {
try {
const data = await pdfParse(buffer);
return {
text: data.text.trim(),
confidence: 0.99, // High confidence for direct text extraction
extractedAt: new Date().toISOString(),
language: options.language || this.getDefaultOCRLanguage(),
pageCount: data.numpages
};
} catch (error) {
log.error(`PDF text extraction failed: ${error}`);
throw error;
}
}
private async extractTextViaOCR(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult> {
try {
// Convert PDF to images and OCR each page
// For now, we'll use a simple approach - convert first page to image
// In a full implementation, we'd convert all pages
// This is a simplified implementation
// In practice, you might want to use pdf2pic or similar library
// to convert PDF pages to images for OCR
// For now, we'll return a placeholder result
// indicating that OCR on PDF is not fully implemented
log.info('PDF to image conversion not fully implemented, returning placeholder');
return {
text: '[PDF OCR not fully implemented - would convert PDF pages to images and OCR each page]',
confidence: 0.0,
extractedAt: new Date().toISOString(),
language: options.language || this.getDefaultOCRLanguage(),
pageCount: 1
};
} catch (error) {
log.error(`PDF OCR extraction failed: ${error}`);
throw error;
}
}
getProcessingType(): string {
return 'pdf';
}
async cleanup(): Promise<void> {
await this.imageProcessor.cleanup();
}
/**
* Get default OCR language from options
*/
private getDefaultOCRLanguage(): string {
try {
const options = require('../../options.js').default;
const ocrLanguage = options.getOption('ocrLanguage');
if (!ocrLanguage) {
throw new Error('OCR language not configured in user settings');
}
return ocrLanguage;
} catch (error) {
log.error(`Failed to get default OCR language: ${error}`);
throw new Error('OCR language must be configured in settings before processing');
}
}
/**
* Validate OCR language format
* Supports single language (eng) or multi-language (ron+eng)
*/
private isValidLanguageFormat(language: string): boolean {
if (!language || typeof language !== 'string') {
return false;
}
// Split by '+' for multi-language format
const languages = language.split('+');
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
return languages.every(lang => {
const trimmed = lang.trim();
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
});
}
}

View File

@ -0,0 +1,129 @@
import sharp from 'sharp';
import { FileProcessor } from './file_processor.js';
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
import { ImageProcessor } from './image_processor.js';
import log from '../../log.js';
/**
* TIFF processor for extracting text from multi-page TIFF files
*/
export class TIFFProcessor extends FileProcessor {
private imageProcessor: ImageProcessor;
constructor() {
super();
this.imageProcessor = new ImageProcessor();
}
canProcess(mimeType: string): boolean {
return mimeType.toLowerCase() === 'image/tiff' || mimeType.toLowerCase() === 'image/tif';
}
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
try {
log.info('Starting TIFF text extraction...');
// Validate language format
const language = options.language || this.getDefaultOCRLanguage();
if (!this.isValidLanguageFormat(language)) {
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
}
// Check if this is a multi-page TIFF
const metadata = await sharp(buffer).metadata();
const pageCount = metadata.pages || 1;
let combinedText = '';
let totalConfidence = 0;
// Process each page
for (let page = 0; page < pageCount; page++) {
try {
log.info(`Processing TIFF page ${page + 1}/${pageCount}...`);
// Extract page as PNG buffer
const pageBuffer = await sharp(buffer, { page })
.png()
.toBuffer();
// OCR the page
const pageResult = await this.imageProcessor.extractText(pageBuffer, options);
if (pageResult.text.trim().length > 0) {
if (combinedText.length > 0) {
combinedText += '\n\n--- Page ' + (page + 1) + ' ---\n';
}
combinedText += pageResult.text;
totalConfidence += pageResult.confidence;
}
} catch (error) {
log.error(`Failed to process TIFF page ${page + 1}: ${error}`);
// Continue with other pages
}
}
const averageConfidence = pageCount > 0 ? totalConfidence / pageCount : 0;
const result: OCRResult = {
text: combinedText.trim(),
confidence: averageConfidence,
extractedAt: new Date().toISOString(),
language: options.language || this.getDefaultOCRLanguage(),
pageCount: pageCount
};
log.info(`TIFF text extraction completed. Pages: ${pageCount}, Confidence: ${averageConfidence}%, Text length: ${result.text.length}`);
return result;
} catch (error) {
log.error(`TIFF text extraction failed: ${error}`);
throw error;
}
}
getProcessingType(): string {
return 'tiff';
}
async cleanup(): Promise<void> {
await this.imageProcessor.cleanup();
}
/**
* Get default OCR language from options
*/
private getDefaultOCRLanguage(): string {
try {
const options = require('../../options.js').default;
const ocrLanguage = options.getOption('ocrLanguage');
if (!ocrLanguage) {
throw new Error('OCR language not configured in user settings');
}
return ocrLanguage;
} catch (error) {
log.error(`Failed to get default OCR language: ${error}`);
throw new Error('OCR language must be configured in settings before processing');
}
}
/**
* Validate OCR language format
* Supports single language (eng) or multi-language (ron+eng)
*/
private isValidLanguageFormat(language: string): boolean {
if (!language || typeof language !== 'string') {
return false;
}
// Split by '+' for multi-language format
const languages = language.split('+');
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
return languages.every(lang => {
const trimmed = lang.trim();
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
});
}
}