mirror of
https://github.com/zadam/trilium.git
synced 2025-12-04 22:44:25 +01:00
feat(ocr): add additional processors for OCR feature
This commit is contained in:
parent
6722d2d266
commit
ca8cbf8ccf
@ -3,23 +3,31 @@ import log from '../log.js';
|
||||
import sql from '../sql.js';
|
||||
import becca from '../../becca/becca.js';
|
||||
import options from '../options.js';
|
||||
import { ImageProcessor } from './processors/image_processor.js';
|
||||
import { PDFProcessor } from './processors/pdf_processor.js';
|
||||
import { TIFFProcessor } from './processors/tiff_processor.js';
|
||||
import { OfficeProcessor } from './processors/office_processor.js';
|
||||
import { FileProcessor } from './processors/file_processor.js';
|
||||
|
||||
export interface OCRResult {
|
||||
text: string;
|
||||
confidence: number;
|
||||
extractedAt: string;
|
||||
language?: string;
|
||||
pageCount?: number;
|
||||
}
|
||||
|
||||
export interface OCRProcessingOptions {
|
||||
language?: string;
|
||||
forceReprocess?: boolean;
|
||||
confidence?: number;
|
||||
enablePDFTextExtraction?: boolean;
|
||||
}
|
||||
|
||||
interface OCRBlobRow {
|
||||
blobId: string;
|
||||
ocr_text: string;
|
||||
ocr_last_processed?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -30,6 +38,7 @@ class OCRService {
|
||||
private isInitialized = false;
|
||||
private worker: Tesseract.Worker | null = null;
|
||||
private isProcessing = false;
|
||||
private processors: Map<string, FileProcessor> = new Map();
|
||||
|
||||
/**
|
||||
* Initialize the OCR service
|
||||
@ -40,25 +49,14 @@ class OCRService {
|
||||
}
|
||||
|
||||
try {
|
||||
log.info('Initializing OCR service with Tesseract.js...');
|
||||
log.info('Initializing OCR service with file processors...');
|
||||
|
||||
// Configure proper paths for Node.js environment
|
||||
const tesseractDir = require.resolve('tesseract.js').replace('/src/index.js', '');
|
||||
const workerPath = require.resolve('tesseract.js/src/worker-script/node/index.js');
|
||||
const corePath = require.resolve('tesseract.js-core/tesseract-core.wasm.js');
|
||||
// Initialize file processors
|
||||
this.processors.set('image', new ImageProcessor());
|
||||
this.processors.set('pdf', new PDFProcessor());
|
||||
this.processors.set('tiff', new TIFFProcessor());
|
||||
this.processors.set('office', new OfficeProcessor());
|
||||
|
||||
log.info(`Using worker path: ${workerPath}`);
|
||||
log.info(`Using core path: ${corePath}`);
|
||||
|
||||
this.worker = await Tesseract.createWorker('eng', 1, {
|
||||
workerPath,
|
||||
corePath,
|
||||
logger: (m: { status: string; progress: number }) => {
|
||||
if (m.status === 'recognizing text') {
|
||||
log.info(`OCR progress: ${Math.round(m.progress * 100)}%`);
|
||||
}
|
||||
}
|
||||
});
|
||||
this.isInitialized = true;
|
||||
log.info('OCR service initialized successfully');
|
||||
} catch (error) {
|
||||
@ -100,46 +98,27 @@ class OCRService {
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text from image buffer
|
||||
* Extract text from file buffer using appropriate processor
|
||||
*/
|
||||
async extractTextFromImage(imageBuffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||
async extractTextFromFile(fileBuffer: Buffer, mimeType: string, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||
if (!this.isInitialized) {
|
||||
await this.initialize();
|
||||
}
|
||||
|
||||
if (!this.worker) {
|
||||
throw new Error('OCR worker not initialized');
|
||||
}
|
||||
|
||||
try {
|
||||
log.info('Starting OCR text extraction...');
|
||||
log.info(`Starting OCR text extraction for MIME type: ${mimeType}`);
|
||||
this.isProcessing = true;
|
||||
|
||||
// Set language if specified and different from current
|
||||
const language = options.language || 'eng';
|
||||
if (language !== 'eng') {
|
||||
// For different languages, create a new worker
|
||||
await this.worker.terminate();
|
||||
this.worker = await Tesseract.createWorker(language, 1, {
|
||||
logger: (m: { status: string; progress: number }) => {
|
||||
if (m.status === 'recognizing text') {
|
||||
log.info(`OCR progress: ${Math.round(m.progress * 100)}%`);
|
||||
}
|
||||
}
|
||||
});
|
||||
// Find appropriate processor
|
||||
const processor = this.getProcessorForMimeType(mimeType);
|
||||
if (!processor) {
|
||||
throw new Error(`No processor found for MIME type: ${mimeType}`);
|
||||
}
|
||||
|
||||
const result = await this.worker.recognize(imageBuffer);
|
||||
const result = await processor.extractText(fileBuffer, options);
|
||||
|
||||
const ocrResult: OCRResult = {
|
||||
text: result.data.text.trim(),
|
||||
confidence: result.data.confidence / 100, // Convert percentage to decimal
|
||||
extractedAt: new Date().toISOString(),
|
||||
language: options.language || 'eng'
|
||||
};
|
||||
|
||||
log.info(`OCR extraction completed. Confidence: ${ocrResult.confidence}%, Text length: ${ocrResult.text.length}`);
|
||||
return ocrResult;
|
||||
log.info(`OCR extraction completed. Confidence: ${result.confidence}%, Text length: ${result.text.length}`);
|
||||
return result;
|
||||
|
||||
} catch (error) {
|
||||
log.error(`OCR text extraction failed: ${error}`);
|
||||
@ -174,10 +153,10 @@ class OCRService {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check if OCR already exists in the blob and we're not forcing reprocessing
|
||||
// Check if OCR already exists and is up-to-date
|
||||
const existingOCR = this.getStoredOCRResult(note.blobId);
|
||||
if (existingOCR && !options.forceReprocess) {
|
||||
log.info(`OCR already exists for note ${noteId}, returning cached result`);
|
||||
if (existingOCR && !options.forceReprocess && note.blobId && !this.needsReprocessing(note.blobId)) {
|
||||
log.info(`OCR already exists and is up-to-date for note ${noteId}, returning cached result`);
|
||||
return existingOCR;
|
||||
}
|
||||
|
||||
@ -187,7 +166,7 @@ class OCRService {
|
||||
throw new Error(`Cannot get image content for note ${noteId}`);
|
||||
}
|
||||
|
||||
const ocrResult = await this.extractTextFromImage(content, options);
|
||||
const ocrResult = await this.extractTextFromFile(content, note.mime, options);
|
||||
|
||||
// Store OCR result in blob
|
||||
await this.storeOCRResult(note.blobId, ocrResult);
|
||||
@ -224,10 +203,10 @@ class OCRService {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check if OCR already exists in the blob and we're not forcing reprocessing
|
||||
// Check if OCR already exists and is up-to-date
|
||||
const existingOCR = this.getStoredOCRResult(attachment.blobId);
|
||||
if (existingOCR && !options.forceReprocess) {
|
||||
log.info(`OCR already exists for attachment ${attachmentId}, returning cached result`);
|
||||
if (existingOCR && !options.forceReprocess && attachment.blobId && !this.needsReprocessing(attachment.blobId)) {
|
||||
log.info(`OCR already exists and is up-to-date for attachment ${attachmentId}, returning cached result`);
|
||||
return existingOCR;
|
||||
}
|
||||
|
||||
@ -237,7 +216,7 @@ class OCRService {
|
||||
throw new Error(`Cannot get image content for attachment ${attachmentId}`);
|
||||
}
|
||||
|
||||
const ocrResult = await this.extractTextFromImage(content, options);
|
||||
const ocrResult = await this.extractTextFromFile(content, attachment.mime, options);
|
||||
|
||||
// Store OCR result in blob
|
||||
await this.storeOCRResult(attachment.blobId, ocrResult);
|
||||
@ -259,11 +238,15 @@ class OCRService {
|
||||
}
|
||||
|
||||
try {
|
||||
// Store OCR text in blobs table
|
||||
// Store OCR text and timestamp in blobs table
|
||||
sql.execute(`
|
||||
UPDATE blobs SET ocr_text = ? WHERE blobId = ?
|
||||
UPDATE blobs SET
|
||||
ocr_text = ?,
|
||||
ocr_last_processed = ?
|
||||
WHERE blobId = ?
|
||||
`, [
|
||||
ocrResult.text,
|
||||
new Date().toISOString(),
|
||||
blobId
|
||||
]);
|
||||
|
||||
@ -353,80 +336,10 @@ class OCRService {
|
||||
}
|
||||
|
||||
/**
|
||||
* Process OCR for all images that don't have OCR results yet
|
||||
* Process OCR for all files that don't have OCR results yet or need reprocessing
|
||||
*/
|
||||
async processAllImages(): Promise<void> {
|
||||
if (!this.isOCREnabled()) {
|
||||
log.info('OCR is disabled, skipping batch processing');
|
||||
return;
|
||||
}
|
||||
|
||||
log.info('Starting batch OCR processing for all images...');
|
||||
|
||||
try {
|
||||
// Process image notes
|
||||
const imageNotes = sql.getRows<{
|
||||
noteId: string;
|
||||
mime: string;
|
||||
blobId: string;
|
||||
}>(`
|
||||
SELECT n.noteId, n.mime, n.blobId
|
||||
FROM notes n
|
||||
LEFT JOIN blobs b ON n.blobId = b.blobId
|
||||
WHERE n.type = 'image'
|
||||
AND n.isDeleted = 0
|
||||
AND n.blobId IS NOT NULL
|
||||
AND (b.ocr_text IS NULL OR b.ocr_text = '')
|
||||
`);
|
||||
|
||||
log.info(`Found ${imageNotes.length} image notes to process`);
|
||||
|
||||
for (const noteRow of imageNotes) {
|
||||
if (this.isSupportedMimeType(noteRow.mime)) {
|
||||
try {
|
||||
await this.processNoteOCR(noteRow.noteId);
|
||||
// Add small delay to prevent overwhelming the system
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
} catch (error) {
|
||||
log.error(`Failed to process OCR for note ${noteRow.noteId}: ${error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process image attachments
|
||||
const imageAttachments = sql.getRows<{
|
||||
attachmentId: string;
|
||||
mime: string;
|
||||
blobId: string;
|
||||
}>(`
|
||||
SELECT a.attachmentId, a.mime, a.blobId
|
||||
FROM attachments a
|
||||
LEFT JOIN blobs b ON a.blobId = b.blobId
|
||||
WHERE a.role = 'image'
|
||||
AND a.isDeleted = 0
|
||||
AND a.blobId IS NOT NULL
|
||||
AND (b.ocr_text IS NULL OR b.ocr_text = '')
|
||||
`);
|
||||
|
||||
log.info(`Found ${imageAttachments.length} image attachments to process`);
|
||||
|
||||
for (const attachmentRow of imageAttachments) {
|
||||
if (this.isSupportedMimeType(attachmentRow.mime)) {
|
||||
try {
|
||||
await this.processAttachmentOCR(attachmentRow.attachmentId);
|
||||
// Add small delay to prevent overwhelming the system
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
} catch (error) {
|
||||
log.error(`Failed to process OCR for attachment ${attachmentRow.attachmentId}: ${error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.info('Batch OCR processing completed');
|
||||
} catch (error) {
|
||||
log.error(`Batch OCR processing failed: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
return this.processAllBlobsNeedingOCR();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -521,28 +434,9 @@ class OCRService {
|
||||
}
|
||||
|
||||
try {
|
||||
// Count total images to process
|
||||
const imageNotesCount = sql.getRow<{ count: number }>(`
|
||||
SELECT COUNT(*) as count
|
||||
FROM notes
|
||||
WHERE type = 'image'
|
||||
AND isDeleted = 0
|
||||
AND noteId NOT IN (
|
||||
SELECT entity_id FROM ocr_results WHERE entity_type = 'note'
|
||||
)
|
||||
`)?.count || 0;
|
||||
|
||||
const imageAttachmentsCount = sql.getRow<{ count: number }>(`
|
||||
SELECT COUNT(*) as count
|
||||
FROM attachments
|
||||
WHERE role = 'image'
|
||||
AND isDeleted = 0
|
||||
AND attachmentId NOT IN (
|
||||
SELECT entity_id FROM ocr_results WHERE entity_type = 'attachment'
|
||||
)
|
||||
`)?.count || 0;
|
||||
|
||||
const totalCount = imageNotesCount + imageAttachmentsCount;
|
||||
// Count total blobs needing OCR processing
|
||||
const blobsNeedingOCR = this.getBlobsNeedingOCR();
|
||||
const totalCount = blobsNeedingOCR.length;
|
||||
|
||||
if (totalCount === 0) {
|
||||
return { success: false, message: 'No images found that need OCR processing' };
|
||||
@ -557,7 +451,7 @@ class OCRService {
|
||||
};
|
||||
|
||||
// Start processing in background
|
||||
this.processBatchInBackground().catch(error => {
|
||||
this.processBatchInBackground(blobsNeedingOCR).catch(error => {
|
||||
log.error(`Batch processing failed: ${error instanceof Error ? error.message : String(error)}`);
|
||||
this.batchProcessingState.inProgress = false;
|
||||
});
|
||||
@ -583,79 +477,33 @@ class OCRService {
|
||||
/**
|
||||
* Process batch OCR in background with progress tracking
|
||||
*/
|
||||
private async processBatchInBackground(): Promise<void> {
|
||||
private async processBatchInBackground(blobsToProcess: Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }>): Promise<void> {
|
||||
try {
|
||||
log.info('Starting batch OCR processing...');
|
||||
|
||||
// Process image notes
|
||||
const imageNotes = sql.getRows<{
|
||||
noteId: string;
|
||||
mime: string;
|
||||
blobId: string;
|
||||
}>(`
|
||||
SELECT n.noteId, n.mime, n.blobId
|
||||
FROM notes n
|
||||
LEFT JOIN blobs b ON n.blobId = b.blobId
|
||||
WHERE n.type = 'image'
|
||||
AND n.isDeleted = 0
|
||||
AND n.blobId IS NOT NULL
|
||||
AND (b.ocr_text IS NULL OR b.ocr_text = '')
|
||||
`);
|
||||
|
||||
for (const noteRow of imageNotes) {
|
||||
for (const blobInfo of blobsToProcess) {
|
||||
if (!this.batchProcessingState.inProgress) {
|
||||
break; // Stop if processing was cancelled
|
||||
}
|
||||
|
||||
if (this.isSupportedMimeType(noteRow.mime)) {
|
||||
try {
|
||||
await this.processNoteOCR(noteRow.noteId);
|
||||
this.batchProcessingState.processed++;
|
||||
// Add small delay to prevent overwhelming the system
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
} catch (error) {
|
||||
log.error(`Failed to process OCR for note ${noteRow.noteId}: ${error}`);
|
||||
this.batchProcessingState.processed++; // Count as processed even if failed
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process image attachments
|
||||
const imageAttachments = sql.getRows<{
|
||||
attachmentId: string;
|
||||
mime: string;
|
||||
blobId: string;
|
||||
}>(`
|
||||
SELECT a.attachmentId, a.mime, a.blobId
|
||||
FROM attachments a
|
||||
LEFT JOIN blobs b ON a.blobId = b.blobId
|
||||
WHERE a.role = 'image'
|
||||
AND a.isDeleted = 0
|
||||
AND a.blobId IS NOT NULL
|
||||
AND (b.ocr_text IS NULL OR b.ocr_text = '')
|
||||
`);
|
||||
|
||||
for (const attachmentRow of imageAttachments) {
|
||||
if (!this.batchProcessingState.inProgress) {
|
||||
break; // Stop if processing was cancelled
|
||||
}
|
||||
|
||||
if (this.isSupportedMimeType(attachmentRow.mime)) {
|
||||
try {
|
||||
await this.processAttachmentOCR(attachmentRow.attachmentId);
|
||||
this.batchProcessingState.processed++;
|
||||
// Add small delay to prevent overwhelming the system
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
} catch (error) {
|
||||
log.error(`Failed to process OCR for attachment ${attachmentRow.attachmentId}: ${error}`);
|
||||
this.batchProcessingState.processed++; // Count as processed even if failed
|
||||
try {
|
||||
if (blobInfo.entityType === 'note') {
|
||||
await this.processNoteOCR(blobInfo.entityId);
|
||||
} else {
|
||||
await this.processAttachmentOCR(blobInfo.entityId);
|
||||
}
|
||||
this.batchProcessingState.processed++;
|
||||
// Add small delay to prevent overwhelming the system
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
} catch (error) {
|
||||
log.error(`Failed to process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`);
|
||||
this.batchProcessingState.processed++; // Count as processed even if failed
|
||||
}
|
||||
}
|
||||
|
||||
// Mark as completed
|
||||
this.batchProcessingState.inProgress = false;
|
||||
log.info(`Batch OCR processing completed. Processed ${this.batchProcessingState.processed} images.`);
|
||||
log.info(`Batch OCR processing completed. Processed ${this.batchProcessingState.processed} files.`);
|
||||
} catch (error) {
|
||||
log.error(`Batch OCR processing failed: ${error}`);
|
||||
this.batchProcessingState.inProgress = false;
|
||||
@ -672,6 +520,170 @@ class OCRService {
|
||||
log.info('Batch OCR processing cancelled');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get processor for a given MIME type
|
||||
*/
|
||||
private getProcessorForMimeType(mimeType: string): FileProcessor | null {
|
||||
for (const processor of this.processors.values()) {
|
||||
if (processor.canProcess(mimeType)) {
|
||||
return processor;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if blob needs OCR re-processing due to content changes
|
||||
*/
|
||||
needsReprocessing(blobId: string): boolean {
|
||||
if (!blobId) {
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
const blobInfo = sql.getRow<{
|
||||
utcDateModified: string;
|
||||
ocr_last_processed: string | null;
|
||||
}>(`
|
||||
SELECT utcDateModified, ocr_last_processed
|
||||
FROM blobs
|
||||
WHERE blobId = ?
|
||||
`, [blobId]);
|
||||
|
||||
if (!blobInfo) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// If OCR was never processed, it needs processing
|
||||
if (!blobInfo.ocr_last_processed) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// If blob was modified after last OCR processing, it needs re-processing
|
||||
const blobModified = new Date(blobInfo.utcDateModified);
|
||||
const lastOcrProcessed = new Date(blobInfo.ocr_last_processed);
|
||||
|
||||
return blobModified > lastOcrProcessed;
|
||||
} catch (error) {
|
||||
log.error(`Failed to check if blob ${blobId} needs reprocessing: ${error}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Invalidate OCR results for a blob (clear ocr_text and ocr_last_processed)
|
||||
*/
|
||||
invalidateOCRResult(blobId: string): void {
|
||||
if (!blobId) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
sql.execute(`
|
||||
UPDATE blobs SET
|
||||
ocr_text = NULL,
|
||||
ocr_last_processed = NULL
|
||||
WHERE blobId = ?
|
||||
`, [blobId]);
|
||||
|
||||
log.info(`Invalidated OCR result for blob ${blobId}`);
|
||||
} catch (error) {
|
||||
log.error(`Failed to invalidate OCR result for blob ${blobId}: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get blobs that need OCR processing (modified after last OCR or never processed)
|
||||
*/
|
||||
getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> {
|
||||
try {
|
||||
// Get notes with blobs that need OCR
|
||||
const noteBlobs = sql.getRows<{
|
||||
blobId: string;
|
||||
mimeType: string;
|
||||
entityId: string;
|
||||
}>(`
|
||||
SELECT n.blobId, n.mime as mimeType, n.noteId as entityId
|
||||
FROM notes n
|
||||
JOIN blobs b ON n.blobId = b.blobId
|
||||
WHERE n.type = 'image'
|
||||
AND n.isDeleted = 0
|
||||
AND n.blobId IS NOT NULL
|
||||
AND (
|
||||
b.ocr_last_processed IS NULL
|
||||
OR b.utcDateModified > b.ocr_last_processed
|
||||
)
|
||||
`);
|
||||
|
||||
// Get attachments with blobs that need OCR
|
||||
const attachmentBlobs = sql.getRows<{
|
||||
blobId: string;
|
||||
mimeType: string;
|
||||
entityId: string;
|
||||
}>(`
|
||||
SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId
|
||||
FROM attachments a
|
||||
JOIN blobs b ON a.blobId = b.blobId
|
||||
WHERE a.role = 'image'
|
||||
AND a.isDeleted = 0
|
||||
AND a.blobId IS NOT NULL
|
||||
AND (
|
||||
b.ocr_last_processed IS NULL
|
||||
OR b.utcDateModified > b.ocr_last_processed
|
||||
)
|
||||
`);
|
||||
|
||||
// Combine results
|
||||
const result = [
|
||||
...noteBlobs.map(blob => ({ ...blob, entityType: 'note' as const })),
|
||||
...attachmentBlobs.map(blob => ({ ...blob, entityType: 'attachment' as const }))
|
||||
];
|
||||
|
||||
// Filter to only supported MIME types
|
||||
return result.filter(blob => this.isSupportedMimeType(blob.mimeType));
|
||||
} catch (error) {
|
||||
log.error(`Failed to get blobs needing OCR: ${error}`);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process OCR for all blobs that need it (auto-processing)
|
||||
*/
|
||||
async processAllBlobsNeedingOCR(): Promise<void> {
|
||||
if (!this.isOCREnabled()) {
|
||||
log.info('OCR is disabled, skipping auto-processing');
|
||||
return;
|
||||
}
|
||||
|
||||
const blobsNeedingOCR = this.getBlobsNeedingOCR();
|
||||
if (blobsNeedingOCR.length === 0) {
|
||||
log.info('No blobs need OCR processing');
|
||||
return;
|
||||
}
|
||||
|
||||
log.info(`Auto-processing OCR for ${blobsNeedingOCR.length} blobs...`);
|
||||
|
||||
for (const blobInfo of blobsNeedingOCR) {
|
||||
try {
|
||||
if (blobInfo.entityType === 'note') {
|
||||
await this.processNoteOCR(blobInfo.entityId);
|
||||
} else {
|
||||
await this.processAttachmentOCR(blobInfo.entityId);
|
||||
}
|
||||
|
||||
// Add small delay to prevent overwhelming the system
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
} catch (error) {
|
||||
log.error(`Failed to auto-process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`);
|
||||
// Continue with other blobs
|
||||
}
|
||||
}
|
||||
|
||||
log.info('Auto-processing OCR completed');
|
||||
}
|
||||
}
|
||||
|
||||
export default new OCRService();
|
||||
28
apps/server/src/services/ocr/processors/file_processor.ts
Normal file
28
apps/server/src/services/ocr/processors/file_processor.ts
Normal file
@ -0,0 +1,28 @@
|
||||
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
|
||||
|
||||
/**
|
||||
* Base class for file processors that extract text from different file types
|
||||
*/
|
||||
export abstract class FileProcessor {
|
||||
/**
|
||||
* Check if this processor can handle the given MIME type
|
||||
*/
|
||||
abstract canProcess(mimeType: string): boolean;
|
||||
|
||||
/**
|
||||
* Extract text from the given file buffer
|
||||
*/
|
||||
abstract extractText(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult>;
|
||||
|
||||
/**
|
||||
* Get the processing type identifier
|
||||
*/
|
||||
abstract getProcessingType(): string;
|
||||
|
||||
/**
|
||||
* Clean up any resources
|
||||
*/
|
||||
cleanup(): Promise<void> {
|
||||
return Promise.resolve();
|
||||
}
|
||||
}
|
||||
162
apps/server/src/services/ocr/processors/image_processor.ts
Normal file
162
apps/server/src/services/ocr/processors/image_processor.ts
Normal file
@ -0,0 +1,162 @@
|
||||
import Tesseract from 'tesseract.js';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
|
||||
import log from '../../log.js';
|
||||
|
||||
/**
|
||||
* Image processor for extracting text from image files using Tesseract
|
||||
*/
|
||||
export class ImageProcessor extends FileProcessor {
|
||||
private worker: Tesseract.Worker | null = null;
|
||||
private isInitialized = false;
|
||||
|
||||
canProcess(mimeType: string): boolean {
|
||||
const supportedTypes = [
|
||||
'image/jpeg',
|
||||
'image/jpg',
|
||||
'image/png',
|
||||
'image/gif',
|
||||
'image/bmp',
|
||||
'image/tiff',
|
||||
'image/webp'
|
||||
];
|
||||
return supportedTypes.includes(mimeType.toLowerCase());
|
||||
}
|
||||
|
||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||
if (!this.isInitialized) {
|
||||
await this.initialize();
|
||||
}
|
||||
|
||||
if (!this.worker) {
|
||||
throw new Error('Image processor worker not initialized');
|
||||
}
|
||||
|
||||
try {
|
||||
log.info('Starting image OCR text extraction...');
|
||||
|
||||
// Set language if specified and different from current
|
||||
// Support multi-language format like 'ron+eng'
|
||||
const language = options.language || this.getDefaultOCRLanguage();
|
||||
|
||||
// Validate language format
|
||||
if (!this.isValidLanguageFormat(language)) {
|
||||
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
|
||||
}
|
||||
|
||||
if (language !== 'eng') {
|
||||
// For different languages, create a new worker
|
||||
await this.worker.terminate();
|
||||
log.info(`Initializing Tesseract worker for language(s): ${language}`);
|
||||
this.worker = await Tesseract.createWorker(language, 1, {
|
||||
logger: (m: { status: string; progress: number }) => {
|
||||
if (m.status === 'recognizing text') {
|
||||
log.info(`Image OCR progress (${language}): ${Math.round(m.progress * 100)}%`);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
const result = await this.worker.recognize(buffer);
|
||||
|
||||
const ocrResult: OCRResult = {
|
||||
text: result.data.text.trim(),
|
||||
confidence: result.data.confidence / 100, // Convert percentage to decimal
|
||||
extractedAt: new Date().toISOString(),
|
||||
language: options.language || this.getDefaultOCRLanguage(),
|
||||
pageCount: 1
|
||||
};
|
||||
|
||||
log.info(`Image OCR extraction completed. Confidence: ${ocrResult.confidence}%, Text length: ${ocrResult.text.length}`);
|
||||
return ocrResult;
|
||||
|
||||
} catch (error) {
|
||||
log.error(`Image OCR text extraction failed: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
getProcessingType(): string {
|
||||
return 'image';
|
||||
}
|
||||
|
||||
private async initialize(): Promise<void> {
|
||||
if (this.isInitialized) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
log.info('Initializing image OCR processor with Tesseract.js...');
|
||||
|
||||
// Configure proper paths for Node.js environment
|
||||
const tesseractDir = require.resolve('tesseract.js').replace('/src/index.js', '');
|
||||
const workerPath = require.resolve('tesseract.js/src/worker-script/node/index.js');
|
||||
const corePath = require.resolve('tesseract.js-core/tesseract-core.wasm.js');
|
||||
|
||||
log.info(`Using worker path: ${workerPath}`);
|
||||
log.info(`Using core path: ${corePath}`);
|
||||
|
||||
this.worker = await Tesseract.createWorker(this.getDefaultOCRLanguage(), 1, {
|
||||
workerPath,
|
||||
corePath,
|
||||
logger: (m: { status: string; progress: number }) => {
|
||||
if (m.status === 'recognizing text') {
|
||||
log.info(`Image OCR progress: ${Math.round(m.progress * 100)}%`);
|
||||
}
|
||||
}
|
||||
});
|
||||
this.isInitialized = true;
|
||||
log.info('Image OCR processor initialized successfully');
|
||||
} catch (error) {
|
||||
log.error(`Failed to initialize image OCR processor: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async cleanup(): Promise<void> {
|
||||
if (this.worker) {
|
||||
await this.worker.terminate();
|
||||
this.worker = null;
|
||||
}
|
||||
this.isInitialized = false;
|
||||
log.info('Image OCR processor cleaned up');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get default OCR language from options
|
||||
*/
|
||||
private getDefaultOCRLanguage(): string {
|
||||
try {
|
||||
const options = require('../../options.js').default;
|
||||
const ocrLanguage = options.getOption('ocrLanguage');
|
||||
if (!ocrLanguage) {
|
||||
throw new Error('OCR language not configured in user settings');
|
||||
}
|
||||
return ocrLanguage;
|
||||
} catch (error) {
|
||||
log.error(`Failed to get default OCR language: ${error}`);
|
||||
throw new Error('OCR language must be configured in settings before processing');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate OCR language format
|
||||
* Supports single language (eng) or multi-language (ron+eng)
|
||||
*/
|
||||
private isValidLanguageFormat(language: string): boolean {
|
||||
if (!language || typeof language !== 'string') {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Split by '+' for multi-language format
|
||||
const languages = language.split('+');
|
||||
|
||||
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
|
||||
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
|
||||
|
||||
return languages.every(lang => {
|
||||
const trimmed = lang.trim();
|
||||
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
|
||||
});
|
||||
}
|
||||
}
|
||||
128
apps/server/src/services/ocr/processors/office_processor.ts
Normal file
128
apps/server/src/services/ocr/processors/office_processor.ts
Normal file
@ -0,0 +1,128 @@
|
||||
import * as officeParser from 'officeparser';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
|
||||
import { ImageProcessor } from './image_processor.js';
|
||||
import log from '../../log.js';
|
||||
|
||||
/**
|
||||
* Office document processor for extracting text and images from DOCX/XLSX/PPTX files
|
||||
*/
|
||||
export class OfficeProcessor extends FileProcessor {
|
||||
private imageProcessor: ImageProcessor;
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.imageProcessor = new ImageProcessor();
|
||||
}
|
||||
|
||||
canProcess(mimeType: string): boolean {
|
||||
const supportedTypes = [
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
|
||||
'application/msword', // DOC
|
||||
'application/vnd.ms-excel', // XLS
|
||||
'application/vnd.ms-powerpoint', // PPT
|
||||
'application/rtf' // RTF
|
||||
];
|
||||
return supportedTypes.includes(mimeType.toLowerCase());
|
||||
}
|
||||
|
||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||
try {
|
||||
log.info('Starting Office document text extraction...');
|
||||
|
||||
// Validate language format
|
||||
const language = options.language || this.getDefaultOCRLanguage();
|
||||
if (!this.isValidLanguageFormat(language)) {
|
||||
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
|
||||
}
|
||||
|
||||
// Extract text from Office document
|
||||
const data = await this.parseOfficeDocument(buffer);
|
||||
|
||||
// Extract text from Office document
|
||||
const combinedText = data.data && data.data.trim().length > 0 ? data.data.trim() : '';
|
||||
const confidence = combinedText.length > 0 ? 0.99 : 0; // High confidence for direct text extraction
|
||||
|
||||
const result: OCRResult = {
|
||||
text: combinedText,
|
||||
confidence: confidence,
|
||||
extractedAt: new Date().toISOString(),
|
||||
language: language,
|
||||
pageCount: 1 // Office documents are treated as single logical document
|
||||
};
|
||||
|
||||
log.info(`Office document text extraction completed. Confidence: ${confidence}%, Text length: ${result.text.length}`);
|
||||
return result;
|
||||
|
||||
} catch (error) {
|
||||
log.error(`Office document text extraction failed: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
private async parseOfficeDocument(buffer: Buffer): Promise<{ data: string }> {
|
||||
try {
|
||||
// Use promise-based API directly
|
||||
const data = await officeParser.parseOfficeAsync(buffer, {
|
||||
outputErrorToConsole: false,
|
||||
newlineDelimiter: '\n',
|
||||
ignoreNotes: false,
|
||||
putNotesAtLast: false
|
||||
});
|
||||
|
||||
return {
|
||||
data: data || ''
|
||||
};
|
||||
} catch (error) {
|
||||
throw new Error(`Office document parsing failed: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
getProcessingType(): string {
|
||||
return 'office';
|
||||
}
|
||||
|
||||
async cleanup(): Promise<void> {
|
||||
await this.imageProcessor.cleanup();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get default OCR language from options
|
||||
*/
|
||||
private getDefaultOCRLanguage(): string {
|
||||
try {
|
||||
const options = require('../../options.js').default;
|
||||
const ocrLanguage = options.getOption('ocrLanguage');
|
||||
if (!ocrLanguage) {
|
||||
throw new Error('OCR language not configured in user settings');
|
||||
}
|
||||
return ocrLanguage;
|
||||
} catch (error) {
|
||||
log.error(`Failed to get default OCR language: ${error}`);
|
||||
throw new Error('OCR language must be configured in settings before processing');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate OCR language format
|
||||
* Supports single language (eng) or multi-language (ron+eng)
|
||||
*/
|
||||
private isValidLanguageFormat(language: string): boolean {
|
||||
if (!language || typeof language !== 'string') {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Split by '+' for multi-language format
|
||||
const languages = language.split('+');
|
||||
|
||||
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
|
||||
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
|
||||
|
||||
return languages.every(lang => {
|
||||
const trimmed = lang.trim();
|
||||
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
|
||||
});
|
||||
}
|
||||
}
|
||||
142
apps/server/src/services/ocr/processors/pdf_processor.ts
Normal file
142
apps/server/src/services/ocr/processors/pdf_processor.ts
Normal file
@ -0,0 +1,142 @@
|
||||
import * as pdfParse from 'pdf-parse';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
|
||||
import { ImageProcessor } from './image_processor.js';
|
||||
import log from '../../log.js';
|
||||
import sharp from 'sharp';
|
||||
|
||||
/**
|
||||
* PDF processor for extracting text from PDF files
|
||||
* First tries to extract existing text, then falls back to OCR on images
|
||||
*/
|
||||
export class PDFProcessor extends FileProcessor {
|
||||
private imageProcessor: ImageProcessor;
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.imageProcessor = new ImageProcessor();
|
||||
}
|
||||
|
||||
canProcess(mimeType: string): boolean {
|
||||
return mimeType.toLowerCase() === 'application/pdf';
|
||||
}
|
||||
|
||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||
try {
|
||||
log.info('Starting PDF text extraction...');
|
||||
|
||||
// Validate language format
|
||||
const language = options.language || this.getDefaultOCRLanguage();
|
||||
if (!this.isValidLanguageFormat(language)) {
|
||||
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
|
||||
}
|
||||
|
||||
// First try to extract existing text from PDF
|
||||
if (options.enablePDFTextExtraction !== false) {
|
||||
const textResult = await this.extractTextFromPDF(buffer, options);
|
||||
if (textResult.text.trim().length > 0) {
|
||||
log.info(`PDF text extraction successful. Length: ${textResult.text.length}`);
|
||||
return textResult;
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to OCR if no text found or PDF text extraction is disabled
|
||||
log.info('No text found in PDF or text extraction disabled, falling back to OCR...');
|
||||
return await this.extractTextViaOCR(buffer, options);
|
||||
|
||||
} catch (error) {
|
||||
log.error(`PDF text extraction failed: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
private async extractTextFromPDF(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult> {
|
||||
try {
|
||||
const data = await pdfParse(buffer);
|
||||
|
||||
return {
|
||||
text: data.text.trim(),
|
||||
confidence: 0.99, // High confidence for direct text extraction
|
||||
extractedAt: new Date().toISOString(),
|
||||
language: options.language || this.getDefaultOCRLanguage(),
|
||||
pageCount: data.numpages
|
||||
};
|
||||
} catch (error) {
|
||||
log.error(`PDF text extraction failed: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
private async extractTextViaOCR(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult> {
|
||||
try {
|
||||
// Convert PDF to images and OCR each page
|
||||
// For now, we'll use a simple approach - convert first page to image
|
||||
// In a full implementation, we'd convert all pages
|
||||
|
||||
// This is a simplified implementation
|
||||
// In practice, you might want to use pdf2pic or similar library
|
||||
// to convert PDF pages to images for OCR
|
||||
|
||||
// For now, we'll return a placeholder result
|
||||
// indicating that OCR on PDF is not fully implemented
|
||||
log.info('PDF to image conversion not fully implemented, returning placeholder');
|
||||
|
||||
return {
|
||||
text: '[PDF OCR not fully implemented - would convert PDF pages to images and OCR each page]',
|
||||
confidence: 0.0,
|
||||
extractedAt: new Date().toISOString(),
|
||||
language: options.language || this.getDefaultOCRLanguage(),
|
||||
pageCount: 1
|
||||
};
|
||||
} catch (error) {
|
||||
log.error(`PDF OCR extraction failed: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
getProcessingType(): string {
|
||||
return 'pdf';
|
||||
}
|
||||
|
||||
async cleanup(): Promise<void> {
|
||||
await this.imageProcessor.cleanup();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get default OCR language from options
|
||||
*/
|
||||
private getDefaultOCRLanguage(): string {
|
||||
try {
|
||||
const options = require('../../options.js').default;
|
||||
const ocrLanguage = options.getOption('ocrLanguage');
|
||||
if (!ocrLanguage) {
|
||||
throw new Error('OCR language not configured in user settings');
|
||||
}
|
||||
return ocrLanguage;
|
||||
} catch (error) {
|
||||
log.error(`Failed to get default OCR language: ${error}`);
|
||||
throw new Error('OCR language must be configured in settings before processing');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate OCR language format
|
||||
* Supports single language (eng) or multi-language (ron+eng)
|
||||
*/
|
||||
private isValidLanguageFormat(language: string): boolean {
|
||||
if (!language || typeof language !== 'string') {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Split by '+' for multi-language format
|
||||
const languages = language.split('+');
|
||||
|
||||
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
|
||||
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
|
||||
|
||||
return languages.every(lang => {
|
||||
const trimmed = lang.trim();
|
||||
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
|
||||
});
|
||||
}
|
||||
}
|
||||
129
apps/server/src/services/ocr/processors/tiff_processor.ts
Normal file
129
apps/server/src/services/ocr/processors/tiff_processor.ts
Normal file
@ -0,0 +1,129 @@
|
||||
import sharp from 'sharp';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
|
||||
import { ImageProcessor } from './image_processor.js';
|
||||
import log from '../../log.js';
|
||||
|
||||
/**
|
||||
* TIFF processor for extracting text from multi-page TIFF files
|
||||
*/
|
||||
export class TIFFProcessor extends FileProcessor {
|
||||
private imageProcessor: ImageProcessor;
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.imageProcessor = new ImageProcessor();
|
||||
}
|
||||
|
||||
canProcess(mimeType: string): boolean {
|
||||
return mimeType.toLowerCase() === 'image/tiff' || mimeType.toLowerCase() === 'image/tif';
|
||||
}
|
||||
|
||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||
try {
|
||||
log.info('Starting TIFF text extraction...');
|
||||
|
||||
// Validate language format
|
||||
const language = options.language || this.getDefaultOCRLanguage();
|
||||
if (!this.isValidLanguageFormat(language)) {
|
||||
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
|
||||
}
|
||||
|
||||
// Check if this is a multi-page TIFF
|
||||
const metadata = await sharp(buffer).metadata();
|
||||
const pageCount = metadata.pages || 1;
|
||||
|
||||
let combinedText = '';
|
||||
let totalConfidence = 0;
|
||||
|
||||
// Process each page
|
||||
for (let page = 0; page < pageCount; page++) {
|
||||
try {
|
||||
log.info(`Processing TIFF page ${page + 1}/${pageCount}...`);
|
||||
|
||||
// Extract page as PNG buffer
|
||||
const pageBuffer = await sharp(buffer, { page })
|
||||
.png()
|
||||
.toBuffer();
|
||||
|
||||
// OCR the page
|
||||
const pageResult = await this.imageProcessor.extractText(pageBuffer, options);
|
||||
|
||||
if (pageResult.text.trim().length > 0) {
|
||||
if (combinedText.length > 0) {
|
||||
combinedText += '\n\n--- Page ' + (page + 1) + ' ---\n';
|
||||
}
|
||||
combinedText += pageResult.text;
|
||||
totalConfidence += pageResult.confidence;
|
||||
}
|
||||
} catch (error) {
|
||||
log.error(`Failed to process TIFF page ${page + 1}: ${error}`);
|
||||
// Continue with other pages
|
||||
}
|
||||
}
|
||||
|
||||
const averageConfidence = pageCount > 0 ? totalConfidence / pageCount : 0;
|
||||
|
||||
const result: OCRResult = {
|
||||
text: combinedText.trim(),
|
||||
confidence: averageConfidence,
|
||||
extractedAt: new Date().toISOString(),
|
||||
language: options.language || this.getDefaultOCRLanguage(),
|
||||
pageCount: pageCount
|
||||
};
|
||||
|
||||
log.info(`TIFF text extraction completed. Pages: ${pageCount}, Confidence: ${averageConfidence}%, Text length: ${result.text.length}`);
|
||||
return result;
|
||||
|
||||
} catch (error) {
|
||||
log.error(`TIFF text extraction failed: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
getProcessingType(): string {
|
||||
return 'tiff';
|
||||
}
|
||||
|
||||
async cleanup(): Promise<void> {
|
||||
await this.imageProcessor.cleanup();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get default OCR language from options
|
||||
*/
|
||||
private getDefaultOCRLanguage(): string {
|
||||
try {
|
||||
const options = require('../../options.js').default;
|
||||
const ocrLanguage = options.getOption('ocrLanguage');
|
||||
if (!ocrLanguage) {
|
||||
throw new Error('OCR language not configured in user settings');
|
||||
}
|
||||
return ocrLanguage;
|
||||
} catch (error) {
|
||||
log.error(`Failed to get default OCR language: ${error}`);
|
||||
throw new Error('OCR language must be configured in settings before processing');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate OCR language format
|
||||
* Supports single language (eng) or multi-language (ron+eng)
|
||||
*/
|
||||
private isValidLanguageFormat(language: string): boolean {
|
||||
if (!language || typeof language !== 'string') {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Split by '+' for multi-language format
|
||||
const languages = language.split('+');
|
||||
|
||||
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
|
||||
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
|
||||
|
||||
return languages.every(lang => {
|
||||
const trimmed = lang.trim();
|
||||
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
|
||||
});
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user