mirror of
https://github.com/zadam/trilium.git
synced 2025-12-05 06:54:23 +01:00
feat(ocr): add additional processors for OCR feature
This commit is contained in:
parent
6722d2d266
commit
ca8cbf8ccf
@ -3,23 +3,31 @@ import log from '../log.js';
|
|||||||
import sql from '../sql.js';
|
import sql from '../sql.js';
|
||||||
import becca from '../../becca/becca.js';
|
import becca from '../../becca/becca.js';
|
||||||
import options from '../options.js';
|
import options from '../options.js';
|
||||||
|
import { ImageProcessor } from './processors/image_processor.js';
|
||||||
|
import { PDFProcessor } from './processors/pdf_processor.js';
|
||||||
|
import { TIFFProcessor } from './processors/tiff_processor.js';
|
||||||
|
import { OfficeProcessor } from './processors/office_processor.js';
|
||||||
|
import { FileProcessor } from './processors/file_processor.js';
|
||||||
|
|
||||||
export interface OCRResult {
|
export interface OCRResult {
|
||||||
text: string;
|
text: string;
|
||||||
confidence: number;
|
confidence: number;
|
||||||
extractedAt: string;
|
extractedAt: string;
|
||||||
language?: string;
|
language?: string;
|
||||||
|
pageCount?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface OCRProcessingOptions {
|
export interface OCRProcessingOptions {
|
||||||
language?: string;
|
language?: string;
|
||||||
forceReprocess?: boolean;
|
forceReprocess?: boolean;
|
||||||
confidence?: number;
|
confidence?: number;
|
||||||
|
enablePDFTextExtraction?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface OCRBlobRow {
|
interface OCRBlobRow {
|
||||||
blobId: string;
|
blobId: string;
|
||||||
ocr_text: string;
|
ocr_text: string;
|
||||||
|
ocr_last_processed?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -30,6 +38,7 @@ class OCRService {
|
|||||||
private isInitialized = false;
|
private isInitialized = false;
|
||||||
private worker: Tesseract.Worker | null = null;
|
private worker: Tesseract.Worker | null = null;
|
||||||
private isProcessing = false;
|
private isProcessing = false;
|
||||||
|
private processors: Map<string, FileProcessor> = new Map();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize the OCR service
|
* Initialize the OCR service
|
||||||
@ -40,25 +49,14 @@ class OCRService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
log.info('Initializing OCR service with Tesseract.js...');
|
log.info('Initializing OCR service with file processors...');
|
||||||
|
|
||||||
// Configure proper paths for Node.js environment
|
// Initialize file processors
|
||||||
const tesseractDir = require.resolve('tesseract.js').replace('/src/index.js', '');
|
this.processors.set('image', new ImageProcessor());
|
||||||
const workerPath = require.resolve('tesseract.js/src/worker-script/node/index.js');
|
this.processors.set('pdf', new PDFProcessor());
|
||||||
const corePath = require.resolve('tesseract.js-core/tesseract-core.wasm.js');
|
this.processors.set('tiff', new TIFFProcessor());
|
||||||
|
this.processors.set('office', new OfficeProcessor());
|
||||||
|
|
||||||
log.info(`Using worker path: ${workerPath}`);
|
|
||||||
log.info(`Using core path: ${corePath}`);
|
|
||||||
|
|
||||||
this.worker = await Tesseract.createWorker('eng', 1, {
|
|
||||||
workerPath,
|
|
||||||
corePath,
|
|
||||||
logger: (m: { status: string; progress: number }) => {
|
|
||||||
if (m.status === 'recognizing text') {
|
|
||||||
log.info(`OCR progress: ${Math.round(m.progress * 100)}%`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
this.isInitialized = true;
|
this.isInitialized = true;
|
||||||
log.info('OCR service initialized successfully');
|
log.info('OCR service initialized successfully');
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@ -100,46 +98,27 @@ class OCRService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract text from image buffer
|
* Extract text from file buffer using appropriate processor
|
||||||
*/
|
*/
|
||||||
async extractTextFromImage(imageBuffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
async extractTextFromFile(fileBuffer: Buffer, mimeType: string, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||||
if (!this.isInitialized) {
|
if (!this.isInitialized) {
|
||||||
await this.initialize();
|
await this.initialize();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!this.worker) {
|
|
||||||
throw new Error('OCR worker not initialized');
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
log.info('Starting OCR text extraction...');
|
log.info(`Starting OCR text extraction for MIME type: ${mimeType}`);
|
||||||
this.isProcessing = true;
|
this.isProcessing = true;
|
||||||
|
|
||||||
// Set language if specified and different from current
|
// Find appropriate processor
|
||||||
const language = options.language || 'eng';
|
const processor = this.getProcessorForMimeType(mimeType);
|
||||||
if (language !== 'eng') {
|
if (!processor) {
|
||||||
// For different languages, create a new worker
|
throw new Error(`No processor found for MIME type: ${mimeType}`);
|
||||||
await this.worker.terminate();
|
|
||||||
this.worker = await Tesseract.createWorker(language, 1, {
|
|
||||||
logger: (m: { status: string; progress: number }) => {
|
|
||||||
if (m.status === 'recognizing text') {
|
|
||||||
log.info(`OCR progress: ${Math.round(m.progress * 100)}%`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const result = await this.worker.recognize(imageBuffer);
|
const result = await processor.extractText(fileBuffer, options);
|
||||||
|
|
||||||
const ocrResult: OCRResult = {
|
log.info(`OCR extraction completed. Confidence: ${result.confidence}%, Text length: ${result.text.length}`);
|
||||||
text: result.data.text.trim(),
|
return result;
|
||||||
confidence: result.data.confidence / 100, // Convert percentage to decimal
|
|
||||||
extractedAt: new Date().toISOString(),
|
|
||||||
language: options.language || 'eng'
|
|
||||||
};
|
|
||||||
|
|
||||||
log.info(`OCR extraction completed. Confidence: ${ocrResult.confidence}%, Text length: ${ocrResult.text.length}`);
|
|
||||||
return ocrResult;
|
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
log.error(`OCR text extraction failed: ${error}`);
|
log.error(`OCR text extraction failed: ${error}`);
|
||||||
@ -174,10 +153,10 @@ class OCRService {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if OCR already exists in the blob and we're not forcing reprocessing
|
// Check if OCR already exists and is up-to-date
|
||||||
const existingOCR = this.getStoredOCRResult(note.blobId);
|
const existingOCR = this.getStoredOCRResult(note.blobId);
|
||||||
if (existingOCR && !options.forceReprocess) {
|
if (existingOCR && !options.forceReprocess && note.blobId && !this.needsReprocessing(note.blobId)) {
|
||||||
log.info(`OCR already exists for note ${noteId}, returning cached result`);
|
log.info(`OCR already exists and is up-to-date for note ${noteId}, returning cached result`);
|
||||||
return existingOCR;
|
return existingOCR;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -187,7 +166,7 @@ class OCRService {
|
|||||||
throw new Error(`Cannot get image content for note ${noteId}`);
|
throw new Error(`Cannot get image content for note ${noteId}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const ocrResult = await this.extractTextFromImage(content, options);
|
const ocrResult = await this.extractTextFromFile(content, note.mime, options);
|
||||||
|
|
||||||
// Store OCR result in blob
|
// Store OCR result in blob
|
||||||
await this.storeOCRResult(note.blobId, ocrResult);
|
await this.storeOCRResult(note.blobId, ocrResult);
|
||||||
@ -224,10 +203,10 @@ class OCRService {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if OCR already exists in the blob and we're not forcing reprocessing
|
// Check if OCR already exists and is up-to-date
|
||||||
const existingOCR = this.getStoredOCRResult(attachment.blobId);
|
const existingOCR = this.getStoredOCRResult(attachment.blobId);
|
||||||
if (existingOCR && !options.forceReprocess) {
|
if (existingOCR && !options.forceReprocess && attachment.blobId && !this.needsReprocessing(attachment.blobId)) {
|
||||||
log.info(`OCR already exists for attachment ${attachmentId}, returning cached result`);
|
log.info(`OCR already exists and is up-to-date for attachment ${attachmentId}, returning cached result`);
|
||||||
return existingOCR;
|
return existingOCR;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -237,7 +216,7 @@ class OCRService {
|
|||||||
throw new Error(`Cannot get image content for attachment ${attachmentId}`);
|
throw new Error(`Cannot get image content for attachment ${attachmentId}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const ocrResult = await this.extractTextFromImage(content, options);
|
const ocrResult = await this.extractTextFromFile(content, attachment.mime, options);
|
||||||
|
|
||||||
// Store OCR result in blob
|
// Store OCR result in blob
|
||||||
await this.storeOCRResult(attachment.blobId, ocrResult);
|
await this.storeOCRResult(attachment.blobId, ocrResult);
|
||||||
@ -259,11 +238,15 @@ class OCRService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Store OCR text in blobs table
|
// Store OCR text and timestamp in blobs table
|
||||||
sql.execute(`
|
sql.execute(`
|
||||||
UPDATE blobs SET ocr_text = ? WHERE blobId = ?
|
UPDATE blobs SET
|
||||||
|
ocr_text = ?,
|
||||||
|
ocr_last_processed = ?
|
||||||
|
WHERE blobId = ?
|
||||||
`, [
|
`, [
|
||||||
ocrResult.text,
|
ocrResult.text,
|
||||||
|
new Date().toISOString(),
|
||||||
blobId
|
blobId
|
||||||
]);
|
]);
|
||||||
|
|
||||||
@ -353,80 +336,10 @@ class OCRService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Process OCR for all images that don't have OCR results yet
|
* Process OCR for all files that don't have OCR results yet or need reprocessing
|
||||||
*/
|
*/
|
||||||
async processAllImages(): Promise<void> {
|
async processAllImages(): Promise<void> {
|
||||||
if (!this.isOCREnabled()) {
|
return this.processAllBlobsNeedingOCR();
|
||||||
log.info('OCR is disabled, skipping batch processing');
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
log.info('Starting batch OCR processing for all images...');
|
|
||||||
|
|
||||||
try {
|
|
||||||
// Process image notes
|
|
||||||
const imageNotes = sql.getRows<{
|
|
||||||
noteId: string;
|
|
||||||
mime: string;
|
|
||||||
blobId: string;
|
|
||||||
}>(`
|
|
||||||
SELECT n.noteId, n.mime, n.blobId
|
|
||||||
FROM notes n
|
|
||||||
LEFT JOIN blobs b ON n.blobId = b.blobId
|
|
||||||
WHERE n.type = 'image'
|
|
||||||
AND n.isDeleted = 0
|
|
||||||
AND n.blobId IS NOT NULL
|
|
||||||
AND (b.ocr_text IS NULL OR b.ocr_text = '')
|
|
||||||
`);
|
|
||||||
|
|
||||||
log.info(`Found ${imageNotes.length} image notes to process`);
|
|
||||||
|
|
||||||
for (const noteRow of imageNotes) {
|
|
||||||
if (this.isSupportedMimeType(noteRow.mime)) {
|
|
||||||
try {
|
|
||||||
await this.processNoteOCR(noteRow.noteId);
|
|
||||||
// Add small delay to prevent overwhelming the system
|
|
||||||
await new Promise(resolve => setTimeout(resolve, 100));
|
|
||||||
} catch (error) {
|
|
||||||
log.error(`Failed to process OCR for note ${noteRow.noteId}: ${error}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Process image attachments
|
|
||||||
const imageAttachments = sql.getRows<{
|
|
||||||
attachmentId: string;
|
|
||||||
mime: string;
|
|
||||||
blobId: string;
|
|
||||||
}>(`
|
|
||||||
SELECT a.attachmentId, a.mime, a.blobId
|
|
||||||
FROM attachments a
|
|
||||||
LEFT JOIN blobs b ON a.blobId = b.blobId
|
|
||||||
WHERE a.role = 'image'
|
|
||||||
AND a.isDeleted = 0
|
|
||||||
AND a.blobId IS NOT NULL
|
|
||||||
AND (b.ocr_text IS NULL OR b.ocr_text = '')
|
|
||||||
`);
|
|
||||||
|
|
||||||
log.info(`Found ${imageAttachments.length} image attachments to process`);
|
|
||||||
|
|
||||||
for (const attachmentRow of imageAttachments) {
|
|
||||||
if (this.isSupportedMimeType(attachmentRow.mime)) {
|
|
||||||
try {
|
|
||||||
await this.processAttachmentOCR(attachmentRow.attachmentId);
|
|
||||||
// Add small delay to prevent overwhelming the system
|
|
||||||
await new Promise(resolve => setTimeout(resolve, 100));
|
|
||||||
} catch (error) {
|
|
||||||
log.error(`Failed to process OCR for attachment ${attachmentRow.attachmentId}: ${error}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
log.info('Batch OCR processing completed');
|
|
||||||
} catch (error) {
|
|
||||||
log.error(`Batch OCR processing failed: ${error}`);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -521,28 +434,9 @@ class OCRService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Count total images to process
|
// Count total blobs needing OCR processing
|
||||||
const imageNotesCount = sql.getRow<{ count: number }>(`
|
const blobsNeedingOCR = this.getBlobsNeedingOCR();
|
||||||
SELECT COUNT(*) as count
|
const totalCount = blobsNeedingOCR.length;
|
||||||
FROM notes
|
|
||||||
WHERE type = 'image'
|
|
||||||
AND isDeleted = 0
|
|
||||||
AND noteId NOT IN (
|
|
||||||
SELECT entity_id FROM ocr_results WHERE entity_type = 'note'
|
|
||||||
)
|
|
||||||
`)?.count || 0;
|
|
||||||
|
|
||||||
const imageAttachmentsCount = sql.getRow<{ count: number }>(`
|
|
||||||
SELECT COUNT(*) as count
|
|
||||||
FROM attachments
|
|
||||||
WHERE role = 'image'
|
|
||||||
AND isDeleted = 0
|
|
||||||
AND attachmentId NOT IN (
|
|
||||||
SELECT entity_id FROM ocr_results WHERE entity_type = 'attachment'
|
|
||||||
)
|
|
||||||
`)?.count || 0;
|
|
||||||
|
|
||||||
const totalCount = imageNotesCount + imageAttachmentsCount;
|
|
||||||
|
|
||||||
if (totalCount === 0) {
|
if (totalCount === 0) {
|
||||||
return { success: false, message: 'No images found that need OCR processing' };
|
return { success: false, message: 'No images found that need OCR processing' };
|
||||||
@ -557,7 +451,7 @@ class OCRService {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Start processing in background
|
// Start processing in background
|
||||||
this.processBatchInBackground().catch(error => {
|
this.processBatchInBackground(blobsNeedingOCR).catch(error => {
|
||||||
log.error(`Batch processing failed: ${error instanceof Error ? error.message : String(error)}`);
|
log.error(`Batch processing failed: ${error instanceof Error ? error.message : String(error)}`);
|
||||||
this.batchProcessingState.inProgress = false;
|
this.batchProcessingState.inProgress = false;
|
||||||
});
|
});
|
||||||
@ -583,79 +477,33 @@ class OCRService {
|
|||||||
/**
|
/**
|
||||||
* Process batch OCR in background with progress tracking
|
* Process batch OCR in background with progress tracking
|
||||||
*/
|
*/
|
||||||
private async processBatchInBackground(): Promise<void> {
|
private async processBatchInBackground(blobsToProcess: Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }>): Promise<void> {
|
||||||
try {
|
try {
|
||||||
log.info('Starting batch OCR processing...');
|
log.info('Starting batch OCR processing...');
|
||||||
|
|
||||||
// Process image notes
|
for (const blobInfo of blobsToProcess) {
|
||||||
const imageNotes = sql.getRows<{
|
|
||||||
noteId: string;
|
|
||||||
mime: string;
|
|
||||||
blobId: string;
|
|
||||||
}>(`
|
|
||||||
SELECT n.noteId, n.mime, n.blobId
|
|
||||||
FROM notes n
|
|
||||||
LEFT JOIN blobs b ON n.blobId = b.blobId
|
|
||||||
WHERE n.type = 'image'
|
|
||||||
AND n.isDeleted = 0
|
|
||||||
AND n.blobId IS NOT NULL
|
|
||||||
AND (b.ocr_text IS NULL OR b.ocr_text = '')
|
|
||||||
`);
|
|
||||||
|
|
||||||
for (const noteRow of imageNotes) {
|
|
||||||
if (!this.batchProcessingState.inProgress) {
|
if (!this.batchProcessingState.inProgress) {
|
||||||
break; // Stop if processing was cancelled
|
break; // Stop if processing was cancelled
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.isSupportedMimeType(noteRow.mime)) {
|
try {
|
||||||
try {
|
if (blobInfo.entityType === 'note') {
|
||||||
await this.processNoteOCR(noteRow.noteId);
|
await this.processNoteOCR(blobInfo.entityId);
|
||||||
this.batchProcessingState.processed++;
|
} else {
|
||||||
// Add small delay to prevent overwhelming the system
|
await this.processAttachmentOCR(blobInfo.entityId);
|
||||||
await new Promise(resolve => setTimeout(resolve, 500));
|
|
||||||
} catch (error) {
|
|
||||||
log.error(`Failed to process OCR for note ${noteRow.noteId}: ${error}`);
|
|
||||||
this.batchProcessingState.processed++; // Count as processed even if failed
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Process image attachments
|
|
||||||
const imageAttachments = sql.getRows<{
|
|
||||||
attachmentId: string;
|
|
||||||
mime: string;
|
|
||||||
blobId: string;
|
|
||||||
}>(`
|
|
||||||
SELECT a.attachmentId, a.mime, a.blobId
|
|
||||||
FROM attachments a
|
|
||||||
LEFT JOIN blobs b ON a.blobId = b.blobId
|
|
||||||
WHERE a.role = 'image'
|
|
||||||
AND a.isDeleted = 0
|
|
||||||
AND a.blobId IS NOT NULL
|
|
||||||
AND (b.ocr_text IS NULL OR b.ocr_text = '')
|
|
||||||
`);
|
|
||||||
|
|
||||||
for (const attachmentRow of imageAttachments) {
|
|
||||||
if (!this.batchProcessingState.inProgress) {
|
|
||||||
break; // Stop if processing was cancelled
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this.isSupportedMimeType(attachmentRow.mime)) {
|
|
||||||
try {
|
|
||||||
await this.processAttachmentOCR(attachmentRow.attachmentId);
|
|
||||||
this.batchProcessingState.processed++;
|
|
||||||
// Add small delay to prevent overwhelming the system
|
|
||||||
await new Promise(resolve => setTimeout(resolve, 500));
|
|
||||||
} catch (error) {
|
|
||||||
log.error(`Failed to process OCR for attachment ${attachmentRow.attachmentId}: ${error}`);
|
|
||||||
this.batchProcessingState.processed++; // Count as processed even if failed
|
|
||||||
}
|
}
|
||||||
|
this.batchProcessingState.processed++;
|
||||||
|
// Add small delay to prevent overwhelming the system
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 500));
|
||||||
|
} catch (error) {
|
||||||
|
log.error(`Failed to process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`);
|
||||||
|
this.batchProcessingState.processed++; // Count as processed even if failed
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mark as completed
|
// Mark as completed
|
||||||
this.batchProcessingState.inProgress = false;
|
this.batchProcessingState.inProgress = false;
|
||||||
log.info(`Batch OCR processing completed. Processed ${this.batchProcessingState.processed} images.`);
|
log.info(`Batch OCR processing completed. Processed ${this.batchProcessingState.processed} files.`);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
log.error(`Batch OCR processing failed: ${error}`);
|
log.error(`Batch OCR processing failed: ${error}`);
|
||||||
this.batchProcessingState.inProgress = false;
|
this.batchProcessingState.inProgress = false;
|
||||||
@ -672,6 +520,170 @@ class OCRService {
|
|||||||
log.info('Batch OCR processing cancelled');
|
log.info('Batch OCR processing cancelled');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get processor for a given MIME type
|
||||||
|
*/
|
||||||
|
private getProcessorForMimeType(mimeType: string): FileProcessor | null {
|
||||||
|
for (const processor of this.processors.values()) {
|
||||||
|
if (processor.canProcess(mimeType)) {
|
||||||
|
return processor;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if blob needs OCR re-processing due to content changes
|
||||||
|
*/
|
||||||
|
needsReprocessing(blobId: string): boolean {
|
||||||
|
if (!blobId) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const blobInfo = sql.getRow<{
|
||||||
|
utcDateModified: string;
|
||||||
|
ocr_last_processed: string | null;
|
||||||
|
}>(`
|
||||||
|
SELECT utcDateModified, ocr_last_processed
|
||||||
|
FROM blobs
|
||||||
|
WHERE blobId = ?
|
||||||
|
`, [blobId]);
|
||||||
|
|
||||||
|
if (!blobInfo) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If OCR was never processed, it needs processing
|
||||||
|
if (!blobInfo.ocr_last_processed) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If blob was modified after last OCR processing, it needs re-processing
|
||||||
|
const blobModified = new Date(blobInfo.utcDateModified);
|
||||||
|
const lastOcrProcessed = new Date(blobInfo.ocr_last_processed);
|
||||||
|
|
||||||
|
return blobModified > lastOcrProcessed;
|
||||||
|
} catch (error) {
|
||||||
|
log.error(`Failed to check if blob ${blobId} needs reprocessing: ${error}`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Invalidate OCR results for a blob (clear ocr_text and ocr_last_processed)
|
||||||
|
*/
|
||||||
|
invalidateOCRResult(blobId: string): void {
|
||||||
|
if (!blobId) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
sql.execute(`
|
||||||
|
UPDATE blobs SET
|
||||||
|
ocr_text = NULL,
|
||||||
|
ocr_last_processed = NULL
|
||||||
|
WHERE blobId = ?
|
||||||
|
`, [blobId]);
|
||||||
|
|
||||||
|
log.info(`Invalidated OCR result for blob ${blobId}`);
|
||||||
|
} catch (error) {
|
||||||
|
log.error(`Failed to invalidate OCR result for blob ${blobId}: ${error}`);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get blobs that need OCR processing (modified after last OCR or never processed)
|
||||||
|
*/
|
||||||
|
getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> {
|
||||||
|
try {
|
||||||
|
// Get notes with blobs that need OCR
|
||||||
|
const noteBlobs = sql.getRows<{
|
||||||
|
blobId: string;
|
||||||
|
mimeType: string;
|
||||||
|
entityId: string;
|
||||||
|
}>(`
|
||||||
|
SELECT n.blobId, n.mime as mimeType, n.noteId as entityId
|
||||||
|
FROM notes n
|
||||||
|
JOIN blobs b ON n.blobId = b.blobId
|
||||||
|
WHERE n.type = 'image'
|
||||||
|
AND n.isDeleted = 0
|
||||||
|
AND n.blobId IS NOT NULL
|
||||||
|
AND (
|
||||||
|
b.ocr_last_processed IS NULL
|
||||||
|
OR b.utcDateModified > b.ocr_last_processed
|
||||||
|
)
|
||||||
|
`);
|
||||||
|
|
||||||
|
// Get attachments with blobs that need OCR
|
||||||
|
const attachmentBlobs = sql.getRows<{
|
||||||
|
blobId: string;
|
||||||
|
mimeType: string;
|
||||||
|
entityId: string;
|
||||||
|
}>(`
|
||||||
|
SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId
|
||||||
|
FROM attachments a
|
||||||
|
JOIN blobs b ON a.blobId = b.blobId
|
||||||
|
WHERE a.role = 'image'
|
||||||
|
AND a.isDeleted = 0
|
||||||
|
AND a.blobId IS NOT NULL
|
||||||
|
AND (
|
||||||
|
b.ocr_last_processed IS NULL
|
||||||
|
OR b.utcDateModified > b.ocr_last_processed
|
||||||
|
)
|
||||||
|
`);
|
||||||
|
|
||||||
|
// Combine results
|
||||||
|
const result = [
|
||||||
|
...noteBlobs.map(blob => ({ ...blob, entityType: 'note' as const })),
|
||||||
|
...attachmentBlobs.map(blob => ({ ...blob, entityType: 'attachment' as const }))
|
||||||
|
];
|
||||||
|
|
||||||
|
// Filter to only supported MIME types
|
||||||
|
return result.filter(blob => this.isSupportedMimeType(blob.mimeType));
|
||||||
|
} catch (error) {
|
||||||
|
log.error(`Failed to get blobs needing OCR: ${error}`);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process OCR for all blobs that need it (auto-processing)
|
||||||
|
*/
|
||||||
|
async processAllBlobsNeedingOCR(): Promise<void> {
|
||||||
|
if (!this.isOCREnabled()) {
|
||||||
|
log.info('OCR is disabled, skipping auto-processing');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const blobsNeedingOCR = this.getBlobsNeedingOCR();
|
||||||
|
if (blobsNeedingOCR.length === 0) {
|
||||||
|
log.info('No blobs need OCR processing');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
log.info(`Auto-processing OCR for ${blobsNeedingOCR.length} blobs...`);
|
||||||
|
|
||||||
|
for (const blobInfo of blobsNeedingOCR) {
|
||||||
|
try {
|
||||||
|
if (blobInfo.entityType === 'note') {
|
||||||
|
await this.processNoteOCR(blobInfo.entityId);
|
||||||
|
} else {
|
||||||
|
await this.processAttachmentOCR(blobInfo.entityId);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add small delay to prevent overwhelming the system
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 100));
|
||||||
|
} catch (error) {
|
||||||
|
log.error(`Failed to auto-process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`);
|
||||||
|
// Continue with other blobs
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log.info('Auto-processing OCR completed');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export default new OCRService();
|
export default new OCRService();
|
||||||
28
apps/server/src/services/ocr/processors/file_processor.ts
Normal file
28
apps/server/src/services/ocr/processors/file_processor.ts
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Base class for file processors that extract text from different file types
|
||||||
|
*/
|
||||||
|
export abstract class FileProcessor {
|
||||||
|
/**
|
||||||
|
* Check if this processor can handle the given MIME type
|
||||||
|
*/
|
||||||
|
abstract canProcess(mimeType: string): boolean;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract text from the given file buffer
|
||||||
|
*/
|
||||||
|
abstract extractText(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the processing type identifier
|
||||||
|
*/
|
||||||
|
abstract getProcessingType(): string;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clean up any resources
|
||||||
|
*/
|
||||||
|
cleanup(): Promise<void> {
|
||||||
|
return Promise.resolve();
|
||||||
|
}
|
||||||
|
}
|
||||||
162
apps/server/src/services/ocr/processors/image_processor.ts
Normal file
162
apps/server/src/services/ocr/processors/image_processor.ts
Normal file
@ -0,0 +1,162 @@
|
|||||||
|
import Tesseract from 'tesseract.js';
|
||||||
|
import { FileProcessor } from './file_processor.js';
|
||||||
|
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
|
||||||
|
import log from '../../log.js';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Image processor for extracting text from image files using Tesseract
|
||||||
|
*/
|
||||||
|
export class ImageProcessor extends FileProcessor {
|
||||||
|
private worker: Tesseract.Worker | null = null;
|
||||||
|
private isInitialized = false;
|
||||||
|
|
||||||
|
canProcess(mimeType: string): boolean {
|
||||||
|
const supportedTypes = [
|
||||||
|
'image/jpeg',
|
||||||
|
'image/jpg',
|
||||||
|
'image/png',
|
||||||
|
'image/gif',
|
||||||
|
'image/bmp',
|
||||||
|
'image/tiff',
|
||||||
|
'image/webp'
|
||||||
|
];
|
||||||
|
return supportedTypes.includes(mimeType.toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||||
|
if (!this.isInitialized) {
|
||||||
|
await this.initialize();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!this.worker) {
|
||||||
|
throw new Error('Image processor worker not initialized');
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
log.info('Starting image OCR text extraction...');
|
||||||
|
|
||||||
|
// Set language if specified and different from current
|
||||||
|
// Support multi-language format like 'ron+eng'
|
||||||
|
const language = options.language || this.getDefaultOCRLanguage();
|
||||||
|
|
||||||
|
// Validate language format
|
||||||
|
if (!this.isValidLanguageFormat(language)) {
|
||||||
|
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (language !== 'eng') {
|
||||||
|
// For different languages, create a new worker
|
||||||
|
await this.worker.terminate();
|
||||||
|
log.info(`Initializing Tesseract worker for language(s): ${language}`);
|
||||||
|
this.worker = await Tesseract.createWorker(language, 1, {
|
||||||
|
logger: (m: { status: string; progress: number }) => {
|
||||||
|
if (m.status === 'recognizing text') {
|
||||||
|
log.info(`Image OCR progress (${language}): ${Math.round(m.progress * 100)}%`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await this.worker.recognize(buffer);
|
||||||
|
|
||||||
|
const ocrResult: OCRResult = {
|
||||||
|
text: result.data.text.trim(),
|
||||||
|
confidence: result.data.confidence / 100, // Convert percentage to decimal
|
||||||
|
extractedAt: new Date().toISOString(),
|
||||||
|
language: options.language || this.getDefaultOCRLanguage(),
|
||||||
|
pageCount: 1
|
||||||
|
};
|
||||||
|
|
||||||
|
log.info(`Image OCR extraction completed. Confidence: ${ocrResult.confidence}%, Text length: ${ocrResult.text.length}`);
|
||||||
|
return ocrResult;
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
log.error(`Image OCR text extraction failed: ${error}`);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
getProcessingType(): string {
|
||||||
|
return 'image';
|
||||||
|
}
|
||||||
|
|
||||||
|
private async initialize(): Promise<void> {
|
||||||
|
if (this.isInitialized) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
log.info('Initializing image OCR processor with Tesseract.js...');
|
||||||
|
|
||||||
|
// Configure proper paths for Node.js environment
|
||||||
|
const tesseractDir = require.resolve('tesseract.js').replace('/src/index.js', '');
|
||||||
|
const workerPath = require.resolve('tesseract.js/src/worker-script/node/index.js');
|
||||||
|
const corePath = require.resolve('tesseract.js-core/tesseract-core.wasm.js');
|
||||||
|
|
||||||
|
log.info(`Using worker path: ${workerPath}`);
|
||||||
|
log.info(`Using core path: ${corePath}`);
|
||||||
|
|
||||||
|
this.worker = await Tesseract.createWorker(this.getDefaultOCRLanguage(), 1, {
|
||||||
|
workerPath,
|
||||||
|
corePath,
|
||||||
|
logger: (m: { status: string; progress: number }) => {
|
||||||
|
if (m.status === 'recognizing text') {
|
||||||
|
log.info(`Image OCR progress: ${Math.round(m.progress * 100)}%`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
this.isInitialized = true;
|
||||||
|
log.info('Image OCR processor initialized successfully');
|
||||||
|
} catch (error) {
|
||||||
|
log.error(`Failed to initialize image OCR processor: ${error}`);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async cleanup(): Promise<void> {
|
||||||
|
if (this.worker) {
|
||||||
|
await this.worker.terminate();
|
||||||
|
this.worker = null;
|
||||||
|
}
|
||||||
|
this.isInitialized = false;
|
||||||
|
log.info('Image OCR processor cleaned up');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get default OCR language from options
|
||||||
|
*/
|
||||||
|
private getDefaultOCRLanguage(): string {
|
||||||
|
try {
|
||||||
|
const options = require('../../options.js').default;
|
||||||
|
const ocrLanguage = options.getOption('ocrLanguage');
|
||||||
|
if (!ocrLanguage) {
|
||||||
|
throw new Error('OCR language not configured in user settings');
|
||||||
|
}
|
||||||
|
return ocrLanguage;
|
||||||
|
} catch (error) {
|
||||||
|
log.error(`Failed to get default OCR language: ${error}`);
|
||||||
|
throw new Error('OCR language must be configured in settings before processing');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate OCR language format
|
||||||
|
* Supports single language (eng) or multi-language (ron+eng)
|
||||||
|
*/
|
||||||
|
private isValidLanguageFormat(language: string): boolean {
|
||||||
|
if (!language || typeof language !== 'string') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Split by '+' for multi-language format
|
||||||
|
const languages = language.split('+');
|
||||||
|
|
||||||
|
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
|
||||||
|
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
|
||||||
|
|
||||||
|
return languages.every(lang => {
|
||||||
|
const trimmed = lang.trim();
|
||||||
|
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
128
apps/server/src/services/ocr/processors/office_processor.ts
Normal file
128
apps/server/src/services/ocr/processors/office_processor.ts
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
import * as officeParser from 'officeparser';
|
||||||
|
import { FileProcessor } from './file_processor.js';
|
||||||
|
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
|
||||||
|
import { ImageProcessor } from './image_processor.js';
|
||||||
|
import log from '../../log.js';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Office document processor for extracting text and images from DOCX/XLSX/PPTX files
|
||||||
|
*/
|
||||||
|
export class OfficeProcessor extends FileProcessor {
|
||||||
|
private imageProcessor: ImageProcessor;
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super();
|
||||||
|
this.imageProcessor = new ImageProcessor();
|
||||||
|
}
|
||||||
|
|
||||||
|
canProcess(mimeType: string): boolean {
|
||||||
|
const supportedTypes = [
|
||||||
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
|
||||||
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
|
||||||
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
|
||||||
|
'application/msword', // DOC
|
||||||
|
'application/vnd.ms-excel', // XLS
|
||||||
|
'application/vnd.ms-powerpoint', // PPT
|
||||||
|
'application/rtf' // RTF
|
||||||
|
];
|
||||||
|
return supportedTypes.includes(mimeType.toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||||
|
try {
|
||||||
|
log.info('Starting Office document text extraction...');
|
||||||
|
|
||||||
|
// Validate language format
|
||||||
|
const language = options.language || this.getDefaultOCRLanguage();
|
||||||
|
if (!this.isValidLanguageFormat(language)) {
|
||||||
|
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract text from Office document
|
||||||
|
const data = await this.parseOfficeDocument(buffer);
|
||||||
|
|
||||||
|
// Extract text from Office document
|
||||||
|
const combinedText = data.data && data.data.trim().length > 0 ? data.data.trim() : '';
|
||||||
|
const confidence = combinedText.length > 0 ? 0.99 : 0; // High confidence for direct text extraction
|
||||||
|
|
||||||
|
const result: OCRResult = {
|
||||||
|
text: combinedText,
|
||||||
|
confidence: confidence,
|
||||||
|
extractedAt: new Date().toISOString(),
|
||||||
|
language: language,
|
||||||
|
pageCount: 1 // Office documents are treated as single logical document
|
||||||
|
};
|
||||||
|
|
||||||
|
log.info(`Office document text extraction completed. Confidence: ${confidence}%, Text length: ${result.text.length}`);
|
||||||
|
return result;
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
log.error(`Office document text extraction failed: ${error}`);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async parseOfficeDocument(buffer: Buffer): Promise<{ data: string }> {
|
||||||
|
try {
|
||||||
|
// Use promise-based API directly
|
||||||
|
const data = await officeParser.parseOfficeAsync(buffer, {
|
||||||
|
outputErrorToConsole: false,
|
||||||
|
newlineDelimiter: '\n',
|
||||||
|
ignoreNotes: false,
|
||||||
|
putNotesAtLast: false
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
data: data || ''
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
throw new Error(`Office document parsing failed: ${error}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
getProcessingType(): string {
|
||||||
|
return 'office';
|
||||||
|
}
|
||||||
|
|
||||||
|
async cleanup(): Promise<void> {
|
||||||
|
await this.imageProcessor.cleanup();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get default OCR language from options
|
||||||
|
*/
|
||||||
|
private getDefaultOCRLanguage(): string {
|
||||||
|
try {
|
||||||
|
const options = require('../../options.js').default;
|
||||||
|
const ocrLanguage = options.getOption('ocrLanguage');
|
||||||
|
if (!ocrLanguage) {
|
||||||
|
throw new Error('OCR language not configured in user settings');
|
||||||
|
}
|
||||||
|
return ocrLanguage;
|
||||||
|
} catch (error) {
|
||||||
|
log.error(`Failed to get default OCR language: ${error}`);
|
||||||
|
throw new Error('OCR language must be configured in settings before processing');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate OCR language format
|
||||||
|
* Supports single language (eng) or multi-language (ron+eng)
|
||||||
|
*/
|
||||||
|
private isValidLanguageFormat(language: string): boolean {
|
||||||
|
if (!language || typeof language !== 'string') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Split by '+' for multi-language format
|
||||||
|
const languages = language.split('+');
|
||||||
|
|
||||||
|
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
|
||||||
|
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
|
||||||
|
|
||||||
|
return languages.every(lang => {
|
||||||
|
const trimmed = lang.trim();
|
||||||
|
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
142
apps/server/src/services/ocr/processors/pdf_processor.ts
Normal file
142
apps/server/src/services/ocr/processors/pdf_processor.ts
Normal file
@ -0,0 +1,142 @@
|
|||||||
|
import * as pdfParse from 'pdf-parse';
|
||||||
|
import { FileProcessor } from './file_processor.js';
|
||||||
|
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
|
||||||
|
import { ImageProcessor } from './image_processor.js';
|
||||||
|
import log from '../../log.js';
|
||||||
|
import sharp from 'sharp';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PDF processor for extracting text from PDF files
|
||||||
|
* First tries to extract existing text, then falls back to OCR on images
|
||||||
|
*/
|
||||||
|
export class PDFProcessor extends FileProcessor {
|
||||||
|
private imageProcessor: ImageProcessor;
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super();
|
||||||
|
this.imageProcessor = new ImageProcessor();
|
||||||
|
}
|
||||||
|
|
||||||
|
canProcess(mimeType: string): boolean {
|
||||||
|
return mimeType.toLowerCase() === 'application/pdf';
|
||||||
|
}
|
||||||
|
|
||||||
|
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||||
|
try {
|
||||||
|
log.info('Starting PDF text extraction...');
|
||||||
|
|
||||||
|
// Validate language format
|
||||||
|
const language = options.language || this.getDefaultOCRLanguage();
|
||||||
|
if (!this.isValidLanguageFormat(language)) {
|
||||||
|
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// First try to extract existing text from PDF
|
||||||
|
if (options.enablePDFTextExtraction !== false) {
|
||||||
|
const textResult = await this.extractTextFromPDF(buffer, options);
|
||||||
|
if (textResult.text.trim().length > 0) {
|
||||||
|
log.info(`PDF text extraction successful. Length: ${textResult.text.length}`);
|
||||||
|
return textResult;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back to OCR if no text found or PDF text extraction is disabled
|
||||||
|
log.info('No text found in PDF or text extraction disabled, falling back to OCR...');
|
||||||
|
return await this.extractTextViaOCR(buffer, options);
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
log.error(`PDF text extraction failed: ${error}`);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async extractTextFromPDF(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult> {
|
||||||
|
try {
|
||||||
|
const data = await pdfParse(buffer);
|
||||||
|
|
||||||
|
return {
|
||||||
|
text: data.text.trim(),
|
||||||
|
confidence: 0.99, // High confidence for direct text extraction
|
||||||
|
extractedAt: new Date().toISOString(),
|
||||||
|
language: options.language || this.getDefaultOCRLanguage(),
|
||||||
|
pageCount: data.numpages
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
log.error(`PDF text extraction failed: ${error}`);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async extractTextViaOCR(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult> {
|
||||||
|
try {
|
||||||
|
// Convert PDF to images and OCR each page
|
||||||
|
// For now, we'll use a simple approach - convert first page to image
|
||||||
|
// In a full implementation, we'd convert all pages
|
||||||
|
|
||||||
|
// This is a simplified implementation
|
||||||
|
// In practice, you might want to use pdf2pic or similar library
|
||||||
|
// to convert PDF pages to images for OCR
|
||||||
|
|
||||||
|
// For now, we'll return a placeholder result
|
||||||
|
// indicating that OCR on PDF is not fully implemented
|
||||||
|
log.info('PDF to image conversion not fully implemented, returning placeholder');
|
||||||
|
|
||||||
|
return {
|
||||||
|
text: '[PDF OCR not fully implemented - would convert PDF pages to images and OCR each page]',
|
||||||
|
confidence: 0.0,
|
||||||
|
extractedAt: new Date().toISOString(),
|
||||||
|
language: options.language || this.getDefaultOCRLanguage(),
|
||||||
|
pageCount: 1
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
log.error(`PDF OCR extraction failed: ${error}`);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
getProcessingType(): string {
|
||||||
|
return 'pdf';
|
||||||
|
}
|
||||||
|
|
||||||
|
async cleanup(): Promise<void> {
|
||||||
|
await this.imageProcessor.cleanup();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get default OCR language from options
|
||||||
|
*/
|
||||||
|
private getDefaultOCRLanguage(): string {
|
||||||
|
try {
|
||||||
|
const options = require('../../options.js').default;
|
||||||
|
const ocrLanguage = options.getOption('ocrLanguage');
|
||||||
|
if (!ocrLanguage) {
|
||||||
|
throw new Error('OCR language not configured in user settings');
|
||||||
|
}
|
||||||
|
return ocrLanguage;
|
||||||
|
} catch (error) {
|
||||||
|
log.error(`Failed to get default OCR language: ${error}`);
|
||||||
|
throw new Error('OCR language must be configured in settings before processing');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate OCR language format
|
||||||
|
* Supports single language (eng) or multi-language (ron+eng)
|
||||||
|
*/
|
||||||
|
private isValidLanguageFormat(language: string): boolean {
|
||||||
|
if (!language || typeof language !== 'string') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Split by '+' for multi-language format
|
||||||
|
const languages = language.split('+');
|
||||||
|
|
||||||
|
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
|
||||||
|
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
|
||||||
|
|
||||||
|
return languages.every(lang => {
|
||||||
|
const trimmed = lang.trim();
|
||||||
|
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
129
apps/server/src/services/ocr/processors/tiff_processor.ts
Normal file
129
apps/server/src/services/ocr/processors/tiff_processor.ts
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
import sharp from 'sharp';
|
||||||
|
import { FileProcessor } from './file_processor.js';
|
||||||
|
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
|
||||||
|
import { ImageProcessor } from './image_processor.js';
|
||||||
|
import log from '../../log.js';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* TIFF processor for extracting text from multi-page TIFF files
|
||||||
|
*/
|
||||||
|
export class TIFFProcessor extends FileProcessor {
|
||||||
|
private imageProcessor: ImageProcessor;
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super();
|
||||||
|
this.imageProcessor = new ImageProcessor();
|
||||||
|
}
|
||||||
|
|
||||||
|
canProcess(mimeType: string): boolean {
|
||||||
|
return mimeType.toLowerCase() === 'image/tiff' || mimeType.toLowerCase() === 'image/tif';
|
||||||
|
}
|
||||||
|
|
||||||
|
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||||
|
try {
|
||||||
|
log.info('Starting TIFF text extraction...');
|
||||||
|
|
||||||
|
// Validate language format
|
||||||
|
const language = options.language || this.getDefaultOCRLanguage();
|
||||||
|
if (!this.isValidLanguageFormat(language)) {
|
||||||
|
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if this is a multi-page TIFF
|
||||||
|
const metadata = await sharp(buffer).metadata();
|
||||||
|
const pageCount = metadata.pages || 1;
|
||||||
|
|
||||||
|
let combinedText = '';
|
||||||
|
let totalConfidence = 0;
|
||||||
|
|
||||||
|
// Process each page
|
||||||
|
for (let page = 0; page < pageCount; page++) {
|
||||||
|
try {
|
||||||
|
log.info(`Processing TIFF page ${page + 1}/${pageCount}...`);
|
||||||
|
|
||||||
|
// Extract page as PNG buffer
|
||||||
|
const pageBuffer = await sharp(buffer, { page })
|
||||||
|
.png()
|
||||||
|
.toBuffer();
|
||||||
|
|
||||||
|
// OCR the page
|
||||||
|
const pageResult = await this.imageProcessor.extractText(pageBuffer, options);
|
||||||
|
|
||||||
|
if (pageResult.text.trim().length > 0) {
|
||||||
|
if (combinedText.length > 0) {
|
||||||
|
combinedText += '\n\n--- Page ' + (page + 1) + ' ---\n';
|
||||||
|
}
|
||||||
|
combinedText += pageResult.text;
|
||||||
|
totalConfidence += pageResult.confidence;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
log.error(`Failed to process TIFF page ${page + 1}: ${error}`);
|
||||||
|
// Continue with other pages
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const averageConfidence = pageCount > 0 ? totalConfidence / pageCount : 0;
|
||||||
|
|
||||||
|
const result: OCRResult = {
|
||||||
|
text: combinedText.trim(),
|
||||||
|
confidence: averageConfidence,
|
||||||
|
extractedAt: new Date().toISOString(),
|
||||||
|
language: options.language || this.getDefaultOCRLanguage(),
|
||||||
|
pageCount: pageCount
|
||||||
|
};
|
||||||
|
|
||||||
|
log.info(`TIFF text extraction completed. Pages: ${pageCount}, Confidence: ${averageConfidence}%, Text length: ${result.text.length}`);
|
||||||
|
return result;
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
log.error(`TIFF text extraction failed: ${error}`);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
getProcessingType(): string {
|
||||||
|
return 'tiff';
|
||||||
|
}
|
||||||
|
|
||||||
|
async cleanup(): Promise<void> {
|
||||||
|
await this.imageProcessor.cleanup();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get default OCR language from options
|
||||||
|
*/
|
||||||
|
private getDefaultOCRLanguage(): string {
|
||||||
|
try {
|
||||||
|
const options = require('../../options.js').default;
|
||||||
|
const ocrLanguage = options.getOption('ocrLanguage');
|
||||||
|
if (!ocrLanguage) {
|
||||||
|
throw new Error('OCR language not configured in user settings');
|
||||||
|
}
|
||||||
|
return ocrLanguage;
|
||||||
|
} catch (error) {
|
||||||
|
log.error(`Failed to get default OCR language: ${error}`);
|
||||||
|
throw new Error('OCR language must be configured in settings before processing');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate OCR language format
|
||||||
|
* Supports single language (eng) or multi-language (ron+eng)
|
||||||
|
*/
|
||||||
|
private isValidLanguageFormat(language: string): boolean {
|
||||||
|
if (!language || typeof language !== 'string') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Split by '+' for multi-language format
|
||||||
|
const languages = language.split('+');
|
||||||
|
|
||||||
|
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
|
||||||
|
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
|
||||||
|
|
||||||
|
return languages.every(lang => {
|
||||||
|
const trimmed = lang.trim();
|
||||||
|
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user