diff --git a/apps/server/src/services/ocr/processors/image_processor.ts b/apps/server/src/services/ocr/processors/image_processor.ts index 2666fe965..52145cac3 100644 --- a/apps/server/src/services/ocr/processors/image_processor.ts +++ b/apps/server/src/services/ocr/processors/image_processor.ts @@ -2,6 +2,7 @@ import Tesseract from 'tesseract.js'; import { FileProcessor } from './file_processor.js'; import { OCRResult, OCRProcessingOptions } from '../ocr_service.js'; import log from '../../log.js'; +import options from '../../options.js'; /** * Image processor for extracting text from image files using Tesseract @@ -11,7 +12,7 @@ export class ImageProcessor extends FileProcessor { private isInitialized = false; private readonly supportedTypes = [ 'image/jpeg', - 'image/jpg', + 'image/jpg', 'image/png', 'image/gif', 'image/bmp', @@ -42,12 +43,12 @@ export class ImageProcessor extends FileProcessor { // Set language if specified and different from current // Support multi-language format like 'ron+eng' const language = options.language || this.getDefaultOCRLanguage(); - + // Validate language format if (!this.isValidLanguageFormat(language)) { throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`); } - + if (language !== 'eng') { // For different languages, create a new worker await this.worker.terminate(); @@ -62,10 +63,13 @@ export class ImageProcessor extends FileProcessor { } const result = await this.worker.recognize(buffer); - + + // Filter text based on minimum confidence threshold + const { filteredText, overallConfidence } = this.filterTextByConfidence(result.data, options); + const ocrResult: OCRResult = { - text: result.data.text.trim(), - confidence: result.data.confidence / 100, // Convert percentage to decimal + text: filteredText, + confidence: overallConfidence, extractedAt: new Date().toISOString(), language: options.language || this.getDefaultOCRLanguage(), pageCount: 1 @@ -91,15 +95,15 @@ export class ImageProcessor extends FileProcessor { try { log.info('Initializing image OCR processor with Tesseract.js...'); - + // Configure proper paths for Node.js environment const tesseractDir = require.resolve('tesseract.js').replace('/src/index.js', ''); const workerPath = require.resolve('tesseract.js/src/worker-script/node/index.js'); const corePath = require.resolve('tesseract.js-core/tesseract-core.wasm.js'); - + log.info(`Using worker path: ${workerPath}`); log.info(`Using core path: ${corePath}`); - + this.worker = await Tesseract.createWorker(this.getDefaultOCRLanguage(), 1, { workerPath, corePath, @@ -143,6 +147,73 @@ export class ImageProcessor extends FileProcessor { } } + /** + * Filter text based on minimum confidence threshold + */ + private filterTextByConfidence(data: any, options: OCRProcessingOptions): { filteredText: string; overallConfidence: number } { + const minConfidence = this.getMinConfidenceThreshold(); + + // If no minimum confidence set, return original text + if (minConfidence <= 0) { + return { + filteredText: data.text.trim(), + overallConfidence: data.confidence / 100 + }; + } + + let filteredWords: string[] = []; + let validConfidences: number[] = []; + + // Tesseract provides word-level data + if (data.words && Array.isArray(data.words)) { + for (const word of data.words) { + const wordConfidence = word.confidence / 100; // Convert to decimal + + if (wordConfidence >= minConfidence) { + filteredWords.push(word.text); + validConfidences.push(wordConfidence); + } + } + } else { + // Fallback: if word-level data not available, use overall confidence + const overallConfidence = data.confidence / 100; + if (overallConfidence >= minConfidence) { + return { + filteredText: data.text.trim(), + overallConfidence + }; + } else { + log.info(`Entire text filtered out due to low confidence ${overallConfidence} (below threshold ${minConfidence})`); + return { + filteredText: '', + overallConfidence + }; + } + } + + // Calculate average confidence of accepted words + const averageConfidence = validConfidences.length > 0 + ? validConfidences.reduce((sum, conf) => sum + conf, 0) / validConfidences.length + : 0; + + const filteredText = filteredWords.join(' ').trim(); + + log.info(`Filtered OCR text: ${filteredWords.length} words kept out of ${data.words?.length || 0} total words (min confidence: ${minConfidence})`); + + return { + filteredText, + overallConfidence: averageConfidence + }; + } + + /** + * Get minimum confidence threshold from options + */ + private getMinConfidenceThreshold(): number { + const minConfidence = options.getOption('ocrMinConfidence') ?? 0; + return parseFloat(minConfidence); + } + /** * Validate OCR language format * Supports single language (eng) or multi-language (ron+eng) @@ -151,16 +222,16 @@ export class ImageProcessor extends FileProcessor { if (!language || typeof language !== 'string') { return false; } - + // Split by '+' for multi-language format const languages = language.split('+'); - + // Check each language code (should be 2-7 characters, alphanumeric with underscores) const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/; - + return languages.every(lang => { const trimmed = lang.trim(); return trimmed.length > 0 && validLanguagePattern.test(trimmed); }); } -} \ No newline at end of file +} diff --git a/apps/server/src/services/options_init.ts b/apps/server/src/services/options_init.ts index c96a9d17c..f91d3ca97 100644 --- a/apps/server/src/services/options_init.ts +++ b/apps/server/src/services/options_init.ts @@ -216,7 +216,7 @@ const defaultOptions: DefaultOption[] = [ { name: "ocrEnabled", value: "false", isSynced: true }, { name: "ocrLanguage", value: "eng", isSynced: true }, { name: "ocrAutoProcessImages", value: "true", isSynced: true }, - { name: "ocrMinConfidence", value: "0.2", isSynced: true }, + { name: "ocrMinConfidence", value: "0.55", isSynced: true }, ]; /**