feat(ocr): filter out text based on confidence

This commit is contained in:
Elian Doran 2025-07-26 14:57:12 +03:00
parent 55ac1e01f2
commit 5ec6141369
No known key found for this signature in database
2 changed files with 85 additions and 14 deletions

View File

@ -2,6 +2,7 @@ import Tesseract from 'tesseract.js';
import { FileProcessor } from './file_processor.js';
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
import log from '../../log.js';
import options from '../../options.js';
/**
* Image processor for extracting text from image files using Tesseract
@ -11,7 +12,7 @@ export class ImageProcessor extends FileProcessor {
private isInitialized = false;
private readonly supportedTypes = [
'image/jpeg',
'image/jpg',
'image/jpg',
'image/png',
'image/gif',
'image/bmp',
@ -42,12 +43,12 @@ export class ImageProcessor extends FileProcessor {
// Set language if specified and different from current
// Support multi-language format like 'ron+eng'
const language = options.language || this.getDefaultOCRLanguage();
// Validate language format
if (!this.isValidLanguageFormat(language)) {
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
}
if (language !== 'eng') {
// For different languages, create a new worker
await this.worker.terminate();
@ -62,10 +63,13 @@ export class ImageProcessor extends FileProcessor {
}
const result = await this.worker.recognize(buffer);
// Filter text based on minimum confidence threshold
const { filteredText, overallConfidence } = this.filterTextByConfidence(result.data, options);
const ocrResult: OCRResult = {
text: result.data.text.trim(),
confidence: result.data.confidence / 100, // Convert percentage to decimal
text: filteredText,
confidence: overallConfidence,
extractedAt: new Date().toISOString(),
language: options.language || this.getDefaultOCRLanguage(),
pageCount: 1
@ -91,15 +95,15 @@ export class ImageProcessor extends FileProcessor {
try {
log.info('Initializing image OCR processor with Tesseract.js...');
// Configure proper paths for Node.js environment
const tesseractDir = require.resolve('tesseract.js').replace('/src/index.js', '');
const workerPath = require.resolve('tesseract.js/src/worker-script/node/index.js');
const corePath = require.resolve('tesseract.js-core/tesseract-core.wasm.js');
log.info(`Using worker path: ${workerPath}`);
log.info(`Using core path: ${corePath}`);
this.worker = await Tesseract.createWorker(this.getDefaultOCRLanguage(), 1, {
workerPath,
corePath,
@ -143,6 +147,73 @@ export class ImageProcessor extends FileProcessor {
}
}
/**
* Filter text based on minimum confidence threshold
*/
private filterTextByConfidence(data: any, options: OCRProcessingOptions): { filteredText: string; overallConfidence: number } {
const minConfidence = this.getMinConfidenceThreshold();
// If no minimum confidence set, return original text
if (minConfidence <= 0) {
return {
filteredText: data.text.trim(),
overallConfidence: data.confidence / 100
};
}
let filteredWords: string[] = [];
let validConfidences: number[] = [];
// Tesseract provides word-level data
if (data.words && Array.isArray(data.words)) {
for (const word of data.words) {
const wordConfidence = word.confidence / 100; // Convert to decimal
if (wordConfidence >= minConfidence) {
filteredWords.push(word.text);
validConfidences.push(wordConfidence);
}
}
} else {
// Fallback: if word-level data not available, use overall confidence
const overallConfidence = data.confidence / 100;
if (overallConfidence >= minConfidence) {
return {
filteredText: data.text.trim(),
overallConfidence
};
} else {
log.info(`Entire text filtered out due to low confidence ${overallConfidence} (below threshold ${minConfidence})`);
return {
filteredText: '',
overallConfidence
};
}
}
// Calculate average confidence of accepted words
const averageConfidence = validConfidences.length > 0
? validConfidences.reduce((sum, conf) => sum + conf, 0) / validConfidences.length
: 0;
const filteredText = filteredWords.join(' ').trim();
log.info(`Filtered OCR text: ${filteredWords.length} words kept out of ${data.words?.length || 0} total words (min confidence: ${minConfidence})`);
return {
filteredText,
overallConfidence: averageConfidence
};
}
/**
* Get minimum confidence threshold from options
*/
private getMinConfidenceThreshold(): number {
const minConfidence = options.getOption('ocrMinConfidence') ?? 0;
return parseFloat(minConfidence);
}
/**
* Validate OCR language format
* Supports single language (eng) or multi-language (ron+eng)
@ -151,16 +222,16 @@ export class ImageProcessor extends FileProcessor {
if (!language || typeof language !== 'string') {
return false;
}
// Split by '+' for multi-language format
const languages = language.split('+');
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
return languages.every(lang => {
const trimmed = lang.trim();
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
});
}
}
}

View File

@ -216,7 +216,7 @@ const defaultOptions: DefaultOption[] = [
{ name: "ocrEnabled", value: "false", isSynced: true },
{ name: "ocrLanguage", value: "eng", isSynced: true },
{ name: "ocrAutoProcessImages", value: "true", isSynced: true },
{ name: "ocrMinConfidence", value: "0.2", isSynced: true },
{ name: "ocrMinConfidence", value: "0.55", isSynced: true },
];
/**