feat(ocr): filter out text based on confidence

This commit is contained in:
Elian Doran 2025-07-26 14:57:12 +03:00
parent 55ac1e01f2
commit 5ec6141369
No known key found for this signature in database
2 changed files with 85 additions and 14 deletions

View File

@ -2,6 +2,7 @@ import Tesseract from 'tesseract.js';
import { FileProcessor } from './file_processor.js';
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
import log from '../../log.js';
import options from '../../options.js';
/**
* Image processor for extracting text from image files using Tesseract
@ -63,9 +64,12 @@ export class ImageProcessor extends FileProcessor {
const result = await this.worker.recognize(buffer);
// Filter text based on minimum confidence threshold
const { filteredText, overallConfidence } = this.filterTextByConfidence(result.data, options);
const ocrResult: OCRResult = {
text: result.data.text.trim(),
confidence: result.data.confidence / 100, // Convert percentage to decimal
text: filteredText,
confidence: overallConfidence,
extractedAt: new Date().toISOString(),
language: options.language || this.getDefaultOCRLanguage(),
pageCount: 1
@ -143,6 +147,73 @@ export class ImageProcessor extends FileProcessor {
}
}
/**
* Filter text based on minimum confidence threshold
*/
private filterTextByConfidence(data: any, options: OCRProcessingOptions): { filteredText: string; overallConfidence: number } {
const minConfidence = this.getMinConfidenceThreshold();
// If no minimum confidence set, return original text
if (minConfidence <= 0) {
return {
filteredText: data.text.trim(),
overallConfidence: data.confidence / 100
};
}
let filteredWords: string[] = [];
let validConfidences: number[] = [];
// Tesseract provides word-level data
if (data.words && Array.isArray(data.words)) {
for (const word of data.words) {
const wordConfidence = word.confidence / 100; // Convert to decimal
if (wordConfidence >= minConfidence) {
filteredWords.push(word.text);
validConfidences.push(wordConfidence);
}
}
} else {
// Fallback: if word-level data not available, use overall confidence
const overallConfidence = data.confidence / 100;
if (overallConfidence >= minConfidence) {
return {
filteredText: data.text.trim(),
overallConfidence
};
} else {
log.info(`Entire text filtered out due to low confidence ${overallConfidence} (below threshold ${minConfidence})`);
return {
filteredText: '',
overallConfidence
};
}
}
// Calculate average confidence of accepted words
const averageConfidence = validConfidences.length > 0
? validConfidences.reduce((sum, conf) => sum + conf, 0) / validConfidences.length
: 0;
const filteredText = filteredWords.join(' ').trim();
log.info(`Filtered OCR text: ${filteredWords.length} words kept out of ${data.words?.length || 0} total words (min confidence: ${minConfidence})`);
return {
filteredText,
overallConfidence: averageConfidence
};
}
/**
* Get minimum confidence threshold from options
*/
private getMinConfidenceThreshold(): number {
const minConfidence = options.getOption('ocrMinConfidence') ?? 0;
return parseFloat(minConfidence);
}
/**
* Validate OCR language format
* Supports single language (eng) or multi-language (ron+eng)

View File

@ -216,7 +216,7 @@ const defaultOptions: DefaultOption[] = [
{ name: "ocrEnabled", value: "false", isSynced: true },
{ name: "ocrLanguage", value: "eng", isSynced: true },
{ name: "ocrAutoProcessImages", value: "true", isSynced: true },
{ name: "ocrMinConfidence", value: "0.2", isSynced: true },
{ name: "ocrMinConfidence", value: "0.55", isSynced: true },
];
/**