mirror of
https://github.com/zadam/trilium.git
synced 2025-12-07 16:04:23 +01:00
feat(ocr): filter out text based on confidence
This commit is contained in:
parent
55ac1e01f2
commit
5ec6141369
@ -2,6 +2,7 @@ import Tesseract from 'tesseract.js';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
|
||||
import log from '../../log.js';
|
||||
import options from '../../options.js';
|
||||
|
||||
/**
|
||||
* Image processor for extracting text from image files using Tesseract
|
||||
@ -63,9 +64,12 @@ export class ImageProcessor extends FileProcessor {
|
||||
|
||||
const result = await this.worker.recognize(buffer);
|
||||
|
||||
// Filter text based on minimum confidence threshold
|
||||
const { filteredText, overallConfidence } = this.filterTextByConfidence(result.data, options);
|
||||
|
||||
const ocrResult: OCRResult = {
|
||||
text: result.data.text.trim(),
|
||||
confidence: result.data.confidence / 100, // Convert percentage to decimal
|
||||
text: filteredText,
|
||||
confidence: overallConfidence,
|
||||
extractedAt: new Date().toISOString(),
|
||||
language: options.language || this.getDefaultOCRLanguage(),
|
||||
pageCount: 1
|
||||
@ -143,6 +147,73 @@ export class ImageProcessor extends FileProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter text based on minimum confidence threshold
|
||||
*/
|
||||
private filterTextByConfidence(data: any, options: OCRProcessingOptions): { filteredText: string; overallConfidence: number } {
|
||||
const minConfidence = this.getMinConfidenceThreshold();
|
||||
|
||||
// If no minimum confidence set, return original text
|
||||
if (minConfidence <= 0) {
|
||||
return {
|
||||
filteredText: data.text.trim(),
|
||||
overallConfidence: data.confidence / 100
|
||||
};
|
||||
}
|
||||
|
||||
let filteredWords: string[] = [];
|
||||
let validConfidences: number[] = [];
|
||||
|
||||
// Tesseract provides word-level data
|
||||
if (data.words && Array.isArray(data.words)) {
|
||||
for (const word of data.words) {
|
||||
const wordConfidence = word.confidence / 100; // Convert to decimal
|
||||
|
||||
if (wordConfidence >= minConfidence) {
|
||||
filteredWords.push(word.text);
|
||||
validConfidences.push(wordConfidence);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Fallback: if word-level data not available, use overall confidence
|
||||
const overallConfidence = data.confidence / 100;
|
||||
if (overallConfidence >= minConfidence) {
|
||||
return {
|
||||
filteredText: data.text.trim(),
|
||||
overallConfidence
|
||||
};
|
||||
} else {
|
||||
log.info(`Entire text filtered out due to low confidence ${overallConfidence} (below threshold ${minConfidence})`);
|
||||
return {
|
||||
filteredText: '',
|
||||
overallConfidence
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate average confidence of accepted words
|
||||
const averageConfidence = validConfidences.length > 0
|
||||
? validConfidences.reduce((sum, conf) => sum + conf, 0) / validConfidences.length
|
||||
: 0;
|
||||
|
||||
const filteredText = filteredWords.join(' ').trim();
|
||||
|
||||
log.info(`Filtered OCR text: ${filteredWords.length} words kept out of ${data.words?.length || 0} total words (min confidence: ${minConfidence})`);
|
||||
|
||||
return {
|
||||
filteredText,
|
||||
overallConfidence: averageConfidence
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get minimum confidence threshold from options
|
||||
*/
|
||||
private getMinConfidenceThreshold(): number {
|
||||
const minConfidence = options.getOption('ocrMinConfidence') ?? 0;
|
||||
return parseFloat(minConfidence);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate OCR language format
|
||||
* Supports single language (eng) or multi-language (ron+eng)
|
||||
|
||||
@ -216,7 +216,7 @@ const defaultOptions: DefaultOption[] = [
|
||||
{ name: "ocrEnabled", value: "false", isSynced: true },
|
||||
{ name: "ocrLanguage", value: "eng", isSynced: true },
|
||||
{ name: "ocrAutoProcessImages", value: "true", isSynced: true },
|
||||
{ name: "ocrMinConfidence", value: "0.2", isSynced: true },
|
||||
{ name: "ocrMinConfidence", value: "0.55", isSynced: true },
|
||||
];
|
||||
|
||||
/**
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user