mirror of
https://github.com/zadam/trilium.git
synced 2025-12-06 23:44:25 +01:00
feat(ocr): filter out text based on confidence
This commit is contained in:
parent
55ac1e01f2
commit
5ec6141369
@ -2,6 +2,7 @@ import Tesseract from 'tesseract.js';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
|
||||
import log from '../../log.js';
|
||||
import options from '../../options.js';
|
||||
|
||||
/**
|
||||
* Image processor for extracting text from image files using Tesseract
|
||||
@ -11,7 +12,7 @@ export class ImageProcessor extends FileProcessor {
|
||||
private isInitialized = false;
|
||||
private readonly supportedTypes = [
|
||||
'image/jpeg',
|
||||
'image/jpg',
|
||||
'image/jpg',
|
||||
'image/png',
|
||||
'image/gif',
|
||||
'image/bmp',
|
||||
@ -42,12 +43,12 @@ export class ImageProcessor extends FileProcessor {
|
||||
// Set language if specified and different from current
|
||||
// Support multi-language format like 'ron+eng'
|
||||
const language = options.language || this.getDefaultOCRLanguage();
|
||||
|
||||
|
||||
// Validate language format
|
||||
if (!this.isValidLanguageFormat(language)) {
|
||||
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
|
||||
}
|
||||
|
||||
|
||||
if (language !== 'eng') {
|
||||
// For different languages, create a new worker
|
||||
await this.worker.terminate();
|
||||
@ -62,10 +63,13 @@ export class ImageProcessor extends FileProcessor {
|
||||
}
|
||||
|
||||
const result = await this.worker.recognize(buffer);
|
||||
|
||||
|
||||
// Filter text based on minimum confidence threshold
|
||||
const { filteredText, overallConfidence } = this.filterTextByConfidence(result.data, options);
|
||||
|
||||
const ocrResult: OCRResult = {
|
||||
text: result.data.text.trim(),
|
||||
confidence: result.data.confidence / 100, // Convert percentage to decimal
|
||||
text: filteredText,
|
||||
confidence: overallConfidence,
|
||||
extractedAt: new Date().toISOString(),
|
||||
language: options.language || this.getDefaultOCRLanguage(),
|
||||
pageCount: 1
|
||||
@ -91,15 +95,15 @@ export class ImageProcessor extends FileProcessor {
|
||||
|
||||
try {
|
||||
log.info('Initializing image OCR processor with Tesseract.js...');
|
||||
|
||||
|
||||
// Configure proper paths for Node.js environment
|
||||
const tesseractDir = require.resolve('tesseract.js').replace('/src/index.js', '');
|
||||
const workerPath = require.resolve('tesseract.js/src/worker-script/node/index.js');
|
||||
const corePath = require.resolve('tesseract.js-core/tesseract-core.wasm.js');
|
||||
|
||||
|
||||
log.info(`Using worker path: ${workerPath}`);
|
||||
log.info(`Using core path: ${corePath}`);
|
||||
|
||||
|
||||
this.worker = await Tesseract.createWorker(this.getDefaultOCRLanguage(), 1, {
|
||||
workerPath,
|
||||
corePath,
|
||||
@ -143,6 +147,73 @@ export class ImageProcessor extends FileProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter text based on minimum confidence threshold
|
||||
*/
|
||||
private filterTextByConfidence(data: any, options: OCRProcessingOptions): { filteredText: string; overallConfidence: number } {
|
||||
const minConfidence = this.getMinConfidenceThreshold();
|
||||
|
||||
// If no minimum confidence set, return original text
|
||||
if (minConfidence <= 0) {
|
||||
return {
|
||||
filteredText: data.text.trim(),
|
||||
overallConfidence: data.confidence / 100
|
||||
};
|
||||
}
|
||||
|
||||
let filteredWords: string[] = [];
|
||||
let validConfidences: number[] = [];
|
||||
|
||||
// Tesseract provides word-level data
|
||||
if (data.words && Array.isArray(data.words)) {
|
||||
for (const word of data.words) {
|
||||
const wordConfidence = word.confidence / 100; // Convert to decimal
|
||||
|
||||
if (wordConfidence >= minConfidence) {
|
||||
filteredWords.push(word.text);
|
||||
validConfidences.push(wordConfidence);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Fallback: if word-level data not available, use overall confidence
|
||||
const overallConfidence = data.confidence / 100;
|
||||
if (overallConfidence >= minConfidence) {
|
||||
return {
|
||||
filteredText: data.text.trim(),
|
||||
overallConfidence
|
||||
};
|
||||
} else {
|
||||
log.info(`Entire text filtered out due to low confidence ${overallConfidence} (below threshold ${minConfidence})`);
|
||||
return {
|
||||
filteredText: '',
|
||||
overallConfidence
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate average confidence of accepted words
|
||||
const averageConfidence = validConfidences.length > 0
|
||||
? validConfidences.reduce((sum, conf) => sum + conf, 0) / validConfidences.length
|
||||
: 0;
|
||||
|
||||
const filteredText = filteredWords.join(' ').trim();
|
||||
|
||||
log.info(`Filtered OCR text: ${filteredWords.length} words kept out of ${data.words?.length || 0} total words (min confidence: ${minConfidence})`);
|
||||
|
||||
return {
|
||||
filteredText,
|
||||
overallConfidence: averageConfidence
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get minimum confidence threshold from options
|
||||
*/
|
||||
private getMinConfidenceThreshold(): number {
|
||||
const minConfidence = options.getOption('ocrMinConfidence') ?? 0;
|
||||
return parseFloat(minConfidence);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate OCR language format
|
||||
* Supports single language (eng) or multi-language (ron+eng)
|
||||
@ -151,16 +222,16 @@ export class ImageProcessor extends FileProcessor {
|
||||
if (!language || typeof language !== 'string') {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// Split by '+' for multi-language format
|
||||
const languages = language.split('+');
|
||||
|
||||
|
||||
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
|
||||
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
|
||||
|
||||
|
||||
return languages.every(lang => {
|
||||
const trimmed = lang.trim();
|
||||
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -216,7 +216,7 @@ const defaultOptions: DefaultOption[] = [
|
||||
{ name: "ocrEnabled", value: "false", isSynced: true },
|
||||
{ name: "ocrLanguage", value: "eng", isSynced: true },
|
||||
{ name: "ocrAutoProcessImages", value: "true", isSynced: true },
|
||||
{ name: "ocrMinConfidence", value: "0.2", isSynced: true },
|
||||
{ name: "ocrMinConfidence", value: "0.55", isSynced: true },
|
||||
];
|
||||
|
||||
/**
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user