mirror of
https://github.com/zadam/trilium.git
synced 2025-12-06 23:44:25 +01:00
refactor(ocr): deduplicate mime types partially
This commit is contained in:
parent
11e9b097a2
commit
090b175152
@ -143,26 +143,7 @@ eventService.subscribe(eventService.ENTITY_CREATED, ({ entityName, entity }) =>
|
|||||||
// Automatically process OCR for file notes if OCR is enabled
|
// Automatically process OCR for file notes if OCR is enabled
|
||||||
if (entity.type === 'file' && ocrService.isOCREnabled()) {
|
if (entity.type === 'file' && ocrService.isOCREnabled()) {
|
||||||
// Check if the file MIME type is supported by any OCR processor
|
// Check if the file MIME type is supported by any OCR processor
|
||||||
const supportedMimeTypes = [
|
const supportedMimeTypes = ocrService.getAllSupportedMimeTypes();
|
||||||
// Office documents
|
|
||||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
||||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
||||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
||||||
'application/msword',
|
|
||||||
'application/vnd.ms-excel',
|
|
||||||
'application/vnd.ms-powerpoint',
|
|
||||||
'application/rtf',
|
|
||||||
// PDFs
|
|
||||||
'application/pdf',
|
|
||||||
// Images (though these are usually type='image', not 'file')
|
|
||||||
'image/jpeg',
|
|
||||||
'image/jpg',
|
|
||||||
'image/png',
|
|
||||||
'image/gif',
|
|
||||||
'image/bmp',
|
|
||||||
'image/tiff',
|
|
||||||
'image/webp'
|
|
||||||
];
|
|
||||||
|
|
||||||
if (entity.mime && supportedMimeTypes.includes(entity.mime)) {
|
if (entity.mime && supportedMimeTypes.includes(entity.mime)) {
|
||||||
// Process OCR asynchronously to avoid blocking note creation
|
// Process OCR asynchronously to avoid blocking note creation
|
||||||
|
|||||||
@ -557,6 +557,58 @@ class OCRService {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get all MIME types supported by all registered processors
|
||||||
|
*/
|
||||||
|
getAllSupportedMimeTypes(): string[] {
|
||||||
|
const supportedTypes = new Set<string>();
|
||||||
|
|
||||||
|
// Initialize processors if not already done
|
||||||
|
if (!this.isInitialized) {
|
||||||
|
// Return a static list if not initialized to avoid async issues
|
||||||
|
// This covers all known supported types
|
||||||
|
return [
|
||||||
|
// Images
|
||||||
|
'image/jpeg',
|
||||||
|
'image/jpg',
|
||||||
|
'image/png',
|
||||||
|
'image/gif',
|
||||||
|
'image/bmp',
|
||||||
|
'image/tiff',
|
||||||
|
'image/tif',
|
||||||
|
'image/webp',
|
||||||
|
// Documents
|
||||||
|
'application/pdf',
|
||||||
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||||
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||||
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||||
|
'application/msword',
|
||||||
|
'application/vnd.ms-excel',
|
||||||
|
'application/vnd.ms-powerpoint',
|
||||||
|
'application/rtf'
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Gather MIME types from all registered processors
|
||||||
|
for (const processor of this.processors.values()) {
|
||||||
|
const processorTypes = processor.getSupportedMimeTypes();
|
||||||
|
processorTypes.forEach(type => supportedTypes.add(type));
|
||||||
|
}
|
||||||
|
|
||||||
|
return Array.from(supportedTypes);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if a MIME type is supported by any processor
|
||||||
|
*/
|
||||||
|
isSupportedByAnyProcessor(mimeType: string): boolean {
|
||||||
|
if (!mimeType) return false;
|
||||||
|
|
||||||
|
// Check if any processor can handle this MIME type
|
||||||
|
const processor = this.getProcessorForMimeType(mimeType);
|
||||||
|
return processor !== null;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if blob needs OCR re-processing due to content changes
|
* Check if blob needs OCR re-processing due to content changes
|
||||||
*/
|
*/
|
||||||
|
|||||||
@ -19,6 +19,11 @@ export abstract class FileProcessor {
|
|||||||
*/
|
*/
|
||||||
abstract getProcessingType(): string;
|
abstract getProcessingType(): string;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get list of MIME types supported by this processor
|
||||||
|
*/
|
||||||
|
abstract getSupportedMimeTypes(): string[];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Clean up any resources
|
* Clean up any resources
|
||||||
*/
|
*/
|
||||||
|
|||||||
@ -9,9 +9,7 @@ import log from '../../log.js';
|
|||||||
export class ImageProcessor extends FileProcessor {
|
export class ImageProcessor extends FileProcessor {
|
||||||
private worker: Tesseract.Worker | null = null;
|
private worker: Tesseract.Worker | null = null;
|
||||||
private isInitialized = false;
|
private isInitialized = false;
|
||||||
|
private readonly supportedTypes = [
|
||||||
canProcess(mimeType: string): boolean {
|
|
||||||
const supportedTypes = [
|
|
||||||
'image/jpeg',
|
'image/jpeg',
|
||||||
'image/jpg',
|
'image/jpg',
|
||||||
'image/png',
|
'image/png',
|
||||||
@ -20,7 +18,13 @@ export class ImageProcessor extends FileProcessor {
|
|||||||
'image/tiff',
|
'image/tiff',
|
||||||
'image/webp'
|
'image/webp'
|
||||||
];
|
];
|
||||||
return supportedTypes.includes(mimeType.toLowerCase());
|
|
||||||
|
canProcess(mimeType: string): boolean {
|
||||||
|
return this.supportedTypes.includes(mimeType.toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
getSupportedMimeTypes(): string[] {
|
||||||
|
return [...this.supportedTypes];
|
||||||
}
|
}
|
||||||
|
|
||||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||||
|
|||||||
@ -9,14 +9,7 @@ import log from '../../log.js';
|
|||||||
*/
|
*/
|
||||||
export class OfficeProcessor extends FileProcessor {
|
export class OfficeProcessor extends FileProcessor {
|
||||||
private imageProcessor: ImageProcessor;
|
private imageProcessor: ImageProcessor;
|
||||||
|
private readonly supportedTypes = [
|
||||||
constructor() {
|
|
||||||
super();
|
|
||||||
this.imageProcessor = new ImageProcessor();
|
|
||||||
}
|
|
||||||
|
|
||||||
canProcess(mimeType: string): boolean {
|
|
||||||
const supportedTypes = [
|
|
||||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
|
||||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
|
||||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
|
||||||
@ -25,7 +18,18 @@ export class OfficeProcessor extends FileProcessor {
|
|||||||
'application/vnd.ms-powerpoint', // PPT
|
'application/vnd.ms-powerpoint', // PPT
|
||||||
'application/rtf' // RTF
|
'application/rtf' // RTF
|
||||||
];
|
];
|
||||||
return supportedTypes.includes(mimeType);
|
|
||||||
|
constructor() {
|
||||||
|
super();
|
||||||
|
this.imageProcessor = new ImageProcessor();
|
||||||
|
}
|
||||||
|
|
||||||
|
canProcess(mimeType: string): boolean {
|
||||||
|
return this.supportedTypes.includes(mimeType);
|
||||||
|
}
|
||||||
|
|
||||||
|
getSupportedMimeTypes(): string[] {
|
||||||
|
return [...this.supportedTypes];
|
||||||
}
|
}
|
||||||
|
|
||||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||||
|
|||||||
@ -11,6 +11,7 @@ import sharp from 'sharp';
|
|||||||
*/
|
*/
|
||||||
export class PDFProcessor extends FileProcessor {
|
export class PDFProcessor extends FileProcessor {
|
||||||
private imageProcessor: ImageProcessor;
|
private imageProcessor: ImageProcessor;
|
||||||
|
private readonly supportedTypes = ['application/pdf'];
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super();
|
super();
|
||||||
@ -21,6 +22,10 @@ export class PDFProcessor extends FileProcessor {
|
|||||||
return mimeType.toLowerCase() === 'application/pdf';
|
return mimeType.toLowerCase() === 'application/pdf';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
getSupportedMimeTypes(): string[] {
|
||||||
|
return [...this.supportedTypes];
|
||||||
|
}
|
||||||
|
|
||||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||||
try {
|
try {
|
||||||
log.info('Starting PDF text extraction...');
|
log.info('Starting PDF text extraction...');
|
||||||
|
|||||||
@ -9,6 +9,7 @@ import log from '../../log.js';
|
|||||||
*/
|
*/
|
||||||
export class TIFFProcessor extends FileProcessor {
|
export class TIFFProcessor extends FileProcessor {
|
||||||
private imageProcessor: ImageProcessor;
|
private imageProcessor: ImageProcessor;
|
||||||
|
private readonly supportedTypes = ['image/tiff', 'image/tif'];
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super();
|
super();
|
||||||
@ -19,6 +20,10 @@ export class TIFFProcessor extends FileProcessor {
|
|||||||
return mimeType.toLowerCase() === 'image/tiff' || mimeType.toLowerCase() === 'image/tif';
|
return mimeType.toLowerCase() === 'image/tiff' || mimeType.toLowerCase() === 'image/tif';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
getSupportedMimeTypes(): string[] {
|
||||||
|
return [...this.supportedTypes];
|
||||||
|
}
|
||||||
|
|
||||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||||
try {
|
try {
|
||||||
log.info('Starting TIFF text extraction...');
|
log.info('Starting TIFF text extraction...');
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user