mirror of
https://github.com/zadam/trilium.git
synced 2025-12-04 22:44:25 +01:00
refactor(ocr): deduplicate mime types partially
This commit is contained in:
parent
11e9b097a2
commit
090b175152
@ -143,26 +143,7 @@ eventService.subscribe(eventService.ENTITY_CREATED, ({ entityName, entity }) =>
|
||||
// Automatically process OCR for file notes if OCR is enabled
|
||||
if (entity.type === 'file' && ocrService.isOCREnabled()) {
|
||||
// Check if the file MIME type is supported by any OCR processor
|
||||
const supportedMimeTypes = [
|
||||
// Office documents
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||
'application/msword',
|
||||
'application/vnd.ms-excel',
|
||||
'application/vnd.ms-powerpoint',
|
||||
'application/rtf',
|
||||
// PDFs
|
||||
'application/pdf',
|
||||
// Images (though these are usually type='image', not 'file')
|
||||
'image/jpeg',
|
||||
'image/jpg',
|
||||
'image/png',
|
||||
'image/gif',
|
||||
'image/bmp',
|
||||
'image/tiff',
|
||||
'image/webp'
|
||||
];
|
||||
const supportedMimeTypes = ocrService.getAllSupportedMimeTypes();
|
||||
|
||||
if (entity.mime && supportedMimeTypes.includes(entity.mime)) {
|
||||
// Process OCR asynchronously to avoid blocking note creation
|
||||
|
||||
@ -557,6 +557,58 @@ class OCRService {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all MIME types supported by all registered processors
|
||||
*/
|
||||
getAllSupportedMimeTypes(): string[] {
|
||||
const supportedTypes = new Set<string>();
|
||||
|
||||
// Initialize processors if not already done
|
||||
if (!this.isInitialized) {
|
||||
// Return a static list if not initialized to avoid async issues
|
||||
// This covers all known supported types
|
||||
return [
|
||||
// Images
|
||||
'image/jpeg',
|
||||
'image/jpg',
|
||||
'image/png',
|
||||
'image/gif',
|
||||
'image/bmp',
|
||||
'image/tiff',
|
||||
'image/tif',
|
||||
'image/webp',
|
||||
// Documents
|
||||
'application/pdf',
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||
'application/msword',
|
||||
'application/vnd.ms-excel',
|
||||
'application/vnd.ms-powerpoint',
|
||||
'application/rtf'
|
||||
];
|
||||
}
|
||||
|
||||
// Gather MIME types from all registered processors
|
||||
for (const processor of this.processors.values()) {
|
||||
const processorTypes = processor.getSupportedMimeTypes();
|
||||
processorTypes.forEach(type => supportedTypes.add(type));
|
||||
}
|
||||
|
||||
return Array.from(supportedTypes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a MIME type is supported by any processor
|
||||
*/
|
||||
isSupportedByAnyProcessor(mimeType: string): boolean {
|
||||
if (!mimeType) return false;
|
||||
|
||||
// Check if any processor can handle this MIME type
|
||||
const processor = this.getProcessorForMimeType(mimeType);
|
||||
return processor !== null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if blob needs OCR re-processing due to content changes
|
||||
*/
|
||||
|
||||
@ -19,6 +19,11 @@ export abstract class FileProcessor {
|
||||
*/
|
||||
abstract getProcessingType(): string;
|
||||
|
||||
/**
|
||||
* Get list of MIME types supported by this processor
|
||||
*/
|
||||
abstract getSupportedMimeTypes(): string[];
|
||||
|
||||
/**
|
||||
* Clean up any resources
|
||||
*/
|
||||
|
||||
@ -9,18 +9,22 @@ import log from '../../log.js';
|
||||
export class ImageProcessor extends FileProcessor {
|
||||
private worker: Tesseract.Worker | null = null;
|
||||
private isInitialized = false;
|
||||
private readonly supportedTypes = [
|
||||
'image/jpeg',
|
||||
'image/jpg',
|
||||
'image/png',
|
||||
'image/gif',
|
||||
'image/bmp',
|
||||
'image/tiff',
|
||||
'image/webp'
|
||||
];
|
||||
|
||||
canProcess(mimeType: string): boolean {
|
||||
const supportedTypes = [
|
||||
'image/jpeg',
|
||||
'image/jpg',
|
||||
'image/png',
|
||||
'image/gif',
|
||||
'image/bmp',
|
||||
'image/tiff',
|
||||
'image/webp'
|
||||
];
|
||||
return supportedTypes.includes(mimeType.toLowerCase());
|
||||
return this.supportedTypes.includes(mimeType.toLowerCase());
|
||||
}
|
||||
|
||||
getSupportedMimeTypes(): string[] {
|
||||
return [...this.supportedTypes];
|
||||
}
|
||||
|
||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||
|
||||
@ -9,6 +9,15 @@ import log from '../../log.js';
|
||||
*/
|
||||
export class OfficeProcessor extends FileProcessor {
|
||||
private imageProcessor: ImageProcessor;
|
||||
private readonly supportedTypes = [
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
|
||||
'application/msword', // DOC
|
||||
'application/vnd.ms-excel', // XLS
|
||||
'application/vnd.ms-powerpoint', // PPT
|
||||
'application/rtf' // RTF
|
||||
];
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
@ -16,16 +25,11 @@ export class OfficeProcessor extends FileProcessor {
|
||||
}
|
||||
|
||||
canProcess(mimeType: string): boolean {
|
||||
const supportedTypes = [
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
|
||||
'application/msword', // DOC
|
||||
'application/vnd.ms-excel', // XLS
|
||||
'application/vnd.ms-powerpoint', // PPT
|
||||
'application/rtf' // RTF
|
||||
];
|
||||
return supportedTypes.includes(mimeType);
|
||||
return this.supportedTypes.includes(mimeType);
|
||||
}
|
||||
|
||||
getSupportedMimeTypes(): string[] {
|
||||
return [...this.supportedTypes];
|
||||
}
|
||||
|
||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||
|
||||
@ -11,6 +11,7 @@ import sharp from 'sharp';
|
||||
*/
|
||||
export class PDFProcessor extends FileProcessor {
|
||||
private imageProcessor: ImageProcessor;
|
||||
private readonly supportedTypes = ['application/pdf'];
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
@ -21,6 +22,10 @@ export class PDFProcessor extends FileProcessor {
|
||||
return mimeType.toLowerCase() === 'application/pdf';
|
||||
}
|
||||
|
||||
getSupportedMimeTypes(): string[] {
|
||||
return [...this.supportedTypes];
|
||||
}
|
||||
|
||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||
try {
|
||||
log.info('Starting PDF text extraction...');
|
||||
|
||||
@ -9,6 +9,7 @@ import log from '../../log.js';
|
||||
*/
|
||||
export class TIFFProcessor extends FileProcessor {
|
||||
private imageProcessor: ImageProcessor;
|
||||
private readonly supportedTypes = ['image/tiff', 'image/tif'];
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
@ -19,6 +20,10 @@ export class TIFFProcessor extends FileProcessor {
|
||||
return mimeType.toLowerCase() === 'image/tiff' || mimeType.toLowerCase() === 'image/tif';
|
||||
}
|
||||
|
||||
getSupportedMimeTypes(): string[] {
|
||||
return [...this.supportedTypes];
|
||||
}
|
||||
|
||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||
try {
|
||||
log.info('Starting TIFF text extraction...');
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user