refactor(ocr): deduplicate mime types partially

This commit is contained in:
Elian Doran 2025-07-26 11:51:53 +03:00
parent 11e9b097a2
commit 090b175152
No known key found for this signature in database
7 changed files with 96 additions and 40 deletions

View File

@ -143,26 +143,7 @@ eventService.subscribe(eventService.ENTITY_CREATED, ({ entityName, entity }) =>
// Automatically process OCR for file notes if OCR is enabled
if (entity.type === 'file' && ocrService.isOCREnabled()) {
// Check if the file MIME type is supported by any OCR processor
const supportedMimeTypes = [
// Office documents
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'application/msword',
'application/vnd.ms-excel',
'application/vnd.ms-powerpoint',
'application/rtf',
// PDFs
'application/pdf',
// Images (though these are usually type='image', not 'file')
'image/jpeg',
'image/jpg',
'image/png',
'image/gif',
'image/bmp',
'image/tiff',
'image/webp'
];
const supportedMimeTypes = ocrService.getAllSupportedMimeTypes();
if (entity.mime && supportedMimeTypes.includes(entity.mime)) {
// Process OCR asynchronously to avoid blocking note creation

View File

@ -557,6 +557,58 @@ class OCRService {
return null;
}
/**
* Get all MIME types supported by all registered processors
*/
getAllSupportedMimeTypes(): string[] {
const supportedTypes = new Set<string>();
// Initialize processors if not already done
if (!this.isInitialized) {
// Return a static list if not initialized to avoid async issues
// This covers all known supported types
return [
// Images
'image/jpeg',
'image/jpg',
'image/png',
'image/gif',
'image/bmp',
'image/tiff',
'image/tif',
'image/webp',
// Documents
'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'application/msword',
'application/vnd.ms-excel',
'application/vnd.ms-powerpoint',
'application/rtf'
];
}
// Gather MIME types from all registered processors
for (const processor of this.processors.values()) {
const processorTypes = processor.getSupportedMimeTypes();
processorTypes.forEach(type => supportedTypes.add(type));
}
return Array.from(supportedTypes);
}
/**
* Check if a MIME type is supported by any processor
*/
isSupportedByAnyProcessor(mimeType: string): boolean {
if (!mimeType) return false;
// Check if any processor can handle this MIME type
const processor = this.getProcessorForMimeType(mimeType);
return processor !== null;
}
/**
* Check if blob needs OCR re-processing due to content changes
*/

View File

@ -19,6 +19,11 @@ export abstract class FileProcessor {
*/
abstract getProcessingType(): string;
/**
* Get list of MIME types supported by this processor
*/
abstract getSupportedMimeTypes(): string[];
/**
* Clean up any resources
*/

View File

@ -9,18 +9,22 @@ import log from '../../log.js';
export class ImageProcessor extends FileProcessor {
private worker: Tesseract.Worker | null = null;
private isInitialized = false;
private readonly supportedTypes = [
'image/jpeg',
'image/jpg',
'image/png',
'image/gif',
'image/bmp',
'image/tiff',
'image/webp'
];
canProcess(mimeType: string): boolean {
const supportedTypes = [
'image/jpeg',
'image/jpg',
'image/png',
'image/gif',
'image/bmp',
'image/tiff',
'image/webp'
];
return supportedTypes.includes(mimeType.toLowerCase());
return this.supportedTypes.includes(mimeType.toLowerCase());
}
getSupportedMimeTypes(): string[] {
return [...this.supportedTypes];
}
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {

View File

@ -9,6 +9,15 @@ import log from '../../log.js';
*/
export class OfficeProcessor extends FileProcessor {
private imageProcessor: ImageProcessor;
private readonly supportedTypes = [
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
'application/msword', // DOC
'application/vnd.ms-excel', // XLS
'application/vnd.ms-powerpoint', // PPT
'application/rtf' // RTF
];
constructor() {
super();
@ -16,16 +25,11 @@ export class OfficeProcessor extends FileProcessor {
}
canProcess(mimeType: string): boolean {
const supportedTypes = [
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
'application/msword', // DOC
'application/vnd.ms-excel', // XLS
'application/vnd.ms-powerpoint', // PPT
'application/rtf' // RTF
];
return supportedTypes.includes(mimeType);
return this.supportedTypes.includes(mimeType);
}
getSupportedMimeTypes(): string[] {
return [...this.supportedTypes];
}
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {

View File

@ -11,6 +11,7 @@ import sharp from 'sharp';
*/
export class PDFProcessor extends FileProcessor {
private imageProcessor: ImageProcessor;
private readonly supportedTypes = ['application/pdf'];
constructor() {
super();
@ -21,6 +22,10 @@ export class PDFProcessor extends FileProcessor {
return mimeType.toLowerCase() === 'application/pdf';
}
getSupportedMimeTypes(): string[] {
return [...this.supportedTypes];
}
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
try {
log.info('Starting PDF text extraction...');

View File

@ -9,6 +9,7 @@ import log from '../../log.js';
*/
export class TIFFProcessor extends FileProcessor {
private imageProcessor: ImageProcessor;
private readonly supportedTypes = ['image/tiff', 'image/tif'];
constructor() {
super();
@ -19,6 +20,10 @@ export class TIFFProcessor extends FileProcessor {
return mimeType.toLowerCase() === 'image/tiff' || mimeType.toLowerCase() === 'image/tif';
}
getSupportedMimeTypes(): string[] {
return [...this.supportedTypes];
}
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
try {
log.info('Starting TIFF text extraction...');