feat(ocr): basic processing of new files

This commit is contained in:
Elian Doran 2025-07-26 11:46:28 +03:00
parent 2adfc1d32b
commit 11e9b097a2
No known key found for this signature in database
3 changed files with 157 additions and 51 deletions

View File

@ -6,6 +6,8 @@ import becca from "../becca/becca.js";
import BAttribute from "../becca/entities/battribute.js"; import BAttribute from "../becca/entities/battribute.js";
import hiddenSubtreeService from "./hidden_subtree.js"; import hiddenSubtreeService from "./hidden_subtree.js";
import oneTimeTimer from "./one_time_timer.js"; import oneTimeTimer from "./one_time_timer.js";
import ocrService from "./ocr/ocr_service.js";
import log from "./log.js";
import type BNote from "../becca/entities/bnote.js"; import type BNote from "../becca/entities/bnote.js";
import type AbstractBeccaEntity from "../becca/entities/abstract_becca_entity.js"; import type AbstractBeccaEntity from "../becca/entities/abstract_becca_entity.js";
import type { DefinitionObject } from "./promoted_attribute_definition_interface.js"; import type { DefinitionObject } from "./promoted_attribute_definition_interface.js";
@ -137,6 +139,42 @@ eventService.subscribe(eventService.ENTITY_CREATED, ({ entityName, entity }) =>
} }
} else if (entityName === "notes") { } else if (entityName === "notes") {
runAttachedRelations(entity, "runOnNoteCreation", entity); runAttachedRelations(entity, "runOnNoteCreation", entity);
// Automatically process OCR for file notes if OCR is enabled
if (entity.type === 'file' && ocrService.isOCREnabled()) {
// Check if the file MIME type is supported by any OCR processor
const supportedMimeTypes = [
// Office documents
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'application/msword',
'application/vnd.ms-excel',
'application/vnd.ms-powerpoint',
'application/rtf',
// PDFs
'application/pdf',
// Images (though these are usually type='image', not 'file')
'image/jpeg',
'image/jpg',
'image/png',
'image/gif',
'image/bmp',
'image/tiff',
'image/webp'
];
if (entity.mime && supportedMimeTypes.includes(entity.mime)) {
// Process OCR asynchronously to avoid blocking note creation
ocrService.processNoteOCR(entity.noteId).then(result => {
if (result) {
log.info(`Automatically processed OCR for file note ${entity.noteId} with MIME type ${entity.mime}`);
}
}).catch(error => {
log.error(`Failed to automatically process OCR for file note ${entity.noteId}: ${error}`);
});
}
}
} }
}); });

View File

@ -50,13 +50,13 @@ class OCRService {
try { try {
log.info('Initializing OCR service with file processors...'); log.info('Initializing OCR service with file processors...');
// Initialize file processors // Initialize file processors
this.processors.set('image', new ImageProcessor()); this.processors.set('image', new ImageProcessor());
this.processors.set('pdf', new PDFProcessor()); this.processors.set('pdf', new PDFProcessor());
this.processors.set('tiff', new TIFFProcessor()); this.processors.set('tiff', new TIFFProcessor());
this.processors.set('office', new OfficeProcessor()); this.processors.set('office', new OfficeProcessor());
this.isInitialized = true; this.isInitialized = true;
log.info('OCR service initialized successfully'); log.info('OCR service initialized successfully');
} catch (error) { } catch (error) {
@ -84,10 +84,10 @@ class OCRService {
if (!mimeType || typeof mimeType !== 'string') { if (!mimeType || typeof mimeType !== 'string') {
return false; return false;
} }
const supportedTypes = [ const supportedTypes = [
'image/jpeg', 'image/jpeg',
'image/jpg', 'image/jpg',
'image/png', 'image/png',
'image/gif', 'image/gif',
'image/bmp', 'image/bmp',
@ -116,7 +116,7 @@ class OCRService {
} }
const result = await processor.extractText(fileBuffer, options); const result = await processor.extractText(fileBuffer, options);
log.info(`OCR extraction completed. Confidence: ${result.confidence}%, Text length: ${result.text.length}`); log.info(`OCR extraction completed. Confidence: ${result.confidence}%, Text length: ${result.text.length}`);
return result; return result;
@ -143,13 +143,25 @@ class OCRService {
return null; return null;
} }
if (note.type !== 'image') { if (!this.isInitialized) {
log.info(`Note ${noteId} is not an image note, skipping OCR`); await this.initialize();
return null;
} }
if (!this.isSupportedMimeType(note.mime)) { // Check if note type and MIME type are supported for OCR
log.info(`Note ${noteId} has unsupported MIME type ${note.mime}, skipping OCR`); if (note.type === 'image') {
if (!this.isSupportedMimeType(note.mime)) {
log.info(`Image note ${noteId} has unsupported MIME type ${note.mime}, skipping OCR`);
return null;
}
} else if (note.type === 'file') {
// Check if file MIME type is supported by any processor
const processor = this.getProcessorForMimeType(note.mime);
if (!processor) {
log.info(`File note ${noteId} has unsupported MIME type ${note.mime} for OCR, skipping`);
return null;
}
} else {
log.info(`Note ${noteId} is not an image or file note, skipping OCR`);
return null; return null;
} }
@ -167,10 +179,10 @@ class OCRService {
} }
const ocrResult = await this.extractTextFromFile(content, note.mime, options); const ocrResult = await this.extractTextFromFile(content, note.mime, options);
// Store OCR result in blob // Store OCR result in blob
await this.storeOCRResult(note.blobId, ocrResult); await this.storeOCRResult(note.blobId, ocrResult);
return ocrResult; return ocrResult;
} catch (error) { } catch (error) {
log.error(`Failed to process OCR for note ${noteId}: ${error}`); log.error(`Failed to process OCR for note ${noteId}: ${error}`);
@ -193,13 +205,25 @@ class OCRService {
return null; return null;
} }
if (attachment.role !== 'image') { if (!this.isInitialized) {
log.info(`Attachment ${attachmentId} is not an image, skipping OCR`); await this.initialize();
return null;
} }
if (!this.isSupportedMimeType(attachment.mime)) { // Check if attachment role and MIME type are supported for OCR
log.info(`Attachment ${attachmentId} has unsupported MIME type ${attachment.mime}, skipping OCR`); if (attachment.role === 'image') {
if (!this.isSupportedMimeType(attachment.mime)) {
log.info(`Image attachment ${attachmentId} has unsupported MIME type ${attachment.mime}, skipping OCR`);
return null;
}
} else if (attachment.role === 'file') {
// Check if file MIME type is supported by any processor
const processor = this.getProcessorForMimeType(attachment.mime);
if (!processor) {
log.info(`File attachment ${attachmentId} has unsupported MIME type ${attachment.mime} for OCR, skipping`);
return null;
}
} else {
log.info(`Attachment ${attachmentId} is not an image or file, skipping OCR`);
return null; return null;
} }
@ -217,10 +241,10 @@ class OCRService {
} }
const ocrResult = await this.extractTextFromFile(content, attachment.mime, options); const ocrResult = await this.extractTextFromFile(content, attachment.mime, options);
// Store OCR result in blob // Store OCR result in blob
await this.storeOCRResult(attachment.blobId, ocrResult); await this.storeOCRResult(attachment.blobId, ocrResult);
return ocrResult; return ocrResult;
} catch (error) { } catch (error) {
log.error(`Failed to process OCR for attachment ${attachmentId}: ${error}`); log.error(`Failed to process OCR for attachment ${attachmentId}: ${error}`);
@ -240,8 +264,8 @@ class OCRService {
try { try {
// Store OCR text and timestamp in blobs table // Store OCR text and timestamp in blobs table
sql.execute(` sql.execute(`
UPDATE blobs SET UPDATE blobs SET
ocr_text = ?, ocr_text = ?,
ocr_last_processed = ? ocr_last_processed = ?
WHERE blobId = ? WHERE blobId = ?
`, [ `, [
@ -249,7 +273,7 @@ class OCRService {
new Date().toISOString(), new Date().toISOString(),
blobId blobId
]); ]);
log.info(`Stored OCR result for blob ${blobId}`); log.info(`Stored OCR result for blob ${blobId}`);
} catch (error) { } catch (error) {
log.error(`Failed to store OCR result for blob ${blobId}: ${error}`); log.error(`Failed to store OCR result for blob ${blobId}: ${error}`);
@ -270,14 +294,14 @@ class OCRService {
ocr_text: string | null; ocr_text: string | null;
}>(` }>(`
SELECT ocr_text SELECT ocr_text
FROM blobs FROM blobs
WHERE blobId = ? WHERE blobId = ?
`, [blobId]); `, [blobId]);
if (!row || !row.ocr_text) { if (!row || !row.ocr_text) {
return null; return null;
} }
// Return basic OCR result from stored text // Return basic OCR result from stored text
// Note: we lose confidence, language, and extractedAt metadata // Note: we lose confidence, language, and extractedAt metadata
// but gain simplicity by storing directly in blob // but gain simplicity by storing directly in blob
@ -300,14 +324,14 @@ class OCRService {
try { try {
const query = ` const query = `
SELECT blobId, ocr_text SELECT blobId, ocr_text
FROM blobs FROM blobs
WHERE ocr_text LIKE ? WHERE ocr_text LIKE ?
AND ocr_text IS NOT NULL AND ocr_text IS NOT NULL
`; `;
const params = [`%${searchText}%`]; const params = [`%${searchText}%`];
const rows = sql.getRows<OCRBlobRow>(query, params); const rows = sql.getRows<OCRBlobRow>(query, params);
return rows.map(row => ({ return rows.map(row => ({
blobId: row.blobId, blobId: row.blobId,
text: row.ocr_text text: row.ocr_text
@ -324,10 +348,10 @@ class OCRService {
deleteOCRResult(blobId: string): void { deleteOCRResult(blobId: string): void {
try { try {
sql.execute(` sql.execute(`
UPDATE blobs SET ocr_text = NULL UPDATE blobs SET ocr_text = NULL
WHERE blobId = ? WHERE blobId = ?
`, [blobId]); `, [blobId]);
log.info(`Deleted OCR result for blob ${blobId}`); log.info(`Deleted OCR result for blob ${blobId}`);
} catch (error) { } catch (error) {
log.error(`Failed to delete OCR result for blob ${blobId}: ${error}`); log.error(`Failed to delete OCR result for blob ${blobId}: ${error}`);
@ -547,7 +571,7 @@ class OCRService {
ocr_last_processed: string | null; ocr_last_processed: string | null;
}>(` }>(`
SELECT utcDateModified, ocr_last_processed SELECT utcDateModified, ocr_last_processed
FROM blobs FROM blobs
WHERE blobId = ? WHERE blobId = ?
`, [blobId]); `, [blobId]);
@ -563,7 +587,7 @@ class OCRService {
// If blob was modified after last OCR processing, it needs re-processing // If blob was modified after last OCR processing, it needs re-processing
const blobModified = new Date(blobInfo.utcDateModified); const blobModified = new Date(blobInfo.utcDateModified);
const lastOcrProcessed = new Date(blobInfo.ocr_last_processed); const lastOcrProcessed = new Date(blobInfo.ocr_last_processed);
return blobModified > lastOcrProcessed; return blobModified > lastOcrProcessed;
} catch (error) { } catch (error) {
log.error(`Failed to check if blob ${blobId} needs reprocessing: ${error}`); log.error(`Failed to check if blob ${blobId} needs reprocessing: ${error}`);
@ -581,12 +605,12 @@ class OCRService {
try { try {
sql.execute(` sql.execute(`
UPDATE blobs SET UPDATE blobs SET
ocr_text = NULL, ocr_text = NULL,
ocr_last_processed = NULL ocr_last_processed = NULL
WHERE blobId = ? WHERE blobId = ?
`, [blobId]); `, [blobId]);
log.info(`Invalidated OCR result for blob ${blobId}`); log.info(`Invalidated OCR result for blob ${blobId}`);
} catch (error) { } catch (error) {
log.error(`Failed to invalidate OCR result for blob ${blobId}: ${error}`); log.error(`Failed to invalidate OCR result for blob ${blobId}: ${error}`);
@ -599,7 +623,7 @@ class OCRService {
*/ */
getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> { getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> {
try { try {
// Get notes with blobs that need OCR // Get notes with blobs that need OCR (both image notes and file notes with supported MIME types)
const noteBlobs = sql.getRows<{ const noteBlobs = sql.getRows<{
blobId: string; blobId: string;
mimeType: string; mimeType: string;
@ -608,16 +632,38 @@ class OCRService {
SELECT n.blobId, n.mime as mimeType, n.noteId as entityId SELECT n.blobId, n.mime as mimeType, n.noteId as entityId
FROM notes n FROM notes n
JOIN blobs b ON n.blobId = b.blobId JOIN blobs b ON n.blobId = b.blobId
WHERE n.type = 'image' WHERE (
n.type = 'image'
OR (
n.type = 'file'
AND n.mime IN (
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'application/msword',
'application/vnd.ms-excel',
'application/vnd.ms-powerpoint',
'application/rtf',
'application/pdf',
'image/jpeg',
'image/jpg',
'image/png',
'image/gif',
'image/bmp',
'image/tiff',
'image/webp'
)
)
)
AND n.isDeleted = 0 AND n.isDeleted = 0
AND n.blobId IS NOT NULL AND n.blobId IS NOT NULL
AND ( AND (
b.ocr_last_processed IS NULL b.ocr_last_processed IS NULL
OR b.utcDateModified > b.ocr_last_processed OR b.utcDateModified > b.ocr_last_processed
) )
`); `);
// Get attachments with blobs that need OCR // Get attachments with blobs that need OCR (both image and file attachments with supported MIME types)
const attachmentBlobs = sql.getRows<{ const attachmentBlobs = sql.getRows<{
blobId: string; blobId: string;
mimeType: string; mimeType: string;
@ -626,11 +672,33 @@ class OCRService {
SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId
FROM attachments a FROM attachments a
JOIN blobs b ON a.blobId = b.blobId JOIN blobs b ON a.blobId = b.blobId
WHERE a.role = 'image' WHERE (
a.role = 'image'
OR (
a.role = 'file'
AND a.mime IN (
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'application/msword',
'application/vnd.ms-excel',
'application/vnd.ms-powerpoint',
'application/rtf',
'application/pdf',
'image/jpeg',
'image/jpg',
'image/png',
'image/gif',
'image/bmp',
'image/tiff',
'image/webp'
)
)
)
AND a.isDeleted = 0 AND a.isDeleted = 0
AND a.blobId IS NOT NULL AND a.blobId IS NOT NULL
AND ( AND (
b.ocr_last_processed IS NULL b.ocr_last_processed IS NULL
OR b.utcDateModified > b.ocr_last_processed OR b.utcDateModified > b.ocr_last_processed
) )
`); `);
@ -641,8 +709,8 @@ class OCRService {
...attachmentBlobs.map(blob => ({ ...blob, entityType: 'attachment' as const })) ...attachmentBlobs.map(blob => ({ ...blob, entityType: 'attachment' as const }))
]; ];
// Filter to only supported MIME types // Return all results (no need to filter by MIME type as we already did in the query)
return result.filter(blob => this.isSupportedMimeType(blob.mimeType)); return result;
} catch (error) { } catch (error) {
log.error(`Failed to get blobs needing OCR: ${error}`); log.error(`Failed to get blobs needing OCR: ${error}`);
return []; return [];
@ -673,7 +741,7 @@ class OCRService {
} else { } else {
await this.processAttachmentOCR(blobInfo.entityId); await this.processAttachmentOCR(blobInfo.entityId);
} }
// Add small delay to prevent overwhelming the system // Add small delay to prevent overwhelming the system
await new Promise(resolve => setTimeout(resolve, 100)); await new Promise(resolve => setTimeout(resolve, 100));
} catch (error) { } catch (error) {
@ -686,4 +754,4 @@ class OCRService {
} }
} }
export default new OCRService(); export default new OCRService();

View File

@ -25,7 +25,7 @@ export class OfficeProcessor extends FileProcessor {
'application/vnd.ms-powerpoint', // PPT 'application/vnd.ms-powerpoint', // PPT
'application/rtf' // RTF 'application/rtf' // RTF
]; ];
return supportedTypes.includes(mimeType.toLowerCase()); return supportedTypes.includes(mimeType);
} }
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> { async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
@ -40,7 +40,7 @@ export class OfficeProcessor extends FileProcessor {
// Extract text from Office document // Extract text from Office document
const data = await this.parseOfficeDocument(buffer); const data = await this.parseOfficeDocument(buffer);
// Extract text from Office document // Extract text from Office document
const combinedText = data.data && data.data.trim().length > 0 ? data.data.trim() : ''; const combinedText = data.data && data.data.trim().length > 0 ? data.data.trim() : '';
const confidence = combinedText.length > 0 ? 0.99 : 0; // High confidence for direct text extraction const confidence = combinedText.length > 0 ? 0.99 : 0; // High confidence for direct text extraction
@ -71,7 +71,7 @@ export class OfficeProcessor extends FileProcessor {
ignoreNotes: false, ignoreNotes: false,
putNotesAtLast: false putNotesAtLast: false
}); });
return { return {
data: data || '' data: data || ''
}; };
@ -113,16 +113,16 @@ export class OfficeProcessor extends FileProcessor {
if (!language || typeof language !== 'string') { if (!language || typeof language !== 'string') {
return false; return false;
} }
// Split by '+' for multi-language format // Split by '+' for multi-language format
const languages = language.split('+'); const languages = language.split('+');
// Check each language code (should be 2-7 characters, alphanumeric with underscores) // Check each language code (should be 2-7 characters, alphanumeric with underscores)
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/; const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
return languages.every(lang => { return languages.every(lang => {
const trimmed = lang.trim(); const trimmed = lang.trim();
return trimmed.length > 0 && validLanguagePattern.test(trimmed); return trimmed.length > 0 && validLanguagePattern.test(trimmed);
}); });
} }
} }