mirror of
https://github.com/zadam/trilium.git
synced 2025-12-06 23:44:25 +01:00
feat(ocr): basic processing of new files
This commit is contained in:
parent
2adfc1d32b
commit
11e9b097a2
@ -6,6 +6,8 @@ import becca from "../becca/becca.js";
|
|||||||
import BAttribute from "../becca/entities/battribute.js";
|
import BAttribute from "../becca/entities/battribute.js";
|
||||||
import hiddenSubtreeService from "./hidden_subtree.js";
|
import hiddenSubtreeService from "./hidden_subtree.js";
|
||||||
import oneTimeTimer from "./one_time_timer.js";
|
import oneTimeTimer from "./one_time_timer.js";
|
||||||
|
import ocrService from "./ocr/ocr_service.js";
|
||||||
|
import log from "./log.js";
|
||||||
import type BNote from "../becca/entities/bnote.js";
|
import type BNote from "../becca/entities/bnote.js";
|
||||||
import type AbstractBeccaEntity from "../becca/entities/abstract_becca_entity.js";
|
import type AbstractBeccaEntity from "../becca/entities/abstract_becca_entity.js";
|
||||||
import type { DefinitionObject } from "./promoted_attribute_definition_interface.js";
|
import type { DefinitionObject } from "./promoted_attribute_definition_interface.js";
|
||||||
@ -137,6 +139,42 @@ eventService.subscribe(eventService.ENTITY_CREATED, ({ entityName, entity }) =>
|
|||||||
}
|
}
|
||||||
} else if (entityName === "notes") {
|
} else if (entityName === "notes") {
|
||||||
runAttachedRelations(entity, "runOnNoteCreation", entity);
|
runAttachedRelations(entity, "runOnNoteCreation", entity);
|
||||||
|
|
||||||
|
// Automatically process OCR for file notes if OCR is enabled
|
||||||
|
if (entity.type === 'file' && ocrService.isOCREnabled()) {
|
||||||
|
// Check if the file MIME type is supported by any OCR processor
|
||||||
|
const supportedMimeTypes = [
|
||||||
|
// Office documents
|
||||||
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||||
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||||
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||||
|
'application/msword',
|
||||||
|
'application/vnd.ms-excel',
|
||||||
|
'application/vnd.ms-powerpoint',
|
||||||
|
'application/rtf',
|
||||||
|
// PDFs
|
||||||
|
'application/pdf',
|
||||||
|
// Images (though these are usually type='image', not 'file')
|
||||||
|
'image/jpeg',
|
||||||
|
'image/jpg',
|
||||||
|
'image/png',
|
||||||
|
'image/gif',
|
||||||
|
'image/bmp',
|
||||||
|
'image/tiff',
|
||||||
|
'image/webp'
|
||||||
|
];
|
||||||
|
|
||||||
|
if (entity.mime && supportedMimeTypes.includes(entity.mime)) {
|
||||||
|
// Process OCR asynchronously to avoid blocking note creation
|
||||||
|
ocrService.processNoteOCR(entity.noteId).then(result => {
|
||||||
|
if (result) {
|
||||||
|
log.info(`Automatically processed OCR for file note ${entity.noteId} with MIME type ${entity.mime}`);
|
||||||
|
}
|
||||||
|
}).catch(error => {
|
||||||
|
log.error(`Failed to automatically process OCR for file note ${entity.noteId}: ${error}`);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@ -50,13 +50,13 @@ class OCRService {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
log.info('Initializing OCR service with file processors...');
|
log.info('Initializing OCR service with file processors...');
|
||||||
|
|
||||||
// Initialize file processors
|
// Initialize file processors
|
||||||
this.processors.set('image', new ImageProcessor());
|
this.processors.set('image', new ImageProcessor());
|
||||||
this.processors.set('pdf', new PDFProcessor());
|
this.processors.set('pdf', new PDFProcessor());
|
||||||
this.processors.set('tiff', new TIFFProcessor());
|
this.processors.set('tiff', new TIFFProcessor());
|
||||||
this.processors.set('office', new OfficeProcessor());
|
this.processors.set('office', new OfficeProcessor());
|
||||||
|
|
||||||
this.isInitialized = true;
|
this.isInitialized = true;
|
||||||
log.info('OCR service initialized successfully');
|
log.info('OCR service initialized successfully');
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@ -84,10 +84,10 @@ class OCRService {
|
|||||||
if (!mimeType || typeof mimeType !== 'string') {
|
if (!mimeType || typeof mimeType !== 'string') {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const supportedTypes = [
|
const supportedTypes = [
|
||||||
'image/jpeg',
|
'image/jpeg',
|
||||||
'image/jpg',
|
'image/jpg',
|
||||||
'image/png',
|
'image/png',
|
||||||
'image/gif',
|
'image/gif',
|
||||||
'image/bmp',
|
'image/bmp',
|
||||||
@ -116,7 +116,7 @@ class OCRService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const result = await processor.extractText(fileBuffer, options);
|
const result = await processor.extractText(fileBuffer, options);
|
||||||
|
|
||||||
log.info(`OCR extraction completed. Confidence: ${result.confidence}%, Text length: ${result.text.length}`);
|
log.info(`OCR extraction completed. Confidence: ${result.confidence}%, Text length: ${result.text.length}`);
|
||||||
return result;
|
return result;
|
||||||
|
|
||||||
@ -143,13 +143,25 @@ class OCRService {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (note.type !== 'image') {
|
if (!this.isInitialized) {
|
||||||
log.info(`Note ${noteId} is not an image note, skipping OCR`);
|
await this.initialize();
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!this.isSupportedMimeType(note.mime)) {
|
// Check if note type and MIME type are supported for OCR
|
||||||
log.info(`Note ${noteId} has unsupported MIME type ${note.mime}, skipping OCR`);
|
if (note.type === 'image') {
|
||||||
|
if (!this.isSupportedMimeType(note.mime)) {
|
||||||
|
log.info(`Image note ${noteId} has unsupported MIME type ${note.mime}, skipping OCR`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
} else if (note.type === 'file') {
|
||||||
|
// Check if file MIME type is supported by any processor
|
||||||
|
const processor = this.getProcessorForMimeType(note.mime);
|
||||||
|
if (!processor) {
|
||||||
|
log.info(`File note ${noteId} has unsupported MIME type ${note.mime} for OCR, skipping`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
log.info(`Note ${noteId} is not an image or file note, skipping OCR`);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -167,10 +179,10 @@ class OCRService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const ocrResult = await this.extractTextFromFile(content, note.mime, options);
|
const ocrResult = await this.extractTextFromFile(content, note.mime, options);
|
||||||
|
|
||||||
// Store OCR result in blob
|
// Store OCR result in blob
|
||||||
await this.storeOCRResult(note.blobId, ocrResult);
|
await this.storeOCRResult(note.blobId, ocrResult);
|
||||||
|
|
||||||
return ocrResult;
|
return ocrResult;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
log.error(`Failed to process OCR for note ${noteId}: ${error}`);
|
log.error(`Failed to process OCR for note ${noteId}: ${error}`);
|
||||||
@ -193,13 +205,25 @@ class OCRService {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (attachment.role !== 'image') {
|
if (!this.isInitialized) {
|
||||||
log.info(`Attachment ${attachmentId} is not an image, skipping OCR`);
|
await this.initialize();
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!this.isSupportedMimeType(attachment.mime)) {
|
// Check if attachment role and MIME type are supported for OCR
|
||||||
log.info(`Attachment ${attachmentId} has unsupported MIME type ${attachment.mime}, skipping OCR`);
|
if (attachment.role === 'image') {
|
||||||
|
if (!this.isSupportedMimeType(attachment.mime)) {
|
||||||
|
log.info(`Image attachment ${attachmentId} has unsupported MIME type ${attachment.mime}, skipping OCR`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
} else if (attachment.role === 'file') {
|
||||||
|
// Check if file MIME type is supported by any processor
|
||||||
|
const processor = this.getProcessorForMimeType(attachment.mime);
|
||||||
|
if (!processor) {
|
||||||
|
log.info(`File attachment ${attachmentId} has unsupported MIME type ${attachment.mime} for OCR, skipping`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
log.info(`Attachment ${attachmentId} is not an image or file, skipping OCR`);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -217,10 +241,10 @@ class OCRService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const ocrResult = await this.extractTextFromFile(content, attachment.mime, options);
|
const ocrResult = await this.extractTextFromFile(content, attachment.mime, options);
|
||||||
|
|
||||||
// Store OCR result in blob
|
// Store OCR result in blob
|
||||||
await this.storeOCRResult(attachment.blobId, ocrResult);
|
await this.storeOCRResult(attachment.blobId, ocrResult);
|
||||||
|
|
||||||
return ocrResult;
|
return ocrResult;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
log.error(`Failed to process OCR for attachment ${attachmentId}: ${error}`);
|
log.error(`Failed to process OCR for attachment ${attachmentId}: ${error}`);
|
||||||
@ -240,8 +264,8 @@ class OCRService {
|
|||||||
try {
|
try {
|
||||||
// Store OCR text and timestamp in blobs table
|
// Store OCR text and timestamp in blobs table
|
||||||
sql.execute(`
|
sql.execute(`
|
||||||
UPDATE blobs SET
|
UPDATE blobs SET
|
||||||
ocr_text = ?,
|
ocr_text = ?,
|
||||||
ocr_last_processed = ?
|
ocr_last_processed = ?
|
||||||
WHERE blobId = ?
|
WHERE blobId = ?
|
||||||
`, [
|
`, [
|
||||||
@ -249,7 +273,7 @@ class OCRService {
|
|||||||
new Date().toISOString(),
|
new Date().toISOString(),
|
||||||
blobId
|
blobId
|
||||||
]);
|
]);
|
||||||
|
|
||||||
log.info(`Stored OCR result for blob ${blobId}`);
|
log.info(`Stored OCR result for blob ${blobId}`);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
log.error(`Failed to store OCR result for blob ${blobId}: ${error}`);
|
log.error(`Failed to store OCR result for blob ${blobId}: ${error}`);
|
||||||
@ -270,14 +294,14 @@ class OCRService {
|
|||||||
ocr_text: string | null;
|
ocr_text: string | null;
|
||||||
}>(`
|
}>(`
|
||||||
SELECT ocr_text
|
SELECT ocr_text
|
||||||
FROM blobs
|
FROM blobs
|
||||||
WHERE blobId = ?
|
WHERE blobId = ?
|
||||||
`, [blobId]);
|
`, [blobId]);
|
||||||
|
|
||||||
if (!row || !row.ocr_text) {
|
if (!row || !row.ocr_text) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return basic OCR result from stored text
|
// Return basic OCR result from stored text
|
||||||
// Note: we lose confidence, language, and extractedAt metadata
|
// Note: we lose confidence, language, and extractedAt metadata
|
||||||
// but gain simplicity by storing directly in blob
|
// but gain simplicity by storing directly in blob
|
||||||
@ -300,14 +324,14 @@ class OCRService {
|
|||||||
try {
|
try {
|
||||||
const query = `
|
const query = `
|
||||||
SELECT blobId, ocr_text
|
SELECT blobId, ocr_text
|
||||||
FROM blobs
|
FROM blobs
|
||||||
WHERE ocr_text LIKE ?
|
WHERE ocr_text LIKE ?
|
||||||
AND ocr_text IS NOT NULL
|
AND ocr_text IS NOT NULL
|
||||||
`;
|
`;
|
||||||
const params = [`%${searchText}%`];
|
const params = [`%${searchText}%`];
|
||||||
|
|
||||||
const rows = sql.getRows<OCRBlobRow>(query, params);
|
const rows = sql.getRows<OCRBlobRow>(query, params);
|
||||||
|
|
||||||
return rows.map(row => ({
|
return rows.map(row => ({
|
||||||
blobId: row.blobId,
|
blobId: row.blobId,
|
||||||
text: row.ocr_text
|
text: row.ocr_text
|
||||||
@ -324,10 +348,10 @@ class OCRService {
|
|||||||
deleteOCRResult(blobId: string): void {
|
deleteOCRResult(blobId: string): void {
|
||||||
try {
|
try {
|
||||||
sql.execute(`
|
sql.execute(`
|
||||||
UPDATE blobs SET ocr_text = NULL
|
UPDATE blobs SET ocr_text = NULL
|
||||||
WHERE blobId = ?
|
WHERE blobId = ?
|
||||||
`, [blobId]);
|
`, [blobId]);
|
||||||
|
|
||||||
log.info(`Deleted OCR result for blob ${blobId}`);
|
log.info(`Deleted OCR result for blob ${blobId}`);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
log.error(`Failed to delete OCR result for blob ${blobId}: ${error}`);
|
log.error(`Failed to delete OCR result for blob ${blobId}: ${error}`);
|
||||||
@ -547,7 +571,7 @@ class OCRService {
|
|||||||
ocr_last_processed: string | null;
|
ocr_last_processed: string | null;
|
||||||
}>(`
|
}>(`
|
||||||
SELECT utcDateModified, ocr_last_processed
|
SELECT utcDateModified, ocr_last_processed
|
||||||
FROM blobs
|
FROM blobs
|
||||||
WHERE blobId = ?
|
WHERE blobId = ?
|
||||||
`, [blobId]);
|
`, [blobId]);
|
||||||
|
|
||||||
@ -563,7 +587,7 @@ class OCRService {
|
|||||||
// If blob was modified after last OCR processing, it needs re-processing
|
// If blob was modified after last OCR processing, it needs re-processing
|
||||||
const blobModified = new Date(blobInfo.utcDateModified);
|
const blobModified = new Date(blobInfo.utcDateModified);
|
||||||
const lastOcrProcessed = new Date(blobInfo.ocr_last_processed);
|
const lastOcrProcessed = new Date(blobInfo.ocr_last_processed);
|
||||||
|
|
||||||
return blobModified > lastOcrProcessed;
|
return blobModified > lastOcrProcessed;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
log.error(`Failed to check if blob ${blobId} needs reprocessing: ${error}`);
|
log.error(`Failed to check if blob ${blobId} needs reprocessing: ${error}`);
|
||||||
@ -581,12 +605,12 @@ class OCRService {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
sql.execute(`
|
sql.execute(`
|
||||||
UPDATE blobs SET
|
UPDATE blobs SET
|
||||||
ocr_text = NULL,
|
ocr_text = NULL,
|
||||||
ocr_last_processed = NULL
|
ocr_last_processed = NULL
|
||||||
WHERE blobId = ?
|
WHERE blobId = ?
|
||||||
`, [blobId]);
|
`, [blobId]);
|
||||||
|
|
||||||
log.info(`Invalidated OCR result for blob ${blobId}`);
|
log.info(`Invalidated OCR result for blob ${blobId}`);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
log.error(`Failed to invalidate OCR result for blob ${blobId}: ${error}`);
|
log.error(`Failed to invalidate OCR result for blob ${blobId}: ${error}`);
|
||||||
@ -599,7 +623,7 @@ class OCRService {
|
|||||||
*/
|
*/
|
||||||
getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> {
|
getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> {
|
||||||
try {
|
try {
|
||||||
// Get notes with blobs that need OCR
|
// Get notes with blobs that need OCR (both image notes and file notes with supported MIME types)
|
||||||
const noteBlobs = sql.getRows<{
|
const noteBlobs = sql.getRows<{
|
||||||
blobId: string;
|
blobId: string;
|
||||||
mimeType: string;
|
mimeType: string;
|
||||||
@ -608,16 +632,38 @@ class OCRService {
|
|||||||
SELECT n.blobId, n.mime as mimeType, n.noteId as entityId
|
SELECT n.blobId, n.mime as mimeType, n.noteId as entityId
|
||||||
FROM notes n
|
FROM notes n
|
||||||
JOIN blobs b ON n.blobId = b.blobId
|
JOIN blobs b ON n.blobId = b.blobId
|
||||||
WHERE n.type = 'image'
|
WHERE (
|
||||||
|
n.type = 'image'
|
||||||
|
OR (
|
||||||
|
n.type = 'file'
|
||||||
|
AND n.mime IN (
|
||||||
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||||
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||||
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||||
|
'application/msword',
|
||||||
|
'application/vnd.ms-excel',
|
||||||
|
'application/vnd.ms-powerpoint',
|
||||||
|
'application/rtf',
|
||||||
|
'application/pdf',
|
||||||
|
'image/jpeg',
|
||||||
|
'image/jpg',
|
||||||
|
'image/png',
|
||||||
|
'image/gif',
|
||||||
|
'image/bmp',
|
||||||
|
'image/tiff',
|
||||||
|
'image/webp'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
AND n.isDeleted = 0
|
AND n.isDeleted = 0
|
||||||
AND n.blobId IS NOT NULL
|
AND n.blobId IS NOT NULL
|
||||||
AND (
|
AND (
|
||||||
b.ocr_last_processed IS NULL
|
b.ocr_last_processed IS NULL
|
||||||
OR b.utcDateModified > b.ocr_last_processed
|
OR b.utcDateModified > b.ocr_last_processed
|
||||||
)
|
)
|
||||||
`);
|
`);
|
||||||
|
|
||||||
// Get attachments with blobs that need OCR
|
// Get attachments with blobs that need OCR (both image and file attachments with supported MIME types)
|
||||||
const attachmentBlobs = sql.getRows<{
|
const attachmentBlobs = sql.getRows<{
|
||||||
blobId: string;
|
blobId: string;
|
||||||
mimeType: string;
|
mimeType: string;
|
||||||
@ -626,11 +672,33 @@ class OCRService {
|
|||||||
SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId
|
SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId
|
||||||
FROM attachments a
|
FROM attachments a
|
||||||
JOIN blobs b ON a.blobId = b.blobId
|
JOIN blobs b ON a.blobId = b.blobId
|
||||||
WHERE a.role = 'image'
|
WHERE (
|
||||||
|
a.role = 'image'
|
||||||
|
OR (
|
||||||
|
a.role = 'file'
|
||||||
|
AND a.mime IN (
|
||||||
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||||
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||||
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||||
|
'application/msword',
|
||||||
|
'application/vnd.ms-excel',
|
||||||
|
'application/vnd.ms-powerpoint',
|
||||||
|
'application/rtf',
|
||||||
|
'application/pdf',
|
||||||
|
'image/jpeg',
|
||||||
|
'image/jpg',
|
||||||
|
'image/png',
|
||||||
|
'image/gif',
|
||||||
|
'image/bmp',
|
||||||
|
'image/tiff',
|
||||||
|
'image/webp'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
AND a.isDeleted = 0
|
AND a.isDeleted = 0
|
||||||
AND a.blobId IS NOT NULL
|
AND a.blobId IS NOT NULL
|
||||||
AND (
|
AND (
|
||||||
b.ocr_last_processed IS NULL
|
b.ocr_last_processed IS NULL
|
||||||
OR b.utcDateModified > b.ocr_last_processed
|
OR b.utcDateModified > b.ocr_last_processed
|
||||||
)
|
)
|
||||||
`);
|
`);
|
||||||
@ -641,8 +709,8 @@ class OCRService {
|
|||||||
...attachmentBlobs.map(blob => ({ ...blob, entityType: 'attachment' as const }))
|
...attachmentBlobs.map(blob => ({ ...blob, entityType: 'attachment' as const }))
|
||||||
];
|
];
|
||||||
|
|
||||||
// Filter to only supported MIME types
|
// Return all results (no need to filter by MIME type as we already did in the query)
|
||||||
return result.filter(blob => this.isSupportedMimeType(blob.mimeType));
|
return result;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
log.error(`Failed to get blobs needing OCR: ${error}`);
|
log.error(`Failed to get blobs needing OCR: ${error}`);
|
||||||
return [];
|
return [];
|
||||||
@ -673,7 +741,7 @@ class OCRService {
|
|||||||
} else {
|
} else {
|
||||||
await this.processAttachmentOCR(blobInfo.entityId);
|
await this.processAttachmentOCR(blobInfo.entityId);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add small delay to prevent overwhelming the system
|
// Add small delay to prevent overwhelming the system
|
||||||
await new Promise(resolve => setTimeout(resolve, 100));
|
await new Promise(resolve => setTimeout(resolve, 100));
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@ -686,4 +754,4 @@ class OCRService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export default new OCRService();
|
export default new OCRService();
|
||||||
|
|||||||
@ -25,7 +25,7 @@ export class OfficeProcessor extends FileProcessor {
|
|||||||
'application/vnd.ms-powerpoint', // PPT
|
'application/vnd.ms-powerpoint', // PPT
|
||||||
'application/rtf' // RTF
|
'application/rtf' // RTF
|
||||||
];
|
];
|
||||||
return supportedTypes.includes(mimeType.toLowerCase());
|
return supportedTypes.includes(mimeType);
|
||||||
}
|
}
|
||||||
|
|
||||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||||
@ -40,7 +40,7 @@ export class OfficeProcessor extends FileProcessor {
|
|||||||
|
|
||||||
// Extract text from Office document
|
// Extract text from Office document
|
||||||
const data = await this.parseOfficeDocument(buffer);
|
const data = await this.parseOfficeDocument(buffer);
|
||||||
|
|
||||||
// Extract text from Office document
|
// Extract text from Office document
|
||||||
const combinedText = data.data && data.data.trim().length > 0 ? data.data.trim() : '';
|
const combinedText = data.data && data.data.trim().length > 0 ? data.data.trim() : '';
|
||||||
const confidence = combinedText.length > 0 ? 0.99 : 0; // High confidence for direct text extraction
|
const confidence = combinedText.length > 0 ? 0.99 : 0; // High confidence for direct text extraction
|
||||||
@ -71,7 +71,7 @@ export class OfficeProcessor extends FileProcessor {
|
|||||||
ignoreNotes: false,
|
ignoreNotes: false,
|
||||||
putNotesAtLast: false
|
putNotesAtLast: false
|
||||||
});
|
});
|
||||||
|
|
||||||
return {
|
return {
|
||||||
data: data || ''
|
data: data || ''
|
||||||
};
|
};
|
||||||
@ -113,16 +113,16 @@ export class OfficeProcessor extends FileProcessor {
|
|||||||
if (!language || typeof language !== 'string') {
|
if (!language || typeof language !== 'string') {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Split by '+' for multi-language format
|
// Split by '+' for multi-language format
|
||||||
const languages = language.split('+');
|
const languages = language.split('+');
|
||||||
|
|
||||||
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
|
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
|
||||||
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
|
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
|
||||||
|
|
||||||
return languages.every(lang => {
|
return languages.every(lang => {
|
||||||
const trimmed = lang.trim();
|
const trimmed = lang.trim();
|
||||||
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
|
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user