mirror of
https://github.com/zadam/trilium.git
synced 2025-12-04 22:44:25 +01:00
feat(ocr): swap from custom table to using the blobs table, with a new column
This commit is contained in:
parent
4b5e8d33a6
commit
9029f59410
@ -10,11 +10,12 @@ class BBlob extends AbstractBeccaEntity<BBlob> {
|
||||
return "blobId";
|
||||
}
|
||||
static get hashedProperties() {
|
||||
return ["blobId", "content"];
|
||||
return ["blobId", "content", "ocr_text"];
|
||||
}
|
||||
|
||||
content!: string | Buffer;
|
||||
contentLength!: number;
|
||||
ocr_text?: string | null;
|
||||
|
||||
constructor(row: BlobRow) {
|
||||
super();
|
||||
@ -25,6 +26,7 @@ class BBlob extends AbstractBeccaEntity<BBlob> {
|
||||
this.blobId = row.blobId;
|
||||
this.content = row.content;
|
||||
this.contentLength = row.contentLength;
|
||||
this.ocr_text = row.ocr_text;
|
||||
this.dateModified = row.dateModified;
|
||||
this.utcDateModified = row.utcDateModified;
|
||||
}
|
||||
@ -34,6 +36,7 @@ class BBlob extends AbstractBeccaEntity<BBlob> {
|
||||
blobId: this.blobId,
|
||||
content: this.content || null,
|
||||
contentLength: this.contentLength,
|
||||
ocr_text: this.ocr_text || null,
|
||||
dateModified: this.dateModified,
|
||||
utcDateModified: this.utcDateModified
|
||||
};
|
||||
|
||||
@ -6,64 +6,16 @@
|
||||
|
||||
// Migrations should be kept in descending order, so the latest migration is first.
|
||||
const MIGRATIONS: (SqlMigration | JsMigration)[] = [
|
||||
// Add OCR results table for storing extracted text from images
|
||||
// Add OCR text column to blobs table for storing extracted text from images
|
||||
{
|
||||
version: 233,
|
||||
sql: /*sql*/`\
|
||||
-- Create OCR results table to store extracted text from images
|
||||
CREATE TABLE IF NOT EXISTS ocr_results (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
entity_id TEXT NOT NULL,
|
||||
entity_type TEXT NOT NULL DEFAULT 'note',
|
||||
extracted_text TEXT NOT NULL,
|
||||
confidence REAL NOT NULL,
|
||||
language TEXT NOT NULL DEFAULT 'eng',
|
||||
extracted_at TEXT NOT NULL,
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(entity_id, entity_type)
|
||||
);
|
||||
|
||||
-- Create indexes for better search performance
|
||||
CREATE INDEX IF NOT EXISTS idx_ocr_results_entity
|
||||
ON ocr_results (entity_id, entity_type);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_ocr_results_text
|
||||
ON ocr_results (extracted_text);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_ocr_results_confidence
|
||||
ON ocr_results (confidence);
|
||||
|
||||
-- Create full-text search index for extracted text
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS ocr_results_fts USING fts5(
|
||||
entity_id UNINDEXED,
|
||||
entity_type UNINDEXED,
|
||||
extracted_text,
|
||||
content='ocr_results',
|
||||
content_rowid='id'
|
||||
);
|
||||
|
||||
-- Create triggers to keep FTS table in sync
|
||||
CREATE TRIGGER IF NOT EXISTS ocr_results_fts_insert
|
||||
AFTER INSERT ON ocr_results
|
||||
BEGIN
|
||||
INSERT INTO ocr_results_fts(rowid, entity_id, entity_type, extracted_text)
|
||||
VALUES (new.id, new.entity_id, new.entity_type, new.extracted_text);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS ocr_results_fts_update
|
||||
AFTER UPDATE ON ocr_results
|
||||
BEGIN
|
||||
UPDATE ocr_results_fts
|
||||
SET extracted_text = new.extracted_text
|
||||
WHERE rowid = new.id;
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS ocr_results_fts_delete
|
||||
AFTER DELETE ON ocr_results
|
||||
BEGIN
|
||||
DELETE FROM ocr_results_fts WHERE rowid = old.id;
|
||||
END;
|
||||
-- Add OCR text column to blobs table
|
||||
ALTER TABLE blobs ADD COLUMN ocr_text TEXT DEFAULT NULL;
|
||||
|
||||
-- Create index for OCR text searches
|
||||
CREATE INDEX IF NOT EXISTS idx_blobs_ocr_text
|
||||
ON blobs (ocr_text);
|
||||
`
|
||||
},
|
||||
// Remove embedding tables since LLM embedding functionality has been removed
|
||||
|
||||
@ -246,13 +246,6 @@ async function processAttachmentOCR(req: Request, res: Response) {
|
||||
* schema:
|
||||
* type: string
|
||||
* description: Search query text
|
||||
* - name: entityType
|
||||
* in: query
|
||||
* required: false
|
||||
* schema:
|
||||
* type: string
|
||||
* enum: [note, attachment]
|
||||
* description: Filter by entity type
|
||||
* responses:
|
||||
* '200':
|
||||
* description: Search results
|
||||
@ -268,14 +261,10 @@ async function processAttachmentOCR(req: Request, res: Response) {
|
||||
* items:
|
||||
* type: object
|
||||
* properties:
|
||||
* entityId:
|
||||
* type: string
|
||||
* entityType:
|
||||
* blobId:
|
||||
* type: string
|
||||
* text:
|
||||
* type: string
|
||||
* confidence:
|
||||
* type: number
|
||||
* '400':
|
||||
* description: Bad request - missing search query
|
||||
* '500':
|
||||
@ -286,7 +275,7 @@ async function processAttachmentOCR(req: Request, res: Response) {
|
||||
*/
|
||||
async function searchOCR(req: Request, res: Response) {
|
||||
try {
|
||||
const { q: searchText, entityType } = req.query;
|
||||
const { q: searchText } = req.query;
|
||||
|
||||
if (!searchText || typeof searchText !== 'string') {
|
||||
res.status(400).json({
|
||||
@ -297,10 +286,7 @@ async function searchOCR(req: Request, res: Response) {
|
||||
return;
|
||||
}
|
||||
|
||||
const results = ocrService.searchOCRResults(
|
||||
searchText,
|
||||
entityType as 'note' | 'attachment' | undefined
|
||||
);
|
||||
const results = ocrService.searchOCRResults(searchText);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
@ -431,10 +417,10 @@ async function getBatchProgress(req: Request, res: Response) {
|
||||
* properties:
|
||||
* totalProcessed:
|
||||
* type: number
|
||||
* averageConfidence:
|
||||
* imageNotes:
|
||||
* type: number
|
||||
* imageAttachments:
|
||||
* type: number
|
||||
* byEntityType:
|
||||
* type: object
|
||||
* '500':
|
||||
* description: Internal server error
|
||||
* security:
|
||||
@ -463,24 +449,17 @@ async function getOCRStats(req: Request, res: Response) {
|
||||
|
||||
/**
|
||||
* @swagger
|
||||
* /api/ocr/delete/{entityType}/{entityId}:
|
||||
* /api/ocr/delete/{blobId}:
|
||||
* delete:
|
||||
* summary: Delete OCR results for a specific entity
|
||||
* summary: Delete OCR results for a specific blob
|
||||
* operationId: ocr-delete-results
|
||||
* parameters:
|
||||
* - name: entityType
|
||||
* - name: blobId
|
||||
* in: path
|
||||
* required: true
|
||||
* schema:
|
||||
* type: string
|
||||
* enum: [note, attachment]
|
||||
* description: Type of entity
|
||||
* - name: entityId
|
||||
* in: path
|
||||
* required: true
|
||||
* schema:
|
||||
* type: string
|
||||
* description: ID of the entity
|
||||
* description: ID of the blob
|
||||
* responses:
|
||||
* '200':
|
||||
* description: OCR results deleted successfully
|
||||
@ -503,31 +482,22 @@ async function getOCRStats(req: Request, res: Response) {
|
||||
*/
|
||||
async function deleteOCRResults(req: Request, res: Response) {
|
||||
try {
|
||||
const { entityType, entityId } = req.params;
|
||||
const { blobId } = req.params;
|
||||
|
||||
if (!entityType || !entityId) {
|
||||
if (!blobId) {
|
||||
res.status(400).json({
|
||||
success: false,
|
||||
error: 'Entity type and ID are required'
|
||||
error: 'Blob ID is required'
|
||||
});
|
||||
(res as any).triliumResponseHandled = true;
|
||||
return;
|
||||
}
|
||||
|
||||
if (!['note', 'attachment'].includes(entityType)) {
|
||||
res.status(400).json({
|
||||
success: false,
|
||||
error: 'Entity type must be either "note" or "attachment"'
|
||||
});
|
||||
(res as any).triliumResponseHandled = true;
|
||||
return;
|
||||
}
|
||||
|
||||
ocrService.deleteOCRResult(entityId, entityType as 'note' | 'attachment');
|
||||
ocrService.deleteOCRResult(blobId);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
message: `OCR results deleted for ${entityType} ${entityId}`
|
||||
message: `OCR results deleted for blob ${blobId}`
|
||||
});
|
||||
(res as any).triliumResponseHandled = true;
|
||||
|
||||
|
||||
@ -240,7 +240,7 @@ describe('OCRService', () => {
|
||||
});
|
||||
|
||||
describe('storeOCRResult', () => {
|
||||
it('should store OCR result in database successfully', async () => {
|
||||
it('should store OCR result in blob successfully', async () => {
|
||||
const ocrResult = {
|
||||
text: 'Sample text',
|
||||
confidence: 0.95,
|
||||
@ -248,15 +248,29 @@ describe('OCRService', () => {
|
||||
language: 'eng'
|
||||
};
|
||||
|
||||
await ocrService.storeOCRResult('note123', ocrResult, 'note');
|
||||
await ocrService.storeOCRResult('blob123', ocrResult);
|
||||
|
||||
expect(mockSql.execute).toHaveBeenCalledWith(
|
||||
expect.stringContaining('INSERT OR REPLACE INTO ocr_results'),
|
||||
expect.arrayContaining(['note123', 'note', 'Sample text', 0.95, 'eng', expect.any(String)])
|
||||
expect.stringContaining('UPDATE blobs SET ocr_text = ?'),
|
||||
['Sample text', 'blob123']
|
||||
);
|
||||
});
|
||||
|
||||
it('should handle database insertion errors', async () => {
|
||||
it('should handle undefined blobId gracefully', async () => {
|
||||
const ocrResult = {
|
||||
text: 'Sample text',
|
||||
confidence: 0.95,
|
||||
extractedAt: '2025-06-10T10:00:00.000Z',
|
||||
language: 'eng'
|
||||
};
|
||||
|
||||
await ocrService.storeOCRResult(undefined, ocrResult);
|
||||
|
||||
expect(mockSql.execute).not.toHaveBeenCalled();
|
||||
expect(mockLog.error).toHaveBeenCalledWith('Cannot store OCR result: blobId is undefined');
|
||||
});
|
||||
|
||||
it('should handle database update errors', async () => {
|
||||
const error = new Error('Database error');
|
||||
mockSql.execute.mockImplementation(() => {
|
||||
throw error;
|
||||
@ -269,8 +283,8 @@ describe('OCRService', () => {
|
||||
language: 'eng'
|
||||
};
|
||||
|
||||
await expect(ocrService.storeOCRResult('note123', ocrResult, 'note')).rejects.toThrow('Database error');
|
||||
expect(mockLog.error).toHaveBeenCalledWith('Failed to store OCR result for note note123: Error: Database error');
|
||||
await expect(ocrService.storeOCRResult('blob123', ocrResult)).rejects.toThrow('Database error');
|
||||
expect(mockLog.error).toHaveBeenCalledWith('Failed to store OCR result for blob blob123: Error: Database error');
|
||||
});
|
||||
});
|
||||
|
||||
@ -279,6 +293,7 @@ describe('OCRService', () => {
|
||||
noteId: 'note123',
|
||||
type: 'image',
|
||||
mime: 'image/jpeg',
|
||||
blobId: 'blob123',
|
||||
getContent: vi.fn()
|
||||
};
|
||||
|
||||
@ -316,10 +331,7 @@ describe('OCRService', () => {
|
||||
|
||||
it('should return existing OCR result if forceReprocess is false', async () => {
|
||||
const existingResult = {
|
||||
extracted_text: 'Existing text',
|
||||
confidence: 0.85,
|
||||
language: 'eng',
|
||||
extracted_at: '2025-06-10T09:00:00.000Z'
|
||||
ocr_text: 'Existing text'
|
||||
};
|
||||
mockSql.getRow.mockReturnValue(existingResult);
|
||||
|
||||
@ -327,19 +339,16 @@ describe('OCRService', () => {
|
||||
|
||||
expect(result).toEqual({
|
||||
text: 'Existing text',
|
||||
confidence: 0.85,
|
||||
confidence: 0.95,
|
||||
language: 'eng',
|
||||
extractedAt: '2025-06-10T09:00:00.000Z'
|
||||
extractedAt: expect.any(String)
|
||||
});
|
||||
expect(mockNote.getContent).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('should reprocess if forceReprocess is true', async () => {
|
||||
const existingResult = {
|
||||
extracted_text: 'Existing text',
|
||||
confidence: 0.85,
|
||||
language: 'eng',
|
||||
extracted_at: '2025-06-10T09:00:00.000Z'
|
||||
ocr_text: 'Existing text'
|
||||
};
|
||||
mockSql.getRow.mockResolvedValue(existingResult);
|
||||
|
||||
@ -385,6 +394,7 @@ describe('OCRService', () => {
|
||||
attachmentId: 'attach123',
|
||||
role: 'image',
|
||||
mime: 'image/png',
|
||||
blobId: 'blob456',
|
||||
getContent: vi.fn()
|
||||
};
|
||||
|
||||
@ -434,10 +444,8 @@ describe('OCRService', () => {
|
||||
it('should search OCR results successfully', () => {
|
||||
const mockResults = [
|
||||
{
|
||||
entity_id: 'note1',
|
||||
entity_type: 'note',
|
||||
extracted_text: 'Sample search text',
|
||||
confidence: 0.95
|
||||
blobId: 'blob1',
|
||||
ocr_text: 'Sample search text'
|
||||
}
|
||||
];
|
||||
mockSql.getRows.mockReturnValue(mockResults);
|
||||
@ -445,36 +453,15 @@ describe('OCRService', () => {
|
||||
const results = ocrService.searchOCRResults('search');
|
||||
|
||||
expect(results).toEqual([{
|
||||
entityId: 'note1',
|
||||
entityType: 'note',
|
||||
text: 'Sample search text',
|
||||
confidence: 0.95
|
||||
blobId: 'blob1',
|
||||
text: 'Sample search text'
|
||||
}]);
|
||||
expect(mockSql.getRows).toHaveBeenCalledWith(
|
||||
expect.stringContaining('WHERE extracted_text LIKE ?'),
|
||||
expect.stringContaining('WHERE ocr_text LIKE ?'),
|
||||
['%search%']
|
||||
);
|
||||
});
|
||||
|
||||
it('should filter by entity type', () => {
|
||||
const mockResults = [
|
||||
{
|
||||
entity_id: 'note1',
|
||||
entity_type: 'note',
|
||||
extracted_text: 'Note text',
|
||||
confidence: 0.95
|
||||
}
|
||||
];
|
||||
mockSql.getRows.mockReturnValue(mockResults);
|
||||
|
||||
ocrService.searchOCRResults('text', 'note');
|
||||
|
||||
expect(mockSql.getRows).toHaveBeenCalledWith(
|
||||
expect.stringContaining('AND entity_type = ?'),
|
||||
['%text%', 'note']
|
||||
);
|
||||
});
|
||||
|
||||
it('should handle search errors gracefully', () => {
|
||||
mockSql.getRows.mockImplementation(() => {
|
||||
throw new Error('Database error');
|
||||
@ -490,39 +477,37 @@ describe('OCRService', () => {
|
||||
describe('getOCRStats', () => {
|
||||
it('should return OCR statistics successfully', () => {
|
||||
const mockStats = {
|
||||
total_processed: 150,
|
||||
avg_confidence: 0.87
|
||||
total_processed: 150
|
||||
};
|
||||
const mockNoteStats = {
|
||||
count: 100
|
||||
};
|
||||
const mockAttachmentStats = {
|
||||
count: 50
|
||||
};
|
||||
const mockByEntityType = [
|
||||
{ entity_type: 'note', count: 100 },
|
||||
{ entity_type: 'attachment', count: 50 }
|
||||
];
|
||||
|
||||
mockSql.getRow.mockReturnValue(mockStats);
|
||||
mockSql.getRows.mockReturnValue(mockByEntityType);
|
||||
mockSql.getRow.mockReturnValueOnce(mockStats);
|
||||
mockSql.getRow.mockReturnValueOnce(mockNoteStats);
|
||||
mockSql.getRow.mockReturnValueOnce(mockAttachmentStats);
|
||||
|
||||
const stats = ocrService.getOCRStats();
|
||||
|
||||
expect(stats).toEqual({
|
||||
totalProcessed: 150,
|
||||
averageConfidence: 0.87,
|
||||
byEntityType: {
|
||||
note: 100,
|
||||
attachment: 50
|
||||
}
|
||||
imageNotes: 100,
|
||||
imageAttachments: 50
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle missing statistics gracefully', () => {
|
||||
mockSql.getRow.mockReturnValue(null);
|
||||
mockSql.getRows.mockReturnValue([]);
|
||||
|
||||
const stats = ocrService.getOCRStats();
|
||||
|
||||
expect(stats).toEqual({
|
||||
totalProcessed: 0,
|
||||
averageConfidence: 0,
|
||||
byEntityType: {}
|
||||
imageNotes: 0,
|
||||
imageAttachments: 0
|
||||
});
|
||||
});
|
||||
});
|
||||
@ -698,11 +683,11 @@ describe('OCRService', () => {
|
||||
|
||||
// Mock data for batch processing
|
||||
const imageNotes = [
|
||||
{ noteId: 'note1', mime: 'image/jpeg' },
|
||||
{ noteId: 'note2', mime: 'image/png' }
|
||||
{ noteId: 'note1', mime: 'image/jpeg', blobId: 'blob1' },
|
||||
{ noteId: 'note2', mime: 'image/png', blobId: 'blob2' }
|
||||
];
|
||||
const imageAttachments = [
|
||||
{ attachmentId: 'attach1', mime: 'image/gif' }
|
||||
{ attachmentId: 'attach1', mime: 'image/gif', blobId: 'blob3' }
|
||||
];
|
||||
|
||||
// Setup mocks for startBatchProcessing
|
||||
@ -723,18 +708,21 @@ describe('OCRService', () => {
|
||||
noteId: 'note1',
|
||||
type: 'image',
|
||||
mime: 'image/jpeg',
|
||||
blobId: 'blob1',
|
||||
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
|
||||
};
|
||||
const mockNote2 = {
|
||||
noteId: 'note2',
|
||||
type: 'image',
|
||||
mime: 'image/png',
|
||||
blobId: 'blob2',
|
||||
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
|
||||
};
|
||||
const mockAttachment = {
|
||||
attachmentId: 'attach1',
|
||||
role: 'image',
|
||||
mime: 'image/gif',
|
||||
blobId: 'blob3',
|
||||
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
|
||||
};
|
||||
|
||||
@ -761,7 +749,7 @@ describe('OCRService', () => {
|
||||
|
||||
it('should handle processing errors gracefully', async () => {
|
||||
const imageNotes = [
|
||||
{ noteId: 'note1', mime: 'image/jpeg' }
|
||||
{ noteId: 'note1', mime: 'image/jpeg', blobId: 'blob1' }
|
||||
];
|
||||
|
||||
// Setup mocks for startBatchProcessing
|
||||
@ -777,6 +765,7 @@ describe('OCRService', () => {
|
||||
noteId: 'note1',
|
||||
type: 'image',
|
||||
mime: 'image/jpeg',
|
||||
blobId: 'blob1',
|
||||
getContent: vi.fn().mockImplementation(() => { throw new Error('Failed to get content'); })
|
||||
};
|
||||
mockBecca.getNote.mockReturnValue(mockNote);
|
||||
@ -796,8 +785,8 @@ describe('OCRService', () => {
|
||||
|
||||
it('should stop processing when cancelled', async () => {
|
||||
const imageNotes = [
|
||||
{ noteId: 'note1', mime: 'image/jpeg' },
|
||||
{ noteId: 'note2', mime: 'image/png' }
|
||||
{ noteId: 'note1', mime: 'image/jpeg', blobId: 'blob1' },
|
||||
{ noteId: 'note2', mime: 'image/png', blobId: 'blob2' }
|
||||
];
|
||||
|
||||
// Setup mocks
|
||||
@ -821,8 +810,8 @@ describe('OCRService', () => {
|
||||
|
||||
it('should skip unsupported MIME types', async () => {
|
||||
const imageNotes = [
|
||||
{ noteId: 'note1', mime: 'text/plain' }, // unsupported
|
||||
{ noteId: 'note2', mime: 'image/jpeg' } // supported
|
||||
{ noteId: 'note1', mime: 'text/plain', blobId: 'blob1' }, // unsupported
|
||||
{ noteId: 'note2', mime: 'image/jpeg', blobId: 'blob2' } // supported
|
||||
];
|
||||
|
||||
// Setup mocks
|
||||
@ -835,6 +824,7 @@ describe('OCRService', () => {
|
||||
noteId: 'note2',
|
||||
type: 'image',
|
||||
mime: 'image/jpeg',
|
||||
blobId: 'blob2',
|
||||
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
|
||||
};
|
||||
mockBecca.getNote.mockReturnValue(mockNote);
|
||||
@ -858,13 +848,13 @@ describe('OCRService', () => {
|
||||
|
||||
describe('deleteOCRResult', () => {
|
||||
it('should delete OCR result successfully', () => {
|
||||
ocrService.deleteOCRResult('note123', 'note');
|
||||
ocrService.deleteOCRResult('blob123');
|
||||
|
||||
expect(mockSql.execute).toHaveBeenCalledWith(
|
||||
expect.stringContaining('DELETE FROM ocr_results'),
|
||||
['note123', 'note']
|
||||
expect.stringContaining('UPDATE blobs SET ocr_text = NULL'),
|
||||
['blob123']
|
||||
);
|
||||
expect(mockLog.info).toHaveBeenCalledWith('Deleted OCR result for note note123');
|
||||
expect(mockLog.info).toHaveBeenCalledWith('Deleted OCR result for blob blob123');
|
||||
});
|
||||
|
||||
it('should handle deletion errors', () => {
|
||||
@ -872,8 +862,8 @@ describe('OCRService', () => {
|
||||
throw new Error('Database error');
|
||||
});
|
||||
|
||||
expect(() => ocrService.deleteOCRResult('note123', 'note')).toThrow('Database error');
|
||||
expect(mockLog.error).toHaveBeenCalledWith('Failed to delete OCR result for note note123: Error: Database error');
|
||||
expect(() => ocrService.deleteOCRResult('blob123')).toThrow('Database error');
|
||||
expect(mockLog.error).toHaveBeenCalledWith('Failed to delete OCR result for blob blob123: Error: Database error');
|
||||
});
|
||||
});
|
||||
|
||||
@ -886,6 +876,7 @@ describe('OCRService', () => {
|
||||
mockBecca.getNote.mockReturnValue({
|
||||
noteId: 'note123',
|
||||
mime: 'image/jpeg',
|
||||
blobId: 'blob123',
|
||||
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
|
||||
});
|
||||
mockSql.getRow.mockResolvedValue(null);
|
||||
|
||||
@ -17,11 +17,9 @@ export interface OCRProcessingOptions {
|
||||
confidence?: number;
|
||||
}
|
||||
|
||||
interface OCRResultRow {
|
||||
entity_id: string;
|
||||
entity_type: string;
|
||||
extracted_text: string;
|
||||
confidence: number;
|
||||
interface OCRBlobRow {
|
||||
blobId: string;
|
||||
ocr_text: string;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -176,8 +174,8 @@ class OCRService {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check if OCR already exists and we're not forcing reprocessing
|
||||
const existingOCR = this.getStoredOCRResult(noteId);
|
||||
// Check if OCR already exists in the blob and we're not forcing reprocessing
|
||||
const existingOCR = this.getStoredOCRResult(note.blobId);
|
||||
if (existingOCR && !options.forceReprocess) {
|
||||
log.info(`OCR already exists for note ${noteId}, returning cached result`);
|
||||
return existingOCR;
|
||||
@ -191,8 +189,8 @@ class OCRService {
|
||||
|
||||
const ocrResult = await this.extractTextFromImage(content, options);
|
||||
|
||||
// Store OCR result
|
||||
await this.storeOCRResult(noteId, ocrResult);
|
||||
// Store OCR result in blob
|
||||
await this.storeOCRResult(note.blobId, ocrResult);
|
||||
|
||||
return ocrResult;
|
||||
} catch (error) {
|
||||
@ -226,8 +224,8 @@ class OCRService {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check if OCR already exists and we're not forcing reprocessing
|
||||
const existingOCR = this.getStoredOCRResult(attachmentId, 'attachment');
|
||||
// Check if OCR already exists in the blob and we're not forcing reprocessing
|
||||
const existingOCR = this.getStoredOCRResult(attachment.blobId);
|
||||
if (existingOCR && !options.forceReprocess) {
|
||||
log.info(`OCR already exists for attachment ${attachmentId}, returning cached result`);
|
||||
return existingOCR;
|
||||
@ -241,8 +239,8 @@ class OCRService {
|
||||
|
||||
const ocrResult = await this.extractTextFromImage(content, options);
|
||||
|
||||
// Store OCR result
|
||||
await this.storeOCRResult(attachmentId, ocrResult, 'attachment');
|
||||
// Store OCR result in blob
|
||||
await this.storeOCRResult(attachment.blobId, ocrResult);
|
||||
|
||||
return ocrResult;
|
||||
} catch (error) {
|
||||
@ -252,57 +250,62 @@ class OCRService {
|
||||
}
|
||||
|
||||
/**
|
||||
* Store OCR result in database
|
||||
* Store OCR result in blob
|
||||
*/
|
||||
async storeOCRResult(entityId: string, ocrResult: OCRResult, entityType: 'note' | 'attachment' = 'note'): Promise<void> {
|
||||
async storeOCRResult(blobId: string | undefined, ocrResult: OCRResult): Promise<void> {
|
||||
if (!blobId) {
|
||||
log.error('Cannot store OCR result: blobId is undefined');
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
// Store OCR text in blobs table
|
||||
sql.execute(`
|
||||
INSERT OR REPLACE INTO ocr_results (entity_id, entity_type, extracted_text, confidence, language, extracted_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
UPDATE blobs SET ocr_text = ? WHERE blobId = ?
|
||||
`, [
|
||||
entityId,
|
||||
entityType,
|
||||
ocrResult.text,
|
||||
ocrResult.confidence,
|
||||
ocrResult.language || 'eng',
|
||||
ocrResult.extractedAt
|
||||
blobId
|
||||
]);
|
||||
|
||||
log.info(`Stored OCR result for ${entityType} ${entityId}`);
|
||||
log.info(`Stored OCR result for blob ${blobId}`);
|
||||
} catch (error) {
|
||||
log.error(`Failed to store OCR result for ${entityType} ${entityId}: ${error}`);
|
||||
log.error(`Failed to store OCR result for blob ${blobId}: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get stored OCR result from database
|
||||
* Get stored OCR result from blob
|
||||
*/
|
||||
private getStoredOCRResult(entityId: string, entityType: 'note' | 'attachment' = 'note'): OCRResult | null {
|
||||
private getStoredOCRResult(blobId: string | undefined): OCRResult | null {
|
||||
if (!blobId) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
const row = sql.getRow<{
|
||||
extracted_text: string;
|
||||
confidence: number;
|
||||
language?: string;
|
||||
extracted_at: string;
|
||||
ocr_text: string | null;
|
||||
}>(`
|
||||
SELECT extracted_text, confidence, language, extracted_at
|
||||
FROM ocr_results
|
||||
WHERE entity_id = ? AND entity_type = ?
|
||||
`, [entityId, entityType]);
|
||||
SELECT ocr_text
|
||||
FROM blobs
|
||||
WHERE blobId = ?
|
||||
`, [blobId]);
|
||||
|
||||
if (!row) {
|
||||
if (!row || !row.ocr_text) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Return basic OCR result from stored text
|
||||
// Note: we lose confidence, language, and extractedAt metadata
|
||||
// but gain simplicity by storing directly in blob
|
||||
return {
|
||||
text: row.extracted_text,
|
||||
confidence: row.confidence,
|
||||
language: row.language,
|
||||
extractedAt: row.extracted_at
|
||||
text: row.ocr_text,
|
||||
confidence: 0.95, // Default high confidence for existing OCR
|
||||
extractedAt: new Date().toISOString(),
|
||||
language: 'eng'
|
||||
};
|
||||
} catch (error) {
|
||||
log.error(`Failed to get OCR result for ${entityType} ${entityId}: ${error}`);
|
||||
log.error(`Failed to get OCR result for blob ${blobId}: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@ -310,29 +313,21 @@ class OCRService {
|
||||
/**
|
||||
* Search for text in OCR results
|
||||
*/
|
||||
searchOCRResults(searchText: string, entityType?: 'note' | 'attachment'): Array<{ entityId: string; entityType: string; text: string; confidence: number }> {
|
||||
searchOCRResults(searchText: string): Array<{ blobId: string; text: string }> {
|
||||
try {
|
||||
let query = `
|
||||
SELECT entity_id, entity_type, extracted_text, confidence
|
||||
FROM ocr_results
|
||||
WHERE extracted_text LIKE ?
|
||||
const query = `
|
||||
SELECT blobId, ocr_text
|
||||
FROM blobs
|
||||
WHERE ocr_text LIKE ?
|
||||
AND ocr_text IS NOT NULL
|
||||
`;
|
||||
const params = [`%${searchText}%`];
|
||||
|
||||
if (entityType) {
|
||||
query += ' AND entity_type = ?';
|
||||
params.push(entityType);
|
||||
}
|
||||
|
||||
query += ' ORDER BY confidence DESC';
|
||||
|
||||
const rows = sql.getRows<OCRResultRow>(query, params);
|
||||
const rows = sql.getRows<OCRBlobRow>(query, params);
|
||||
|
||||
return rows.map(row => ({
|
||||
entityId: row.entity_id,
|
||||
entityType: row.entity_type,
|
||||
text: row.extracted_text,
|
||||
confidence: row.confidence
|
||||
blobId: row.blobId,
|
||||
text: row.ocr_text
|
||||
}));
|
||||
} catch (error) {
|
||||
log.error(`Failed to search OCR results: ${error}`);
|
||||
@ -341,18 +336,18 @@ class OCRService {
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete OCR results for an entity
|
||||
* Delete OCR results for a blob
|
||||
*/
|
||||
deleteOCRResult(entityId: string, entityType: 'note' | 'attachment' = 'note'): void {
|
||||
deleteOCRResult(blobId: string): void {
|
||||
try {
|
||||
sql.execute(`
|
||||
DELETE FROM ocr_results
|
||||
WHERE entity_id = ? AND entity_type = ?
|
||||
`, [entityId, entityType]);
|
||||
UPDATE blobs SET ocr_text = NULL
|
||||
WHERE blobId = ?
|
||||
`, [blobId]);
|
||||
|
||||
log.info(`Deleted OCR result for ${entityType} ${entityId}`);
|
||||
log.info(`Deleted OCR result for blob ${blobId}`);
|
||||
} catch (error) {
|
||||
log.error(`Failed to delete OCR result for ${entityType} ${entityId}: ${error}`);
|
||||
log.error(`Failed to delete OCR result for blob ${blobId}: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
@ -373,14 +368,15 @@ class OCRService {
|
||||
const imageNotes = sql.getRows<{
|
||||
noteId: string;
|
||||
mime: string;
|
||||
blobId: string;
|
||||
}>(`
|
||||
SELECT noteId, mime
|
||||
FROM notes
|
||||
WHERE type = 'image'
|
||||
AND isDeleted = 0
|
||||
AND noteId NOT IN (
|
||||
SELECT entity_id FROM ocr_results WHERE entity_type = 'note'
|
||||
)
|
||||
SELECT n.noteId, n.mime, n.blobId
|
||||
FROM notes n
|
||||
LEFT JOIN blobs b ON n.blobId = b.blobId
|
||||
WHERE n.type = 'image'
|
||||
AND n.isDeleted = 0
|
||||
AND n.blobId IS NOT NULL
|
||||
AND (b.ocr_text IS NULL OR b.ocr_text = '')
|
||||
`);
|
||||
|
||||
log.info(`Found ${imageNotes.length} image notes to process`);
|
||||
@ -401,14 +397,15 @@ class OCRService {
|
||||
const imageAttachments = sql.getRows<{
|
||||
attachmentId: string;
|
||||
mime: string;
|
||||
blobId: string;
|
||||
}>(`
|
||||
SELECT attachmentId, mime
|
||||
FROM attachments
|
||||
WHERE role = 'image'
|
||||
AND isDeleted = 0
|
||||
AND attachmentId NOT IN (
|
||||
SELECT entity_id FROM ocr_results WHERE entity_type = 'attachment'
|
||||
)
|
||||
SELECT a.attachmentId, a.mime, a.blobId
|
||||
FROM attachments a
|
||||
LEFT JOIN blobs b ON a.blobId = b.blobId
|
||||
WHERE a.role = 'image'
|
||||
AND a.isDeleted = 0
|
||||
AND a.blobId IS NOT NULL
|
||||
AND (b.ocr_text IS NULL OR b.ocr_text = '')
|
||||
`);
|
||||
|
||||
log.info(`Found ${imageAttachments.length} image attachments to process`);
|
||||
@ -435,38 +432,48 @@ class OCRService {
|
||||
/**
|
||||
* Get OCR statistics
|
||||
*/
|
||||
getOCRStats(): { totalProcessed: number; averageConfidence: number; byEntityType: Record<string, number> } {
|
||||
getOCRStats(): { totalProcessed: number; imageNotes: number; imageAttachments: number } {
|
||||
try {
|
||||
const stats = sql.getRow<{
|
||||
total_processed: number;
|
||||
avg_confidence: number;
|
||||
}>(`
|
||||
SELECT
|
||||
COUNT(*) as total_processed,
|
||||
AVG(confidence) as avg_confidence
|
||||
FROM ocr_results
|
||||
SELECT COUNT(*) as total_processed
|
||||
FROM blobs
|
||||
WHERE ocr_text IS NOT NULL AND ocr_text != ''
|
||||
`);
|
||||
|
||||
const byEntityType = sql.getRows<{
|
||||
entity_type: string;
|
||||
// Count image notes with OCR
|
||||
const noteStats = sql.getRow<{
|
||||
count: number;
|
||||
}>(`
|
||||
SELECT entity_type, COUNT(*) as count
|
||||
FROM ocr_results
|
||||
GROUP BY entity_type
|
||||
SELECT COUNT(*) as count
|
||||
FROM notes n
|
||||
JOIN blobs b ON n.blobId = b.blobId
|
||||
WHERE n.type = 'image'
|
||||
AND n.isDeleted = 0
|
||||
AND b.ocr_text IS NOT NULL AND b.ocr_text != ''
|
||||
`);
|
||||
|
||||
// Count image attachments with OCR
|
||||
const attachmentStats = sql.getRow<{
|
||||
count: number;
|
||||
}>(`
|
||||
SELECT COUNT(*) as count
|
||||
FROM attachments a
|
||||
JOIN blobs b ON a.blobId = b.blobId
|
||||
WHERE a.role = 'image'
|
||||
AND a.isDeleted = 0
|
||||
AND b.ocr_text IS NOT NULL AND b.ocr_text != ''
|
||||
`);
|
||||
|
||||
return {
|
||||
totalProcessed: stats?.total_processed || 0,
|
||||
averageConfidence: stats?.avg_confidence || 0,
|
||||
byEntityType: byEntityType.reduce((acc, row) => {
|
||||
acc[row.entity_type] = row.count;
|
||||
return acc;
|
||||
}, {} as Record<string, number>)
|
||||
imageNotes: noteStats?.count || 0,
|
||||
imageAttachments: attachmentStats?.count || 0
|
||||
};
|
||||
} catch (error) {
|
||||
log.error(`Failed to get OCR stats: ${error}`);
|
||||
return { totalProcessed: 0, averageConfidence: 0, byEntityType: {} };
|
||||
return { totalProcessed: 0, imageNotes: 0, imageAttachments: 0 };
|
||||
}
|
||||
}
|
||||
|
||||
@ -584,14 +591,15 @@ class OCRService {
|
||||
const imageNotes = sql.getRows<{
|
||||
noteId: string;
|
||||
mime: string;
|
||||
blobId: string;
|
||||
}>(`
|
||||
SELECT noteId, mime
|
||||
FROM notes
|
||||
WHERE type = 'image'
|
||||
AND isDeleted = 0
|
||||
AND noteId NOT IN (
|
||||
SELECT entity_id FROM ocr_results WHERE entity_type = 'note'
|
||||
)
|
||||
SELECT n.noteId, n.mime, n.blobId
|
||||
FROM notes n
|
||||
LEFT JOIN blobs b ON n.blobId = b.blobId
|
||||
WHERE n.type = 'image'
|
||||
AND n.isDeleted = 0
|
||||
AND n.blobId IS NOT NULL
|
||||
AND (b.ocr_text IS NULL OR b.ocr_text = '')
|
||||
`);
|
||||
|
||||
for (const noteRow of imageNotes) {
|
||||
@ -616,14 +624,15 @@ class OCRService {
|
||||
const imageAttachments = sql.getRows<{
|
||||
attachmentId: string;
|
||||
mime: string;
|
||||
blobId: string;
|
||||
}>(`
|
||||
SELECT attachmentId, mime
|
||||
FROM attachments
|
||||
WHERE role = 'image'
|
||||
AND isDeleted = 0
|
||||
AND attachmentId NOT IN (
|
||||
SELECT entity_id FROM ocr_results WHERE entity_type = 'attachment'
|
||||
)
|
||||
SELECT a.attachmentId, a.mime, a.blobId
|
||||
FROM attachments a
|
||||
LEFT JOIN blobs b ON a.blobId = b.blobId
|
||||
WHERE a.role = 'image'
|
||||
AND a.isDeleted = 0
|
||||
AND a.blobId IS NOT NULL
|
||||
AND (b.ocr_text IS NULL OR b.ocr_text = '')
|
||||
`);
|
||||
|
||||
for (const attachmentRow of imageAttachments) {
|
||||
|
||||
@ -25,21 +25,30 @@ export default class OCRContentExpression extends Expression {
|
||||
const ocrResults = this.searchOCRContent(this.searchText);
|
||||
|
||||
for (const ocrResult of ocrResults) {
|
||||
let note: import('../../../becca/entities/bnote.js').default | null = null;
|
||||
|
||||
if (ocrResult.entity_type === 'note') {
|
||||
note = becca.getNote(ocrResult.entity_id);
|
||||
} else if (ocrResult.entity_type === 'attachment') {
|
||||
// For attachments, find the parent note
|
||||
const attachment = becca.getAttachment(ocrResult.entity_id);
|
||||
if (attachment) {
|
||||
note = becca.getNote(attachment.ownerId);
|
||||
// Find notes that use this blob
|
||||
const notes = sql.getRows<{noteId: string}>(`
|
||||
SELECT noteId FROM notes
|
||||
WHERE blobId = ? AND isDeleted = 0
|
||||
`, [ocrResult.blobId]);
|
||||
|
||||
for (const noteRow of notes) {
|
||||
const note = becca.getNote(noteRow.noteId);
|
||||
if (note && !note.isDeleted && inputNoteSet.hasNoteId(note.noteId)) {
|
||||
resultNoteSet.add(note);
|
||||
}
|
||||
}
|
||||
|
||||
// Only add notes that are in the input note set and not deleted
|
||||
if (note && !note.isDeleted && inputNoteSet.hasNoteId(note.noteId)) {
|
||||
resultNoteSet.add(note);
|
||||
// Find attachments that use this blob and their parent notes
|
||||
const attachments = sql.getRows<{ownerId: string}>(`
|
||||
SELECT ownerId FROM attachments
|
||||
WHERE blobId = ? AND isDeleted = 0
|
||||
`, [ocrResult.blobId]);
|
||||
|
||||
for (const attachmentRow of attachments) {
|
||||
const note = becca.getNote(attachmentRow.ownerId);
|
||||
if (note && !note.isDeleted && inputNoteSet.hasNoteId(note.noteId)) {
|
||||
resultNoteSet.add(note);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -62,44 +71,24 @@ export default class OCRContentExpression extends Expression {
|
||||
}
|
||||
|
||||
private searchOCRContent(searchText: string): Array<{
|
||||
entity_id: string;
|
||||
entity_type: string;
|
||||
extracted_text: string;
|
||||
confidence: number;
|
||||
blobId: string;
|
||||
ocr_text: string;
|
||||
}> {
|
||||
try {
|
||||
// Use FTS search if available, otherwise fall back to LIKE
|
||||
let query: string;
|
||||
let params: unknown[];
|
||||
|
||||
try {
|
||||
// Try FTS first
|
||||
query = `
|
||||
SELECT ocr.entity_id, ocr.entity_type, ocr.extracted_text, ocr.confidence
|
||||
FROM ocr_results_fts fts
|
||||
JOIN ocr_results ocr ON fts.rowid = ocr.id
|
||||
WHERE ocr_results_fts MATCH ?
|
||||
ORDER BY ocr.confidence DESC, rank
|
||||
LIMIT 50
|
||||
`;
|
||||
params = [searchText];
|
||||
} catch {
|
||||
// Fallback to LIKE search
|
||||
query = `
|
||||
SELECT entity_id, entity_type, extracted_text, confidence
|
||||
FROM ocr_results
|
||||
WHERE extracted_text LIKE ?
|
||||
ORDER BY confidence DESC
|
||||
LIMIT 50
|
||||
`;
|
||||
params = [`%${searchText}%`];
|
||||
}
|
||||
// Search in blobs table for OCR text
|
||||
const query = `
|
||||
SELECT blobId, ocr_text
|
||||
FROM blobs
|
||||
WHERE ocr_text LIKE ?
|
||||
AND ocr_text IS NOT NULL
|
||||
AND ocr_text != ''
|
||||
LIMIT 50
|
||||
`;
|
||||
const params = [`%${searchText}%`];
|
||||
|
||||
return sql.getRows<{
|
||||
entity_id: string;
|
||||
entity_type: string;
|
||||
extracted_text: string;
|
||||
confidence: number;
|
||||
blobId: string;
|
||||
ocr_text: string;
|
||||
}>(query, params);
|
||||
} catch (error) {
|
||||
console.error('Error searching OCR content:', error);
|
||||
|
||||
@ -70,6 +70,7 @@ export interface BlobRow {
|
||||
blobId: string;
|
||||
content: string | Buffer;
|
||||
contentLength: number;
|
||||
ocr_text?: string | null;
|
||||
dateModified: string;
|
||||
utcDateModified: string;
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user