feat(ocr): swap from custom table to using the blobs table, with a new column

This commit is contained in:
perf3ct 2025-07-14 16:15:15 +00:00
parent 4b5e8d33a6
commit 9029f59410
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
7 changed files with 246 additions and 331 deletions

View File

@ -10,11 +10,12 @@ class BBlob extends AbstractBeccaEntity<BBlob> {
return "blobId";
}
static get hashedProperties() {
return ["blobId", "content"];
return ["blobId", "content", "ocr_text"];
}
content!: string | Buffer;
contentLength!: number;
ocr_text?: string | null;
constructor(row: BlobRow) {
super();
@ -25,6 +26,7 @@ class BBlob extends AbstractBeccaEntity<BBlob> {
this.blobId = row.blobId;
this.content = row.content;
this.contentLength = row.contentLength;
this.ocr_text = row.ocr_text;
this.dateModified = row.dateModified;
this.utcDateModified = row.utcDateModified;
}
@ -34,6 +36,7 @@ class BBlob extends AbstractBeccaEntity<BBlob> {
blobId: this.blobId,
content: this.content || null,
contentLength: this.contentLength,
ocr_text: this.ocr_text || null,
dateModified: this.dateModified,
utcDateModified: this.utcDateModified
};

View File

@ -6,64 +6,16 @@
// Migrations should be kept in descending order, so the latest migration is first.
const MIGRATIONS: (SqlMigration | JsMigration)[] = [
// Add OCR results table for storing extracted text from images
// Add OCR text column to blobs table for storing extracted text from images
{
version: 233,
sql: /*sql*/`\
-- Create OCR results table to store extracted text from images
CREATE TABLE IF NOT EXISTS ocr_results (
id INTEGER PRIMARY KEY AUTOINCREMENT,
entity_id TEXT NOT NULL,
entity_type TEXT NOT NULL DEFAULT 'note',
extracted_text TEXT NOT NULL,
confidence REAL NOT NULL,
language TEXT NOT NULL DEFAULT 'eng',
extracted_at TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
UNIQUE(entity_id, entity_type)
);
-- Create indexes for better search performance
CREATE INDEX IF NOT EXISTS idx_ocr_results_entity
ON ocr_results (entity_id, entity_type);
CREATE INDEX IF NOT EXISTS idx_ocr_results_text
ON ocr_results (extracted_text);
CREATE INDEX IF NOT EXISTS idx_ocr_results_confidence
ON ocr_results (confidence);
-- Create full-text search index for extracted text
CREATE VIRTUAL TABLE IF NOT EXISTS ocr_results_fts USING fts5(
entity_id UNINDEXED,
entity_type UNINDEXED,
extracted_text,
content='ocr_results',
content_rowid='id'
);
-- Create triggers to keep FTS table in sync
CREATE TRIGGER IF NOT EXISTS ocr_results_fts_insert
AFTER INSERT ON ocr_results
BEGIN
INSERT INTO ocr_results_fts(rowid, entity_id, entity_type, extracted_text)
VALUES (new.id, new.entity_id, new.entity_type, new.extracted_text);
END;
CREATE TRIGGER IF NOT EXISTS ocr_results_fts_update
AFTER UPDATE ON ocr_results
BEGIN
UPDATE ocr_results_fts
SET extracted_text = new.extracted_text
WHERE rowid = new.id;
END;
CREATE TRIGGER IF NOT EXISTS ocr_results_fts_delete
AFTER DELETE ON ocr_results
BEGIN
DELETE FROM ocr_results_fts WHERE rowid = old.id;
END;
-- Add OCR text column to blobs table
ALTER TABLE blobs ADD COLUMN ocr_text TEXT DEFAULT NULL;
-- Create index for OCR text searches
CREATE INDEX IF NOT EXISTS idx_blobs_ocr_text
ON blobs (ocr_text);
`
},
// Remove embedding tables since LLM embedding functionality has been removed

View File

@ -246,13 +246,6 @@ async function processAttachmentOCR(req: Request, res: Response) {
* schema:
* type: string
* description: Search query text
* - name: entityType
* in: query
* required: false
* schema:
* type: string
* enum: [note, attachment]
* description: Filter by entity type
* responses:
* '200':
* description: Search results
@ -268,14 +261,10 @@ async function processAttachmentOCR(req: Request, res: Response) {
* items:
* type: object
* properties:
* entityId:
* type: string
* entityType:
* blobId:
* type: string
* text:
* type: string
* confidence:
* type: number
* '400':
* description: Bad request - missing search query
* '500':
@ -286,7 +275,7 @@ async function processAttachmentOCR(req: Request, res: Response) {
*/
async function searchOCR(req: Request, res: Response) {
try {
const { q: searchText, entityType } = req.query;
const { q: searchText } = req.query;
if (!searchText || typeof searchText !== 'string') {
res.status(400).json({
@ -297,10 +286,7 @@ async function searchOCR(req: Request, res: Response) {
return;
}
const results = ocrService.searchOCRResults(
searchText,
entityType as 'note' | 'attachment' | undefined
);
const results = ocrService.searchOCRResults(searchText);
res.json({
success: true,
@ -431,10 +417,10 @@ async function getBatchProgress(req: Request, res: Response) {
* properties:
* totalProcessed:
* type: number
* averageConfidence:
* imageNotes:
* type: number
* imageAttachments:
* type: number
* byEntityType:
* type: object
* '500':
* description: Internal server error
* security:
@ -463,24 +449,17 @@ async function getOCRStats(req: Request, res: Response) {
/**
* @swagger
* /api/ocr/delete/{entityType}/{entityId}:
* /api/ocr/delete/{blobId}:
* delete:
* summary: Delete OCR results for a specific entity
* summary: Delete OCR results for a specific blob
* operationId: ocr-delete-results
* parameters:
* - name: entityType
* - name: blobId
* in: path
* required: true
* schema:
* type: string
* enum: [note, attachment]
* description: Type of entity
* - name: entityId
* in: path
* required: true
* schema:
* type: string
* description: ID of the entity
* description: ID of the blob
* responses:
* '200':
* description: OCR results deleted successfully
@ -503,31 +482,22 @@ async function getOCRStats(req: Request, res: Response) {
*/
async function deleteOCRResults(req: Request, res: Response) {
try {
const { entityType, entityId } = req.params;
const { blobId } = req.params;
if (!entityType || !entityId) {
if (!blobId) {
res.status(400).json({
success: false,
error: 'Entity type and ID are required'
error: 'Blob ID is required'
});
(res as any).triliumResponseHandled = true;
return;
}
if (!['note', 'attachment'].includes(entityType)) {
res.status(400).json({
success: false,
error: 'Entity type must be either "note" or "attachment"'
});
(res as any).triliumResponseHandled = true;
return;
}
ocrService.deleteOCRResult(entityId, entityType as 'note' | 'attachment');
ocrService.deleteOCRResult(blobId);
res.json({
success: true,
message: `OCR results deleted for ${entityType} ${entityId}`
message: `OCR results deleted for blob ${blobId}`
});
(res as any).triliumResponseHandled = true;

View File

@ -240,7 +240,7 @@ describe('OCRService', () => {
});
describe('storeOCRResult', () => {
it('should store OCR result in database successfully', async () => {
it('should store OCR result in blob successfully', async () => {
const ocrResult = {
text: 'Sample text',
confidence: 0.95,
@ -248,15 +248,29 @@ describe('OCRService', () => {
language: 'eng'
};
await ocrService.storeOCRResult('note123', ocrResult, 'note');
await ocrService.storeOCRResult('blob123', ocrResult);
expect(mockSql.execute).toHaveBeenCalledWith(
expect.stringContaining('INSERT OR REPLACE INTO ocr_results'),
expect.arrayContaining(['note123', 'note', 'Sample text', 0.95, 'eng', expect.any(String)])
expect.stringContaining('UPDATE blobs SET ocr_text = ?'),
['Sample text', 'blob123']
);
});
it('should handle database insertion errors', async () => {
it('should handle undefined blobId gracefully', async () => {
const ocrResult = {
text: 'Sample text',
confidence: 0.95,
extractedAt: '2025-06-10T10:00:00.000Z',
language: 'eng'
};
await ocrService.storeOCRResult(undefined, ocrResult);
expect(mockSql.execute).not.toHaveBeenCalled();
expect(mockLog.error).toHaveBeenCalledWith('Cannot store OCR result: blobId is undefined');
});
it('should handle database update errors', async () => {
const error = new Error('Database error');
mockSql.execute.mockImplementation(() => {
throw error;
@ -269,8 +283,8 @@ describe('OCRService', () => {
language: 'eng'
};
await expect(ocrService.storeOCRResult('note123', ocrResult, 'note')).rejects.toThrow('Database error');
expect(mockLog.error).toHaveBeenCalledWith('Failed to store OCR result for note note123: Error: Database error');
await expect(ocrService.storeOCRResult('blob123', ocrResult)).rejects.toThrow('Database error');
expect(mockLog.error).toHaveBeenCalledWith('Failed to store OCR result for blob blob123: Error: Database error');
});
});
@ -279,6 +293,7 @@ describe('OCRService', () => {
noteId: 'note123',
type: 'image',
mime: 'image/jpeg',
blobId: 'blob123',
getContent: vi.fn()
};
@ -316,10 +331,7 @@ describe('OCRService', () => {
it('should return existing OCR result if forceReprocess is false', async () => {
const existingResult = {
extracted_text: 'Existing text',
confidence: 0.85,
language: 'eng',
extracted_at: '2025-06-10T09:00:00.000Z'
ocr_text: 'Existing text'
};
mockSql.getRow.mockReturnValue(existingResult);
@ -327,19 +339,16 @@ describe('OCRService', () => {
expect(result).toEqual({
text: 'Existing text',
confidence: 0.85,
confidence: 0.95,
language: 'eng',
extractedAt: '2025-06-10T09:00:00.000Z'
extractedAt: expect.any(String)
});
expect(mockNote.getContent).not.toHaveBeenCalled();
});
it('should reprocess if forceReprocess is true', async () => {
const existingResult = {
extracted_text: 'Existing text',
confidence: 0.85,
language: 'eng',
extracted_at: '2025-06-10T09:00:00.000Z'
ocr_text: 'Existing text'
};
mockSql.getRow.mockResolvedValue(existingResult);
@ -385,6 +394,7 @@ describe('OCRService', () => {
attachmentId: 'attach123',
role: 'image',
mime: 'image/png',
blobId: 'blob456',
getContent: vi.fn()
};
@ -434,10 +444,8 @@ describe('OCRService', () => {
it('should search OCR results successfully', () => {
const mockResults = [
{
entity_id: 'note1',
entity_type: 'note',
extracted_text: 'Sample search text',
confidence: 0.95
blobId: 'blob1',
ocr_text: 'Sample search text'
}
];
mockSql.getRows.mockReturnValue(mockResults);
@ -445,36 +453,15 @@ describe('OCRService', () => {
const results = ocrService.searchOCRResults('search');
expect(results).toEqual([{
entityId: 'note1',
entityType: 'note',
text: 'Sample search text',
confidence: 0.95
blobId: 'blob1',
text: 'Sample search text'
}]);
expect(mockSql.getRows).toHaveBeenCalledWith(
expect.stringContaining('WHERE extracted_text LIKE ?'),
expect.stringContaining('WHERE ocr_text LIKE ?'),
['%search%']
);
});
it('should filter by entity type', () => {
const mockResults = [
{
entity_id: 'note1',
entity_type: 'note',
extracted_text: 'Note text',
confidence: 0.95
}
];
mockSql.getRows.mockReturnValue(mockResults);
ocrService.searchOCRResults('text', 'note');
expect(mockSql.getRows).toHaveBeenCalledWith(
expect.stringContaining('AND entity_type = ?'),
['%text%', 'note']
);
});
it('should handle search errors gracefully', () => {
mockSql.getRows.mockImplementation(() => {
throw new Error('Database error');
@ -490,39 +477,37 @@ describe('OCRService', () => {
describe('getOCRStats', () => {
it('should return OCR statistics successfully', () => {
const mockStats = {
total_processed: 150,
avg_confidence: 0.87
total_processed: 150
};
const mockNoteStats = {
count: 100
};
const mockAttachmentStats = {
count: 50
};
const mockByEntityType = [
{ entity_type: 'note', count: 100 },
{ entity_type: 'attachment', count: 50 }
];
mockSql.getRow.mockReturnValue(mockStats);
mockSql.getRows.mockReturnValue(mockByEntityType);
mockSql.getRow.mockReturnValueOnce(mockStats);
mockSql.getRow.mockReturnValueOnce(mockNoteStats);
mockSql.getRow.mockReturnValueOnce(mockAttachmentStats);
const stats = ocrService.getOCRStats();
expect(stats).toEqual({
totalProcessed: 150,
averageConfidence: 0.87,
byEntityType: {
note: 100,
attachment: 50
}
imageNotes: 100,
imageAttachments: 50
});
});
it('should handle missing statistics gracefully', () => {
mockSql.getRow.mockReturnValue(null);
mockSql.getRows.mockReturnValue([]);
const stats = ocrService.getOCRStats();
expect(stats).toEqual({
totalProcessed: 0,
averageConfidence: 0,
byEntityType: {}
imageNotes: 0,
imageAttachments: 0
});
});
});
@ -698,11 +683,11 @@ describe('OCRService', () => {
// Mock data for batch processing
const imageNotes = [
{ noteId: 'note1', mime: 'image/jpeg' },
{ noteId: 'note2', mime: 'image/png' }
{ noteId: 'note1', mime: 'image/jpeg', blobId: 'blob1' },
{ noteId: 'note2', mime: 'image/png', blobId: 'blob2' }
];
const imageAttachments = [
{ attachmentId: 'attach1', mime: 'image/gif' }
{ attachmentId: 'attach1', mime: 'image/gif', blobId: 'blob3' }
];
// Setup mocks for startBatchProcessing
@ -723,18 +708,21 @@ describe('OCRService', () => {
noteId: 'note1',
type: 'image',
mime: 'image/jpeg',
blobId: 'blob1',
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
};
const mockNote2 = {
noteId: 'note2',
type: 'image',
mime: 'image/png',
blobId: 'blob2',
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
};
const mockAttachment = {
attachmentId: 'attach1',
role: 'image',
mime: 'image/gif',
blobId: 'blob3',
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
};
@ -761,7 +749,7 @@ describe('OCRService', () => {
it('should handle processing errors gracefully', async () => {
const imageNotes = [
{ noteId: 'note1', mime: 'image/jpeg' }
{ noteId: 'note1', mime: 'image/jpeg', blobId: 'blob1' }
];
// Setup mocks for startBatchProcessing
@ -777,6 +765,7 @@ describe('OCRService', () => {
noteId: 'note1',
type: 'image',
mime: 'image/jpeg',
blobId: 'blob1',
getContent: vi.fn().mockImplementation(() => { throw new Error('Failed to get content'); })
};
mockBecca.getNote.mockReturnValue(mockNote);
@ -796,8 +785,8 @@ describe('OCRService', () => {
it('should stop processing when cancelled', async () => {
const imageNotes = [
{ noteId: 'note1', mime: 'image/jpeg' },
{ noteId: 'note2', mime: 'image/png' }
{ noteId: 'note1', mime: 'image/jpeg', blobId: 'blob1' },
{ noteId: 'note2', mime: 'image/png', blobId: 'blob2' }
];
// Setup mocks
@ -821,8 +810,8 @@ describe('OCRService', () => {
it('should skip unsupported MIME types', async () => {
const imageNotes = [
{ noteId: 'note1', mime: 'text/plain' }, // unsupported
{ noteId: 'note2', mime: 'image/jpeg' } // supported
{ noteId: 'note1', mime: 'text/plain', blobId: 'blob1' }, // unsupported
{ noteId: 'note2', mime: 'image/jpeg', blobId: 'blob2' } // supported
];
// Setup mocks
@ -835,6 +824,7 @@ describe('OCRService', () => {
noteId: 'note2',
type: 'image',
mime: 'image/jpeg',
blobId: 'blob2',
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
};
mockBecca.getNote.mockReturnValue(mockNote);
@ -858,13 +848,13 @@ describe('OCRService', () => {
describe('deleteOCRResult', () => {
it('should delete OCR result successfully', () => {
ocrService.deleteOCRResult('note123', 'note');
ocrService.deleteOCRResult('blob123');
expect(mockSql.execute).toHaveBeenCalledWith(
expect.stringContaining('DELETE FROM ocr_results'),
['note123', 'note']
expect.stringContaining('UPDATE blobs SET ocr_text = NULL'),
['blob123']
);
expect(mockLog.info).toHaveBeenCalledWith('Deleted OCR result for note note123');
expect(mockLog.info).toHaveBeenCalledWith('Deleted OCR result for blob blob123');
});
it('should handle deletion errors', () => {
@ -872,8 +862,8 @@ describe('OCRService', () => {
throw new Error('Database error');
});
expect(() => ocrService.deleteOCRResult('note123', 'note')).toThrow('Database error');
expect(mockLog.error).toHaveBeenCalledWith('Failed to delete OCR result for note note123: Error: Database error');
expect(() => ocrService.deleteOCRResult('blob123')).toThrow('Database error');
expect(mockLog.error).toHaveBeenCalledWith('Failed to delete OCR result for blob blob123: Error: Database error');
});
});
@ -886,6 +876,7 @@ describe('OCRService', () => {
mockBecca.getNote.mockReturnValue({
noteId: 'note123',
mime: 'image/jpeg',
blobId: 'blob123',
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
});
mockSql.getRow.mockResolvedValue(null);

View File

@ -17,11 +17,9 @@ export interface OCRProcessingOptions {
confidence?: number;
}
interface OCRResultRow {
entity_id: string;
entity_type: string;
extracted_text: string;
confidence: number;
interface OCRBlobRow {
blobId: string;
ocr_text: string;
}
/**
@ -176,8 +174,8 @@ class OCRService {
return null;
}
// Check if OCR already exists and we're not forcing reprocessing
const existingOCR = this.getStoredOCRResult(noteId);
// Check if OCR already exists in the blob and we're not forcing reprocessing
const existingOCR = this.getStoredOCRResult(note.blobId);
if (existingOCR && !options.forceReprocess) {
log.info(`OCR already exists for note ${noteId}, returning cached result`);
return existingOCR;
@ -191,8 +189,8 @@ class OCRService {
const ocrResult = await this.extractTextFromImage(content, options);
// Store OCR result
await this.storeOCRResult(noteId, ocrResult);
// Store OCR result in blob
await this.storeOCRResult(note.blobId, ocrResult);
return ocrResult;
} catch (error) {
@ -226,8 +224,8 @@ class OCRService {
return null;
}
// Check if OCR already exists and we're not forcing reprocessing
const existingOCR = this.getStoredOCRResult(attachmentId, 'attachment');
// Check if OCR already exists in the blob and we're not forcing reprocessing
const existingOCR = this.getStoredOCRResult(attachment.blobId);
if (existingOCR && !options.forceReprocess) {
log.info(`OCR already exists for attachment ${attachmentId}, returning cached result`);
return existingOCR;
@ -241,8 +239,8 @@ class OCRService {
const ocrResult = await this.extractTextFromImage(content, options);
// Store OCR result
await this.storeOCRResult(attachmentId, ocrResult, 'attachment');
// Store OCR result in blob
await this.storeOCRResult(attachment.blobId, ocrResult);
return ocrResult;
} catch (error) {
@ -252,57 +250,62 @@ class OCRService {
}
/**
* Store OCR result in database
* Store OCR result in blob
*/
async storeOCRResult(entityId: string, ocrResult: OCRResult, entityType: 'note' | 'attachment' = 'note'): Promise<void> {
async storeOCRResult(blobId: string | undefined, ocrResult: OCRResult): Promise<void> {
if (!blobId) {
log.error('Cannot store OCR result: blobId is undefined');
return;
}
try {
// Store OCR text in blobs table
sql.execute(`
INSERT OR REPLACE INTO ocr_results (entity_id, entity_type, extracted_text, confidence, language, extracted_at)
VALUES (?, ?, ?, ?, ?, ?)
UPDATE blobs SET ocr_text = ? WHERE blobId = ?
`, [
entityId,
entityType,
ocrResult.text,
ocrResult.confidence,
ocrResult.language || 'eng',
ocrResult.extractedAt
blobId
]);
log.info(`Stored OCR result for ${entityType} ${entityId}`);
log.info(`Stored OCR result for blob ${blobId}`);
} catch (error) {
log.error(`Failed to store OCR result for ${entityType} ${entityId}: ${error}`);
log.error(`Failed to store OCR result for blob ${blobId}: ${error}`);
throw error;
}
}
/**
* Get stored OCR result from database
* Get stored OCR result from blob
*/
private getStoredOCRResult(entityId: string, entityType: 'note' | 'attachment' = 'note'): OCRResult | null {
private getStoredOCRResult(blobId: string | undefined): OCRResult | null {
if (!blobId) {
return null;
}
try {
const row = sql.getRow<{
extracted_text: string;
confidence: number;
language?: string;
extracted_at: string;
ocr_text: string | null;
}>(`
SELECT extracted_text, confidence, language, extracted_at
FROM ocr_results
WHERE entity_id = ? AND entity_type = ?
`, [entityId, entityType]);
SELECT ocr_text
FROM blobs
WHERE blobId = ?
`, [blobId]);
if (!row) {
if (!row || !row.ocr_text) {
return null;
}
// Return basic OCR result from stored text
// Note: we lose confidence, language, and extractedAt metadata
// but gain simplicity by storing directly in blob
return {
text: row.extracted_text,
confidence: row.confidence,
language: row.language,
extractedAt: row.extracted_at
text: row.ocr_text,
confidence: 0.95, // Default high confidence for existing OCR
extractedAt: new Date().toISOString(),
language: 'eng'
};
} catch (error) {
log.error(`Failed to get OCR result for ${entityType} ${entityId}: ${error}`);
log.error(`Failed to get OCR result for blob ${blobId}: ${error}`);
return null;
}
}
@ -310,29 +313,21 @@ class OCRService {
/**
* Search for text in OCR results
*/
searchOCRResults(searchText: string, entityType?: 'note' | 'attachment'): Array<{ entityId: string; entityType: string; text: string; confidence: number }> {
searchOCRResults(searchText: string): Array<{ blobId: string; text: string }> {
try {
let query = `
SELECT entity_id, entity_type, extracted_text, confidence
FROM ocr_results
WHERE extracted_text LIKE ?
const query = `
SELECT blobId, ocr_text
FROM blobs
WHERE ocr_text LIKE ?
AND ocr_text IS NOT NULL
`;
const params = [`%${searchText}%`];
if (entityType) {
query += ' AND entity_type = ?';
params.push(entityType);
}
query += ' ORDER BY confidence DESC';
const rows = sql.getRows<OCRResultRow>(query, params);
const rows = sql.getRows<OCRBlobRow>(query, params);
return rows.map(row => ({
entityId: row.entity_id,
entityType: row.entity_type,
text: row.extracted_text,
confidence: row.confidence
blobId: row.blobId,
text: row.ocr_text
}));
} catch (error) {
log.error(`Failed to search OCR results: ${error}`);
@ -341,18 +336,18 @@ class OCRService {
}
/**
* Delete OCR results for an entity
* Delete OCR results for a blob
*/
deleteOCRResult(entityId: string, entityType: 'note' | 'attachment' = 'note'): void {
deleteOCRResult(blobId: string): void {
try {
sql.execute(`
DELETE FROM ocr_results
WHERE entity_id = ? AND entity_type = ?
`, [entityId, entityType]);
UPDATE blobs SET ocr_text = NULL
WHERE blobId = ?
`, [blobId]);
log.info(`Deleted OCR result for ${entityType} ${entityId}`);
log.info(`Deleted OCR result for blob ${blobId}`);
} catch (error) {
log.error(`Failed to delete OCR result for ${entityType} ${entityId}: ${error}`);
log.error(`Failed to delete OCR result for blob ${blobId}: ${error}`);
throw error;
}
}
@ -373,14 +368,15 @@ class OCRService {
const imageNotes = sql.getRows<{
noteId: string;
mime: string;
blobId: string;
}>(`
SELECT noteId, mime
FROM notes
WHERE type = 'image'
AND isDeleted = 0
AND noteId NOT IN (
SELECT entity_id FROM ocr_results WHERE entity_type = 'note'
)
SELECT n.noteId, n.mime, n.blobId
FROM notes n
LEFT JOIN blobs b ON n.blobId = b.blobId
WHERE n.type = 'image'
AND n.isDeleted = 0
AND n.blobId IS NOT NULL
AND (b.ocr_text IS NULL OR b.ocr_text = '')
`);
log.info(`Found ${imageNotes.length} image notes to process`);
@ -401,14 +397,15 @@ class OCRService {
const imageAttachments = sql.getRows<{
attachmentId: string;
mime: string;
blobId: string;
}>(`
SELECT attachmentId, mime
FROM attachments
WHERE role = 'image'
AND isDeleted = 0
AND attachmentId NOT IN (
SELECT entity_id FROM ocr_results WHERE entity_type = 'attachment'
)
SELECT a.attachmentId, a.mime, a.blobId
FROM attachments a
LEFT JOIN blobs b ON a.blobId = b.blobId
WHERE a.role = 'image'
AND a.isDeleted = 0
AND a.blobId IS NOT NULL
AND (b.ocr_text IS NULL OR b.ocr_text = '')
`);
log.info(`Found ${imageAttachments.length} image attachments to process`);
@ -435,38 +432,48 @@ class OCRService {
/**
* Get OCR statistics
*/
getOCRStats(): { totalProcessed: number; averageConfidence: number; byEntityType: Record<string, number> } {
getOCRStats(): { totalProcessed: number; imageNotes: number; imageAttachments: number } {
try {
const stats = sql.getRow<{
total_processed: number;
avg_confidence: number;
}>(`
SELECT
COUNT(*) as total_processed,
AVG(confidence) as avg_confidence
FROM ocr_results
SELECT COUNT(*) as total_processed
FROM blobs
WHERE ocr_text IS NOT NULL AND ocr_text != ''
`);
const byEntityType = sql.getRows<{
entity_type: string;
// Count image notes with OCR
const noteStats = sql.getRow<{
count: number;
}>(`
SELECT entity_type, COUNT(*) as count
FROM ocr_results
GROUP BY entity_type
SELECT COUNT(*) as count
FROM notes n
JOIN blobs b ON n.blobId = b.blobId
WHERE n.type = 'image'
AND n.isDeleted = 0
AND b.ocr_text IS NOT NULL AND b.ocr_text != ''
`);
// Count image attachments with OCR
const attachmentStats = sql.getRow<{
count: number;
}>(`
SELECT COUNT(*) as count
FROM attachments a
JOIN blobs b ON a.blobId = b.blobId
WHERE a.role = 'image'
AND a.isDeleted = 0
AND b.ocr_text IS NOT NULL AND b.ocr_text != ''
`);
return {
totalProcessed: stats?.total_processed || 0,
averageConfidence: stats?.avg_confidence || 0,
byEntityType: byEntityType.reduce((acc, row) => {
acc[row.entity_type] = row.count;
return acc;
}, {} as Record<string, number>)
imageNotes: noteStats?.count || 0,
imageAttachments: attachmentStats?.count || 0
};
} catch (error) {
log.error(`Failed to get OCR stats: ${error}`);
return { totalProcessed: 0, averageConfidence: 0, byEntityType: {} };
return { totalProcessed: 0, imageNotes: 0, imageAttachments: 0 };
}
}
@ -584,14 +591,15 @@ class OCRService {
const imageNotes = sql.getRows<{
noteId: string;
mime: string;
blobId: string;
}>(`
SELECT noteId, mime
FROM notes
WHERE type = 'image'
AND isDeleted = 0
AND noteId NOT IN (
SELECT entity_id FROM ocr_results WHERE entity_type = 'note'
)
SELECT n.noteId, n.mime, n.blobId
FROM notes n
LEFT JOIN blobs b ON n.blobId = b.blobId
WHERE n.type = 'image'
AND n.isDeleted = 0
AND n.blobId IS NOT NULL
AND (b.ocr_text IS NULL OR b.ocr_text = '')
`);
for (const noteRow of imageNotes) {
@ -616,14 +624,15 @@ class OCRService {
const imageAttachments = sql.getRows<{
attachmentId: string;
mime: string;
blobId: string;
}>(`
SELECT attachmentId, mime
FROM attachments
WHERE role = 'image'
AND isDeleted = 0
AND attachmentId NOT IN (
SELECT entity_id FROM ocr_results WHERE entity_type = 'attachment'
)
SELECT a.attachmentId, a.mime, a.blobId
FROM attachments a
LEFT JOIN blobs b ON a.blobId = b.blobId
WHERE a.role = 'image'
AND a.isDeleted = 0
AND a.blobId IS NOT NULL
AND (b.ocr_text IS NULL OR b.ocr_text = '')
`);
for (const attachmentRow of imageAttachments) {

View File

@ -25,21 +25,30 @@ export default class OCRContentExpression extends Expression {
const ocrResults = this.searchOCRContent(this.searchText);
for (const ocrResult of ocrResults) {
let note: import('../../../becca/entities/bnote.js').default | null = null;
if (ocrResult.entity_type === 'note') {
note = becca.getNote(ocrResult.entity_id);
} else if (ocrResult.entity_type === 'attachment') {
// For attachments, find the parent note
const attachment = becca.getAttachment(ocrResult.entity_id);
if (attachment) {
note = becca.getNote(attachment.ownerId);
// Find notes that use this blob
const notes = sql.getRows<{noteId: string}>(`
SELECT noteId FROM notes
WHERE blobId = ? AND isDeleted = 0
`, [ocrResult.blobId]);
for (const noteRow of notes) {
const note = becca.getNote(noteRow.noteId);
if (note && !note.isDeleted && inputNoteSet.hasNoteId(note.noteId)) {
resultNoteSet.add(note);
}
}
// Only add notes that are in the input note set and not deleted
if (note && !note.isDeleted && inputNoteSet.hasNoteId(note.noteId)) {
resultNoteSet.add(note);
// Find attachments that use this blob and their parent notes
const attachments = sql.getRows<{ownerId: string}>(`
SELECT ownerId FROM attachments
WHERE blobId = ? AND isDeleted = 0
`, [ocrResult.blobId]);
for (const attachmentRow of attachments) {
const note = becca.getNote(attachmentRow.ownerId);
if (note && !note.isDeleted && inputNoteSet.hasNoteId(note.noteId)) {
resultNoteSet.add(note);
}
}
}
@ -62,44 +71,24 @@ export default class OCRContentExpression extends Expression {
}
private searchOCRContent(searchText: string): Array<{
entity_id: string;
entity_type: string;
extracted_text: string;
confidence: number;
blobId: string;
ocr_text: string;
}> {
try {
// Use FTS search if available, otherwise fall back to LIKE
let query: string;
let params: unknown[];
try {
// Try FTS first
query = `
SELECT ocr.entity_id, ocr.entity_type, ocr.extracted_text, ocr.confidence
FROM ocr_results_fts fts
JOIN ocr_results ocr ON fts.rowid = ocr.id
WHERE ocr_results_fts MATCH ?
ORDER BY ocr.confidence DESC, rank
LIMIT 50
`;
params = [searchText];
} catch {
// Fallback to LIKE search
query = `
SELECT entity_id, entity_type, extracted_text, confidence
FROM ocr_results
WHERE extracted_text LIKE ?
ORDER BY confidence DESC
LIMIT 50
`;
params = [`%${searchText}%`];
}
// Search in blobs table for OCR text
const query = `
SELECT blobId, ocr_text
FROM blobs
WHERE ocr_text LIKE ?
AND ocr_text IS NOT NULL
AND ocr_text != ''
LIMIT 50
`;
const params = [`%${searchText}%`];
return sql.getRows<{
entity_id: string;
entity_type: string;
extracted_text: string;
confidence: number;
blobId: string;
ocr_text: string;
}>(query, params);
} catch (error) {
console.error('Error searching OCR content:', error);

View File

@ -70,6 +70,7 @@ export interface BlobRow {
blobId: string;
content: string | Buffer;
contentLength: number;
ocr_text?: string | null;
dateModified: string;
utcDateModified: string;
}