From 63c62df787fb7626cb2744c1ef235187e84fdcbc Mon Sep 17 00:00:00 2001 From: zadam Date: Thu, 26 Jan 2023 09:42:11 +0100 Subject: [PATCH] ocr --- src/becca/entities/bnote_attachment.js | 45 ++++---- src/services/image.js | 34 ++++-- src/services/options_init.js | 1 + .../expressions/note_content_fulltext.js | 104 ++++++++++-------- 4 files changed, 104 insertions(+), 80 deletions(-) diff --git a/src/becca/entities/bnote_attachment.js b/src/becca/entities/bnote_attachment.js index 0fd8bc246..d9c41d51b 100644 --- a/src/becca/entities/bnote_attachment.js +++ b/src/becca/entities/bnote_attachment.js @@ -90,33 +90,34 @@ class BNoteAttachment extends AbstractBeccaEntity { } setContent(content) { - this.contentCheckSum = this.calculateCheckSum(content); - this.save(); // also explicitly save note_attachment to update contentCheckSum + sql.transactional(() => { + this.contentCheckSum = this.calculateCheckSum(content); + this.save(); // also explicitly save note_attachment to update contentCheckSum - const pojo = { - noteAttachmentId: this.noteAttachmentId, - content: content, - utcDateModified: dateUtils.utcNowDateTime() - }; + const pojo = { + noteAttachmentId: this.noteAttachmentId, + content: content, + utcDateModified: dateUtils.utcNowDateTime() + }; - if (this.isProtected) { - if (protectedSessionService.isProtectedSessionAvailable()) { - pojo.content = protectedSessionService.encrypt(pojo.content); + if (this.isProtected) { + if (protectedSessionService.isProtectedSessionAvailable()) { + pojo.content = protectedSessionService.encrypt(pojo.content); + } else { + throw new Error(`Cannot update content of noteAttachmentId=${this.noteAttachmentId} since we're out of protected session.`); + } } - else { - throw new Error(`Cannot update content of noteAttachmentId=${this.noteAttachmentId} since we're out of protected session.`); - } - } - sql.upsert("note_attachment_contents", "noteAttachmentId", pojo); + sql.upsert("note_attachment_contents", "noteAttachmentId", pojo); - entityChangesService.addEntityChange({ - entityName: 'note_attachment_contents', - entityId: this.noteAttachmentId, - hash: this.contentCheckSum, - isErased: false, - utcDateChanged: pojo.utcDateModified, - isSynced: true + entityChangesService.addEntityChange({ + entityName: 'note_attachment_contents', + entityId: this.noteAttachmentId, + hash: this.contentCheckSum, + isErased: false, + utcDateChanged: pojo.utcDateModified, + isSynced: true + }); }); } diff --git a/src/services/image.js b/src/services/image.js index 68cc71256..4173b7acf 100644 --- a/src/services/image.js +++ b/src/services/image.js @@ -65,6 +65,24 @@ function getImageMimeFromExtension(ext) { return `image/${ext === 'svg' ? 'svg+xml' : ext}`; } +function runOcr(note, buffer) { + if (!optionService.getOptionBool('ocrImages')) { + return; + } + + const start = Date.now(); + const img = new Canvas.Image(); + img.src = buffer; + const canvas = new Canvas.createCanvas(img.width, img.height); + const ctx = canvas.getContext('2d'); + ctx.drawImage(img, 0, 0, img.width, img.height); + const plainText = OCRAD(canvas); + + log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`); + + note.saveNoteAttachment('plainText', 'text/plain', plainText); +} + function updateImage(noteId, uploadBuffer, originalName) { log.info(`Updating image ${noteId}: ${originalName}`); @@ -85,17 +103,7 @@ function updateImage(noteId, uploadBuffer, originalName) { note.setContent(buffer); }); - const start = Date.now(); - const img = new Canvas.Image(); - img.src = buffer; - const canvas = new Canvas.createCanvas(img.width, img.height); - const ctx = canvas.getContext('2d'); - ctx.drawImage(img, 0, 0, img.width, img.height); - const text = OCRAD(canvas); - - console.log(text); - - log.info(`OCR of ${buffer.byteLength} bytes took ${Date.now() - start}ms`); + runOcr(note, buffer); }); } @@ -136,7 +144,9 @@ function saveImage(parentNoteId, uploadBuffer, originalName, shrinkImageSwitch, note.save(); note.setContent(buffer); - }) + }); + + runOcr(note, buffer); }); return { diff --git a/src/services/options_init.js b/src/services/options_init.js index 40c2b79d5..7300a58d7 100644 --- a/src/services/options_init.js +++ b/src/services/options_init.js @@ -90,6 +90,7 @@ const defaultOptions = [ { name: 'checkForUpdates', value: 'true', isSynced: true }, { name: 'disableTray', value: 'false', isSynced: false }, { name: 'userGuideSha256Hash', value: '', isSynced: true }, + { name: 'ocrImages', value: 'true', isSynced: true }, ]; function initStartupOptions() { diff --git a/src/services/search/expressions/note_content_fulltext.js b/src/services/search/expressions/note_content_fulltext.js index d2837340b..4663054a4 100644 --- a/src/services/search/expressions/note_content_fulltext.js +++ b/src/services/search/expressions/note_content_fulltext.js @@ -40,63 +40,75 @@ class NoteContentFulltextExp extends Expression { const resultNoteSet = new NoteSet(); const sql = require('../../sql'); - for (let {noteId, type, mime, content, isProtected} of sql.iterateRows(` + for (const row of sql.iterateRows(` SELECT noteId, type, mime, content, isProtected FROM notes JOIN note_contents USING (noteId) WHERE type IN ('text', 'code', 'mermaid') AND isDeleted = 0`)) { - if (!inputNoteSet.hasNoteId(noteId) || !(noteId in becca.notes)) { - continue; - } + this.findInText(row, inputNoteSet, resultNoteSet); + } - if (isProtected) { - if (!protectedSessionService.isProtectedSessionAvailable()) { - continue; - } + for (const row of sql.iterateRows(` + SELECT noteId, 'plainText' as type, mime, content, isProtected + FROM note_attachments JOIN note_attachment_contents USING (noteAttachmentId) + WHERE name IN ('plainText') AND isDeleted = 0`)) { - try { - content = protectedSessionService.decryptString(content); - } catch (e) { - log.info(`Cannot decrypt content of note ${noteId}`); - continue; - } - } - - content = this.preprocessContent(content, type, mime); - - if (this.tokens.length === 1) { - const [token] = this.tokens; - - if ((this.operator === '=' && token === content) - || (this.operator === '!=' && token !== content) - || (this.operator === '*=' && content.endsWith(token)) - || (this.operator === '=*' && content.startsWith(token)) - || (this.operator === '*=*' && content.includes(token)) - || (this.operator === '%=' && getRegex(token).test(content))) { - - resultNoteSet.add(becca.notes[noteId]); - } - } - else { - const nonMatchingToken = this.tokens.find(token => - !content.includes(token) && - ( - // in case of default fulltext search we should consider both title, attrs and content - // so e.g. "hello world" should match when "hello" is in title and "world" in content - !this.flatText - || !becca.notes[noteId].getFlatText().includes(token) - ) - ); - - if (!nonMatchingToken) { - resultNoteSet.add(becca.notes[noteId]); - } - } + this.findInText(row, inputNoteSet, resultNoteSet); } return resultNoteSet; } + findInText({noteId, isProtected, content, type, mime}, inputNoteSet, resultNoteSet) { + if (!inputNoteSet.hasNoteId(noteId) || !(noteId in becca.notes)) { + return; + } + + if (isProtected) { + if (!protectedSessionService.isProtectedSessionAvailable()) { + return; + } + + try { + content = protectedSessionService.decryptString(content); + } catch (e) { + log.info(`Cannot decrypt content of note ${noteId}`); + return; + } + } + + content = this.preprocessContent(content, type, mime); + + if (this.tokens.length === 1) { + const [token] = this.tokens; + + if ((this.operator === '=' && token === content) + || (this.operator === '!=' && token !== content) + || (this.operator === '*=' && content.endsWith(token)) + || (this.operator === '=*' && content.startsWith(token)) + || (this.operator === '*=*' && content.includes(token)) + || (this.operator === '%=' && getRegex(token).test(content))) { + + resultNoteSet.add(becca.notes[noteId]); + } + } else { + const nonMatchingToken = this.tokens.find(token => + !content.includes(token) && + ( + // in case of default fulltext search we should consider both title, attrs and content + // so e.g. "hello world" should match when "hello" is in title and "world" in content + !this.flatText + || !becca.notes[noteId].getFlatText().includes(token) + ) + ); + + if (!nonMatchingToken) { + resultNoteSet.add(becca.notes[noteId]); + } + } + return content; + } + preprocessContent(content, type, mime) { content = utils.normalize(content.toString());