added image OCR and parsing text from PDF (and OCR of PDF images)

2025-06-06 18:08:33 +02:00 · 2023-01-26 20:32:27 +01:00 · 2023-01-26 20:32:27 +01:00 · ad887c4b12
commit ad887c4b12
parent 63c62df787
13 changed files with 380 additions and 189 deletions
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@ -24,7 +24,7 @@
    "test-jasmine": "jasmine",
    "test-es6": "node -r esm spec-es6/attribute_parser.spec.js ",
    "test": "npm run test-jasmine && npm run test-es6",
-    "postinstall": "rimraf ./node_modules/canvas"
+    "postinstall": "node src-build/fix_pdfjs.js"
  },
  "dependencies": {
    "@braintree/sanitize-url": "6.0.2",
@ -72,7 +72,7 @@
    "normalize-strings": "1.1.1",
    "ocrad.js": "antimatter15/ocrad.js#master",
    "open": "8.4.0",
-    "pdfjs-dist": "2.8.335",
+    "pdfjs-dist": "3.2.146",
    "rand-token": "1.0.1",
    "react": "17.0.2",
    "react-dom": "17.0.2",
--- a/src-build/fix_pdfjs.js
+++ b/src-build/fix_pdfjs.js
@ -0,0 +1,12 @@
+const fs = require("fs");
+
+const PACKAGE_JSON_PATH = './node_modules/pdfjs-dist/package.json';
+
+const packageJson = JSON.parse(
+    fs.readFileSync(PACKAGE_JSON_PATH).toString()
+);
+
+// non-legacy build doesn't work on node 16 at least
+packageJson.main = "legacy/build/pdf.js";
+
+fs.writeFileSync(PACKAGE_JSON_PATH, JSON.stringify(packageJson, null, 2));
--- a/src/becca/entities/bnote.js
+++ b/src/becca/entities/bnote.js
@ -351,6 +351,12 @@ class BNote extends AbstractBeccaEntity {
            && this.mime === "text/html";
    }

+    /** @returns {boolean} true if this note is an image */
+    isImage() {
+        return this.type === 'image'
+            || (this.type === 'file' && this.mime?.startsWith('image/'));
+    }
+
    /** @returns {boolean} true if the note has string content (not binary) */
    isStringNote() {
        return utils.isStringNote(this.type, this.mime);
--- a/src/etapi/notes.js
+++ b/src/etapi/notes.js
@ -123,7 +123,7 @@ function register(router) {

        note.setContent(req.body);

-        noteService.scanForLinks(note);
+        noteService.asyncPostProcessContent(note, req.body);

        return res.sendStatus(204);
    });
--- a/src/routes/api/files.js
+++ b/src/routes/api/files.js
@ -3,7 +3,7 @@
 const protectedSessionService = require('../../services/protected_session');
 const utils = require('../../services/utils');
 const log = require('../../services/log');
-const noteRevisionService = require('../../services/note_revisions');
+const noteService = require('../../services/notes');
 const tmp = require('tmp');
 const fs = require('fs');
 const { Readable } = require('stream');
@ -31,21 +31,7 @@ function updateFile(req) {

    note.setLabel('originalFileName', file.originalname);

-    if (note.mime === 'application/pdf') {
-        const pdfjsLib = require("pdfjs-dist");
-
-        (async () =>
-        {
-            let doc = await pdfjsLib.getDocument({data: file.buffer}).promise;
-            let page1 = await doc.getPage(1);
-            let content = await page1.getTextContent();
-            let strings = content.items.map(function (item) {
-                return item.str;
-            });
-
-            console.log(strings);
-        })();
-    }
+    noteService.asyncPostProcessContent(note, file.buffer);

    return {
        uploaded: true
--- a/src/services/image.js
+++ b/src/services/image.js
@ -65,24 +65,6 @@ function getImageMimeFromExtension(ext) {
    return `image/${ext === 'svg' ? 'svg+xml' : ext}`;
 }

-function runOcr(note, buffer) {
-    if (!optionService.getOptionBool('ocrImages')) {
-        return;
-    }
-
-    const start = Date.now();
-    const img = new Canvas.Image();
-    img.src = buffer;
-    const canvas = new Canvas.createCanvas(img.width, img.height);
-    const ctx = canvas.getContext('2d');
-    ctx.drawImage(img, 0, 0, img.width, img.height);
-    const plainText = OCRAD(canvas);
-
-    log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`);
-
-    note.saveNoteAttachment('plainText', 'text/plain', plainText);
-}
-
 function updateImage(noteId, uploadBuffer, originalName) {
    log.info(`Updating image ${noteId}: ${originalName}`);

--- a/src/services/import/enex.js
+++ b/src/services/import/enex.js
@ -335,7 +335,7 @@ function importEnex(taskContext, file, parentNote) {
        // save updated content with links to files/images
        noteEntity.setContent(content);

-        noteService.scanForLinks(noteEntity);
+        noteService.asyncPostProcessContent(noteEntity, content);

        updateDates(noteEntity.noteId, utcDateCreated, utcDateModified);
    }
--- a/src/services/import/zip.js
+++ b/src/services/import/zip.js
@ -520,7 +520,8 @@ async function importZip(taskContext, fileBuffer, importRootNote) {
    });

    for (const noteId in createdNoteIds) { // now the noteIds are unique
-        noteService.scanForLinks(becca.getNote(noteId));
+        const note = becca.getNote(noteId);
+        await noteService.asyncPostProcessContent(note, note.getContent());

        if (!metaFile) {
            // if there's no meta file then the notes are created based on the order in that zip file but that
--- a/src/services/notes.js
+++ b/src/services/notes.js
@ -23,6 +23,7 @@ const dayjs = require("dayjs");
 const htmlSanitizer = require("./html_sanitizer");
 const ValidationError = require("../errors/validation_error");
 const noteTypesService = require("./note_types");
+const textExtractingService = require("./text_extracting");

 function getNewNotePosition(parentNoteId) {
    const note = becca.notes[parentNoteId];
@ -191,7 +192,7 @@ function createNewNote(params) {
            }
        }

-        scanForLinks(note);
+        asyncPostProcessContent(note, params.content);

        copyChildAttributes(parentNote, note);

@ -492,7 +493,7 @@ function downloadImages(noteId, content) {
                if (updatedContent !== origContent) {
                    origNote.setContent(updatedContent);

-                    scanForLinks(origNote);
+                    asyncPostProcessContent(origNote, updatedContent);

                    eventService.emit(eventService.ENTITY_CHANGED, {
                        entityName: 'note_contents',
@ -711,13 +712,12 @@ function getUndeletedParentBranchIds(noteId, deleteId) {
                      AND parentNote.isDeleted = 0`, [noteId, deleteId]);
 }

-function scanForLinks(note) {
+function scanForLinks(note, content) {
    if (!note || !['text', 'relationMap'].includes(note.type)) {
        return;
    }

    try {
-        const content = note.getContent();
        const newContent = saveLinks(note, content);

        if (content !== newContent) {
@ -729,6 +729,30 @@ function scanForLinks(note) {
    }
 }

+function runOcr(note, buffer) {
+    if (!note.isImage() || !optionService.getOptionBool('ocrImages')) {
+        return;
+    }
+
+    try {
+        const plainText = textExtractingService.ocrTextFromBuffer(buffer);
+
+        note.saveNoteAttachment('plainText', 'text/plain', plainText);
+    }
+    catch (e) {
+        log.error(`OCR on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
+    }
+}
+
+/**
+ * Things which have to be executed after updating content, but asynchronously (separate transaction)
+ */
+async function asyncPostProcessContent(note, content) {
+    scanForLinks(note, content);
+    runOcr(note, content);
+    await textExtractingService.extractTextFromPdf(note, content);
+}
+
 function eraseNotes(noteIdsToErase) {
    if (noteIdsToErase.length === 0) {
        return;
@ -1006,7 +1030,6 @@ module.exports = {
    updateNoteData,
    undeleteNote,
    protectNoteRecursively,
-    scanForLinks,
    duplicateSubtree,
    duplicateSubtreeWithoutRoot,
    getUndeletedParentBranchIds,
@ -1014,5 +1037,6 @@ module.exports = {
    eraseDeletedNotesNow,
    eraseNotesWithDeleteId,
    saveNoteRevisionIfNeeded,
-    downloadImages
+    downloadImages,
+    asyncPostProcessContent
 };
--- a/src/services/options_init.js
+++ b/src/services/options_init.js
@ -91,6 +91,7 @@ const defaultOptions = [
    { name: 'disableTray', value: 'false', isSynced: false },
    { name: 'userGuideSha256Hash', value: '', isSynced: true },
    { name: 'ocrImages', value: 'true', isSynced: true },
+    { name: 'extractTextFromPdf', value: 'true', isSynced: true },
 ];

 function initStartupOptions() {
--- a/src/services/search/expressions/note_content_fulltext.js
+++ b/src/services/search/expressions/note_content_fulltext.js
@ -53,7 +53,9 @@ class NoteContentFulltextExp extends Expression {
                FROM note_attachments JOIN note_attachment_contents USING (noteAttachmentId) 
                WHERE name IN ('plainText') AND isDeleted = 0`)) {

-            this.findInText(row, inputNoteSet, resultNoteSet);
+            if (!resultNoteSet.hasNoteId(row.noteId)) {
+                this.findInText(row, inputNoteSet, resultNoteSet);
+            }
        }

        return resultNoteSet;
--- a/src/services/text_extracting.js
+++ b/src/services/text_extracting.js
@ -0,0 +1,129 @@
+const Canvas = require("canvas");
+const OCRAD = require("ocrad.js");
+const log = require("./log.js");
+const optionService = require("./options.js");
+
+function ocrFromByteArray(img) {
+    // byte array contains raw uncompressed pixel data
+    // kind: 1 - GRAYSCALE_1BPP (unsupported)
+    // kind: 2 - RGB_24BPP
+    // kind: 3 - RGBA_32BPP
+
+    if (!(img.data instanceof Uint8ClampedArray) || ![2, 3].includes(img.kind)) {
+        return null;
+    }
+
+    const start = Date.now();
+    const canvas = new Canvas.createCanvas(img.width, img.height);
+    const ctx = canvas.getContext('2d');
+
+    const imageData = ctx.createImageData(img.width, img.height);
+    const imageBytes = imageData.data;
+
+    for (let j = 0, k = 0, jj = img.width * img.height * 4; j < jj;) {
+        imageBytes[j++] = img.data[k++];
+        imageBytes[j++] = img.data[k++];
+        imageBytes[j++] = img.data[k++];
+        // in case of kind = 2, the alpha channel is missing in source pixels and we'll add it
+        imageBytes[j++] = img.kind === 2 ? 255 : img.data[k++];
+    }
+
+    ctx.putImageData(imageData, 0, 0);
+    const text = OCRAD(canvas);
+
+    log.info(`OCR of ${img.data.length} canvas into ${text.length} chars of text took ${Date.now() - start}ms`);
+
+    return text;
+}
+
+async function ocrTextFromPdfImages(pdfjsLib, page, strings) {
+    const ops = await page.getOperatorList();
+
+    const fns = ops.fnArray;
+    const args = ops.argsArray;
+
+    for (const arg of args) {
+        const i = args.indexOf(arg);
+
+        if (fns[i] !== pdfjsLib.OPS.paintXObject && fns[i] !== pdfjsLib.OPS.paintImageXObject) {
+            continue;
+        }
+
+        const imgKey = arg[0];
+        const img = await new Promise((res) => page.objs.get(imgKey, r => res(r)));
+
+        if (!img) {
+            continue;
+        }
+
+        const text = ocrFromByteArray(img);
+
+        if (text) {
+            strings.push(text);
+        }
+    }
+}
+
+async function extractTextFromPdf(note, buffer) {
+    if (note.mime !== 'application/pdf' || !optionService.getOptionBool('extractTextFromPdf')) {
+        return;
+    }
+
+    try {
+        const pdfjsLib = require("pdfjs-dist");
+        const doc = await pdfjsLib.getDocument({data: buffer}).promise;
+        let strings = [];
+
+        for (let p = 1; p <= doc.numPages; p++) {
+            const page = await doc.getPage(p);
+
+            const content = await page.getTextContent({
+                normalizeWhitespace: true,
+                disableCombineTextItems: false
+            });
+
+            content.items.forEach(({str}) => strings.push(str));
+
+            try {
+                if (optionService.getOptionBool('ocrImages')) {
+                    await ocrTextFromPdfImages(pdfjsLib, page, strings);
+                }
+            }
+            catch (e) {
+                log.info(`Could not OCR images from PDF note '${note.noteId}': '${e.message}', stack '${e.stack}'`);
+            }
+        }
+
+        strings = strings.filter(str => str?.trim());
+
+        note.saveNoteAttachment('plainText', 'text/plain', strings.join(" "));
+    }
+    catch (e) {
+        log.info(`Extracting text from PDF on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
+    }
+}
+
+async function ocrTextFromBuffer(buffer) {
+    // buffer is expected to contain an image in JPEG, PNG etc.
+    const start = Date.now();
+
+    const img = await new Promise((res, rej) => {
+        const img = new Canvas.Image();
+        img.onload = () => res(img);
+        img.onerror = err => rej(new Error("Can't load the image " + err));
+        img.src = buffer;
+    });
+
+    const canvas = new Canvas.createCanvas(img.width, img.height);
+    const ctx = canvas.getContext('2d');
+    ctx.drawImage(img, 0, 0, img.width, img.height);
+    const plainText = OCRAD(canvas);
+
+    log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`);
+    return plainText;
+}
+
+module.exports = {
+    ocrTextFromBuffer,
+    extractTextFromPdf
+};