added image OCR and parsing text from PDF (and OCR of PDF images)

2026-01-06 14:44:25 +01:00 · 2023-01-26 20:32:27 +01:00 · 2023-01-26 20:32:27 +01:00 · ad887c4b12
commit ad887c4b12
parent 63c62df787
13 changed files with 380 additions and 189 deletions
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@ -24,7 +24,7 @@
    "test-jasmine": "jasmine",
    "test-es6": "node -r esm spec-es6/attribute_parser.spec.js ",
    "test": "npm run test-jasmine && npm run test-es6",
-    "postinstall": "rimraf ./node_modules/canvas"
+    "postinstall": "node src-build/fix_pdfjs.js"
  },
  "dependencies": {
    "@braintree/sanitize-url": "6.0.2",
@ -72,7 +72,7 @@
    "normalize-strings": "1.1.1",
    "ocrad.js": "antimatter15/ocrad.js#master",
    "open": "8.4.0",
-    "pdfjs-dist": "2.8.335",
+    "pdfjs-dist": "3.2.146",
    "rand-token": "1.0.1",
    "react": "17.0.2",
    "react-dom": "17.0.2",
--- a/src-build/fix_pdfjs.js
+++ b/src-build/fix_pdfjs.js
@ -0,0 +1,12 @@
 const fs = require("fs");
 const PACKAGE_JSON_PATH = './node_modules/pdfjs-dist/package.json';
 const packageJson = JSON.parse(
    fs.readFileSync(PACKAGE_JSON_PATH).toString()
 );
 // non-legacy build doesn't work on node 16 at least
 packageJson.main = "legacy/build/pdf.js";
 fs.writeFileSync(PACKAGE_JSON_PATH, JSON.stringify(packageJson, null, 2));
--- a/src/becca/entities/bnote.js
+++ b/src/becca/entities/bnote.js
@ -351,6 +351,12 @@ class BNote extends AbstractBeccaEntity {
            && this.mime === "text/html";
    }
    /** @returns {boolean} true if this note is an image */
    isImage() {
        return this.type === 'image'
            || (this.type === 'file' && this.mime?.startsWith('image/'));
    }
    /** @returns {boolean} true if the note has string content (not binary) */
    isStringNote() {
        return utils.isStringNote(this.type, this.mime);
--- a/src/etapi/notes.js
+++ b/src/etapi/notes.js
@ -123,7 +123,7 @@ function register(router) {
        note.setContent(req.body);
-        noteService.scanForLinks(note);
+        noteService.asyncPostProcessContent(note, req.body);
        return res.sendStatus(204);
    });
--- a/src/routes/api/files.js
+++ b/src/routes/api/files.js
@ -3,7 +3,7 @@
 const protectedSessionService = require('../../services/protected_session');
 const utils = require('../../services/utils');
 const log = require('../../services/log');
-const noteRevisionService = require('../../services/note_revisions');
+const noteService = require('../../services/notes');
 const tmp = require('tmp');
 const fs = require('fs');
 const { Readable } = require('stream');
@ -31,21 +31,7 @@ function updateFile(req) {
    note.setLabel('originalFileName', file.originalname);
-    if (note.mime === 'application/pdf') {
+    noteService.asyncPostProcessContent(note, file.buffer);
        const pdfjsLib = require("pdfjs-dist");
        (async () =>
        {
            let doc = await pdfjsLib.getDocument({data: file.buffer}).promise;
            let page1 = await doc.getPage(1);
            let content = await page1.getTextContent();
            let strings = content.items.map(function (item) {
                return item.str;
            });
            console.log(strings);
        })();
    }
    return {
        uploaded: true
--- a/src/services/image.js
+++ b/src/services/image.js
@ -65,24 +65,6 @@ function getImageMimeFromExtension(ext) {
    return `image/${ext === 'svg' ? 'svg+xml' : ext}`;
 }
 function runOcr(note, buffer) {
    if (!optionService.getOptionBool('ocrImages')) {
        return;
    }
    const start = Date.now();
    const img = new Canvas.Image();
    img.src = buffer;
    const canvas = new Canvas.createCanvas(img.width, img.height);
    const ctx = canvas.getContext('2d');
    ctx.drawImage(img, 0, 0, img.width, img.height);
    const plainText = OCRAD(canvas);
    log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`);
    note.saveNoteAttachment('plainText', 'text/plain', plainText);
 }
 function updateImage(noteId, uploadBuffer, originalName) {
    log.info(`Updating image ${noteId}: ${originalName}`);
--- a/src/services/import/enex.js
+++ b/src/services/import/enex.js
@ -335,7 +335,7 @@ function importEnex(taskContext, file, parentNote) {
        // save updated content with links to files/images
        noteEntity.setContent(content);
-        noteService.scanForLinks(noteEntity);
+        noteService.asyncPostProcessContent(noteEntity, content);
        updateDates(noteEntity.noteId, utcDateCreated, utcDateModified);
    }
--- a/src/services/import/zip.js
+++ b/src/services/import/zip.js
@ -520,7 +520,8 @@ async function importZip(taskContext, fileBuffer, importRootNote) {
    });
    for (const noteId in createdNoteIds) { // now the noteIds are unique
-        noteService.scanForLinks(becca.getNote(noteId));
+        const note = becca.getNote(noteId);
        await noteService.asyncPostProcessContent(note, note.getContent());
        if (!metaFile) {
            // if there's no meta file then the notes are created based on the order in that zip file but that
--- a/src/services/notes.js
+++ b/src/services/notes.js
@ -23,6 +23,7 @@ const dayjs = require("dayjs");
 const htmlSanitizer = require("./html_sanitizer");
 const ValidationError = require("../errors/validation_error");
 const noteTypesService = require("./note_types");
 const textExtractingService = require("./text_extracting");
 function getNewNotePosition(parentNoteId) {
    const note = becca.notes[parentNoteId];
@ -191,7 +192,7 @@ function createNewNote(params) {
            }
        }
-        scanForLinks(note);
+        asyncPostProcessContent(note, params.content);
        copyChildAttributes(parentNote, note);
@ -492,7 +493,7 @@ function downloadImages(noteId, content) {
                if (updatedContent !== origContent) {
                    origNote.setContent(updatedContent);
-                    scanForLinks(origNote);
+                    asyncPostProcessContent(origNote, updatedContent);
                    eventService.emit(eventService.ENTITY_CHANGED, {
                        entityName: 'note_contents',
@ -711,13 +712,12 @@ function getUndeletedParentBranchIds(noteId, deleteId) {
                      AND parentNote.isDeleted = 0`, [noteId, deleteId]);
 }
-function scanForLinks(note) {
+function scanForLinks(note, content) {
    if (!note || !['text', 'relationMap'].includes(note.type)) {
        return;
    }
    try {
        const content = note.getContent();
        const newContent = saveLinks(note, content);
        if (content !== newContent) {
@ -729,6 +729,30 @@ function scanForLinks(note) {
    }
 }
 function runOcr(note, buffer) {
    if (!note.isImage() || !optionService.getOptionBool('ocrImages')) {
        return;
    }
    try {
        const plainText = textExtractingService.ocrTextFromBuffer(buffer);
        note.saveNoteAttachment('plainText', 'text/plain', plainText);
    }
    catch (e) {
        log.error(`OCR on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
    }
 }
 /**
 * Things which have to be executed after updating content, but asynchronously (separate transaction)
 */
 async function asyncPostProcessContent(note, content) {
    scanForLinks(note, content);
    runOcr(note, content);
    await textExtractingService.extractTextFromPdf(note, content);
 }
 function eraseNotes(noteIdsToErase) {
    if (noteIdsToErase.length === 0) {
        return;
@ -1006,7 +1030,6 @@ module.exports = {
    updateNoteData,
    undeleteNote,
    protectNoteRecursively,
    scanForLinks,
    duplicateSubtree,
    duplicateSubtreeWithoutRoot,
    getUndeletedParentBranchIds,
@ -1014,5 +1037,6 @@ module.exports = {
    eraseDeletedNotesNow,
    eraseNotesWithDeleteId,
    saveNoteRevisionIfNeeded,
-    downloadImages
+    downloadImages,
    asyncPostProcessContent
 };
--- a/src/services/options_init.js
+++ b/src/services/options_init.js
@ -91,6 +91,7 @@ const defaultOptions = [
    { name: 'disableTray', value: 'false', isSynced: false },
    { name: 'userGuideSha256Hash', value: '', isSynced: true },
    { name: 'ocrImages', value: 'true', isSynced: true },
    { name: 'extractTextFromPdf', value: 'true', isSynced: true },
 ];
 function initStartupOptions() {
--- a/src/services/search/expressions/note_content_fulltext.js
+++ b/src/services/search/expressions/note_content_fulltext.js
@ -53,7 +53,9 @@ class NoteContentFulltextExp extends Expression {
                FROM note_attachments JOIN note_attachment_contents USING (noteAttachmentId) 
                WHERE name IN ('plainText') AND isDeleted = 0`)) {
-            this.findInText(row, inputNoteSet, resultNoteSet);
+            if (!resultNoteSet.hasNoteId(row.noteId)) {
                this.findInText(row, inputNoteSet, resultNoteSet);
            }
        }
        return resultNoteSet;
--- a/src/services/text_extracting.js
+++ b/src/services/text_extracting.js
@ -0,0 +1,129 @@
 const Canvas = require("canvas");
 const OCRAD = require("ocrad.js");
 const log = require("./log.js");
 const optionService = require("./options.js");
 function ocrFromByteArray(img) {
    // byte array contains raw uncompressed pixel data
    // kind: 1 - GRAYSCALE_1BPP (unsupported)
    // kind: 2 - RGB_24BPP
    // kind: 3 - RGBA_32BPP
    if (!(img.data instanceof Uint8ClampedArray) || ![2, 3].includes(img.kind)) {
        return null;
    }
    const start = Date.now();
    const canvas = new Canvas.createCanvas(img.width, img.height);
    const ctx = canvas.getContext('2d');
    const imageData = ctx.createImageData(img.width, img.height);
    const imageBytes = imageData.data;
    for (let j = 0, k = 0, jj = img.width * img.height * 4; j < jj;) {
        imageBytes[j++] = img.data[k++];
        imageBytes[j++] = img.data[k++];
        imageBytes[j++] = img.data[k++];
        // in case of kind = 2, the alpha channel is missing in source pixels and we'll add it
        imageBytes[j++] = img.kind === 2 ? 255 : img.data[k++];
    }
    ctx.putImageData(imageData, 0, 0);
    const text = OCRAD(canvas);
    log.info(`OCR of ${img.data.length} canvas into ${text.length} chars of text took ${Date.now() - start}ms`);
    return text;
 }
 async function ocrTextFromPdfImages(pdfjsLib, page, strings) {
    const ops = await page.getOperatorList();
    const fns = ops.fnArray;
    const args = ops.argsArray;
    for (const arg of args) {
        const i = args.indexOf(arg);
        if (fns[i] !== pdfjsLib.OPS.paintXObject && fns[i] !== pdfjsLib.OPS.paintImageXObject) {
            continue;
        }
        const imgKey = arg[0];
        const img = await new Promise((res) => page.objs.get(imgKey, r => res(r)));
        if (!img) {
            continue;
        }
        const text = ocrFromByteArray(img);
        if (text) {
            strings.push(text);
        }
    }
 }
 async function extractTextFromPdf(note, buffer) {
    if (note.mime !== 'application/pdf' || !optionService.getOptionBool('extractTextFromPdf')) {
        return;
    }
    try {
        const pdfjsLib = require("pdfjs-dist");
        const doc = await pdfjsLib.getDocument({data: buffer}).promise;
        let strings = [];
        for (let p = 1; p <= doc.numPages; p++) {
            const page = await doc.getPage(p);
            const content = await page.getTextContent({
                normalizeWhitespace: true,
                disableCombineTextItems: false
            });
            content.items.forEach(({str}) => strings.push(str));
            try {
                if (optionService.getOptionBool('ocrImages')) {
                    await ocrTextFromPdfImages(pdfjsLib, page, strings);
                }
            }
            catch (e) {
                log.info(`Could not OCR images from PDF note '${note.noteId}': '${e.message}', stack '${e.stack}'`);
            }
        }
        strings = strings.filter(str => str?.trim());
        note.saveNoteAttachment('plainText', 'text/plain', strings.join(" "));
    }
    catch (e) {
        log.info(`Extracting text from PDF on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
    }
 }
 async function ocrTextFromBuffer(buffer) {
    // buffer is expected to contain an image in JPEG, PNG etc.
    const start = Date.now();
    const img = await new Promise((res, rej) => {
        const img = new Canvas.Image();
        img.onload = () => res(img);
        img.onerror = err => rej(new Error("Can't load the image " + err));
        img.src = buffer;
    });
    const canvas = new Canvas.createCanvas(img.width, img.height);
    const ctx = canvas.getContext('2d');
    ctx.drawImage(img, 0, 0, img.width, img.height);
    const plainText = OCRAD(canvas);
    log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`);
    return plainText;
 }
 module.exports = {
    ocrTextFromBuffer,
    extractTextFromPdf
 };