added image OCR and parsing text from PDF (and OCR of PDF images)

This commit is contained in:
zadam 2023-01-26 20:32:27 +01:00
parent 63c62df787
commit ad887c4b12
13 changed files with 380 additions and 189 deletions

334
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -24,7 +24,7 @@
"test-jasmine": "jasmine",
"test-es6": "node -r esm spec-es6/attribute_parser.spec.js ",
"test": "npm run test-jasmine && npm run test-es6",
"postinstall": "rimraf ./node_modules/canvas"
"postinstall": "node src-build/fix_pdfjs.js"
},
"dependencies": {
"@braintree/sanitize-url": "6.0.2",
@ -72,7 +72,7 @@
"normalize-strings": "1.1.1",
"ocrad.js": "antimatter15/ocrad.js#master",
"open": "8.4.0",
"pdfjs-dist": "2.8.335",
"pdfjs-dist": "3.2.146",
"rand-token": "1.0.1",
"react": "17.0.2",
"react-dom": "17.0.2",

12
src-build/fix_pdfjs.js Normal file
View File

@ -0,0 +1,12 @@
const fs = require("fs");
const PACKAGE_JSON_PATH = './node_modules/pdfjs-dist/package.json';
const packageJson = JSON.parse(
fs.readFileSync(PACKAGE_JSON_PATH).toString()
);
// non-legacy build doesn't work on node 16 at least
packageJson.main = "legacy/build/pdf.js";
fs.writeFileSync(PACKAGE_JSON_PATH, JSON.stringify(packageJson, null, 2));

View File

@ -351,6 +351,12 @@ class BNote extends AbstractBeccaEntity {
&& this.mime === "text/html";
}
/** @returns {boolean} true if this note is an image */
isImage() {
return this.type === 'image'
|| (this.type === 'file' && this.mime?.startsWith('image/'));
}
/** @returns {boolean} true if the note has string content (not binary) */
isStringNote() {
return utils.isStringNote(this.type, this.mime);

View File

@ -123,7 +123,7 @@ function register(router) {
note.setContent(req.body);
noteService.scanForLinks(note);
noteService.asyncPostProcessContent(note, req.body);
return res.sendStatus(204);
});

View File

@ -3,7 +3,7 @@
const protectedSessionService = require('../../services/protected_session');
const utils = require('../../services/utils');
const log = require('../../services/log');
const noteRevisionService = require('../../services/note_revisions');
const noteService = require('../../services/notes');
const tmp = require('tmp');
const fs = require('fs');
const { Readable } = require('stream');
@ -31,21 +31,7 @@ function updateFile(req) {
note.setLabel('originalFileName', file.originalname);
if (note.mime === 'application/pdf') {
const pdfjsLib = require("pdfjs-dist");
(async () =>
{
let doc = await pdfjsLib.getDocument({data: file.buffer}).promise;
let page1 = await doc.getPage(1);
let content = await page1.getTextContent();
let strings = content.items.map(function (item) {
return item.str;
});
console.log(strings);
})();
}
noteService.asyncPostProcessContent(note, file.buffer);
return {
uploaded: true

View File

@ -65,24 +65,6 @@ function getImageMimeFromExtension(ext) {
return `image/${ext === 'svg' ? 'svg+xml' : ext}`;
}
function runOcr(note, buffer) {
if (!optionService.getOptionBool('ocrImages')) {
return;
}
const start = Date.now();
const img = new Canvas.Image();
img.src = buffer;
const canvas = new Canvas.createCanvas(img.width, img.height);
const ctx = canvas.getContext('2d');
ctx.drawImage(img, 0, 0, img.width, img.height);
const plainText = OCRAD(canvas);
log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`);
note.saveNoteAttachment('plainText', 'text/plain', plainText);
}
function updateImage(noteId, uploadBuffer, originalName) {
log.info(`Updating image ${noteId}: ${originalName}`);

View File

@ -335,7 +335,7 @@ function importEnex(taskContext, file, parentNote) {
// save updated content with links to files/images
noteEntity.setContent(content);
noteService.scanForLinks(noteEntity);
noteService.asyncPostProcessContent(noteEntity, content);
updateDates(noteEntity.noteId, utcDateCreated, utcDateModified);
}

View File

@ -520,7 +520,8 @@ async function importZip(taskContext, fileBuffer, importRootNote) {
});
for (const noteId in createdNoteIds) { // now the noteIds are unique
noteService.scanForLinks(becca.getNote(noteId));
const note = becca.getNote(noteId);
await noteService.asyncPostProcessContent(note, note.getContent());
if (!metaFile) {
// if there's no meta file then the notes are created based on the order in that zip file but that

View File

@ -23,6 +23,7 @@ const dayjs = require("dayjs");
const htmlSanitizer = require("./html_sanitizer");
const ValidationError = require("../errors/validation_error");
const noteTypesService = require("./note_types");
const textExtractingService = require("./text_extracting");
function getNewNotePosition(parentNoteId) {
const note = becca.notes[parentNoteId];
@ -191,7 +192,7 @@ function createNewNote(params) {
}
}
scanForLinks(note);
asyncPostProcessContent(note, params.content);
copyChildAttributes(parentNote, note);
@ -492,7 +493,7 @@ function downloadImages(noteId, content) {
if (updatedContent !== origContent) {
origNote.setContent(updatedContent);
scanForLinks(origNote);
asyncPostProcessContent(origNote, updatedContent);
eventService.emit(eventService.ENTITY_CHANGED, {
entityName: 'note_contents',
@ -711,13 +712,12 @@ function getUndeletedParentBranchIds(noteId, deleteId) {
AND parentNote.isDeleted = 0`, [noteId, deleteId]);
}
function scanForLinks(note) {
function scanForLinks(note, content) {
if (!note || !['text', 'relationMap'].includes(note.type)) {
return;
}
try {
const content = note.getContent();
const newContent = saveLinks(note, content);
if (content !== newContent) {
@ -729,6 +729,30 @@ function scanForLinks(note) {
}
}
function runOcr(note, buffer) {
if (!note.isImage() || !optionService.getOptionBool('ocrImages')) {
return;
}
try {
const plainText = textExtractingService.ocrTextFromBuffer(buffer);
note.saveNoteAttachment('plainText', 'text/plain', plainText);
}
catch (e) {
log.error(`OCR on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
}
}
/**
* Things which have to be executed after updating content, but asynchronously (separate transaction)
*/
async function asyncPostProcessContent(note, content) {
scanForLinks(note, content);
runOcr(note, content);
await textExtractingService.extractTextFromPdf(note, content);
}
function eraseNotes(noteIdsToErase) {
if (noteIdsToErase.length === 0) {
return;
@ -1006,7 +1030,6 @@ module.exports = {
updateNoteData,
undeleteNote,
protectNoteRecursively,
scanForLinks,
duplicateSubtree,
duplicateSubtreeWithoutRoot,
getUndeletedParentBranchIds,
@ -1014,5 +1037,6 @@ module.exports = {
eraseDeletedNotesNow,
eraseNotesWithDeleteId,
saveNoteRevisionIfNeeded,
downloadImages
downloadImages,
asyncPostProcessContent
};

View File

@ -91,6 +91,7 @@ const defaultOptions = [
{ name: 'disableTray', value: 'false', isSynced: false },
{ name: 'userGuideSha256Hash', value: '', isSynced: true },
{ name: 'ocrImages', value: 'true', isSynced: true },
{ name: 'extractTextFromPdf', value: 'true', isSynced: true },
];
function initStartupOptions() {

View File

@ -53,7 +53,9 @@ class NoteContentFulltextExp extends Expression {
FROM note_attachments JOIN note_attachment_contents USING (noteAttachmentId)
WHERE name IN ('plainText') AND isDeleted = 0`)) {
this.findInText(row, inputNoteSet, resultNoteSet);
if (!resultNoteSet.hasNoteId(row.noteId)) {
this.findInText(row, inputNoteSet, resultNoteSet);
}
}
return resultNoteSet;

View File

@ -0,0 +1,129 @@
const Canvas = require("canvas");
const OCRAD = require("ocrad.js");
const log = require("./log.js");
const optionService = require("./options.js");
function ocrFromByteArray(img) {
// byte array contains raw uncompressed pixel data
// kind: 1 - GRAYSCALE_1BPP (unsupported)
// kind: 2 - RGB_24BPP
// kind: 3 - RGBA_32BPP
if (!(img.data instanceof Uint8ClampedArray) || ![2, 3].includes(img.kind)) {
return null;
}
const start = Date.now();
const canvas = new Canvas.createCanvas(img.width, img.height);
const ctx = canvas.getContext('2d');
const imageData = ctx.createImageData(img.width, img.height);
const imageBytes = imageData.data;
for (let j = 0, k = 0, jj = img.width * img.height * 4; j < jj;) {
imageBytes[j++] = img.data[k++];
imageBytes[j++] = img.data[k++];
imageBytes[j++] = img.data[k++];
// in case of kind = 2, the alpha channel is missing in source pixels and we'll add it
imageBytes[j++] = img.kind === 2 ? 255 : img.data[k++];
}
ctx.putImageData(imageData, 0, 0);
const text = OCRAD(canvas);
log.info(`OCR of ${img.data.length} canvas into ${text.length} chars of text took ${Date.now() - start}ms`);
return text;
}
async function ocrTextFromPdfImages(pdfjsLib, page, strings) {
const ops = await page.getOperatorList();
const fns = ops.fnArray;
const args = ops.argsArray;
for (const arg of args) {
const i = args.indexOf(arg);
if (fns[i] !== pdfjsLib.OPS.paintXObject && fns[i] !== pdfjsLib.OPS.paintImageXObject) {
continue;
}
const imgKey = arg[0];
const img = await new Promise((res) => page.objs.get(imgKey, r => res(r)));
if (!img) {
continue;
}
const text = ocrFromByteArray(img);
if (text) {
strings.push(text);
}
}
}
async function extractTextFromPdf(note, buffer) {
if (note.mime !== 'application/pdf' || !optionService.getOptionBool('extractTextFromPdf')) {
return;
}
try {
const pdfjsLib = require("pdfjs-dist");
const doc = await pdfjsLib.getDocument({data: buffer}).promise;
let strings = [];
for (let p = 1; p <= doc.numPages; p++) {
const page = await doc.getPage(p);
const content = await page.getTextContent({
normalizeWhitespace: true,
disableCombineTextItems: false
});
content.items.forEach(({str}) => strings.push(str));
try {
if (optionService.getOptionBool('ocrImages')) {
await ocrTextFromPdfImages(pdfjsLib, page, strings);
}
}
catch (e) {
log.info(`Could not OCR images from PDF note '${note.noteId}': '${e.message}', stack '${e.stack}'`);
}
}
strings = strings.filter(str => str?.trim());
note.saveNoteAttachment('plainText', 'text/plain', strings.join(" "));
}
catch (e) {
log.info(`Extracting text from PDF on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
}
}
async function ocrTextFromBuffer(buffer) {
// buffer is expected to contain an image in JPEG, PNG etc.
const start = Date.now();
const img = await new Promise((res, rej) => {
const img = new Canvas.Image();
img.onload = () => res(img);
img.onerror = err => rej(new Error("Can't load the image " + err));
img.src = buffer;
});
const canvas = new Canvas.createCanvas(img.width, img.height);
const ctx = canvas.getContext('2d');
ctx.drawImage(img, 0, 0, img.width, img.height);
const plainText = OCRAD(canvas);
log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`);
return plainText;
}
module.exports = {
ocrTextFromBuffer,
extractTextFromPdf
};