mirror of
https://github.com/zadam/trilium.git
synced 2025-03-01 14:22:32 +01:00
added image OCR and parsing text from PDF (and OCR of PDF images)
This commit is contained in:
parent
63c62df787
commit
ad887c4b12
334
package-lock.json
generated
334
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -24,7 +24,7 @@
|
||||
"test-jasmine": "jasmine",
|
||||
"test-es6": "node -r esm spec-es6/attribute_parser.spec.js ",
|
||||
"test": "npm run test-jasmine && npm run test-es6",
|
||||
"postinstall": "rimraf ./node_modules/canvas"
|
||||
"postinstall": "node src-build/fix_pdfjs.js"
|
||||
},
|
||||
"dependencies": {
|
||||
"@braintree/sanitize-url": "6.0.2",
|
||||
@ -72,7 +72,7 @@
|
||||
"normalize-strings": "1.1.1",
|
||||
"ocrad.js": "antimatter15/ocrad.js#master",
|
||||
"open": "8.4.0",
|
||||
"pdfjs-dist": "2.8.335",
|
||||
"pdfjs-dist": "3.2.146",
|
||||
"rand-token": "1.0.1",
|
||||
"react": "17.0.2",
|
||||
"react-dom": "17.0.2",
|
||||
|
12
src-build/fix_pdfjs.js
Normal file
12
src-build/fix_pdfjs.js
Normal file
@ -0,0 +1,12 @@
|
||||
const fs = require("fs");
|
||||
|
||||
const PACKAGE_JSON_PATH = './node_modules/pdfjs-dist/package.json';
|
||||
|
||||
const packageJson = JSON.parse(
|
||||
fs.readFileSync(PACKAGE_JSON_PATH).toString()
|
||||
);
|
||||
|
||||
// non-legacy build doesn't work on node 16 at least
|
||||
packageJson.main = "legacy/build/pdf.js";
|
||||
|
||||
fs.writeFileSync(PACKAGE_JSON_PATH, JSON.stringify(packageJson, null, 2));
|
@ -351,6 +351,12 @@ class BNote extends AbstractBeccaEntity {
|
||||
&& this.mime === "text/html";
|
||||
}
|
||||
|
||||
/** @returns {boolean} true if this note is an image */
|
||||
isImage() {
|
||||
return this.type === 'image'
|
||||
|| (this.type === 'file' && this.mime?.startsWith('image/'));
|
||||
}
|
||||
|
||||
/** @returns {boolean} true if the note has string content (not binary) */
|
||||
isStringNote() {
|
||||
return utils.isStringNote(this.type, this.mime);
|
||||
|
@ -123,7 +123,7 @@ function register(router) {
|
||||
|
||||
note.setContent(req.body);
|
||||
|
||||
noteService.scanForLinks(note);
|
||||
noteService.asyncPostProcessContent(note, req.body);
|
||||
|
||||
return res.sendStatus(204);
|
||||
});
|
||||
|
@ -3,7 +3,7 @@
|
||||
const protectedSessionService = require('../../services/protected_session');
|
||||
const utils = require('../../services/utils');
|
||||
const log = require('../../services/log');
|
||||
const noteRevisionService = require('../../services/note_revisions');
|
||||
const noteService = require('../../services/notes');
|
||||
const tmp = require('tmp');
|
||||
const fs = require('fs');
|
||||
const { Readable } = require('stream');
|
||||
@ -31,21 +31,7 @@ function updateFile(req) {
|
||||
|
||||
note.setLabel('originalFileName', file.originalname);
|
||||
|
||||
if (note.mime === 'application/pdf') {
|
||||
const pdfjsLib = require("pdfjs-dist");
|
||||
|
||||
(async () =>
|
||||
{
|
||||
let doc = await pdfjsLib.getDocument({data: file.buffer}).promise;
|
||||
let page1 = await doc.getPage(1);
|
||||
let content = await page1.getTextContent();
|
||||
let strings = content.items.map(function (item) {
|
||||
return item.str;
|
||||
});
|
||||
|
||||
console.log(strings);
|
||||
})();
|
||||
}
|
||||
noteService.asyncPostProcessContent(note, file.buffer);
|
||||
|
||||
return {
|
||||
uploaded: true
|
||||
|
@ -65,24 +65,6 @@ function getImageMimeFromExtension(ext) {
|
||||
return `image/${ext === 'svg' ? 'svg+xml' : ext}`;
|
||||
}
|
||||
|
||||
function runOcr(note, buffer) {
|
||||
if (!optionService.getOptionBool('ocrImages')) {
|
||||
return;
|
||||
}
|
||||
|
||||
const start = Date.now();
|
||||
const img = new Canvas.Image();
|
||||
img.src = buffer;
|
||||
const canvas = new Canvas.createCanvas(img.width, img.height);
|
||||
const ctx = canvas.getContext('2d');
|
||||
ctx.drawImage(img, 0, 0, img.width, img.height);
|
||||
const plainText = OCRAD(canvas);
|
||||
|
||||
log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`);
|
||||
|
||||
note.saveNoteAttachment('plainText', 'text/plain', plainText);
|
||||
}
|
||||
|
||||
function updateImage(noteId, uploadBuffer, originalName) {
|
||||
log.info(`Updating image ${noteId}: ${originalName}`);
|
||||
|
||||
|
@ -335,7 +335,7 @@ function importEnex(taskContext, file, parentNote) {
|
||||
// save updated content with links to files/images
|
||||
noteEntity.setContent(content);
|
||||
|
||||
noteService.scanForLinks(noteEntity);
|
||||
noteService.asyncPostProcessContent(noteEntity, content);
|
||||
|
||||
updateDates(noteEntity.noteId, utcDateCreated, utcDateModified);
|
||||
}
|
||||
|
@ -520,7 +520,8 @@ async function importZip(taskContext, fileBuffer, importRootNote) {
|
||||
});
|
||||
|
||||
for (const noteId in createdNoteIds) { // now the noteIds are unique
|
||||
noteService.scanForLinks(becca.getNote(noteId));
|
||||
const note = becca.getNote(noteId);
|
||||
await noteService.asyncPostProcessContent(note, note.getContent());
|
||||
|
||||
if (!metaFile) {
|
||||
// if there's no meta file then the notes are created based on the order in that zip file but that
|
||||
|
@ -23,6 +23,7 @@ const dayjs = require("dayjs");
|
||||
const htmlSanitizer = require("./html_sanitizer");
|
||||
const ValidationError = require("../errors/validation_error");
|
||||
const noteTypesService = require("./note_types");
|
||||
const textExtractingService = require("./text_extracting");
|
||||
|
||||
function getNewNotePosition(parentNoteId) {
|
||||
const note = becca.notes[parentNoteId];
|
||||
@ -191,7 +192,7 @@ function createNewNote(params) {
|
||||
}
|
||||
}
|
||||
|
||||
scanForLinks(note);
|
||||
asyncPostProcessContent(note, params.content);
|
||||
|
||||
copyChildAttributes(parentNote, note);
|
||||
|
||||
@ -492,7 +493,7 @@ function downloadImages(noteId, content) {
|
||||
if (updatedContent !== origContent) {
|
||||
origNote.setContent(updatedContent);
|
||||
|
||||
scanForLinks(origNote);
|
||||
asyncPostProcessContent(origNote, updatedContent);
|
||||
|
||||
eventService.emit(eventService.ENTITY_CHANGED, {
|
||||
entityName: 'note_contents',
|
||||
@ -711,13 +712,12 @@ function getUndeletedParentBranchIds(noteId, deleteId) {
|
||||
AND parentNote.isDeleted = 0`, [noteId, deleteId]);
|
||||
}
|
||||
|
||||
function scanForLinks(note) {
|
||||
function scanForLinks(note, content) {
|
||||
if (!note || !['text', 'relationMap'].includes(note.type)) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const content = note.getContent();
|
||||
const newContent = saveLinks(note, content);
|
||||
|
||||
if (content !== newContent) {
|
||||
@ -729,6 +729,30 @@ function scanForLinks(note) {
|
||||
}
|
||||
}
|
||||
|
||||
function runOcr(note, buffer) {
|
||||
if (!note.isImage() || !optionService.getOptionBool('ocrImages')) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const plainText = textExtractingService.ocrTextFromBuffer(buffer);
|
||||
|
||||
note.saveNoteAttachment('plainText', 'text/plain', plainText);
|
||||
}
|
||||
catch (e) {
|
||||
log.error(`OCR on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Things which have to be executed after updating content, but asynchronously (separate transaction)
|
||||
*/
|
||||
async function asyncPostProcessContent(note, content) {
|
||||
scanForLinks(note, content);
|
||||
runOcr(note, content);
|
||||
await textExtractingService.extractTextFromPdf(note, content);
|
||||
}
|
||||
|
||||
function eraseNotes(noteIdsToErase) {
|
||||
if (noteIdsToErase.length === 0) {
|
||||
return;
|
||||
@ -1006,7 +1030,6 @@ module.exports = {
|
||||
updateNoteData,
|
||||
undeleteNote,
|
||||
protectNoteRecursively,
|
||||
scanForLinks,
|
||||
duplicateSubtree,
|
||||
duplicateSubtreeWithoutRoot,
|
||||
getUndeletedParentBranchIds,
|
||||
@ -1014,5 +1037,6 @@ module.exports = {
|
||||
eraseDeletedNotesNow,
|
||||
eraseNotesWithDeleteId,
|
||||
saveNoteRevisionIfNeeded,
|
||||
downloadImages
|
||||
downloadImages,
|
||||
asyncPostProcessContent
|
||||
};
|
||||
|
@ -91,6 +91,7 @@ const defaultOptions = [
|
||||
{ name: 'disableTray', value: 'false', isSynced: false },
|
||||
{ name: 'userGuideSha256Hash', value: '', isSynced: true },
|
||||
{ name: 'ocrImages', value: 'true', isSynced: true },
|
||||
{ name: 'extractTextFromPdf', value: 'true', isSynced: true },
|
||||
];
|
||||
|
||||
function initStartupOptions() {
|
||||
|
@ -53,7 +53,9 @@ class NoteContentFulltextExp extends Expression {
|
||||
FROM note_attachments JOIN note_attachment_contents USING (noteAttachmentId)
|
||||
WHERE name IN ('plainText') AND isDeleted = 0`)) {
|
||||
|
||||
this.findInText(row, inputNoteSet, resultNoteSet);
|
||||
if (!resultNoteSet.hasNoteId(row.noteId)) {
|
||||
this.findInText(row, inputNoteSet, resultNoteSet);
|
||||
}
|
||||
}
|
||||
|
||||
return resultNoteSet;
|
||||
|
129
src/services/text_extracting.js
Normal file
129
src/services/text_extracting.js
Normal file
@ -0,0 +1,129 @@
|
||||
const Canvas = require("canvas");
|
||||
const OCRAD = require("ocrad.js");
|
||||
const log = require("./log.js");
|
||||
const optionService = require("./options.js");
|
||||
|
||||
function ocrFromByteArray(img) {
|
||||
// byte array contains raw uncompressed pixel data
|
||||
// kind: 1 - GRAYSCALE_1BPP (unsupported)
|
||||
// kind: 2 - RGB_24BPP
|
||||
// kind: 3 - RGBA_32BPP
|
||||
|
||||
if (!(img.data instanceof Uint8ClampedArray) || ![2, 3].includes(img.kind)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const start = Date.now();
|
||||
const canvas = new Canvas.createCanvas(img.width, img.height);
|
||||
const ctx = canvas.getContext('2d');
|
||||
|
||||
const imageData = ctx.createImageData(img.width, img.height);
|
||||
const imageBytes = imageData.data;
|
||||
|
||||
for (let j = 0, k = 0, jj = img.width * img.height * 4; j < jj;) {
|
||||
imageBytes[j++] = img.data[k++];
|
||||
imageBytes[j++] = img.data[k++];
|
||||
imageBytes[j++] = img.data[k++];
|
||||
// in case of kind = 2, the alpha channel is missing in source pixels and we'll add it
|
||||
imageBytes[j++] = img.kind === 2 ? 255 : img.data[k++];
|
||||
}
|
||||
|
||||
ctx.putImageData(imageData, 0, 0);
|
||||
const text = OCRAD(canvas);
|
||||
|
||||
log.info(`OCR of ${img.data.length} canvas into ${text.length} chars of text took ${Date.now() - start}ms`);
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
async function ocrTextFromPdfImages(pdfjsLib, page, strings) {
|
||||
const ops = await page.getOperatorList();
|
||||
|
||||
const fns = ops.fnArray;
|
||||
const args = ops.argsArray;
|
||||
|
||||
for (const arg of args) {
|
||||
const i = args.indexOf(arg);
|
||||
|
||||
if (fns[i] !== pdfjsLib.OPS.paintXObject && fns[i] !== pdfjsLib.OPS.paintImageXObject) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const imgKey = arg[0];
|
||||
const img = await new Promise((res) => page.objs.get(imgKey, r => res(r)));
|
||||
|
||||
if (!img) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const text = ocrFromByteArray(img);
|
||||
|
||||
if (text) {
|
||||
strings.push(text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function extractTextFromPdf(note, buffer) {
|
||||
if (note.mime !== 'application/pdf' || !optionService.getOptionBool('extractTextFromPdf')) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const pdfjsLib = require("pdfjs-dist");
|
||||
const doc = await pdfjsLib.getDocument({data: buffer}).promise;
|
||||
let strings = [];
|
||||
|
||||
for (let p = 1; p <= doc.numPages; p++) {
|
||||
const page = await doc.getPage(p);
|
||||
|
||||
const content = await page.getTextContent({
|
||||
normalizeWhitespace: true,
|
||||
disableCombineTextItems: false
|
||||
});
|
||||
|
||||
content.items.forEach(({str}) => strings.push(str));
|
||||
|
||||
try {
|
||||
if (optionService.getOptionBool('ocrImages')) {
|
||||
await ocrTextFromPdfImages(pdfjsLib, page, strings);
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
log.info(`Could not OCR images from PDF note '${note.noteId}': '${e.message}', stack '${e.stack}'`);
|
||||
}
|
||||
}
|
||||
|
||||
strings = strings.filter(str => str?.trim());
|
||||
|
||||
note.saveNoteAttachment('plainText', 'text/plain', strings.join(" "));
|
||||
}
|
||||
catch (e) {
|
||||
log.info(`Extracting text from PDF on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function ocrTextFromBuffer(buffer) {
|
||||
// buffer is expected to contain an image in JPEG, PNG etc.
|
||||
const start = Date.now();
|
||||
|
||||
const img = await new Promise((res, rej) => {
|
||||
const img = new Canvas.Image();
|
||||
img.onload = () => res(img);
|
||||
img.onerror = err => rej(new Error("Can't load the image " + err));
|
||||
img.src = buffer;
|
||||
});
|
||||
|
||||
const canvas = new Canvas.createCanvas(img.width, img.height);
|
||||
const ctx = canvas.getContext('2d');
|
||||
ctx.drawImage(img, 0, 0, img.width, img.height);
|
||||
const plainText = OCRAD(canvas);
|
||||
|
||||
log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`);
|
||||
return plainText;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
ocrTextFromBuffer,
|
||||
extractTextFromPdf
|
||||
};
|
Loading…
x
Reference in New Issue
Block a user