mirror of
https://github.com/zadam/trilium.git
synced 2025-03-01 14:22:32 +01:00
added image OCR and parsing text from PDF (and OCR of PDF images)
This commit is contained in:
parent
63c62df787
commit
ad887c4b12
334
package-lock.json
generated
334
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -24,7 +24,7 @@
|
|||||||
"test-jasmine": "jasmine",
|
"test-jasmine": "jasmine",
|
||||||
"test-es6": "node -r esm spec-es6/attribute_parser.spec.js ",
|
"test-es6": "node -r esm spec-es6/attribute_parser.spec.js ",
|
||||||
"test": "npm run test-jasmine && npm run test-es6",
|
"test": "npm run test-jasmine && npm run test-es6",
|
||||||
"postinstall": "rimraf ./node_modules/canvas"
|
"postinstall": "node src-build/fix_pdfjs.js"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@braintree/sanitize-url": "6.0.2",
|
"@braintree/sanitize-url": "6.0.2",
|
||||||
@ -72,7 +72,7 @@
|
|||||||
"normalize-strings": "1.1.1",
|
"normalize-strings": "1.1.1",
|
||||||
"ocrad.js": "antimatter15/ocrad.js#master",
|
"ocrad.js": "antimatter15/ocrad.js#master",
|
||||||
"open": "8.4.0",
|
"open": "8.4.0",
|
||||||
"pdfjs-dist": "2.8.335",
|
"pdfjs-dist": "3.2.146",
|
||||||
"rand-token": "1.0.1",
|
"rand-token": "1.0.1",
|
||||||
"react": "17.0.2",
|
"react": "17.0.2",
|
||||||
"react-dom": "17.0.2",
|
"react-dom": "17.0.2",
|
||||||
|
12
src-build/fix_pdfjs.js
Normal file
12
src-build/fix_pdfjs.js
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
const fs = require("fs");
|
||||||
|
|
||||||
|
const PACKAGE_JSON_PATH = './node_modules/pdfjs-dist/package.json';
|
||||||
|
|
||||||
|
const packageJson = JSON.parse(
|
||||||
|
fs.readFileSync(PACKAGE_JSON_PATH).toString()
|
||||||
|
);
|
||||||
|
|
||||||
|
// non-legacy build doesn't work on node 16 at least
|
||||||
|
packageJson.main = "legacy/build/pdf.js";
|
||||||
|
|
||||||
|
fs.writeFileSync(PACKAGE_JSON_PATH, JSON.stringify(packageJson, null, 2));
|
@ -351,6 +351,12 @@ class BNote extends AbstractBeccaEntity {
|
|||||||
&& this.mime === "text/html";
|
&& this.mime === "text/html";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** @returns {boolean} true if this note is an image */
|
||||||
|
isImage() {
|
||||||
|
return this.type === 'image'
|
||||||
|
|| (this.type === 'file' && this.mime?.startsWith('image/'));
|
||||||
|
}
|
||||||
|
|
||||||
/** @returns {boolean} true if the note has string content (not binary) */
|
/** @returns {boolean} true if the note has string content (not binary) */
|
||||||
isStringNote() {
|
isStringNote() {
|
||||||
return utils.isStringNote(this.type, this.mime);
|
return utils.isStringNote(this.type, this.mime);
|
||||||
|
@ -123,7 +123,7 @@ function register(router) {
|
|||||||
|
|
||||||
note.setContent(req.body);
|
note.setContent(req.body);
|
||||||
|
|
||||||
noteService.scanForLinks(note);
|
noteService.asyncPostProcessContent(note, req.body);
|
||||||
|
|
||||||
return res.sendStatus(204);
|
return res.sendStatus(204);
|
||||||
});
|
});
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
const protectedSessionService = require('../../services/protected_session');
|
const protectedSessionService = require('../../services/protected_session');
|
||||||
const utils = require('../../services/utils');
|
const utils = require('../../services/utils');
|
||||||
const log = require('../../services/log');
|
const log = require('../../services/log');
|
||||||
const noteRevisionService = require('../../services/note_revisions');
|
const noteService = require('../../services/notes');
|
||||||
const tmp = require('tmp');
|
const tmp = require('tmp');
|
||||||
const fs = require('fs');
|
const fs = require('fs');
|
||||||
const { Readable } = require('stream');
|
const { Readable } = require('stream');
|
||||||
@ -31,21 +31,7 @@ function updateFile(req) {
|
|||||||
|
|
||||||
note.setLabel('originalFileName', file.originalname);
|
note.setLabel('originalFileName', file.originalname);
|
||||||
|
|
||||||
if (note.mime === 'application/pdf') {
|
noteService.asyncPostProcessContent(note, file.buffer);
|
||||||
const pdfjsLib = require("pdfjs-dist");
|
|
||||||
|
|
||||||
(async () =>
|
|
||||||
{
|
|
||||||
let doc = await pdfjsLib.getDocument({data: file.buffer}).promise;
|
|
||||||
let page1 = await doc.getPage(1);
|
|
||||||
let content = await page1.getTextContent();
|
|
||||||
let strings = content.items.map(function (item) {
|
|
||||||
return item.str;
|
|
||||||
});
|
|
||||||
|
|
||||||
console.log(strings);
|
|
||||||
})();
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
uploaded: true
|
uploaded: true
|
||||||
|
@ -65,24 +65,6 @@ function getImageMimeFromExtension(ext) {
|
|||||||
return `image/${ext === 'svg' ? 'svg+xml' : ext}`;
|
return `image/${ext === 'svg' ? 'svg+xml' : ext}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
function runOcr(note, buffer) {
|
|
||||||
if (!optionService.getOptionBool('ocrImages')) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const start = Date.now();
|
|
||||||
const img = new Canvas.Image();
|
|
||||||
img.src = buffer;
|
|
||||||
const canvas = new Canvas.createCanvas(img.width, img.height);
|
|
||||||
const ctx = canvas.getContext('2d');
|
|
||||||
ctx.drawImage(img, 0, 0, img.width, img.height);
|
|
||||||
const plainText = OCRAD(canvas);
|
|
||||||
|
|
||||||
log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`);
|
|
||||||
|
|
||||||
note.saveNoteAttachment('plainText', 'text/plain', plainText);
|
|
||||||
}
|
|
||||||
|
|
||||||
function updateImage(noteId, uploadBuffer, originalName) {
|
function updateImage(noteId, uploadBuffer, originalName) {
|
||||||
log.info(`Updating image ${noteId}: ${originalName}`);
|
log.info(`Updating image ${noteId}: ${originalName}`);
|
||||||
|
|
||||||
|
@ -335,7 +335,7 @@ function importEnex(taskContext, file, parentNote) {
|
|||||||
// save updated content with links to files/images
|
// save updated content with links to files/images
|
||||||
noteEntity.setContent(content);
|
noteEntity.setContent(content);
|
||||||
|
|
||||||
noteService.scanForLinks(noteEntity);
|
noteService.asyncPostProcessContent(noteEntity, content);
|
||||||
|
|
||||||
updateDates(noteEntity.noteId, utcDateCreated, utcDateModified);
|
updateDates(noteEntity.noteId, utcDateCreated, utcDateModified);
|
||||||
}
|
}
|
||||||
|
@ -520,7 +520,8 @@ async function importZip(taskContext, fileBuffer, importRootNote) {
|
|||||||
});
|
});
|
||||||
|
|
||||||
for (const noteId in createdNoteIds) { // now the noteIds are unique
|
for (const noteId in createdNoteIds) { // now the noteIds are unique
|
||||||
noteService.scanForLinks(becca.getNote(noteId));
|
const note = becca.getNote(noteId);
|
||||||
|
await noteService.asyncPostProcessContent(note, note.getContent());
|
||||||
|
|
||||||
if (!metaFile) {
|
if (!metaFile) {
|
||||||
// if there's no meta file then the notes are created based on the order in that zip file but that
|
// if there's no meta file then the notes are created based on the order in that zip file but that
|
||||||
|
@ -23,6 +23,7 @@ const dayjs = require("dayjs");
|
|||||||
const htmlSanitizer = require("./html_sanitizer");
|
const htmlSanitizer = require("./html_sanitizer");
|
||||||
const ValidationError = require("../errors/validation_error");
|
const ValidationError = require("../errors/validation_error");
|
||||||
const noteTypesService = require("./note_types");
|
const noteTypesService = require("./note_types");
|
||||||
|
const textExtractingService = require("./text_extracting");
|
||||||
|
|
||||||
function getNewNotePosition(parentNoteId) {
|
function getNewNotePosition(parentNoteId) {
|
||||||
const note = becca.notes[parentNoteId];
|
const note = becca.notes[parentNoteId];
|
||||||
@ -191,7 +192,7 @@ function createNewNote(params) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
scanForLinks(note);
|
asyncPostProcessContent(note, params.content);
|
||||||
|
|
||||||
copyChildAttributes(parentNote, note);
|
copyChildAttributes(parentNote, note);
|
||||||
|
|
||||||
@ -492,7 +493,7 @@ function downloadImages(noteId, content) {
|
|||||||
if (updatedContent !== origContent) {
|
if (updatedContent !== origContent) {
|
||||||
origNote.setContent(updatedContent);
|
origNote.setContent(updatedContent);
|
||||||
|
|
||||||
scanForLinks(origNote);
|
asyncPostProcessContent(origNote, updatedContent);
|
||||||
|
|
||||||
eventService.emit(eventService.ENTITY_CHANGED, {
|
eventService.emit(eventService.ENTITY_CHANGED, {
|
||||||
entityName: 'note_contents',
|
entityName: 'note_contents',
|
||||||
@ -711,13 +712,12 @@ function getUndeletedParentBranchIds(noteId, deleteId) {
|
|||||||
AND parentNote.isDeleted = 0`, [noteId, deleteId]);
|
AND parentNote.isDeleted = 0`, [noteId, deleteId]);
|
||||||
}
|
}
|
||||||
|
|
||||||
function scanForLinks(note) {
|
function scanForLinks(note, content) {
|
||||||
if (!note || !['text', 'relationMap'].includes(note.type)) {
|
if (!note || !['text', 'relationMap'].includes(note.type)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const content = note.getContent();
|
|
||||||
const newContent = saveLinks(note, content);
|
const newContent = saveLinks(note, content);
|
||||||
|
|
||||||
if (content !== newContent) {
|
if (content !== newContent) {
|
||||||
@ -729,6 +729,30 @@ function scanForLinks(note) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function runOcr(note, buffer) {
|
||||||
|
if (!note.isImage() || !optionService.getOptionBool('ocrImages')) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const plainText = textExtractingService.ocrTextFromBuffer(buffer);
|
||||||
|
|
||||||
|
note.saveNoteAttachment('plainText', 'text/plain', plainText);
|
||||||
|
}
|
||||||
|
catch (e) {
|
||||||
|
log.error(`OCR on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Things which have to be executed after updating content, but asynchronously (separate transaction)
|
||||||
|
*/
|
||||||
|
async function asyncPostProcessContent(note, content) {
|
||||||
|
scanForLinks(note, content);
|
||||||
|
runOcr(note, content);
|
||||||
|
await textExtractingService.extractTextFromPdf(note, content);
|
||||||
|
}
|
||||||
|
|
||||||
function eraseNotes(noteIdsToErase) {
|
function eraseNotes(noteIdsToErase) {
|
||||||
if (noteIdsToErase.length === 0) {
|
if (noteIdsToErase.length === 0) {
|
||||||
return;
|
return;
|
||||||
@ -1006,7 +1030,6 @@ module.exports = {
|
|||||||
updateNoteData,
|
updateNoteData,
|
||||||
undeleteNote,
|
undeleteNote,
|
||||||
protectNoteRecursively,
|
protectNoteRecursively,
|
||||||
scanForLinks,
|
|
||||||
duplicateSubtree,
|
duplicateSubtree,
|
||||||
duplicateSubtreeWithoutRoot,
|
duplicateSubtreeWithoutRoot,
|
||||||
getUndeletedParentBranchIds,
|
getUndeletedParentBranchIds,
|
||||||
@ -1014,5 +1037,6 @@ module.exports = {
|
|||||||
eraseDeletedNotesNow,
|
eraseDeletedNotesNow,
|
||||||
eraseNotesWithDeleteId,
|
eraseNotesWithDeleteId,
|
||||||
saveNoteRevisionIfNeeded,
|
saveNoteRevisionIfNeeded,
|
||||||
downloadImages
|
downloadImages,
|
||||||
|
asyncPostProcessContent
|
||||||
};
|
};
|
||||||
|
@ -91,6 +91,7 @@ const defaultOptions = [
|
|||||||
{ name: 'disableTray', value: 'false', isSynced: false },
|
{ name: 'disableTray', value: 'false', isSynced: false },
|
||||||
{ name: 'userGuideSha256Hash', value: '', isSynced: true },
|
{ name: 'userGuideSha256Hash', value: '', isSynced: true },
|
||||||
{ name: 'ocrImages', value: 'true', isSynced: true },
|
{ name: 'ocrImages', value: 'true', isSynced: true },
|
||||||
|
{ name: 'extractTextFromPdf', value: 'true', isSynced: true },
|
||||||
];
|
];
|
||||||
|
|
||||||
function initStartupOptions() {
|
function initStartupOptions() {
|
||||||
|
@ -53,7 +53,9 @@ class NoteContentFulltextExp extends Expression {
|
|||||||
FROM note_attachments JOIN note_attachment_contents USING (noteAttachmentId)
|
FROM note_attachments JOIN note_attachment_contents USING (noteAttachmentId)
|
||||||
WHERE name IN ('plainText') AND isDeleted = 0`)) {
|
WHERE name IN ('plainText') AND isDeleted = 0`)) {
|
||||||
|
|
||||||
this.findInText(row, inputNoteSet, resultNoteSet);
|
if (!resultNoteSet.hasNoteId(row.noteId)) {
|
||||||
|
this.findInText(row, inputNoteSet, resultNoteSet);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return resultNoteSet;
|
return resultNoteSet;
|
||||||
|
129
src/services/text_extracting.js
Normal file
129
src/services/text_extracting.js
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
const Canvas = require("canvas");
|
||||||
|
const OCRAD = require("ocrad.js");
|
||||||
|
const log = require("./log.js");
|
||||||
|
const optionService = require("./options.js");
|
||||||
|
|
||||||
|
function ocrFromByteArray(img) {
|
||||||
|
// byte array contains raw uncompressed pixel data
|
||||||
|
// kind: 1 - GRAYSCALE_1BPP (unsupported)
|
||||||
|
// kind: 2 - RGB_24BPP
|
||||||
|
// kind: 3 - RGBA_32BPP
|
||||||
|
|
||||||
|
if (!(img.data instanceof Uint8ClampedArray) || ![2, 3].includes(img.kind)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const start = Date.now();
|
||||||
|
const canvas = new Canvas.createCanvas(img.width, img.height);
|
||||||
|
const ctx = canvas.getContext('2d');
|
||||||
|
|
||||||
|
const imageData = ctx.createImageData(img.width, img.height);
|
||||||
|
const imageBytes = imageData.data;
|
||||||
|
|
||||||
|
for (let j = 0, k = 0, jj = img.width * img.height * 4; j < jj;) {
|
||||||
|
imageBytes[j++] = img.data[k++];
|
||||||
|
imageBytes[j++] = img.data[k++];
|
||||||
|
imageBytes[j++] = img.data[k++];
|
||||||
|
// in case of kind = 2, the alpha channel is missing in source pixels and we'll add it
|
||||||
|
imageBytes[j++] = img.kind === 2 ? 255 : img.data[k++];
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx.putImageData(imageData, 0, 0);
|
||||||
|
const text = OCRAD(canvas);
|
||||||
|
|
||||||
|
log.info(`OCR of ${img.data.length} canvas into ${text.length} chars of text took ${Date.now() - start}ms`);
|
||||||
|
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function ocrTextFromPdfImages(pdfjsLib, page, strings) {
|
||||||
|
const ops = await page.getOperatorList();
|
||||||
|
|
||||||
|
const fns = ops.fnArray;
|
||||||
|
const args = ops.argsArray;
|
||||||
|
|
||||||
|
for (const arg of args) {
|
||||||
|
const i = args.indexOf(arg);
|
||||||
|
|
||||||
|
if (fns[i] !== pdfjsLib.OPS.paintXObject && fns[i] !== pdfjsLib.OPS.paintImageXObject) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const imgKey = arg[0];
|
||||||
|
const img = await new Promise((res) => page.objs.get(imgKey, r => res(r)));
|
||||||
|
|
||||||
|
if (!img) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const text = ocrFromByteArray(img);
|
||||||
|
|
||||||
|
if (text) {
|
||||||
|
strings.push(text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function extractTextFromPdf(note, buffer) {
|
||||||
|
if (note.mime !== 'application/pdf' || !optionService.getOptionBool('extractTextFromPdf')) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const pdfjsLib = require("pdfjs-dist");
|
||||||
|
const doc = await pdfjsLib.getDocument({data: buffer}).promise;
|
||||||
|
let strings = [];
|
||||||
|
|
||||||
|
for (let p = 1; p <= doc.numPages; p++) {
|
||||||
|
const page = await doc.getPage(p);
|
||||||
|
|
||||||
|
const content = await page.getTextContent({
|
||||||
|
normalizeWhitespace: true,
|
||||||
|
disableCombineTextItems: false
|
||||||
|
});
|
||||||
|
|
||||||
|
content.items.forEach(({str}) => strings.push(str));
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (optionService.getOptionBool('ocrImages')) {
|
||||||
|
await ocrTextFromPdfImages(pdfjsLib, page, strings);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (e) {
|
||||||
|
log.info(`Could not OCR images from PDF note '${note.noteId}': '${e.message}', stack '${e.stack}'`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
strings = strings.filter(str => str?.trim());
|
||||||
|
|
||||||
|
note.saveNoteAttachment('plainText', 'text/plain', strings.join(" "));
|
||||||
|
}
|
||||||
|
catch (e) {
|
||||||
|
log.info(`Extracting text from PDF on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function ocrTextFromBuffer(buffer) {
|
||||||
|
// buffer is expected to contain an image in JPEG, PNG etc.
|
||||||
|
const start = Date.now();
|
||||||
|
|
||||||
|
const img = await new Promise((res, rej) => {
|
||||||
|
const img = new Canvas.Image();
|
||||||
|
img.onload = () => res(img);
|
||||||
|
img.onerror = err => rej(new Error("Can't load the image " + err));
|
||||||
|
img.src = buffer;
|
||||||
|
});
|
||||||
|
|
||||||
|
const canvas = new Canvas.createCanvas(img.width, img.height);
|
||||||
|
const ctx = canvas.getContext('2d');
|
||||||
|
ctx.drawImage(img, 0, 0, img.width, img.height);
|
||||||
|
const plainText = OCRAD(canvas);
|
||||||
|
|
||||||
|
log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`);
|
||||||
|
return plainText;
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
ocrTextFromBuffer,
|
||||||
|
extractTextFromPdf
|
||||||
|
};
|
Loading…
x
Reference in New Issue
Block a user