This commit is contained in:
zadam 2023-02-01 21:07:23 +01:00
parent 37ba76fdd8
commit 88e09eb279
7 changed files with 258 additions and 1229 deletions

1413
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -33,7 +33,7 @@
"archiver": "5.3.1",
"async-mutex": "0.4.0",
"axios": "1.2.5",
"better-sqlite3": "7.4.5",
"better-sqlite3": "8.0.1",
"canvas": "2.11.0",
"chokidar": "3.5.3",
"cls-hooked": "4.2.2",
@ -95,7 +95,7 @@
},
"devDependencies": {
"cross-env": "7.0.3",
"electron": "16.2.8",
"electron": "23.0.0-beta.6",
"electron-builder": "23.6.0",
"electron-packager": "17.1.1",
"electron-rebuild": "3.2.9",

View File

@ -48,6 +48,14 @@ function isEntityEventsDisabled() {
return !!namespace.get('disableEntityEvents');
}
function isOcrDisabled() {
return !!namespace.get('disableOcr');
}
function disableOcr() {
namespace.set('disableOcr', true);
}
function getAndClearEntityChangeIds() {
const entityChangeIds = namespace.get('entityChangeIds') || [];
@ -92,5 +100,7 @@ module.exports = {
reset,
getAndClearEntityChangeIds,
addEntityChange,
ignoreEntityChangeIds
ignoreEntityChangeIds,
isOcrDisabled,
disableOcr
};

View File

@ -12,8 +12,7 @@ const sanitizeFilename = require('sanitize-filename');
const isSvg = require('is-svg');
const isAnimated = require('is-animated');
const htmlSanitizer = require("./html_sanitizer");
const OCRAD = require('ocrad.js');
const Canvas = require('canvas');
const textExtractingService = require("./text_extracting");
async function processImage(uploadBuffer, originalName, shrinkImageSwitch) {
const compressImages = optionService.getOptionBool("compressImages");
@ -128,7 +127,7 @@ function saveImage(parentNoteId, uploadBuffer, originalName, shrinkImageSwitch,
note.setContent(buffer);
});
runOcr(note, buffer);
textExtractingService.runOcr(note, buffer);
});
return {

View File

@ -729,27 +729,12 @@ function scanForLinks(note, content) {
}
}
function runOcr(note, buffer) {
if (!note.isImage() || !optionService.getOptionBool('ocrImages')) {
return;
}
try {
const plainText = textExtractingService.ocrTextFromBuffer(buffer);
note.saveNoteAncillary('plainText', 'text/plain', plainText);
}
catch (e) {
log.error(`OCR on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
}
}
/**
* Things which have to be executed after updating content, but asynchronously (separate transaction)
*/
async function asyncPostProcessContent(note, content) {
scanForLinks(note, content);
runOcr(note, content);
await textExtractingService.runOcr(note, content);
await textExtractingService.extractTextFromPdf(note, content);
}

View File

@ -1,7 +1,8 @@
const Canvas = require("canvas");
const OCRAD = require("ocrad.js");
const log = require("./log.js");
const optionService = require("./options.js");
const log = require("./log");
const optionService = require("./options");
const cls = require("./cls");
function ocrFromByteArray(img) {
// byte array contains raw uncompressed pixel data
@ -85,7 +86,7 @@ async function extractTextFromPdf(note, buffer) {
content.items.forEach(({str}) => strings.push(str));
try {
if (optionService.getOptionBool('ocrImages')) {
if (optionService.getOptionBool('ocrImages') && !cls.isOcrDisabled()) {
await ocrTextFromPdfImages(pdfjsLib, page, strings);
}
}
@ -117,13 +118,37 @@ async function ocrTextFromBuffer(buffer) {
const canvas = new Canvas.createCanvas(img.width, img.height);
const ctx = canvas.getContext('2d');
ctx.drawImage(img, 0, 0, img.width, img.height);
const plainText = OCRAD(canvas);
log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`);
return plainText;
}
async function runOcr(note, buffer) {
console.log("buffer length", buffer.length);
if (!note.isImage()
|| !optionService.getOptionBool('ocrImages')
|| cls.isOcrDisabled()
|| buffer.length === 0
) {
return;
}
try {
const plainText = await ocrTextFromBuffer(buffer);
console.log("OCR", plainText);
note.saveNoteAncillary('plainText', 'text/plain', plainText);
}
catch (e) {
log.error(`OCR on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
}
}
module.exports = {
ocrTextFromBuffer,
runOcr,
extractTextFromPdf
};

View File

@ -13,6 +13,7 @@ const yauzl = require("yauzl");
const htmlSanitizer = require('./html_sanitizer');
const sql = require('./sql');
const options = require('./options');
const cls = require('./cls');
const {USER_GUIDE_ZIP_DIR} = require('./resource_dir');
async function importUserGuideIfNeeded() {
@ -33,6 +34,8 @@ async function importUserGuideIfNeeded() {
const hiddenRoot = becca.getNote("_hidden");
const data = await fs.readFile(USER_GUIDE_ZIP_DIR + "/user-guide.zip", "binary");
cls.disableOcr(); // no OCR needed for user guide images
await importZip(Buffer.from(data, 'binary'), hiddenRoot);
options.setOption('userGuideSha256Hash', userGuideSha256HashInFile);