feat(import/single): support UTF-16 LE with BOM for HTML

This commit is contained in:
Elian Doran 2025-02-22 00:50:19 +02:00
parent 39f00bd568
commit fd4f35e879
No known key found for this signature in database
5 changed files with 85 additions and 45 deletions

30
package-lock.json generated
View File

@ -31,6 +31,7 @@
"better-sqlite3": "11.8.1",
"bootstrap": "5.3.3",
"boxicons": "2.1.4",
"chardet": "2.0.0",
"cheerio": "1.0.0",
"chokidar": "4.0.3",
"cls-hooked": "4.2.2",
@ -97,6 +98,7 @@
"source-map-support": "0.5.21",
"split.js": "1.6.5",
"stream-throttle": "0.1.3",
"strip-bom": "5.0.0",
"striptags": "3.2.0",
"swagger-ui-express": "5.0.1",
"tmp": "0.2.3",
@ -6175,6 +6177,12 @@
"url": "https://github.com/chalk/chalk?sponsor=1"
}
},
"node_modules/chardet": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/chardet/-/chardet-2.0.0.tgz",
"integrity": "sha512-xVgPpulCooDjY6zH4m9YW3jbkaBe3FKIAvF5sj5t7aBNsVl2ljIE+xwJ4iNgiDZHFQvNIpjdKdVOQvvk5ZfxbQ==",
"license": "MIT"
},
"node_modules/check-error": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/check-error/-/check-error-2.1.1.tgz",
@ -11889,6 +11897,16 @@
"node": ">=4"
}
},
"node_modules/load-json-file/node_modules/strip-bom": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz",
"integrity": "sha512-vavAMRXOgBVNF6nyEEmL3DBK19iRpDcoIwW+swQ+CbGiu7lju6t+JklA1MHweoWtadgt4ISVUsXLyDq34ddcwA==",
"dev": true,
"license": "MIT",
"engines": {
"node": ">=4"
}
},
"node_modules/loader-runner": {
"version": "4.3.0",
"resolved": "https://registry.npmjs.org/loader-runner/-/loader-runner-4.3.0.tgz",
@ -15917,13 +15935,15 @@
}
},
"node_modules/strip-bom": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz",
"integrity": "sha512-vavAMRXOgBVNF6nyEEmL3DBK19iRpDcoIwW+swQ+CbGiu7lju6t+JklA1MHweoWtadgt4ISVUsXLyDq34ddcwA==",
"dev": true,
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-5.0.0.tgz",
"integrity": "sha512-p+byADHF7SzEcVnLvc/r3uognM1hUhObuHXxJcgLCfD194XAkaLbjq3Wzb0N5G2tgIjH0dgT708Z51QxMeu60A==",
"license": "MIT",
"engines": {
"node": ">=4"
"node": ">=12"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/strip-eof": {

View File

@ -26,7 +26,6 @@
"server:start-test": "npm run server:switch && rimraf ./data-test && cross-env TRILIUM_DATA_DIR=./data-test TRILIUM_SYNC_SERVER_HOST=http://tsyncserver:4000 TRILIUM_ENV=dev TRILIUM_PORT=9999 nodemon src/main.ts",
"server:qstart": "npm run server:switch && npm run server:start",
"server:switch": "rimraf ./node_modules/better-sqlite3 && npm install",
"electron:start": "cross-env NODE_OPTIONS=\"--import tsx\" TRILIUM_DATA_DIR=./data TRILIUM_SYNC_SERVER_HOST=http://tsyncserver:4000 TRILIUM_ENV=dev electron ./electron-main.ts --inspect=5858 .",
"electron:start-no-dir": "cross-env NODE_OPTIONS=\"--import tsx\" TRILIUM_ENV=dev electron --inspect=5858 .",
"electron:start-nix": "electron-rebuild --version 33.3.1 && cross-env NODE_OPTIONS=\"--import tsx\" TRILIUM_DATA_DIR=./data TRILIUM_SYNC_SERVER_HOST=http://tsyncserver:4000 TRILIUM_ENV=dev nix-shell -p electron_33 --run \"electron ./electron-main.ts --inspect=5858 .\"",
@ -37,30 +36,23 @@
"electron:start-prod-nix-no-dir": "electron-rebuild --version 33.3.1 && npm run build:prepare-dist && cross-env TRILIUM_ENV=dev nix-shell -p electron_33 --run \"electron ./dist/electron-main.js --inspect=5858 .\"",
"electron:qstart": "npm run electron:switch && npm run electron:start",
"electron:switch": "electron-rebuild",
"electron-forge:start": "npm run build:prepare-dist && electron-forge start",
"electron-forge:make": "npm run build:prepare-dist && electron-forge make",
"electron-forge:package": "npm run build:prepare-dist && electron-forge package",
"docs:build-backend": "rimraf ./docs/backend_api && typedoc ./docs/backend_api src/becca/entities/*.ts src/services/backend_script_api.ts src/services/sql.ts",
"docs:build-frontend": "rimraf ./docs/frontend_api && jsdoc -c jsdoc-conf.json -d ./docs/frontend_api src/public/app/entities/*.js src/public/app/services/frontend_script_api.js src/public/app/widgets/basic_widget.js src/public/app/widgets/note_context_aware_widget.js src/public/app/widgets/right_panel_widget.js",
"docs:build": "npm run docs:build-backend && npm run docs:build-frontend",
"build:webpack": "tsx node_modules/webpack/bin/webpack.js -c webpack.config.ts",
"build:prepare-dist": "npm run build:webpack && rimraf ./dist && tsc && tsx ./bin/copy-dist.ts",
"test": "cross-env TRILIUM_DATA_DIR=./integration-tests/db TRILIUM_INTEGRATION_TEST=memory vitest",
"test:coverage": "cross-env TRILIUM_DATA_DIR=./integration-tests/db vitest --coverage",
"test:playwright": "playwright test",
"test:integration-edit-db": "cross-env TRILIUM_INTEGRATION_TEST=edit TRILIUM_PORT=8081 TRILIUM_ENV=dev TRILIUM_DATA_DIR=./integration-tests/db nodemon src/main.ts",
"test:integration-mem-db": "cross-env TRILIUM_INTEGRATION_TEST=memory TRILIUM_PORT=8082 TRILIUM_DATA_DIR=./integration-tests/db nodemon src/main.ts",
"test:integration-mem-db-dev": "cross-env TRILIUM_INTEGRATION_TEST=memory TRILIUM_PORT=8082 TRILIUM_ENV=dev TRILIUM_DATA_DIR=./integration-tests/db nodemon src/main.ts",
"dev:watch-dist": "tsx ./bin/watch-dist.ts",
"dev:prettier-check": "prettier . --check",
"dev:prettier-fix": "prettier . --write",
"chore:update-build-info": "tsx bin/update-build-info.ts",
"chore:ci-update-nightly-version": "tsx ./bin/update-nightly-version.ts",
"chore:generate-document": "cross-env nodemon ./bin/generate_document.ts 1000",
@ -89,6 +81,7 @@
"better-sqlite3": "11.8.1",
"bootstrap": "5.3.3",
"boxicons": "2.1.4",
"chardet": "2.0.0",
"cheerio": "1.0.0",
"chokidar": "4.0.3",
"cls-hooked": "4.2.2",
@ -155,6 +148,7 @@
"source-map-support": "0.5.21",
"split.js": "1.6.5",
"stream-throttle": "0.1.3",
"strip-bom": "5.0.0",
"striptags": "3.2.0",
"swagger-ui-express": "5.0.1",
"tmp": "0.2.3",

View File

@ -1,4 +1,4 @@
import { describe, expect, it } from "vitest";
import { beforeAll, describe, expect, it } from "vitest";
import fs from "fs";
import path from "path";
import { fileURLToPath } from "url";
@ -12,38 +12,47 @@ import { initializeTranslations } from "../i18n.js";
import single from "./single.js";
const scriptDir = dirname(fileURLToPath(import.meta.url));
describe("processNoteContent", () => {
it("treats single MDX as Markdown", async () => {
const mdxSample = fs.readFileSync(path.join(scriptDir, "samples", "Text Note.mdx"));
const taskContext = TaskContext.getInstance("import-mdx", "import", {
textImportedAsText: true
});
async function testImport(fileName: string, mimetype: string): Promise<BNote> {
const mdxSample = fs.readFileSync(path.join(scriptDir, "samples", fileName));
const taskContext = TaskContext.getInstance("import-mdx", "import", {
textImportedAsText: true
});
await new Promise<void>((resolve, reject) => {
cls.init(async () => {
initializeTranslations();
sql_init.initializeDb();
await sql_init.dbReady;
return new Promise<BNote>((resolve, reject) => {
cls.init(async () => {
const rootNote = becca.getNote("root");
if (!rootNote) {
reject("Missing root note.");
}
const rootNote = becca.getNote("root");
if (!rootNote) {
reject("Missing root note.");
}
const importedNote = single.importSingleFile(taskContext, {
originalname: "Text Note.mdx",
mimetype: "text/mdx",
buffer: mdxSample
}, rootNote as BNote);
try {
expect(importedNote.mime).toBe("text/html");
expect(importedNote.type).toBe("text");
expect(importedNote.title).toBe("Text Note");
} catch (e) {
reject(e);
}
resolve();
});
const importedNote = single.importSingleFile(taskContext, {
originalname: fileName,
mimetype,
buffer: mdxSample
}, rootNote as BNote);
resolve(importedNote);
});
});
}
describe("processNoteContent", () => {
beforeAll(async () => {
initializeTranslations();
sql_init.initializeDb();
await sql_init.dbReady;
});
it("treats single MDX as Markdown", async () => {
const importedNote = await testImport("Text Note.mdx", "text/mdx");
expect(importedNote.mime).toBe("text/html");
expect(importedNote.type).toBe("text");
expect(importedNote.title).toBe("Text Note");
});
it("supports HTML note with UTF-16 (w/ BOM) from Microsoft Outlook", async () => {
const importedNote = await testImport("IREN Reports Q2 FY25 Results.htm", "text/html");
expect(importedNote.mime).toBe("text/html");
expect(importedNote.title).toBe("IREN Reports Q2 FY25 Results");
expect(importedNote.getContent().toString().substring(0, 5)).toEqual("<html");
});
})

View File

@ -3,6 +3,8 @@
import type BNote from "../../becca/entities/bnote.js";
import type TaskContext from "../task_context.js";
import chardet from "chardet";
import stripBom from "strip-bom";
import noteService from "../../services/notes.js";
import imageService from "../../services/image.js";
import protectedSessionService from "../protected_session.js";
@ -146,8 +148,23 @@ function importMarkdown(taskContext: TaskContext, file: File, parentNote: BNote)
return note;
}
function processStringOrBuffer(data: string | Buffer) {
if (!Buffer.isBuffer(data)) {
return data;
}
const detectedEncoding = chardet.detect(data);
switch (detectedEncoding) {
case "UTF-16LE":
return stripBom(data.toString("utf-16le"));
case "UTF-8":
default:
return data.toString("utf-8");
}
}
function importHtml(taskContext: TaskContext, file: File, parentNote: BNote) {
let content = file.buffer.toString("utf-8");
let content = processStringOrBuffer(file.buffer);
// Try to get title from HTML first, fall back to filename
// We do this before sanitization since that turns all <h1>s into <h2>