From 47c05b2c6d591227b4ce2913809818cc605d28d1 Mon Sep 17 00:00:00 2001 From: maphew Date: Sat, 16 Nov 2024 09:06:58 -0700 Subject: [PATCH] feat: prefer HTML title tag over filename during import When importing HTML files, extract and use the title from the tag if available, falling back to the filename only when no title tag is found. This improves handling of titles with special characters that can't be represented in filenames. --- src/services/import/single.ts | 15 +++++++++------ src/services/import/utils.ts | 8 +++++++- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src/services/import/single.ts b/src/services/import/single.ts index 465d3b7a8..18b22329f 100644 --- a/src/services/import/single.ts +++ b/src/services/import/single.ts @@ -149,15 +149,18 @@ function importMarkdown(taskContext: TaskContext, file: File, parentNote: BNote) } function importHtml(taskContext: TaskContext, file: File, parentNote: BNote) { - const title = utils.getNoteTitle(file.originalname, !!taskContext.data?.replaceUnderscoresWithSpaces); let content = file.buffer.toString("utf-8"); - + if (taskContext?.data?.safeImport) { content = htmlSanitizer.sanitize(content); } - + + // Try to get title from HTML first, fall back to filename + const htmlTitle = importUtils.extractHtmlTitle(content); + const title = htmlTitle || utils.getNoteTitle(file.originalname, !!taskContext.data?.replaceUnderscoresWithSpaces); + content = importUtils.handleH1(content, title); - + const {note} = noteService.createNewNote({ parentNoteId: parentNote.noteId, title, @@ -166,9 +169,9 @@ function importHtml(taskContext: TaskContext, file: File, parentNote: BNote) { mime: 'text/html', isProtected: parentNote.isProtected && protectedSessionService.isProtectedSessionAvailable(), }); - + taskContext.increaseProgressCount(); - + return note; } diff --git a/src/services/import/utils.ts b/src/services/import/utils.ts index b85700230..ea3cb075e 100644 --- a/src/services/import/utils.ts +++ b/src/services/import/utils.ts @@ -11,6 +11,12 @@ function handleH1(content: string, title: string) { return content; } +function extractHtmlTitle(content: string): string | null { + const titleMatch = content.match(/<title[^>]*>([^<]+)<\/title>/i); + return titleMatch ? titleMatch[1].trim() : null; +} + export default { - handleH1 + handleH1, + extractHtmlTitle };