Merge pull request #1022 from TriliumNext/fix_import-utils-#1016

fix(import/utils.handleH1): fix stripping of all <h1> tags that match title tag
2026-02-22 13:44:25 +01:00 · 2025-01-26 14:28:52 +02:00 · 2025-01-26 14:28:52 +02:00 · 7ae7831a27
commit 7ae7831a27
parent 189dfdb5cd 05b433d44e
2 changed files with 114 additions and 6 deletions
--- a/src/services/import/utils.spec.ts
+++ b/src/services/import/utils.spec.ts
@ -0,0 +1,103 @@
+import { describe, it, expect } from "vitest";
+import importUtils from "./utils.js";
+
+type TestCase<T extends (...args: any) => any> = [desc: string, fnParams: Parameters<T>, expected: ReturnType<T>];
+
+describe("#extractHtmlTitle", () => {
+    const htmlWithNoTitle = `
+  <html>
+    <body>
+      <div>abc</div>
+    </body>
+  </html>`;
+
+    const htmlWithTitle = `
+  <html><head>
+    <title>Test Title</title>
+  </head>
+  <body>
+    <div>abc</div>
+  </body>
+  </html>`;
+
+    const htmlWithTitleWOpeningBracket = `
+  <html><head>
+  <title>Test < Title</title>
+  </head>
+  <body>
+    <div>abc</div>
+  </body>
+  </html>`;
+
+    // prettier-ignore
+    const testCases: TestCase<typeof importUtils.extractHtmlTitle>[] = [
+        [
+            "w/ existing <title> tag, it should return the content of the title tag",
+            [htmlWithTitle],
+            "Test Title"
+        ],
+        [
+            // @TriliumNextTODO: this seems more like an unwanted behaviour to me – check if this needs rather fixing
+            "with existing <title> tag, that includes an opening HTML tag '<', it should return null",
+            [htmlWithTitleWOpeningBracket], 
+            null
+        ],
+        [
+            "w/o an existing <title> tag, it should reutrn null",
+            [htmlWithNoTitle],
+            null
+        ],
+        [
+            "w/ empty string content, it should return null",
+            [""],
+            null
+        ]
+    ];
+
+    testCases.forEach((testCase) => {
+        const [desc, fnParams, expected] = testCase;
+        return it(desc, () => {
+            const actual = importUtils.extractHtmlTitle(...fnParams);
+            expect(actual).toStrictEqual(expected);
+        });
+    });
+});
+
+describe("#handleH1", () => {
+    // prettier-ignore
+    const testCases: TestCase<typeof importUtils.handleH1>[] = [
+        [
+            "w/ single <h1> tag w/ identical text content as the title tag: the <h1> tag should be stripped",
+            ["<h1>Title</h1>", "Title"],
+            ""
+        ],
+        [
+            "w/ multiple <h1> tags, with the fist matching the title tag: the first <h1> tag should be stripped and subsequent tags converted to <h2>",
+            ["<h1>Title</h1><h1>Header 1</h1><h1>Header 2</h1>", "Title"],
+            "<h2>Header 1</h2><h2>Header 2</h2>"
+        ],
+        [
+            "w/ no <h1> tag and only <h2> tags, it should not cause any changes and return the same content",
+            ["<h2>Heading 1</h2><h2>Heading 2</h2>", "Title"],
+            "<h2>Heading 1</h2><h2>Heading 2</h2>"
+        ],
+        [
+            "w/ multiple <h1> tags, and the 1st matching the title tag, it should strip ONLY the very first occurence of the <h1> tags in the returned content",
+            ["<h1>Topic ABC</h1><h1>Heading 1</h1><h1>Topic ABC</h1>", "Topic ABC"],
+            "<h2>Heading 1</h2><h2>Topic ABC</h2>"
+        ],
+        [
+            "w/ multiple <h1> tags, and the 1st matching NOT the title tag, it should NOT strip any other <h1> tags",
+            ["<h1>Introduction</h1><h1>Topic ABC</h1><h1>Summary</h1>", "Topic ABC"],
+            "<h2>Introduction</h2><h2>Topic ABC</h2><h2>Summary</h2>"
+        ]
+    ];
+
+    testCases.forEach((testCase) => {
+        const [desc, fnParams, expected] = testCase;
+        return it(desc, () => {
+            const actual = importUtils.handleH1(...fnParams);
+            expect(actual).toStrictEqual(expected);
+        });
+    });
+});
--- a/src/services/import/utils.ts
+++ b/src/services/import/utils.ts
@ -1,14 +1,19 @@
 "use strict";

 function handleH1(content: string, title: string) {
-    content = content.replace(/<h1[^>]*>([^<]*)<\/h1>/gi, (match, text) => {
-        if (title.trim() === text.trim()) {
-            return ""; // remove whole H1 tag
-        } else {
-            return `<h2>${text}</h2>`;
+    let isFirstH1Handled = false;
+
+    return content.replace(/<h1[^>]*>([^<]*)<\/h1>/gi, (match, text) => {
+        const convertedContent = `<h2>${text}</h2>`;
+
+        // strip away very first found h1 tag, if it matches the title
+        if (!isFirstH1Handled) {
+            isFirstH1Handled = true;
+            return title.trim() === text.trim() ? "" : convertedContent;
        }
+
+        return convertedContent;
    });
-    return content;
 }

 function extractHtmlTitle(content: string): string | null {