From cb5b4d870f5f70a7fdd3e0b87d555926e25d2143 Mon Sep 17 00:00:00 2001
From: Elian Doran <contact@eliandoran.me>
Date: Wed, 11 Mar 2026 18:26:44 +0200
Subject: [PATCH] refactor(server/search): extract fulltext preprocessing to
 separate file

---
 .../expressions/note_content_fulltext.spec.ts |  15 +-
 .../expressions/note_content_fulltext.ts      | 187 ++++--------------
 ...note_content_fulltext_preprocessor.spec.ts |  18 ++
 .../note_content_fulltext_preprocessor.ts     | 116 +++++++++++
 4 files changed, 170 insertions(+), 166 deletions(-)
 create mode 100644 apps/server/src/services/search/expressions/note_content_fulltext_preprocessor.spec.ts
 create mode 100644 apps/server/src/services/search/expressions/note_content_fulltext_preprocessor.ts

diff --git a/apps/server/src/services/search/expressions/note_content_fulltext.spec.ts b/apps/server/src/services/search/expressions/note_content_fulltext.spec.ts
index d0d51e0877..50d9123fdf 100644
--- a/apps/server/src/services/search/expressions/note_content_fulltext.spec.ts
+++ b/apps/server/src/services/search/expressions/note_content_fulltext.spec.ts
@@ -1,18 +1,7 @@
-import { describe, it, expect } from "vitest";
-import { processMindmapContent } from "./note_content_fulltext.js";
+import { describe, expect,it } from "vitest";
+
 import NoteContentFulltextExp from "./note_content_fulltext.js";
 
-describe("processMindmapContent", () => {
-    it("supports empty JSON", () => {
-        expect(processMindmapContent("{}")).toEqual("");
-    });
-
-    it("supports blank text / invalid JSON", () => {
-        expect(processMindmapContent("")).toEqual("");
-        expect(processMindmapContent(`{ "node": " }`)).toEqual("");
-    });
-});
-
 describe("Fuzzy Search Operators", () => {
     it("~= operator works with typos", () => {
         // Test that the ~= operator can handle common typos
diff --git a/apps/server/src/services/search/expressions/note_content_fulltext.ts b/apps/server/src/services/search/expressions/note_content_fulltext.ts
index 89ba1bc984..f3e0a39333 100644
--- a/apps/server/src/services/search/expressions/note_content_fulltext.ts
+++ b/apps/server/src/services/search/expressions/note_content_fulltext.ts
@@ -1,24 +1,19 @@
-"use strict";
-
 import type { NoteRow } from "@triliumnext/commons";
-import type SearchContext from "../search_context.js";
 
-import Expression from "./expression.js";
-import NoteSet from "../note_set.js";
-import log from "../../log.js";
 import becca from "../../../becca/becca.js";
+import log from "../../log.js";
 import protectedSessionService from "../../protected_session.js";
-import striptags from "striptags";
-import { normalize } from "../../utils.js";
 import sql from "../../sql.js";
-import { 
-    normalizeSearchText, 
-    calculateOptimizedEditDistance, 
-    validateFuzzySearchTokens, 
-    validateAndPreprocessContent,
+import NoteSet from "../note_set.js";
+import type SearchContext from "../search_context.js";
+import {
+    FUZZY_SEARCH_CONFIG,
     fuzzyMatchWord,
-    FUZZY_SEARCH_CONFIG 
-} from "../utils/text_utils.js";
+    normalizeSearchText,
+    validateAndPreprocessContent,
+    validateFuzzySearchTokens} from "../utils/text_utils.js";
+import Expression from "./expression.js";
+import preprocessContent from "./note_content_fulltext_preprocessor.js";
 
 const ALLOWED_OPERATORS = new Set(["=", "!=", "*=*", "*=", "=*", "%=", "~=", "~*"]);
 
@@ -218,7 +213,7 @@ class NoteContentFulltextExp extends Expression {
             return;
         }
 
-        content = this.preprocessContent(content, type, mime);
+        content = preprocessContent(content, type, mime, this.raw);
 
         // Apply content size validation and preprocessing
         const processedContent = validateAndPreprocessContent(content, noteId);
@@ -295,59 +290,22 @@ class NoteContentFulltextExp extends Expression {
         return content;
     }
 
-    preprocessContent(content: string | Buffer, type: string, mime: string) {
-        content = normalize(content.toString());
-
-        if (type === "text" && mime === "text/html") {
-            if (!this.raw) {
-                // Content size already filtered at DB level, safe to process
-                content = this.stripTags(content);
-            }
-
-            content = content.replace(/&nbsp;/g, " ");
-        } else if (type === "mindMap" && mime === "application/json") {
-            content = processMindmapContent(content);
-        } else if (type === "canvas" && mime === "application/json") {
-            interface Element {
-                type: string;
-                text?: string; // Optional since not all objects have a `text` property
-                id: string;
-                [key: string]: any; // Other properties that may exist
-            }
-
-            const canvasContent = JSON.parse(content);
-            const elements = canvasContent.elements;
-
-            if (Array.isArray(elements)) {
-                const texts = elements
-                    .filter((element: Element) => element.type === "text" && element.text) // Filter for 'text' type elements with a 'text' property
-                    .map((element: Element) => element.text!); // Use `!` to assert `text` is defined after filtering
-
-                content = normalize(texts.join(" "));
-            } else {
-                content = "";
-            }
-        }
-
-        return content.trim();
-    }
-
     /**
      * Checks if a token matches content with optional fuzzy matching
      */
     private tokenMatchesContent(token: string, content: string, noteId: string): boolean {
         const normalizedToken = normalizeSearchText(token);
         const normalizedContent = normalizeSearchText(content);
-        
+
         if (normalizedContent.includes(normalizedToken)) {
             return true;
         }
-        
+
         // Check flat text for default fulltext search
         if (!this.flatText || !becca.notes[noteId].getFlatText().includes(token)) {
             return false;
         }
-        
+
         return true;
     }
 
@@ -358,15 +316,15 @@ class NoteContentFulltextExp extends Expression {
         try {
             const normalizedContent = normalizeSearchText(content);
             const flatText = this.flatText ? normalizeSearchText(becca.notes[noteId].getFlatText()) : "";
-            
+
             // For phrase matching, check if tokens appear within reasonable proximity
             if (this.tokens.length > 1) {
                 return this.matchesPhrase(normalizedContent, flatText);
             }
-            
+
             // Single token fuzzy matching
             const token = normalizeSearchText(this.tokens[0]);
-            return this.fuzzyMatchToken(token, normalizedContent) || 
+            return this.fuzzyMatchToken(token, normalizedContent) ||
                    (this.flatText && this.fuzzyMatchToken(token, flatText));
         } catch (error) {
             log.error(`Error in fuzzy matching for note ${noteId}: ${error}`);
@@ -379,45 +337,45 @@ class NoteContentFulltextExp extends Expression {
      */
     private matchesPhrase(content: string, flatText: string): boolean {
         const searchText = this.flatText ? `${content} ${flatText}` : content;
-        
+
         // Apply content size limits for phrase matching
         const limitedText = validateAndPreprocessContent(searchText);
         if (!limitedText) {
             return false;
         }
-        
+
         const words = limitedText.toLowerCase().split(/\s+/);
-        
+
         // Only skip phrase matching for truly extreme word counts that could crash the system
         if (words.length > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT) {
             console.error(`Phrase matching skipped due to extreme word count that could cause system instability: ${words.length} words`);
             return false;
         }
-        
+
         // Warn about large word counts but still attempt matching
         if (words.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) {
             console.info(`Large word count for phrase matching: ${words.length} words - may take longer but will attempt full matching`);
         }
-        
+
         // Find positions of each token
         const tokenPositions: number[][] = this.tokens.map(token => {
             const normalizedToken = normalizeSearchText(token);
             const positions: number[] = [];
-            
+
             words.forEach((word, index) => {
                 if (this.fuzzyMatchSingle(normalizedToken, word)) {
                     positions.push(index);
                 }
             });
-            
+
             return positions;
         });
-        
+
         // Check if we found all tokens
         if (tokenPositions.some(positions => positions.length === 0)) {
             return false;
         }
-        
+
         // Check for phrase proximity using configurable distance
         return this.hasProximityMatch(tokenPositions, FUZZY_SEARCH_CONFIG.MAX_PHRASE_PROXIMITY);
     }
@@ -431,18 +389,18 @@ class NoteContentFulltextExp extends Expression {
             const [pos1, pos2] = tokenPositions;
             return pos1.some(p1 => pos2.some(p2 => Math.abs(p1 - p2) <= maxDistance));
         }
-        
+
         // For more tokens, check if we can find a sequence where all tokens are within range
         const findSequence = (remaining: number[][], currentPos: number): boolean => {
             if (remaining.length === 0) return true;
-            
+
             const [nextPositions, ...rest] = remaining;
-            return nextPositions.some(pos => 
-                Math.abs(pos - currentPos) <= maxDistance && 
+            return nextPositions.some(pos =>
+                Math.abs(pos - currentPos) <= maxDistance &&
                 findSequence(rest, pos)
             );
         };
-        
+
         const [firstPositions, ...rest] = tokenPositions;
         return firstPositions.some(startPos => findSequence(rest, startPos));
     }
@@ -455,12 +413,12 @@ class NoteContentFulltextExp extends Expression {
             // For short tokens, require exact match to avoid too many false positives
             return content.includes(token);
         }
-        
+
         const words = content.split(/\s+/);
-        
+
         // Only limit word processing for truly extreme cases to prevent system instability
         const limitedWords = words.slice(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT);
-        
+
         return limitedWords.some(word => this.fuzzyMatchSingle(token, word));
     }
 
@@ -471,83 +429,6 @@ class NoteContentFulltextExp extends Expression {
         // Use shared optimized fuzzy matching logic
         return fuzzyMatchWord(token, word, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
     }
-
-
-    stripTags(content: string) {
-        // we want to allow link to preserve URLs: https://github.com/zadam/trilium/issues/2412
-        // we want to insert space in place of block tags (because they imply text separation)
-        // but we don't want to insert text for typical formatting inline tags which can occur within one word
-        const linkTag = "a";
-        const inlineFormattingTags = ["b", "strong", "em", "i", "span", "big", "small", "font", "sub", "sup"];
-
-        // replace tags which imply text separation with a space
-        content = striptags(content, [linkTag, ...inlineFormattingTags], " ");
-
-        // replace the inline formatting tags (but not links) without a space
-        content = striptags(content, [linkTag], "");
-
-        // at least the closing link tag can be easily stripped
-        return content.replace(/<\/a>/gi, "");
-    }
-}
-
-export function processMindmapContent(content: string) {
-    let mindMapcontent;
-
-    try {
-        mindMapcontent = JSON.parse(content);
-    } catch (e) {
-        return "";
-    }
-
-    // Define interfaces for the JSON structure
-    interface MindmapNode {
-        id: string;
-        topic: string;
-        children: MindmapNode[]; // Recursive structure
-        direction?: number;
-        expanded?: boolean;
-    }
-
-    interface MindmapData {
-        nodedata: MindmapNode;
-        arrows: any[]; // If you know the structure, replace `any` with the correct type
-        summaries: any[];
-        direction: number;
-        theme: {
-            name: string;
-            type: string;
-            palette: string[];
-            cssvar: Record<string, string>; // Object with string keys and string values
-        };
-    }
-
-    // Recursive function to collect all topics
-    function collectTopics(node?: MindmapNode): string[] {
-        if (!node) {
-            return [];
-        }
-
-        // Collect the current node's topic
-        let topics = [node.topic];
-
-        // If the node has children, collect topics recursively
-        if (node.children && node.children.length > 0) {
-            for (const child of node.children) {
-                topics = topics.concat(collectTopics(child));
-            }
-        }
-
-        return topics;
-    }
-
-    // Start extracting from the root node
-    const topicsArray = collectTopics(mindMapcontent.nodedata);
-
-    // Combine topics into a single string
-    const topicsString = topicsArray.join(", ");
-
-    return normalize(topicsString.toString());
 }
 
 export default NoteContentFulltextExp;
diff --git a/apps/server/src/services/search/expressions/note_content_fulltext_preprocessor.spec.ts b/apps/server/src/services/search/expressions/note_content_fulltext_preprocessor.spec.ts
new file mode 100644
index 0000000000..e01e7f2aa8
--- /dev/null
+++ b/apps/server/src/services/search/expressions/note_content_fulltext_preprocessor.spec.ts
@@ -0,0 +1,18 @@
+import { NoteType } from "@triliumnext/commons";
+import { describe, expect,it } from "vitest";
+
+import preprocessContent from "./note_content_fulltext_preprocessor";
+
+describe("Mind map preprocessing", () => {
+    const type: NoteType = "mindMap";
+    const mime = "application/json";
+
+    it("supports empty JSON", () => {
+        expect(preprocessContent("{}", type, mime)).toEqual("");
+    });
+
+    it("supports blank text / invalid JSON", () => {
+        expect(preprocessContent("", type, mime)).toEqual("");
+        expect(preprocessContent(`{ "node": " }`, type, mime)).toEqual("");
+    });
+});
\ No newline at end of file
diff --git a/apps/server/src/services/search/expressions/note_content_fulltext_preprocessor.ts b/apps/server/src/services/search/expressions/note_content_fulltext_preprocessor.ts
new file mode 100644
index 0000000000..ff0893b083
--- /dev/null
+++ b/apps/server/src/services/search/expressions/note_content_fulltext_preprocessor.ts
@@ -0,0 +1,116 @@
+import striptags from "striptags";
+
+import { normalize } from "../../utils.js";
+
+export default function preprocessContent(content: string | Buffer, type: string, mime: string, raw?: boolean) {
+    content = normalize(content.toString());
+
+    if (type === "text" && mime === "text/html") {
+        if (!raw) {
+            // Content size already filtered at DB level, safe to process
+            content = stripTags(content);
+        }
+
+        content = content.replace(/&nbsp;/g, " ");
+    } else if (type === "mindMap" && mime === "application/json") {
+        content = processMindmapContent(content);
+    } else if (type === "canvas" && mime === "application/json") {
+        interface Element {
+            type: string;
+            text?: string; // Optional since not all objects have a `text` property
+            id: string;
+            [key: string]: any; // Other properties that may exist
+        }
+
+        const canvasContent = JSON.parse(content);
+        const elements = canvasContent.elements;
+
+        if (Array.isArray(elements)) {
+            const texts = elements
+                .filter((element: Element) => element.type === "text" && element.text) // Filter for 'text' type elements with a 'text' property
+                .map((element: Element) => element.text!); // Use `!` to assert `text` is defined after filtering
+
+            content = normalize(texts.join(" "));
+        } else {
+            content = "";
+        }
+    }
+
+    return content.trim();
+}
+
+function processMindmapContent(content: string) {
+    let mindMapcontent;
+
+    try {
+        mindMapcontent = JSON.parse(content);
+    } catch (e) {
+        return "";
+    }
+
+    // Define interfaces for the JSON structure
+    interface MindmapNode {
+        id: string;
+        topic: string;
+        children: MindmapNode[]; // Recursive structure
+        direction?: number;
+        expanded?: boolean;
+    }
+
+    interface MindmapData {
+        nodedata: MindmapNode;
+        arrows: any[]; // If you know the structure, replace `any` with the correct type
+        summaries: any[];
+        direction: number;
+        theme: {
+            name: string;
+            type: string;
+            palette: string[];
+            cssvar: Record<string, string>; // Object with string keys and string values
+        };
+    }
+
+    // Recursive function to collect all topics
+    function collectTopics(node?: MindmapNode): string[] {
+        if (!node) {
+            return [];
+        }
+
+        // Collect the current node's topic
+        let topics = [node.topic];
+
+        // If the node has children, collect topics recursively
+        if (node.children && node.children.length > 0) {
+            for (const child of node.children) {
+                topics = topics.concat(collectTopics(child));
+            }
+        }
+
+        return topics;
+    }
+
+    // Start extracting from the root node
+    const topicsArray = collectTopics(mindMapcontent.nodedata);
+
+    // Combine topics into a single string
+    const topicsString = topicsArray.join(", ");
+
+    return normalize(topicsString.toString());
+}
+
+function stripTags(content: string) {
+    // we want to allow link to preserve URLs: https://github.com/zadam/trilium/issues/2412
+    // we want to insert space in place of block tags (because they imply text separation)
+    // but we don't want to insert text for typical formatting inline tags which can occur within one word
+    const linkTag = "a";
+    const inlineFormattingTags = ["b", "strong", "em", "i", "span", "big", "small", "font", "sub", "sup"];
+
+    // replace tags which imply text separation with a space
+    content = striptags(content, [linkTag, ...inlineFormattingTags], " ");
+
+    // replace the inline formatting tags (but not links) without a space
+    content = striptags(content, [linkTag], "");
+
+    // at least the closing link tag can be easily stripped
+    return content.replace(/<\/a>/gi, "");
+}