From cb5b4d870f5f70a7fdd3e0b87d555926e25d2143 Mon Sep 17 00:00:00 2001 From: Elian Doran Date: Wed, 11 Mar 2026 18:26:44 +0200 Subject: [PATCH] refactor(server/search): extract fulltext preprocessing to separate file --- .../expressions/note_content_fulltext.spec.ts | 15 +- .../expressions/note_content_fulltext.ts | 187 ++++-------------- ...note_content_fulltext_preprocessor.spec.ts | 18 ++ .../note_content_fulltext_preprocessor.ts | 116 +++++++++++ 4 files changed, 170 insertions(+), 166 deletions(-) create mode 100644 apps/server/src/services/search/expressions/note_content_fulltext_preprocessor.spec.ts create mode 100644 apps/server/src/services/search/expressions/note_content_fulltext_preprocessor.ts diff --git a/apps/server/src/services/search/expressions/note_content_fulltext.spec.ts b/apps/server/src/services/search/expressions/note_content_fulltext.spec.ts index d0d51e0877..50d9123fdf 100644 --- a/apps/server/src/services/search/expressions/note_content_fulltext.spec.ts +++ b/apps/server/src/services/search/expressions/note_content_fulltext.spec.ts @@ -1,18 +1,7 @@ -import { describe, it, expect } from "vitest"; -import { processMindmapContent } from "./note_content_fulltext.js"; +import { describe, expect,it } from "vitest"; + import NoteContentFulltextExp from "./note_content_fulltext.js"; -describe("processMindmapContent", () => { - it("supports empty JSON", () => { - expect(processMindmapContent("{}")).toEqual(""); - }); - - it("supports blank text / invalid JSON", () => { - expect(processMindmapContent("")).toEqual(""); - expect(processMindmapContent(`{ "node": " }`)).toEqual(""); - }); -}); - describe("Fuzzy Search Operators", () => { it("~= operator works with typos", () => { // Test that the ~= operator can handle common typos diff --git a/apps/server/src/services/search/expressions/note_content_fulltext.ts b/apps/server/src/services/search/expressions/note_content_fulltext.ts index 89ba1bc984..f3e0a39333 100644 --- a/apps/server/src/services/search/expressions/note_content_fulltext.ts +++ b/apps/server/src/services/search/expressions/note_content_fulltext.ts @@ -1,24 +1,19 @@ -"use strict"; - import type { NoteRow } from "@triliumnext/commons"; -import type SearchContext from "../search_context.js"; -import Expression from "./expression.js"; -import NoteSet from "../note_set.js"; -import log from "../../log.js"; import becca from "../../../becca/becca.js"; +import log from "../../log.js"; import protectedSessionService from "../../protected_session.js"; -import striptags from "striptags"; -import { normalize } from "../../utils.js"; import sql from "../../sql.js"; -import { - normalizeSearchText, - calculateOptimizedEditDistance, - validateFuzzySearchTokens, - validateAndPreprocessContent, +import NoteSet from "../note_set.js"; +import type SearchContext from "../search_context.js"; +import { + FUZZY_SEARCH_CONFIG, fuzzyMatchWord, - FUZZY_SEARCH_CONFIG -} from "../utils/text_utils.js"; + normalizeSearchText, + validateAndPreprocessContent, + validateFuzzySearchTokens} from "../utils/text_utils.js"; +import Expression from "./expression.js"; +import preprocessContent from "./note_content_fulltext_preprocessor.js"; const ALLOWED_OPERATORS = new Set(["=", "!=", "*=*", "*=", "=*", "%=", "~=", "~*"]); @@ -218,7 +213,7 @@ class NoteContentFulltextExp extends Expression { return; } - content = this.preprocessContent(content, type, mime); + content = preprocessContent(content, type, mime, this.raw); // Apply content size validation and preprocessing const processedContent = validateAndPreprocessContent(content, noteId); @@ -295,59 +290,22 @@ class NoteContentFulltextExp extends Expression { return content; } - preprocessContent(content: string | Buffer, type: string, mime: string) { - content = normalize(content.toString()); - - if (type === "text" && mime === "text/html") { - if (!this.raw) { - // Content size already filtered at DB level, safe to process - content = this.stripTags(content); - } - - content = content.replace(/ /g, " "); - } else if (type === "mindMap" && mime === "application/json") { - content = processMindmapContent(content); - } else if (type === "canvas" && mime === "application/json") { - interface Element { - type: string; - text?: string; // Optional since not all objects have a `text` property - id: string; - [key: string]: any; // Other properties that may exist - } - - const canvasContent = JSON.parse(content); - const elements = canvasContent.elements; - - if (Array.isArray(elements)) { - const texts = elements - .filter((element: Element) => element.type === "text" && element.text) // Filter for 'text' type elements with a 'text' property - .map((element: Element) => element.text!); // Use `!` to assert `text` is defined after filtering - - content = normalize(texts.join(" ")); - } else { - content = ""; - } - } - - return content.trim(); - } - /** * Checks if a token matches content with optional fuzzy matching */ private tokenMatchesContent(token: string, content: string, noteId: string): boolean { const normalizedToken = normalizeSearchText(token); const normalizedContent = normalizeSearchText(content); - + if (normalizedContent.includes(normalizedToken)) { return true; } - + // Check flat text for default fulltext search if (!this.flatText || !becca.notes[noteId].getFlatText().includes(token)) { return false; } - + return true; } @@ -358,15 +316,15 @@ class NoteContentFulltextExp extends Expression { try { const normalizedContent = normalizeSearchText(content); const flatText = this.flatText ? normalizeSearchText(becca.notes[noteId].getFlatText()) : ""; - + // For phrase matching, check if tokens appear within reasonable proximity if (this.tokens.length > 1) { return this.matchesPhrase(normalizedContent, flatText); } - + // Single token fuzzy matching const token = normalizeSearchText(this.tokens[0]); - return this.fuzzyMatchToken(token, normalizedContent) || + return this.fuzzyMatchToken(token, normalizedContent) || (this.flatText && this.fuzzyMatchToken(token, flatText)); } catch (error) { log.error(`Error in fuzzy matching for note ${noteId}: ${error}`); @@ -379,45 +337,45 @@ class NoteContentFulltextExp extends Expression { */ private matchesPhrase(content: string, flatText: string): boolean { const searchText = this.flatText ? `${content} ${flatText}` : content; - + // Apply content size limits for phrase matching const limitedText = validateAndPreprocessContent(searchText); if (!limitedText) { return false; } - + const words = limitedText.toLowerCase().split(/\s+/); - + // Only skip phrase matching for truly extreme word counts that could crash the system if (words.length > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT) { console.error(`Phrase matching skipped due to extreme word count that could cause system instability: ${words.length} words`); return false; } - + // Warn about large word counts but still attempt matching if (words.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) { console.info(`Large word count for phrase matching: ${words.length} words - may take longer but will attempt full matching`); } - + // Find positions of each token const tokenPositions: number[][] = this.tokens.map(token => { const normalizedToken = normalizeSearchText(token); const positions: number[] = []; - + words.forEach((word, index) => { if (this.fuzzyMatchSingle(normalizedToken, word)) { positions.push(index); } }); - + return positions; }); - + // Check if we found all tokens if (tokenPositions.some(positions => positions.length === 0)) { return false; } - + // Check for phrase proximity using configurable distance return this.hasProximityMatch(tokenPositions, FUZZY_SEARCH_CONFIG.MAX_PHRASE_PROXIMITY); } @@ -431,18 +389,18 @@ class NoteContentFulltextExp extends Expression { const [pos1, pos2] = tokenPositions; return pos1.some(p1 => pos2.some(p2 => Math.abs(p1 - p2) <= maxDistance)); } - + // For more tokens, check if we can find a sequence where all tokens are within range const findSequence = (remaining: number[][], currentPos: number): boolean => { if (remaining.length === 0) return true; - + const [nextPositions, ...rest] = remaining; - return nextPositions.some(pos => - Math.abs(pos - currentPos) <= maxDistance && + return nextPositions.some(pos => + Math.abs(pos - currentPos) <= maxDistance && findSequence(rest, pos) ); }; - + const [firstPositions, ...rest] = tokenPositions; return firstPositions.some(startPos => findSequence(rest, startPos)); } @@ -455,12 +413,12 @@ class NoteContentFulltextExp extends Expression { // For short tokens, require exact match to avoid too many false positives return content.includes(token); } - + const words = content.split(/\s+/); - + // Only limit word processing for truly extreme cases to prevent system instability const limitedWords = words.slice(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT); - + return limitedWords.some(word => this.fuzzyMatchSingle(token, word)); } @@ -471,83 +429,6 @@ class NoteContentFulltextExp extends Expression { // Use shared optimized fuzzy matching logic return fuzzyMatchWord(token, word, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE); } - - - stripTags(content: string) { - // we want to allow link to preserve URLs: https://github.com/zadam/trilium/issues/2412 - // we want to insert space in place of block tags (because they imply text separation) - // but we don't want to insert text for typical formatting inline tags which can occur within one word - const linkTag = "a"; - const inlineFormattingTags = ["b", "strong", "em", "i", "span", "big", "small", "font", "sub", "sup"]; - - // replace tags which imply text separation with a space - content = striptags(content, [linkTag, ...inlineFormattingTags], " "); - - // replace the inline formatting tags (but not links) without a space - content = striptags(content, [linkTag], ""); - - // at least the closing link tag can be easily stripped - return content.replace(/<\/a>/gi, ""); - } -} - -export function processMindmapContent(content: string) { - let mindMapcontent; - - try { - mindMapcontent = JSON.parse(content); - } catch (e) { - return ""; - } - - // Define interfaces for the JSON structure - interface MindmapNode { - id: string; - topic: string; - children: MindmapNode[]; // Recursive structure - direction?: number; - expanded?: boolean; - } - - interface MindmapData { - nodedata: MindmapNode; - arrows: any[]; // If you know the structure, replace `any` with the correct type - summaries: any[]; - direction: number; - theme: { - name: string; - type: string; - palette: string[]; - cssvar: Record; // Object with string keys and string values - }; - } - - // Recursive function to collect all topics - function collectTopics(node?: MindmapNode): string[] { - if (!node) { - return []; - } - - // Collect the current node's topic - let topics = [node.topic]; - - // If the node has children, collect topics recursively - if (node.children && node.children.length > 0) { - for (const child of node.children) { - topics = topics.concat(collectTopics(child)); - } - } - - return topics; - } - - // Start extracting from the root node - const topicsArray = collectTopics(mindMapcontent.nodedata); - - // Combine topics into a single string - const topicsString = topicsArray.join(", "); - - return normalize(topicsString.toString()); } export default NoteContentFulltextExp; diff --git a/apps/server/src/services/search/expressions/note_content_fulltext_preprocessor.spec.ts b/apps/server/src/services/search/expressions/note_content_fulltext_preprocessor.spec.ts new file mode 100644 index 0000000000..e01e7f2aa8 --- /dev/null +++ b/apps/server/src/services/search/expressions/note_content_fulltext_preprocessor.spec.ts @@ -0,0 +1,18 @@ +import { NoteType } from "@triliumnext/commons"; +import { describe, expect,it } from "vitest"; + +import preprocessContent from "./note_content_fulltext_preprocessor"; + +describe("Mind map preprocessing", () => { + const type: NoteType = "mindMap"; + const mime = "application/json"; + + it("supports empty JSON", () => { + expect(preprocessContent("{}", type, mime)).toEqual(""); + }); + + it("supports blank text / invalid JSON", () => { + expect(preprocessContent("", type, mime)).toEqual(""); + expect(preprocessContent(`{ "node": " }`, type, mime)).toEqual(""); + }); +}); \ No newline at end of file diff --git a/apps/server/src/services/search/expressions/note_content_fulltext_preprocessor.ts b/apps/server/src/services/search/expressions/note_content_fulltext_preprocessor.ts new file mode 100644 index 0000000000..ff0893b083 --- /dev/null +++ b/apps/server/src/services/search/expressions/note_content_fulltext_preprocessor.ts @@ -0,0 +1,116 @@ +import striptags from "striptags"; + +import { normalize } from "../../utils.js"; + +export default function preprocessContent(content: string | Buffer, type: string, mime: string, raw?: boolean) { + content = normalize(content.toString()); + + if (type === "text" && mime === "text/html") { + if (!raw) { + // Content size already filtered at DB level, safe to process + content = stripTags(content); + } + + content = content.replace(/ /g, " "); + } else if (type === "mindMap" && mime === "application/json") { + content = processMindmapContent(content); + } else if (type === "canvas" && mime === "application/json") { + interface Element { + type: string; + text?: string; // Optional since not all objects have a `text` property + id: string; + [key: string]: any; // Other properties that may exist + } + + const canvasContent = JSON.parse(content); + const elements = canvasContent.elements; + + if (Array.isArray(elements)) { + const texts = elements + .filter((element: Element) => element.type === "text" && element.text) // Filter for 'text' type elements with a 'text' property + .map((element: Element) => element.text!); // Use `!` to assert `text` is defined after filtering + + content = normalize(texts.join(" ")); + } else { + content = ""; + } + } + + return content.trim(); +} + +function processMindmapContent(content: string) { + let mindMapcontent; + + try { + mindMapcontent = JSON.parse(content); + } catch (e) { + return ""; + } + + // Define interfaces for the JSON structure + interface MindmapNode { + id: string; + topic: string; + children: MindmapNode[]; // Recursive structure + direction?: number; + expanded?: boolean; + } + + interface MindmapData { + nodedata: MindmapNode; + arrows: any[]; // If you know the structure, replace `any` with the correct type + summaries: any[]; + direction: number; + theme: { + name: string; + type: string; + palette: string[]; + cssvar: Record; // Object with string keys and string values + }; + } + + // Recursive function to collect all topics + function collectTopics(node?: MindmapNode): string[] { + if (!node) { + return []; + } + + // Collect the current node's topic + let topics = [node.topic]; + + // If the node has children, collect topics recursively + if (node.children && node.children.length > 0) { + for (const child of node.children) { + topics = topics.concat(collectTopics(child)); + } + } + + return topics; + } + + // Start extracting from the root node + const topicsArray = collectTopics(mindMapcontent.nodedata); + + // Combine topics into a single string + const topicsString = topicsArray.join(", "); + + return normalize(topicsString.toString()); +} + +function stripTags(content: string) { + // we want to allow link to preserve URLs: https://github.com/zadam/trilium/issues/2412 + // we want to insert space in place of block tags (because they imply text separation) + // but we don't want to insert text for typical formatting inline tags which can occur within one word + const linkTag = "a"; + const inlineFormattingTags = ["b", "strong", "em", "i", "span", "big", "small", "font", "sub", "sup"]; + + // replace tags which imply text separation with a space + content = striptags(content, [linkTag, ...inlineFormattingTags], " "); + + // replace the inline formatting tags (but not links) without a space + content = striptags(content, [linkTag], ""); + + // at least the closing link tag can be easily stripped + return content.replace(/<\/a>/gi, ""); +}