mirror of
https://github.com/zadam/trilium.git
synced 2026-03-13 11:53:38 +01:00
refactor(server/search): extract fulltext preprocessing to separate file
This commit is contained in:
parent
f81aef2de5
commit
cb5b4d870f
@ -1,18 +1,7 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { processMindmapContent } from "./note_content_fulltext.js";
|
||||
import { describe, expect,it } from "vitest";
|
||||
|
||||
import NoteContentFulltextExp from "./note_content_fulltext.js";
|
||||
|
||||
describe("processMindmapContent", () => {
|
||||
it("supports empty JSON", () => {
|
||||
expect(processMindmapContent("{}")).toEqual("");
|
||||
});
|
||||
|
||||
it("supports blank text / invalid JSON", () => {
|
||||
expect(processMindmapContent("")).toEqual("");
|
||||
expect(processMindmapContent(`{ "node": " }`)).toEqual("");
|
||||
});
|
||||
});
|
||||
|
||||
describe("Fuzzy Search Operators", () => {
|
||||
it("~= operator works with typos", () => {
|
||||
// Test that the ~= operator can handle common typos
|
||||
|
||||
@ -1,24 +1,19 @@
|
||||
"use strict";
|
||||
|
||||
import type { NoteRow } from "@triliumnext/commons";
|
||||
import type SearchContext from "../search_context.js";
|
||||
|
||||
import Expression from "./expression.js";
|
||||
import NoteSet from "../note_set.js";
|
||||
import log from "../../log.js";
|
||||
import becca from "../../../becca/becca.js";
|
||||
import log from "../../log.js";
|
||||
import protectedSessionService from "../../protected_session.js";
|
||||
import striptags from "striptags";
|
||||
import { normalize } from "../../utils.js";
|
||||
import sql from "../../sql.js";
|
||||
import {
|
||||
normalizeSearchText,
|
||||
calculateOptimizedEditDistance,
|
||||
validateFuzzySearchTokens,
|
||||
validateAndPreprocessContent,
|
||||
import NoteSet from "../note_set.js";
|
||||
import type SearchContext from "../search_context.js";
|
||||
import {
|
||||
FUZZY_SEARCH_CONFIG,
|
||||
fuzzyMatchWord,
|
||||
FUZZY_SEARCH_CONFIG
|
||||
} from "../utils/text_utils.js";
|
||||
normalizeSearchText,
|
||||
validateAndPreprocessContent,
|
||||
validateFuzzySearchTokens} from "../utils/text_utils.js";
|
||||
import Expression from "./expression.js";
|
||||
import preprocessContent from "./note_content_fulltext_preprocessor.js";
|
||||
|
||||
const ALLOWED_OPERATORS = new Set(["=", "!=", "*=*", "*=", "=*", "%=", "~=", "~*"]);
|
||||
|
||||
@ -218,7 +213,7 @@ class NoteContentFulltextExp extends Expression {
|
||||
return;
|
||||
}
|
||||
|
||||
content = this.preprocessContent(content, type, mime);
|
||||
content = preprocessContent(content, type, mime, this.raw);
|
||||
|
||||
// Apply content size validation and preprocessing
|
||||
const processedContent = validateAndPreprocessContent(content, noteId);
|
||||
@ -295,59 +290,22 @@ class NoteContentFulltextExp extends Expression {
|
||||
return content;
|
||||
}
|
||||
|
||||
preprocessContent(content: string | Buffer, type: string, mime: string) {
|
||||
content = normalize(content.toString());
|
||||
|
||||
if (type === "text" && mime === "text/html") {
|
||||
if (!this.raw) {
|
||||
// Content size already filtered at DB level, safe to process
|
||||
content = this.stripTags(content);
|
||||
}
|
||||
|
||||
content = content.replace(/ /g, " ");
|
||||
} else if (type === "mindMap" && mime === "application/json") {
|
||||
content = processMindmapContent(content);
|
||||
} else if (type === "canvas" && mime === "application/json") {
|
||||
interface Element {
|
||||
type: string;
|
||||
text?: string; // Optional since not all objects have a `text` property
|
||||
id: string;
|
||||
[key: string]: any; // Other properties that may exist
|
||||
}
|
||||
|
||||
const canvasContent = JSON.parse(content);
|
||||
const elements = canvasContent.elements;
|
||||
|
||||
if (Array.isArray(elements)) {
|
||||
const texts = elements
|
||||
.filter((element: Element) => element.type === "text" && element.text) // Filter for 'text' type elements with a 'text' property
|
||||
.map((element: Element) => element.text!); // Use `!` to assert `text` is defined after filtering
|
||||
|
||||
content = normalize(texts.join(" "));
|
||||
} else {
|
||||
content = "";
|
||||
}
|
||||
}
|
||||
|
||||
return content.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a token matches content with optional fuzzy matching
|
||||
*/
|
||||
private tokenMatchesContent(token: string, content: string, noteId: string): boolean {
|
||||
const normalizedToken = normalizeSearchText(token);
|
||||
const normalizedContent = normalizeSearchText(content);
|
||||
|
||||
|
||||
if (normalizedContent.includes(normalizedToken)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// Check flat text for default fulltext search
|
||||
if (!this.flatText || !becca.notes[noteId].getFlatText().includes(token)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -358,15 +316,15 @@ class NoteContentFulltextExp extends Expression {
|
||||
try {
|
||||
const normalizedContent = normalizeSearchText(content);
|
||||
const flatText = this.flatText ? normalizeSearchText(becca.notes[noteId].getFlatText()) : "";
|
||||
|
||||
|
||||
// For phrase matching, check if tokens appear within reasonable proximity
|
||||
if (this.tokens.length > 1) {
|
||||
return this.matchesPhrase(normalizedContent, flatText);
|
||||
}
|
||||
|
||||
|
||||
// Single token fuzzy matching
|
||||
const token = normalizeSearchText(this.tokens[0]);
|
||||
return this.fuzzyMatchToken(token, normalizedContent) ||
|
||||
return this.fuzzyMatchToken(token, normalizedContent) ||
|
||||
(this.flatText && this.fuzzyMatchToken(token, flatText));
|
||||
} catch (error) {
|
||||
log.error(`Error in fuzzy matching for note ${noteId}: ${error}`);
|
||||
@ -379,45 +337,45 @@ class NoteContentFulltextExp extends Expression {
|
||||
*/
|
||||
private matchesPhrase(content: string, flatText: string): boolean {
|
||||
const searchText = this.flatText ? `${content} ${flatText}` : content;
|
||||
|
||||
|
||||
// Apply content size limits for phrase matching
|
||||
const limitedText = validateAndPreprocessContent(searchText);
|
||||
if (!limitedText) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
const words = limitedText.toLowerCase().split(/\s+/);
|
||||
|
||||
|
||||
// Only skip phrase matching for truly extreme word counts that could crash the system
|
||||
if (words.length > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT) {
|
||||
console.error(`Phrase matching skipped due to extreme word count that could cause system instability: ${words.length} words`);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// Warn about large word counts but still attempt matching
|
||||
if (words.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) {
|
||||
console.info(`Large word count for phrase matching: ${words.length} words - may take longer but will attempt full matching`);
|
||||
}
|
||||
|
||||
|
||||
// Find positions of each token
|
||||
const tokenPositions: number[][] = this.tokens.map(token => {
|
||||
const normalizedToken = normalizeSearchText(token);
|
||||
const positions: number[] = [];
|
||||
|
||||
|
||||
words.forEach((word, index) => {
|
||||
if (this.fuzzyMatchSingle(normalizedToken, word)) {
|
||||
positions.push(index);
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
return positions;
|
||||
});
|
||||
|
||||
|
||||
// Check if we found all tokens
|
||||
if (tokenPositions.some(positions => positions.length === 0)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// Check for phrase proximity using configurable distance
|
||||
return this.hasProximityMatch(tokenPositions, FUZZY_SEARCH_CONFIG.MAX_PHRASE_PROXIMITY);
|
||||
}
|
||||
@ -431,18 +389,18 @@ class NoteContentFulltextExp extends Expression {
|
||||
const [pos1, pos2] = tokenPositions;
|
||||
return pos1.some(p1 => pos2.some(p2 => Math.abs(p1 - p2) <= maxDistance));
|
||||
}
|
||||
|
||||
|
||||
// For more tokens, check if we can find a sequence where all tokens are within range
|
||||
const findSequence = (remaining: number[][], currentPos: number): boolean => {
|
||||
if (remaining.length === 0) return true;
|
||||
|
||||
|
||||
const [nextPositions, ...rest] = remaining;
|
||||
return nextPositions.some(pos =>
|
||||
Math.abs(pos - currentPos) <= maxDistance &&
|
||||
return nextPositions.some(pos =>
|
||||
Math.abs(pos - currentPos) <= maxDistance &&
|
||||
findSequence(rest, pos)
|
||||
);
|
||||
};
|
||||
|
||||
|
||||
const [firstPositions, ...rest] = tokenPositions;
|
||||
return firstPositions.some(startPos => findSequence(rest, startPos));
|
||||
}
|
||||
@ -455,12 +413,12 @@ class NoteContentFulltextExp extends Expression {
|
||||
// For short tokens, require exact match to avoid too many false positives
|
||||
return content.includes(token);
|
||||
}
|
||||
|
||||
|
||||
const words = content.split(/\s+/);
|
||||
|
||||
|
||||
// Only limit word processing for truly extreme cases to prevent system instability
|
||||
const limitedWords = words.slice(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT);
|
||||
|
||||
|
||||
return limitedWords.some(word => this.fuzzyMatchSingle(token, word));
|
||||
}
|
||||
|
||||
@ -471,83 +429,6 @@ class NoteContentFulltextExp extends Expression {
|
||||
// Use shared optimized fuzzy matching logic
|
||||
return fuzzyMatchWord(token, word, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
|
||||
}
|
||||
|
||||
|
||||
stripTags(content: string) {
|
||||
// we want to allow link to preserve URLs: https://github.com/zadam/trilium/issues/2412
|
||||
// we want to insert space in place of block tags (because they imply text separation)
|
||||
// but we don't want to insert text for typical formatting inline tags which can occur within one word
|
||||
const linkTag = "a";
|
||||
const inlineFormattingTags = ["b", "strong", "em", "i", "span", "big", "small", "font", "sub", "sup"];
|
||||
|
||||
// replace tags which imply text separation with a space
|
||||
content = striptags(content, [linkTag, ...inlineFormattingTags], " ");
|
||||
|
||||
// replace the inline formatting tags (but not links) without a space
|
||||
content = striptags(content, [linkTag], "");
|
||||
|
||||
// at least the closing link tag can be easily stripped
|
||||
return content.replace(/<\/a>/gi, "");
|
||||
}
|
||||
}
|
||||
|
||||
export function processMindmapContent(content: string) {
|
||||
let mindMapcontent;
|
||||
|
||||
try {
|
||||
mindMapcontent = JSON.parse(content);
|
||||
} catch (e) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Define interfaces for the JSON structure
|
||||
interface MindmapNode {
|
||||
id: string;
|
||||
topic: string;
|
||||
children: MindmapNode[]; // Recursive structure
|
||||
direction?: number;
|
||||
expanded?: boolean;
|
||||
}
|
||||
|
||||
interface MindmapData {
|
||||
nodedata: MindmapNode;
|
||||
arrows: any[]; // If you know the structure, replace `any` with the correct type
|
||||
summaries: any[];
|
||||
direction: number;
|
||||
theme: {
|
||||
name: string;
|
||||
type: string;
|
||||
palette: string[];
|
||||
cssvar: Record<string, string>; // Object with string keys and string values
|
||||
};
|
||||
}
|
||||
|
||||
// Recursive function to collect all topics
|
||||
function collectTopics(node?: MindmapNode): string[] {
|
||||
if (!node) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// Collect the current node's topic
|
||||
let topics = [node.topic];
|
||||
|
||||
// If the node has children, collect topics recursively
|
||||
if (node.children && node.children.length > 0) {
|
||||
for (const child of node.children) {
|
||||
topics = topics.concat(collectTopics(child));
|
||||
}
|
||||
}
|
||||
|
||||
return topics;
|
||||
}
|
||||
|
||||
// Start extracting from the root node
|
||||
const topicsArray = collectTopics(mindMapcontent.nodedata);
|
||||
|
||||
// Combine topics into a single string
|
||||
const topicsString = topicsArray.join(", ");
|
||||
|
||||
return normalize(topicsString.toString());
|
||||
}
|
||||
|
||||
export default NoteContentFulltextExp;
|
||||
|
||||
@ -0,0 +1,18 @@
|
||||
import { NoteType } from "@triliumnext/commons";
|
||||
import { describe, expect,it } from "vitest";
|
||||
|
||||
import preprocessContent from "./note_content_fulltext_preprocessor";
|
||||
|
||||
describe("Mind map preprocessing", () => {
|
||||
const type: NoteType = "mindMap";
|
||||
const mime = "application/json";
|
||||
|
||||
it("supports empty JSON", () => {
|
||||
expect(preprocessContent("{}", type, mime)).toEqual("");
|
||||
});
|
||||
|
||||
it("supports blank text / invalid JSON", () => {
|
||||
expect(preprocessContent("", type, mime)).toEqual("");
|
||||
expect(preprocessContent(`{ "node": " }`, type, mime)).toEqual("");
|
||||
});
|
||||
});
|
||||
@ -0,0 +1,116 @@
|
||||
import striptags from "striptags";
|
||||
|
||||
import { normalize } from "../../utils.js";
|
||||
|
||||
export default function preprocessContent(content: string | Buffer, type: string, mime: string, raw?: boolean) {
|
||||
content = normalize(content.toString());
|
||||
|
||||
if (type === "text" && mime === "text/html") {
|
||||
if (!raw) {
|
||||
// Content size already filtered at DB level, safe to process
|
||||
content = stripTags(content);
|
||||
}
|
||||
|
||||
content = content.replace(/ /g, " ");
|
||||
} else if (type === "mindMap" && mime === "application/json") {
|
||||
content = processMindmapContent(content);
|
||||
} else if (type === "canvas" && mime === "application/json") {
|
||||
interface Element {
|
||||
type: string;
|
||||
text?: string; // Optional since not all objects have a `text` property
|
||||
id: string;
|
||||
[key: string]: any; // Other properties that may exist
|
||||
}
|
||||
|
||||
const canvasContent = JSON.parse(content);
|
||||
const elements = canvasContent.elements;
|
||||
|
||||
if (Array.isArray(elements)) {
|
||||
const texts = elements
|
||||
.filter((element: Element) => element.type === "text" && element.text) // Filter for 'text' type elements with a 'text' property
|
||||
.map((element: Element) => element.text!); // Use `!` to assert `text` is defined after filtering
|
||||
|
||||
content = normalize(texts.join(" "));
|
||||
} else {
|
||||
content = "";
|
||||
}
|
||||
}
|
||||
|
||||
return content.trim();
|
||||
}
|
||||
|
||||
function processMindmapContent(content: string) {
|
||||
let mindMapcontent;
|
||||
|
||||
try {
|
||||
mindMapcontent = JSON.parse(content);
|
||||
} catch (e) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Define interfaces for the JSON structure
|
||||
interface MindmapNode {
|
||||
id: string;
|
||||
topic: string;
|
||||
children: MindmapNode[]; // Recursive structure
|
||||
direction?: number;
|
||||
expanded?: boolean;
|
||||
}
|
||||
|
||||
interface MindmapData {
|
||||
nodedata: MindmapNode;
|
||||
arrows: any[]; // If you know the structure, replace `any` with the correct type
|
||||
summaries: any[];
|
||||
direction: number;
|
||||
theme: {
|
||||
name: string;
|
||||
type: string;
|
||||
palette: string[];
|
||||
cssvar: Record<string, string>; // Object with string keys and string values
|
||||
};
|
||||
}
|
||||
|
||||
// Recursive function to collect all topics
|
||||
function collectTopics(node?: MindmapNode): string[] {
|
||||
if (!node) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// Collect the current node's topic
|
||||
let topics = [node.topic];
|
||||
|
||||
// If the node has children, collect topics recursively
|
||||
if (node.children && node.children.length > 0) {
|
||||
for (const child of node.children) {
|
||||
topics = topics.concat(collectTopics(child));
|
||||
}
|
||||
}
|
||||
|
||||
return topics;
|
||||
}
|
||||
|
||||
// Start extracting from the root node
|
||||
const topicsArray = collectTopics(mindMapcontent.nodedata);
|
||||
|
||||
// Combine topics into a single string
|
||||
const topicsString = topicsArray.join(", ");
|
||||
|
||||
return normalize(topicsString.toString());
|
||||
}
|
||||
|
||||
function stripTags(content: string) {
|
||||
// we want to allow link to preserve URLs: https://github.com/zadam/trilium/issues/2412
|
||||
// we want to insert space in place of block tags (because they imply text separation)
|
||||
// but we don't want to insert text for typical formatting inline tags which can occur within one word
|
||||
const linkTag = "a";
|
||||
const inlineFormattingTags = ["b", "strong", "em", "i", "span", "big", "small", "font", "sub", "sup"];
|
||||
|
||||
// replace tags which imply text separation with a space
|
||||
content = striptags(content, [linkTag, ...inlineFormattingTags], " ");
|
||||
|
||||
// replace the inline formatting tags (but not links) without a space
|
||||
content = striptags(content, [linkTag], "");
|
||||
|
||||
// at least the closing link tag can be easily stripped
|
||||
return content.replace(/<\/a>/gi, "");
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user