mirror of
https://github.com/zadam/trilium.git
synced 2025-10-19 22:58:52 +02:00
feat(search): improve search weights and operators (#6536)
This commit is contained in:
commit
6e37c9ee5a
@ -2255,6 +2255,13 @@ footer.webview-footer button {
|
||||
padding: 1px 10px 1px 10px;
|
||||
}
|
||||
|
||||
/* Search result highlighting */
|
||||
.search-result-title b,
|
||||
.search-result-content b {
|
||||
font-weight: 900;
|
||||
color: var(--admonition-warning-accent-color);
|
||||
}
|
||||
|
||||
/* Customized icons */
|
||||
|
||||
.bx-tn-toc::before {
|
||||
|
@ -23,12 +23,52 @@ const TPL = /*html*/`
|
||||
|
||||
.quick-search .dropdown-menu {
|
||||
max-height: 600px;
|
||||
max-width: 400px;
|
||||
max-width: 600px;
|
||||
overflow-y: auto;
|
||||
overflow-x: hidden;
|
||||
text-overflow: ellipsis;
|
||||
box-shadow: -30px 50px 93px -50px black;
|
||||
}
|
||||
|
||||
.quick-search .dropdown-item {
|
||||
white-space: normal;
|
||||
padding: 12px 16px;
|
||||
line-height: 1.4;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.quick-search .dropdown-item:not(:last-child)::after {
|
||||
content: '';
|
||||
position: absolute;
|
||||
bottom: 0;
|
||||
left: 50%;
|
||||
transform: translateX(-50%);
|
||||
width: 80%;
|
||||
height: 2px;
|
||||
background: var(--main-border-color);
|
||||
border-radius: 1px;
|
||||
opacity: 0.4;
|
||||
}
|
||||
|
||||
.quick-search .dropdown-item:last-child::after {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.quick-search .dropdown-item.disabled::after {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.quick-search .dropdown-item.show-in-full-search::after {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.quick-search .dropdown-item:hover {
|
||||
background-color: #f8f9fa;
|
||||
}
|
||||
|
||||
.quick-search .dropdown-divider {
|
||||
margin: 0;
|
||||
}
|
||||
</style>
|
||||
|
||||
<div class="input-group-prepend">
|
||||
@ -40,11 +80,21 @@ const TPL = /*html*/`
|
||||
<input type="text" class="form-control form-control-sm search-string" placeholder="${t("quick-search.placeholder")}">
|
||||
</div>`;
|
||||
|
||||
const MAX_DISPLAYED_NOTES = 15;
|
||||
const INITIAL_DISPLAYED_NOTES = 15;
|
||||
const LOAD_MORE_BATCH_SIZE = 10;
|
||||
|
||||
// TODO: Deduplicate with server.
|
||||
interface QuickSearchResponse {
|
||||
searchResultNoteIds: string[];
|
||||
searchResults?: Array<{
|
||||
notePath: string;
|
||||
noteTitle: string;
|
||||
notePathTitle: string;
|
||||
highlightedNotePathTitle: string;
|
||||
contentSnippet?: string;
|
||||
highlightedContentSnippet?: string;
|
||||
icon: string;
|
||||
}>;
|
||||
error: string;
|
||||
}
|
||||
|
||||
@ -53,6 +103,12 @@ export default class QuickSearchWidget extends BasicWidget {
|
||||
private dropdown!: bootstrap.Dropdown;
|
||||
private $searchString!: JQuery<HTMLElement>;
|
||||
private $dropdownMenu!: JQuery<HTMLElement>;
|
||||
|
||||
// State for infinite scrolling
|
||||
private allSearchResults: Array<any> = [];
|
||||
private allSearchResultNoteIds: string[] = [];
|
||||
private currentDisplayedCount: number = 0;
|
||||
private isLoadingMore: boolean = false;
|
||||
|
||||
doRender() {
|
||||
this.$widget = $(TPL);
|
||||
@ -68,6 +124,11 @@ export default class QuickSearchWidget extends BasicWidget {
|
||||
});
|
||||
|
||||
this.$widget.find(".input-group-prepend").on("shown.bs.dropdown", () => this.search());
|
||||
|
||||
// Add scroll event listener for infinite scrolling
|
||||
this.$dropdownMenu.on("scroll", () => {
|
||||
this.handleScroll();
|
||||
});
|
||||
|
||||
if (utils.isMobile()) {
|
||||
this.$searchString.keydown((e) => {
|
||||
@ -112,10 +173,16 @@ export default class QuickSearchWidget extends BasicWidget {
|
||||
return;
|
||||
}
|
||||
|
||||
// Reset state for new search
|
||||
this.allSearchResults = [];
|
||||
this.allSearchResultNoteIds = [];
|
||||
this.currentDisplayedCount = 0;
|
||||
this.isLoadingMore = false;
|
||||
|
||||
this.$dropdownMenu.empty();
|
||||
this.$dropdownMenu.append(`<span class="dropdown-item disabled"><span class="bx bx-loader bx-spin"></span>${t("quick-search.searching")}</span>`);
|
||||
|
||||
const { searchResultNoteIds, error } = await server.get<QuickSearchResponse>(`quick-search/${encodeURIComponent(searchString)}`);
|
||||
const { searchResultNoteIds, searchResults, error } = await server.get<QuickSearchResponse>(`quick-search/${encodeURIComponent(searchString)}`);
|
||||
|
||||
if (error) {
|
||||
let tooltip = new Tooltip(this.$searchString[0], {
|
||||
@ -129,47 +196,148 @@ export default class QuickSearchWidget extends BasicWidget {
|
||||
setTimeout(() => tooltip.dispose(), 4000);
|
||||
}
|
||||
|
||||
const displayedNoteIds = searchResultNoteIds.slice(0, Math.min(MAX_DISPLAYED_NOTES, searchResultNoteIds.length));
|
||||
// Store all results for infinite scrolling
|
||||
this.allSearchResults = searchResults || [];
|
||||
this.allSearchResultNoteIds = searchResultNoteIds || [];
|
||||
|
||||
this.$dropdownMenu.empty();
|
||||
|
||||
if (displayedNoteIds.length === 0) {
|
||||
if (this.allSearchResults.length === 0 && this.allSearchResultNoteIds.length === 0) {
|
||||
this.$dropdownMenu.append(`<span class="dropdown-item disabled">${t("quick-search.no-results")}</span>`);
|
||||
return;
|
||||
}
|
||||
|
||||
for (const note of await froca.getNotes(displayedNoteIds)) {
|
||||
const $link = await linkService.createLink(note.noteId, { showNotePath: true, showNoteIcon: true });
|
||||
$link.addClass("dropdown-item");
|
||||
$link.attr("tabIndex", "0");
|
||||
$link.on("click", (e) => {
|
||||
this.dropdown.hide();
|
||||
// Display initial batch
|
||||
await this.displayMoreResults(INITIAL_DISPLAYED_NOTES);
|
||||
this.addShowInFullSearchButton();
|
||||
|
||||
this.dropdown.update();
|
||||
}
|
||||
|
||||
private async displayMoreResults(batchSize: number) {
|
||||
if (this.isLoadingMore) return;
|
||||
this.isLoadingMore = true;
|
||||
|
||||
// Remove the "Show in full search" button temporarily
|
||||
this.$dropdownMenu.find('.show-in-full-search').remove();
|
||||
this.$dropdownMenu.find('.dropdown-divider').remove();
|
||||
|
||||
// Use highlighted search results if available, otherwise fall back to basic display
|
||||
if (this.allSearchResults.length > 0) {
|
||||
const startIndex = this.currentDisplayedCount;
|
||||
const endIndex = Math.min(startIndex + batchSize, this.allSearchResults.length);
|
||||
const resultsToDisplay = this.allSearchResults.slice(startIndex, endIndex);
|
||||
|
||||
for (const result of resultsToDisplay) {
|
||||
const noteId = result.notePath.split("/").pop();
|
||||
if (!noteId) continue;
|
||||
|
||||
const $item = $('<a class="dropdown-item" tabindex="0" href="javascript:">');
|
||||
|
||||
// Build the display HTML with content snippet below the title
|
||||
let itemHtml = `<div style="display: flex; flex-direction: column;">
|
||||
<div style="display: flex; align-items: flex-start; gap: 6px;">
|
||||
<span class="${result.icon}" style="flex-shrink: 0; margin-top: 1px;"></span>
|
||||
<span style="flex: 1;" class="search-result-title">${result.highlightedNotePathTitle}</span>
|
||||
</div>`;
|
||||
|
||||
// Add content snippet below the title if available
|
||||
if (result.highlightedContentSnippet) {
|
||||
itemHtml += `<div style="font-size: 0.85em; color: var(--main-text-color); opacity: 0.7; margin-left: 20px; margin-top: 4px; line-height: 1.3;" class="search-result-content">${result.highlightedContentSnippet}</div>`;
|
||||
}
|
||||
|
||||
itemHtml += `</div>`;
|
||||
|
||||
$item.html(itemHtml);
|
||||
|
||||
$item.on("click", (e) => {
|
||||
this.dropdown.hide();
|
||||
e.preventDefault();
|
||||
|
||||
const activeContext = appContext.tabManager.getActiveContext();
|
||||
if (activeContext) {
|
||||
activeContext.setNote(noteId);
|
||||
}
|
||||
});
|
||||
|
||||
shortcutService.bindElShortcut($item, "return", () => {
|
||||
this.dropdown.hide();
|
||||
|
||||
const activeContext = appContext.tabManager.getActiveContext();
|
||||
if (activeContext) {
|
||||
activeContext.setNote(noteId);
|
||||
}
|
||||
});
|
||||
|
||||
this.$dropdownMenu.append($item);
|
||||
}
|
||||
|
||||
this.currentDisplayedCount = endIndex;
|
||||
} else {
|
||||
// Fallback to original behavior if no highlighted results
|
||||
const startIndex = this.currentDisplayedCount;
|
||||
const endIndex = Math.min(startIndex + batchSize, this.allSearchResultNoteIds.length);
|
||||
const noteIdsToDisplay = this.allSearchResultNoteIds.slice(startIndex, endIndex);
|
||||
|
||||
for (const note of await froca.getNotes(noteIdsToDisplay)) {
|
||||
const $link = await linkService.createLink(note.noteId, { showNotePath: true, showNoteIcon: true });
|
||||
$link.addClass("dropdown-item");
|
||||
$link.attr("tabIndex", "0");
|
||||
$link.on("click", (e) => {
|
||||
this.dropdown.hide();
|
||||
|
||||
if (!e.target || e.target.nodeName !== "A") {
|
||||
// click on the link is handled by link handling, but we want the whole item clickable
|
||||
const activeContext = appContext.tabManager.getActiveContext();
|
||||
if (activeContext) {
|
||||
activeContext.setNote(note.noteId);
|
||||
}
|
||||
}
|
||||
});
|
||||
shortcutService.bindElShortcut($link, "return", () => {
|
||||
this.dropdown.hide();
|
||||
|
||||
if (!e.target || e.target.nodeName !== "A") {
|
||||
// click on the link is handled by link handling, but we want the whole item clickable
|
||||
const activeContext = appContext.tabManager.getActiveContext();
|
||||
if (activeContext) {
|
||||
activeContext.setNote(note.noteId);
|
||||
}
|
||||
}
|
||||
});
|
||||
shortcutService.bindElShortcut($link, "return", () => {
|
||||
this.dropdown.hide();
|
||||
});
|
||||
|
||||
const activeContext = appContext.tabManager.getActiveContext();
|
||||
if (activeContext) {
|
||||
activeContext.setNote(note.noteId);
|
||||
}
|
||||
});
|
||||
this.$dropdownMenu.append($link);
|
||||
}
|
||||
|
||||
this.$dropdownMenu.append($link);
|
||||
this.currentDisplayedCount = endIndex;
|
||||
}
|
||||
|
||||
if (searchResultNoteIds.length > MAX_DISPLAYED_NOTES) {
|
||||
const numRemainingResults = searchResultNoteIds.length - MAX_DISPLAYED_NOTES;
|
||||
this.$dropdownMenu.append(`<span class="dropdown-item disabled">${t("quick-search.more-results", { number: numRemainingResults })}</span>`);
|
||||
}
|
||||
this.isLoadingMore = false;
|
||||
}
|
||||
|
||||
const $showInFullButton = $('<a class="dropdown-item" tabindex="0">').text(t("quick-search.show-in-full-search"));
|
||||
private handleScroll() {
|
||||
if (this.isLoadingMore) return;
|
||||
|
||||
const dropdown = this.$dropdownMenu[0];
|
||||
const scrollTop = dropdown.scrollTop;
|
||||
const scrollHeight = dropdown.scrollHeight;
|
||||
const clientHeight = dropdown.clientHeight;
|
||||
|
||||
// Trigger loading more when user scrolls near the bottom (within 50px)
|
||||
if (scrollTop + clientHeight >= scrollHeight - 50) {
|
||||
const totalResults = this.allSearchResults.length > 0 ? this.allSearchResults.length : this.allSearchResultNoteIds.length;
|
||||
|
||||
if (this.currentDisplayedCount < totalResults) {
|
||||
this.displayMoreResults(LOAD_MORE_BATCH_SIZE).then(() => {
|
||||
this.addShowInFullSearchButton();
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private addShowInFullSearchButton() {
|
||||
// Remove existing button if it exists
|
||||
this.$dropdownMenu.find('.show-in-full-search').remove();
|
||||
this.$dropdownMenu.find('.dropdown-divider').remove();
|
||||
|
||||
const $showInFullButton = $('<a class="dropdown-item show-in-full-search" tabindex="0">').text(t("quick-search.show-in-full-search"));
|
||||
|
||||
this.$dropdownMenu.append($(`<div class="dropdown-divider">`));
|
||||
this.$dropdownMenu.append($showInFullButton);
|
||||
|
@ -52,10 +52,15 @@ function quickSearch(req: Request) {
|
||||
fuzzyAttributeSearch: false
|
||||
});
|
||||
|
||||
const resultNoteIds = searchService.findResultsWithQuery(searchString, searchContext).map((sr) => sr.noteId);
|
||||
// Use the same highlighting logic as autocomplete for consistency
|
||||
const searchResults = searchService.searchNotesForAutocomplete(searchString, false);
|
||||
|
||||
// Extract note IDs for backward compatibility
|
||||
const resultNoteIds = searchResults.map((result) => result.notePath.split("/").pop()).filter(Boolean) as string[];
|
||||
|
||||
return {
|
||||
searchResultNoteIds: resultNoteIds,
|
||||
searchResults: searchResults,
|
||||
error: searchContext.getError()
|
||||
};
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { processMindmapContent } from "./note_content_fulltext.js";
|
||||
import NoteContentFulltextExp from "./note_content_fulltext.js";
|
||||
|
||||
describe("processMindmapContent", () => {
|
||||
it("supports empty JSON", () => {
|
||||
@ -11,3 +12,19 @@ describe("processMindmapContent", () => {
|
||||
expect(processMindmapContent(`{ "node": " }`)).toEqual("");
|
||||
});
|
||||
});
|
||||
|
||||
describe("Fuzzy Search Operators", () => {
|
||||
it("~= operator works with typos", () => {
|
||||
// Test that the ~= operator can handle common typos
|
||||
const expression = new NoteContentFulltextExp("~=", { tokens: ["hello"] });
|
||||
expect(expression.tokens).toEqual(["hello"]);
|
||||
expect(() => new NoteContentFulltextExp("~=", { tokens: ["he"] })).toThrow(); // Too short
|
||||
});
|
||||
|
||||
it("~* operator works with fuzzy contains", () => {
|
||||
// Test that the ~* operator handles fuzzy substring matching
|
||||
const expression = new NoteContentFulltextExp("~*", { tokens: ["world"] });
|
||||
expect(expression.tokens).toEqual(["world"]);
|
||||
expect(() => new NoteContentFulltextExp("~*", { tokens: ["wo"] })).toThrow(); // Too short
|
||||
});
|
||||
});
|
||||
|
@ -11,8 +11,19 @@ import protectedSessionService from "../../protected_session.js";
|
||||
import striptags from "striptags";
|
||||
import { normalize } from "../../utils.js";
|
||||
import sql from "../../sql.js";
|
||||
import {
|
||||
normalizeSearchText,
|
||||
calculateOptimizedEditDistance,
|
||||
validateFuzzySearchTokens,
|
||||
validateAndPreprocessContent,
|
||||
fuzzyMatchWord,
|
||||
FUZZY_SEARCH_CONFIG
|
||||
} from "../utils/text_utils.js";
|
||||
|
||||
const ALLOWED_OPERATORS = new Set(["=", "!=", "*=*", "*=", "=*", "%="]);
|
||||
const ALLOWED_OPERATORS = new Set(["=", "!=", "*=*", "*=", "=*", "%=", "~=", "~*"]);
|
||||
|
||||
// Maximum content size for search processing (2MB)
|
||||
const MAX_SEARCH_CONTENT_SIZE = 2 * 1024 * 1024;
|
||||
|
||||
const cachedRegexes: Record<string, RegExp> = {};
|
||||
|
||||
@ -41,6 +52,16 @@ class NoteContentFulltextExp extends Expression {
|
||||
constructor(operator: string, { tokens, raw, flatText }: ConstructorOpts) {
|
||||
super();
|
||||
|
||||
if (!operator || !tokens || !Array.isArray(tokens)) {
|
||||
throw new Error('Invalid parameters: operator and tokens are required');
|
||||
}
|
||||
|
||||
// Validate fuzzy search tokens
|
||||
const validation = validateFuzzySearchTokens(tokens, operator);
|
||||
if (!validation.isValid) {
|
||||
throw new Error(validation.error!);
|
||||
}
|
||||
|
||||
this.operator = operator;
|
||||
this.tokens = tokens;
|
||||
this.raw = !!raw;
|
||||
@ -59,7 +80,9 @@ class NoteContentFulltextExp extends Expression {
|
||||
for (const row of sql.iterateRows<SearchRow>(`
|
||||
SELECT noteId, type, mime, content, isProtected
|
||||
FROM notes JOIN blobs USING (blobId)
|
||||
WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap') AND isDeleted = 0`)) {
|
||||
WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
|
||||
AND isDeleted = 0
|
||||
AND LENGTH(content) < ${MAX_SEARCH_CONTENT_SIZE}`)) {
|
||||
this.findInText(row, inputNoteSet, resultNoteSet);
|
||||
}
|
||||
|
||||
@ -89,6 +112,13 @@ class NoteContentFulltextExp extends Expression {
|
||||
}
|
||||
|
||||
content = this.preprocessContent(content, type, mime);
|
||||
|
||||
// Apply content size validation and preprocessing
|
||||
const processedContent = validateAndPreprocessContent(content, noteId);
|
||||
if (!processedContent) {
|
||||
return; // Content too large or invalid
|
||||
}
|
||||
content = processedContent;
|
||||
|
||||
if (this.tokens.length === 1) {
|
||||
const [token] = this.tokens;
|
||||
@ -99,21 +129,27 @@ class NoteContentFulltextExp extends Expression {
|
||||
(this.operator === "*=" && content.endsWith(token)) ||
|
||||
(this.operator === "=*" && content.startsWith(token)) ||
|
||||
(this.operator === "*=*" && content.includes(token)) ||
|
||||
(this.operator === "%=" && getRegex(token).test(content))
|
||||
(this.operator === "%=" && getRegex(token).test(content)) ||
|
||||
(this.operator === "~=" && this.matchesWithFuzzy(content, noteId)) ||
|
||||
(this.operator === "~*" && this.fuzzyMatchToken(normalizeSearchText(token), normalizeSearchText(content)))
|
||||
) {
|
||||
resultNoteSet.add(becca.notes[noteId]);
|
||||
}
|
||||
} else {
|
||||
const nonMatchingToken = this.tokens.find(
|
||||
(token) =>
|
||||
!content?.includes(token) &&
|
||||
// in case of default fulltext search, we should consider both title, attrs and content
|
||||
// so e.g. "hello world" should match when "hello" is in title and "world" in content
|
||||
(!this.flatText || !becca.notes[noteId].getFlatText().includes(token))
|
||||
);
|
||||
// Multi-token matching with fuzzy support and phrase proximity
|
||||
if (this.operator === "~=" || this.operator === "~*") {
|
||||
if (this.matchesWithFuzzy(content, noteId)) {
|
||||
resultNoteSet.add(becca.notes[noteId]);
|
||||
}
|
||||
} else {
|
||||
const nonMatchingToken = this.tokens.find(
|
||||
(token) =>
|
||||
!this.tokenMatchesContent(token, content, noteId)
|
||||
);
|
||||
|
||||
if (!nonMatchingToken) {
|
||||
resultNoteSet.add(becca.notes[noteId]);
|
||||
if (!nonMatchingToken) {
|
||||
resultNoteSet.add(becca.notes[noteId]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -124,8 +160,8 @@ class NoteContentFulltextExp extends Expression {
|
||||
content = normalize(content.toString());
|
||||
|
||||
if (type === "text" && mime === "text/html") {
|
||||
if (!this.raw && content.length < 20000) {
|
||||
// striptags is slow for very large notes
|
||||
if (!this.raw) {
|
||||
// Content size already filtered at DB level, safe to process
|
||||
content = this.stripTags(content);
|
||||
}
|
||||
|
||||
@ -152,6 +188,147 @@ class NoteContentFulltextExp extends Expression {
|
||||
return content.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a token matches content with optional fuzzy matching
|
||||
*/
|
||||
private tokenMatchesContent(token: string, content: string, noteId: string): boolean {
|
||||
const normalizedToken = normalizeSearchText(token);
|
||||
const normalizedContent = normalizeSearchText(content);
|
||||
|
||||
if (normalizedContent.includes(normalizedToken)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check flat text for default fulltext search
|
||||
if (!this.flatText || !becca.notes[noteId].getFlatText().includes(token)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs fuzzy matching with edit distance and phrase proximity
|
||||
*/
|
||||
private matchesWithFuzzy(content: string, noteId: string): boolean {
|
||||
try {
|
||||
const normalizedContent = normalizeSearchText(content);
|
||||
const flatText = this.flatText ? normalizeSearchText(becca.notes[noteId].getFlatText()) : "";
|
||||
|
||||
// For phrase matching, check if tokens appear within reasonable proximity
|
||||
if (this.tokens.length > 1) {
|
||||
return this.matchesPhrase(normalizedContent, flatText);
|
||||
}
|
||||
|
||||
// Single token fuzzy matching
|
||||
const token = normalizeSearchText(this.tokens[0]);
|
||||
return this.fuzzyMatchToken(token, normalizedContent) ||
|
||||
(this.flatText && this.fuzzyMatchToken(token, flatText));
|
||||
} catch (error) {
|
||||
log.error(`Error in fuzzy matching for note ${noteId}: ${error}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if multiple tokens match as a phrase with proximity consideration
|
||||
*/
|
||||
private matchesPhrase(content: string, flatText: string): boolean {
|
||||
const searchText = this.flatText ? `${content} ${flatText}` : content;
|
||||
|
||||
// Apply content size limits for phrase matching
|
||||
const limitedText = validateAndPreprocessContent(searchText);
|
||||
if (!limitedText) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const words = limitedText.toLowerCase().split(/\s+/);
|
||||
|
||||
// Only skip phrase matching for truly extreme word counts that could crash the system
|
||||
if (words.length > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT) {
|
||||
console.error(`Phrase matching skipped due to extreme word count that could cause system instability: ${words.length} words`);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Warn about large word counts but still attempt matching
|
||||
if (words.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) {
|
||||
console.info(`Large word count for phrase matching: ${words.length} words - may take longer but will attempt full matching`);
|
||||
}
|
||||
|
||||
// Find positions of each token
|
||||
const tokenPositions: number[][] = this.tokens.map(token => {
|
||||
const normalizedToken = normalizeSearchText(token);
|
||||
const positions: number[] = [];
|
||||
|
||||
words.forEach((word, index) => {
|
||||
if (this.fuzzyMatchSingle(normalizedToken, word)) {
|
||||
positions.push(index);
|
||||
}
|
||||
});
|
||||
|
||||
return positions;
|
||||
});
|
||||
|
||||
// Check if we found all tokens
|
||||
if (tokenPositions.some(positions => positions.length === 0)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check for phrase proximity using configurable distance
|
||||
return this.hasProximityMatch(tokenPositions, FUZZY_SEARCH_CONFIG.MAX_PHRASE_PROXIMITY);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if token positions indicate a phrase match within max distance
|
||||
*/
|
||||
private hasProximityMatch(tokenPositions: number[][], maxDistance: number): boolean {
|
||||
// For 2 tokens, simple proximity check
|
||||
if (tokenPositions.length === 2) {
|
||||
const [pos1, pos2] = tokenPositions;
|
||||
return pos1.some(p1 => pos2.some(p2 => Math.abs(p1 - p2) <= maxDistance));
|
||||
}
|
||||
|
||||
// For more tokens, check if we can find a sequence where all tokens are within range
|
||||
const findSequence = (remaining: number[][], currentPos: number): boolean => {
|
||||
if (remaining.length === 0) return true;
|
||||
|
||||
const [nextPositions, ...rest] = remaining;
|
||||
return nextPositions.some(pos =>
|
||||
Math.abs(pos - currentPos) <= maxDistance &&
|
||||
findSequence(rest, pos)
|
||||
);
|
||||
};
|
||||
|
||||
const [firstPositions, ...rest] = tokenPositions;
|
||||
return firstPositions.some(startPos => findSequence(rest, startPos));
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs fuzzy matching for a single token against content
|
||||
*/
|
||||
private fuzzyMatchToken(token: string, content: string): boolean {
|
||||
if (token.length < FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH) {
|
||||
// For short tokens, require exact match to avoid too many false positives
|
||||
return content.includes(token);
|
||||
}
|
||||
|
||||
const words = content.split(/\s+/);
|
||||
|
||||
// Only limit word processing for truly extreme cases to prevent system instability
|
||||
const limitedWords = words.slice(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT);
|
||||
|
||||
return limitedWords.some(word => this.fuzzyMatchSingle(token, word));
|
||||
}
|
||||
|
||||
/**
|
||||
* Fuzzy matches a single token against a single word
|
||||
*/
|
||||
private fuzzyMatchSingle(token: string, word: string): boolean {
|
||||
// Use shared optimized fuzzy matching logic
|
||||
return fuzzyMatchWord(token, word, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
|
||||
}
|
||||
|
||||
|
||||
stripTags(content: string) {
|
||||
// we want to allow link to preserve URLs: https://github.com/zadam/trilium/issues/2412
|
||||
// we want to insert space in place of block tags (because they imply text separation)
|
||||
|
@ -7,6 +7,7 @@ import Expression from "./expression.js";
|
||||
import NoteSet from "../note_set.js";
|
||||
import becca from "../../../becca/becca.js";
|
||||
import { normalize } from "../../utils.js";
|
||||
import { normalizeSearchText, fuzzyMatchWord, fuzzyMatchWordWithResult } from "../utils/text_utils.js";
|
||||
import beccaService from "../../../becca/becca_service.js";
|
||||
|
||||
class NoteFlatTextExp extends Expression {
|
||||
@ -15,7 +16,8 @@ class NoteFlatTextExp extends Expression {
|
||||
constructor(tokens: string[]) {
|
||||
super();
|
||||
|
||||
this.tokens = tokens;
|
||||
// Normalize tokens using centralized normalization function
|
||||
this.tokens = tokens.map(token => normalizeSearchText(token));
|
||||
}
|
||||
|
||||
execute(inputNoteSet: NoteSet, executionContext: any, searchContext: SearchContext) {
|
||||
@ -55,14 +57,18 @@ class NoteFlatTextExp extends Expression {
|
||||
const foundAttrTokens: string[] = [];
|
||||
|
||||
for (const token of remainingTokens) {
|
||||
if (note.type.includes(token) || note.mime.includes(token)) {
|
||||
// Add defensive checks for undefined properties
|
||||
const typeMatches = note.type && note.type.includes(token);
|
||||
const mimeMatches = note.mime && note.mime.includes(token);
|
||||
|
||||
if (typeMatches || mimeMatches) {
|
||||
foundAttrTokens.push(token);
|
||||
}
|
||||
}
|
||||
|
||||
for (const attribute of note.getOwnedAttributes()) {
|
||||
const normalizedName = normalize(attribute.name);
|
||||
const normalizedValue = normalize(attribute.value);
|
||||
const normalizedName = normalizeSearchText(attribute.name);
|
||||
const normalizedValue = normalizeSearchText(attribute.value);
|
||||
|
||||
for (const token of remainingTokens) {
|
||||
if (normalizedName.includes(token) || normalizedValue.includes(token)) {
|
||||
@ -72,11 +78,11 @@ class NoteFlatTextExp extends Expression {
|
||||
}
|
||||
|
||||
for (const parentNote of note.parents) {
|
||||
const title = normalize(beccaService.getNoteTitle(note.noteId, parentNote.noteId));
|
||||
const title = normalizeSearchText(beccaService.getNoteTitle(note.noteId, parentNote.noteId));
|
||||
const foundTokens: string[] = foundAttrTokens.slice();
|
||||
|
||||
for (const token of remainingTokens) {
|
||||
if (title.includes(token)) {
|
||||
if (this.smartMatch(title, token, searchContext)) {
|
||||
foundTokens.push(token);
|
||||
}
|
||||
}
|
||||
@ -91,7 +97,7 @@ class NoteFlatTextExp extends Expression {
|
||||
}
|
||||
};
|
||||
|
||||
const candidateNotes = this.getCandidateNotes(inputNoteSet);
|
||||
const candidateNotes = this.getCandidateNotes(inputNoteSet, searchContext);
|
||||
|
||||
for (const note of candidateNotes) {
|
||||
// autocomplete should be able to find notes by their noteIds as well (only leafs)
|
||||
@ -103,23 +109,27 @@ class NoteFlatTextExp extends Expression {
|
||||
const foundAttrTokens: string[] = [];
|
||||
|
||||
for (const token of this.tokens) {
|
||||
if (note.type.includes(token) || note.mime.includes(token)) {
|
||||
// Add defensive checks for undefined properties
|
||||
const typeMatches = note.type && note.type.includes(token);
|
||||
const mimeMatches = note.mime && note.mime.includes(token);
|
||||
|
||||
if (typeMatches || mimeMatches) {
|
||||
foundAttrTokens.push(token);
|
||||
}
|
||||
|
||||
for (const attribute of note.ownedAttributes) {
|
||||
if (normalize(attribute.name).includes(token) || normalize(attribute.value).includes(token)) {
|
||||
if (normalizeSearchText(attribute.name).includes(token) || normalizeSearchText(attribute.value).includes(token)) {
|
||||
foundAttrTokens.push(token);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const parentNote of note.parents) {
|
||||
const title = normalize(beccaService.getNoteTitle(note.noteId, parentNote.noteId));
|
||||
const title = normalizeSearchText(beccaService.getNoteTitle(note.noteId, parentNote.noteId));
|
||||
const foundTokens = foundAttrTokens.slice();
|
||||
|
||||
for (const token of this.tokens) {
|
||||
if (title.includes(token)) {
|
||||
if (this.smartMatch(title, token, searchContext)) {
|
||||
foundTokens.push(token);
|
||||
}
|
||||
}
|
||||
@ -152,12 +162,13 @@ class NoteFlatTextExp extends Expression {
|
||||
/**
|
||||
* Returns noteIds which have at least one matching tokens
|
||||
*/
|
||||
getCandidateNotes(noteSet: NoteSet): BNote[] {
|
||||
getCandidateNotes(noteSet: NoteSet, searchContext?: SearchContext): BNote[] {
|
||||
const candidateNotes: BNote[] = [];
|
||||
|
||||
for (const note of noteSet.notes) {
|
||||
const normalizedFlatText = normalizeSearchText(note.getFlatText());
|
||||
for (const token of this.tokens) {
|
||||
if (note.getFlatText().includes(token)) {
|
||||
if (this.smartMatch(normalizedFlatText, token, searchContext)) {
|
||||
candidateNotes.push(note);
|
||||
break;
|
||||
}
|
||||
@ -166,6 +177,34 @@ class NoteFlatTextExp extends Expression {
|
||||
|
||||
return candidateNotes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Smart matching that tries exact match first, then fuzzy fallback
|
||||
* @param text The text to search in
|
||||
* @param token The token to search for
|
||||
* @param searchContext The search context to track matched words for highlighting
|
||||
* @returns True if match found (exact or fuzzy)
|
||||
*/
|
||||
private smartMatch(text: string, token: string, searchContext?: SearchContext): boolean {
|
||||
// Exact match has priority
|
||||
if (text.includes(token)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Fuzzy fallback only if enabled and for tokens >= 4 characters
|
||||
if (searchContext?.enableFuzzyMatching && token.length >= 4) {
|
||||
const matchedWord = fuzzyMatchWordWithResult(token, text);
|
||||
if (matchedWord) {
|
||||
// Track the fuzzy matched word for highlighting
|
||||
if (!searchContext.highlightedTokens.includes(matchedWord)) {
|
||||
searchContext.highlightedTokens.push(matchedWord);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
export default NoteFlatTextExp;
|
||||
|
@ -18,6 +18,7 @@ class SearchContext {
|
||||
debug?: boolean;
|
||||
debugInfo: {} | null;
|
||||
fuzzyAttributeSearch: boolean;
|
||||
enableFuzzyMatching: boolean; // Controls whether fuzzy matching is enabled for this search phase
|
||||
highlightedTokens: string[];
|
||||
originalQuery: string;
|
||||
fulltextQuery: string;
|
||||
@ -45,6 +46,7 @@ class SearchContext {
|
||||
this.debug = params.debug;
|
||||
this.debugInfo = null;
|
||||
this.fuzzyAttributeSearch = !!params.fuzzyAttributeSearch;
|
||||
this.enableFuzzyMatching = true; // Default to true for backward compatibility
|
||||
this.highlightedTokens = [];
|
||||
this.originalQuery = "";
|
||||
this.fulltextQuery = ""; // complete fulltext part
|
||||
|
@ -2,17 +2,46 @@
|
||||
|
||||
import beccaService from "../../becca/becca_service.js";
|
||||
import becca from "../../becca/becca.js";
|
||||
import {
|
||||
normalizeSearchText,
|
||||
calculateOptimizedEditDistance,
|
||||
FUZZY_SEARCH_CONFIG
|
||||
} from "./utils/text_utils.js";
|
||||
|
||||
// Scoring constants for better maintainability
|
||||
const SCORE_WEIGHTS = {
|
||||
NOTE_ID_EXACT_MATCH: 1000,
|
||||
TITLE_EXACT_MATCH: 2000,
|
||||
TITLE_PREFIX_MATCH: 500,
|
||||
TITLE_WORD_MATCH: 300,
|
||||
TOKEN_EXACT_MATCH: 4,
|
||||
TOKEN_PREFIX_MATCH: 2,
|
||||
TOKEN_CONTAINS_MATCH: 1,
|
||||
TOKEN_FUZZY_MATCH: 0.5,
|
||||
TITLE_FACTOR: 2.0,
|
||||
PATH_FACTOR: 0.3,
|
||||
HIDDEN_NOTE_PENALTY: 3,
|
||||
// Score caps to prevent fuzzy matches from outranking exact matches
|
||||
MAX_FUZZY_SCORE_PER_TOKEN: 3, // Cap fuzzy token contributions to stay below exact matches
|
||||
MAX_FUZZY_TOKEN_LENGTH_MULTIPLIER: 3, // Limit token length impact for fuzzy matches
|
||||
MAX_TOTAL_FUZZY_SCORE: 200 // Total cap on fuzzy scoring per search
|
||||
} as const;
|
||||
|
||||
|
||||
class SearchResult {
|
||||
notePathArray: string[];
|
||||
score: number;
|
||||
notePathTitle: string;
|
||||
highlightedNotePathTitle?: string;
|
||||
contentSnippet?: string;
|
||||
highlightedContentSnippet?: string;
|
||||
private fuzzyScore: number; // Track fuzzy score separately
|
||||
|
||||
constructor(notePathArray: string[]) {
|
||||
this.notePathArray = notePathArray;
|
||||
this.notePathTitle = beccaService.getNoteTitleForPath(notePathArray);
|
||||
this.score = 0;
|
||||
this.fuzzyScore = 0;
|
||||
}
|
||||
|
||||
get notePath() {
|
||||
@ -23,53 +52,117 @@ class SearchResult {
|
||||
return this.notePathArray[this.notePathArray.length - 1];
|
||||
}
|
||||
|
||||
computeScore(fulltextQuery: string, tokens: string[]) {
|
||||
computeScore(fulltextQuery: string, tokens: string[], enableFuzzyMatching: boolean = true) {
|
||||
this.score = 0;
|
||||
this.fuzzyScore = 0; // Reset fuzzy score tracking
|
||||
|
||||
const note = becca.notes[this.noteId];
|
||||
const normalizedQuery = fulltextQuery.toLowerCase();
|
||||
const normalizedTitle = note.title.toLowerCase();
|
||||
const normalizedQuery = normalizeSearchText(fulltextQuery.toLowerCase());
|
||||
const normalizedTitle = normalizeSearchText(note.title.toLowerCase());
|
||||
|
||||
// Note ID exact match, much higher score
|
||||
if (note.noteId.toLowerCase() === fulltextQuery) {
|
||||
this.score += 1000;
|
||||
this.score += SCORE_WEIGHTS.NOTE_ID_EXACT_MATCH;
|
||||
}
|
||||
|
||||
// Title matching scores, make sure to always win
|
||||
// Title matching scores with fuzzy matching support
|
||||
if (normalizedTitle === normalizedQuery) {
|
||||
this.score += 2000; // Increased from 1000 to ensure exact matches always win
|
||||
this.score += SCORE_WEIGHTS.TITLE_EXACT_MATCH;
|
||||
} else if (normalizedTitle.startsWith(normalizedQuery)) {
|
||||
this.score += 500; // Increased to give more weight to prefix matches
|
||||
} else if (normalizedTitle.includes(` ${normalizedQuery} `) || normalizedTitle.startsWith(`${normalizedQuery} `) || normalizedTitle.endsWith(` ${normalizedQuery}`)) {
|
||||
this.score += 300; // Increased to better distinguish word matches
|
||||
this.score += SCORE_WEIGHTS.TITLE_PREFIX_MATCH;
|
||||
} else if (this.isWordMatch(normalizedTitle, normalizedQuery)) {
|
||||
this.score += SCORE_WEIGHTS.TITLE_WORD_MATCH;
|
||||
} else if (enableFuzzyMatching) {
|
||||
// Try fuzzy matching for typos only if enabled
|
||||
const fuzzyScore = this.calculateFuzzyTitleScore(normalizedTitle, normalizedQuery);
|
||||
this.score += fuzzyScore;
|
||||
this.fuzzyScore += fuzzyScore; // Track fuzzy score contributions
|
||||
}
|
||||
|
||||
// Add scores for partial matches with adjusted weights
|
||||
this.addScoreForStrings(tokens, note.title, 2.0); // Increased to give more weight to title matches
|
||||
this.addScoreForStrings(tokens, this.notePathTitle, 0.3); // Reduced to further de-emphasize path matches
|
||||
// Add scores for token matches
|
||||
this.addScoreForStrings(tokens, note.title, SCORE_WEIGHTS.TITLE_FACTOR, enableFuzzyMatching);
|
||||
this.addScoreForStrings(tokens, this.notePathTitle, SCORE_WEIGHTS.PATH_FACTOR, enableFuzzyMatching);
|
||||
|
||||
if (note.isInHiddenSubtree()) {
|
||||
this.score = this.score / 3; // Increased penalty for hidden notes
|
||||
this.score = this.score / SCORE_WEIGHTS.HIDDEN_NOTE_PENALTY;
|
||||
}
|
||||
}
|
||||
|
||||
addScoreForStrings(tokens: string[], str: string, factor: number) {
|
||||
const chunks = str.toLowerCase().split(" ");
|
||||
addScoreForStrings(tokens: string[], str: string, factor: number, enableFuzzyMatching: boolean = true) {
|
||||
const normalizedStr = normalizeSearchText(str.toLowerCase());
|
||||
const chunks = normalizedStr.split(" ");
|
||||
|
||||
let tokenScore = 0;
|
||||
for (const chunk of chunks) {
|
||||
for (const token of tokens) {
|
||||
if (chunk === token) {
|
||||
tokenScore += 4 * token.length * factor;
|
||||
} else if (chunk.startsWith(token)) {
|
||||
tokenScore += 2 * token.length * factor;
|
||||
} else if (chunk.includes(token)) {
|
||||
tokenScore += token.length * factor;
|
||||
const normalizedToken = normalizeSearchText(token.toLowerCase());
|
||||
|
||||
if (chunk === normalizedToken) {
|
||||
tokenScore += SCORE_WEIGHTS.TOKEN_EXACT_MATCH * token.length * factor;
|
||||
} else if (chunk.startsWith(normalizedToken)) {
|
||||
tokenScore += SCORE_WEIGHTS.TOKEN_PREFIX_MATCH * token.length * factor;
|
||||
} else if (chunk.includes(normalizedToken)) {
|
||||
tokenScore += SCORE_WEIGHTS.TOKEN_CONTAINS_MATCH * token.length * factor;
|
||||
} else {
|
||||
// Try fuzzy matching for individual tokens with caps applied
|
||||
const editDistance = calculateOptimizedEditDistance(chunk, normalizedToken, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
|
||||
if (editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE &&
|
||||
normalizedToken.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH &&
|
||||
this.fuzzyScore < SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE) {
|
||||
|
||||
const fuzzyWeight = SCORE_WEIGHTS.TOKEN_FUZZY_MATCH * (1 - editDistance / FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
|
||||
// Apply caps: limit token length multiplier and per-token contribution
|
||||
const cappedTokenLength = Math.min(token.length, SCORE_WEIGHTS.MAX_FUZZY_TOKEN_LENGTH_MULTIPLIER);
|
||||
const fuzzyTokenScore = Math.min(
|
||||
fuzzyWeight * cappedTokenLength * factor,
|
||||
SCORE_WEIGHTS.MAX_FUZZY_SCORE_PER_TOKEN
|
||||
);
|
||||
|
||||
tokenScore += fuzzyTokenScore;
|
||||
this.fuzzyScore += fuzzyTokenScore;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
this.score += tokenScore;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if the query matches as a complete word in the text
|
||||
*/
|
||||
private isWordMatch(text: string, query: string): boolean {
|
||||
return text.includes(` ${query} `) ||
|
||||
text.startsWith(`${query} `) ||
|
||||
text.endsWith(` ${query}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates fuzzy matching score for title matches with caps applied
|
||||
*/
|
||||
private calculateFuzzyTitleScore(title: string, query: string): number {
|
||||
// Check if we've already hit the fuzzy scoring cap
|
||||
if (this.fuzzyScore >= SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const editDistance = calculateOptimizedEditDistance(title, query, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
|
||||
const maxLen = Math.max(title.length, query.length);
|
||||
|
||||
// Only apply fuzzy matching if the query is reasonably long and edit distance is small
|
||||
if (query.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH &&
|
||||
editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE &&
|
||||
editDistance / maxLen <= 0.3) {
|
||||
const similarity = 1 - (editDistance / maxLen);
|
||||
const baseFuzzyScore = SCORE_WEIGHTS.TITLE_WORD_MATCH * similarity * 0.7; // Reduced weight for fuzzy matches
|
||||
|
||||
// Apply cap to ensure fuzzy title matches don't exceed reasonable bounds
|
||||
return Math.min(baseFuzzyScore, SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE * 0.3);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
export default SearchResult;
|
||||
|
@ -1,3 +1,5 @@
|
||||
import { normalizeSearchText, fuzzyMatchWord, FUZZY_SEARCH_CONFIG } from "../utils/text_utils.js";
|
||||
|
||||
const cachedRegexes: Record<string, RegExp> = {};
|
||||
|
||||
function getRegex(str: string) {
|
||||
@ -20,7 +22,41 @@ const stringComparators: Record<string, Comparator<string>> = {
|
||||
"*=": (comparedValue) => (val) => !!val && val.endsWith(comparedValue),
|
||||
"=*": (comparedValue) => (val) => !!val && val.startsWith(comparedValue),
|
||||
"*=*": (comparedValue) => (val) => !!val && val.includes(comparedValue),
|
||||
"%=": (comparedValue) => (val) => !!val && !!getRegex(comparedValue).test(val)
|
||||
"%=": (comparedValue) => (val) => !!val && !!getRegex(comparedValue).test(val),
|
||||
"~=": (comparedValue) => (val) => {
|
||||
if (!val || !comparedValue) return false;
|
||||
|
||||
// Validate minimum length for fuzzy search to prevent false positives
|
||||
if (comparedValue.length < FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH) {
|
||||
return val.includes(comparedValue);
|
||||
}
|
||||
|
||||
const normalizedVal = normalizeSearchText(val);
|
||||
const normalizedCompared = normalizeSearchText(comparedValue);
|
||||
|
||||
// First try exact substring match
|
||||
if (normalizedVal.includes(normalizedCompared)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Then try fuzzy word matching
|
||||
const words = normalizedVal.split(/\s+/);
|
||||
return words.some(word => fuzzyMatchWord(normalizedCompared, word));
|
||||
},
|
||||
"~*": (comparedValue) => (val) => {
|
||||
if (!val || !comparedValue) return false;
|
||||
|
||||
// Validate minimum length for fuzzy search
|
||||
if (comparedValue.length < FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH) {
|
||||
return val.includes(comparedValue);
|
||||
}
|
||||
|
||||
const normalizedVal = normalizeSearchText(val);
|
||||
const normalizedCompared = normalizeSearchText(comparedValue);
|
||||
|
||||
// For ~* operator, use fuzzy matching across the entire content
|
||||
return fuzzyMatchWord(normalizedCompared, normalizedVal);
|
||||
}
|
||||
};
|
||||
|
||||
const numericComparators: Record<string, Comparator<number>> = {
|
||||
|
@ -40,7 +40,7 @@ function getFulltext(_tokens: TokenData[], searchContext: SearchContext) {
|
||||
}
|
||||
}
|
||||
|
||||
const OPERATORS = new Set(["=", "!=", "*=*", "*=", "=*", ">", ">=", "<", "<=", "%="]);
|
||||
const OPERATORS = new Set(["=", "!=", "*=*", "*=", "=*", ">", ">=", "<", "<=", "%=", "~=", "~*"]);
|
||||
|
||||
function isOperator(token: TokenData) {
|
||||
if (Array.isArray(token)) {
|
||||
|
@ -0,0 +1,241 @@
|
||||
import { describe, it, expect, beforeEach } from "vitest";
|
||||
import searchService from "./search.js";
|
||||
import BNote from "../../../becca/entities/bnote.js";
|
||||
import BBranch from "../../../becca/entities/bbranch.js";
|
||||
import SearchContext from "../search_context.js";
|
||||
import becca from "../../../becca/becca.js";
|
||||
import { findNoteByTitle, note, NoteBuilder } from "../../../test/becca_mocking.js";
|
||||
|
||||
describe("Progressive Search Strategy", () => {
|
||||
let rootNote: any;
|
||||
|
||||
beforeEach(() => {
|
||||
becca.reset();
|
||||
|
||||
rootNote = new NoteBuilder(new BNote({ noteId: "root", title: "root", type: "text" }));
|
||||
new BBranch({
|
||||
branchId: "none_root",
|
||||
noteId: "root",
|
||||
parentNoteId: "none",
|
||||
notePosition: 10
|
||||
});
|
||||
});
|
||||
|
||||
describe("Phase 1: Exact Matches Only", () => {
|
||||
it("should complete search with exact matches when sufficient results found", () => {
|
||||
// Create notes with exact matches
|
||||
rootNote
|
||||
.child(note("Document Analysis One"))
|
||||
.child(note("Document Report Two"))
|
||||
.child(note("Document Review Three"))
|
||||
.child(note("Document Summary Four"))
|
||||
.child(note("Document Overview Five"))
|
||||
.child(note("Documnt Analysis Six")); // This has a typo that should require fuzzy matching
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("document", searchContext);
|
||||
|
||||
// Should find 5 exact matches and not need fuzzy matching
|
||||
expect(searchResults.length).toEqual(5);
|
||||
|
||||
// Verify all results have high scores (exact matches)
|
||||
const highQualityResults = searchResults.filter(result => result.score >= 10);
|
||||
expect(highQualityResults.length).toEqual(5);
|
||||
|
||||
// The typo document should not be in results since we have enough exact matches
|
||||
expect(findNoteByTitle(searchResults, "Documnt Analysis Six")).toBeFalsy();
|
||||
});
|
||||
|
||||
it("should use exact match scoring only in Phase 1", () => {
|
||||
rootNote
|
||||
.child(note("Testing Exact Match"))
|
||||
.child(note("Test Document"))
|
||||
.child(note("Another Test"));
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("test", searchContext);
|
||||
|
||||
// All results should have scores from exact matching only
|
||||
for (const result of searchResults) {
|
||||
expect(result.score).toBeGreaterThan(0);
|
||||
// Scores should be from exact/prefix/contains matches, not fuzzy
|
||||
expect(result.score % 0.5).not.toBe(0); // Fuzzy scores are multiples of 0.5
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe("Phase 2: Fuzzy Fallback", () => {
|
||||
it("should trigger fuzzy matching when insufficient exact matches", () => {
|
||||
// Create only a few notes, some with typos
|
||||
rootNote
|
||||
.child(note("Document One"))
|
||||
.child(note("Report Two"))
|
||||
.child(note("Anaylsis Three")) // Typo: "Analysis"
|
||||
.child(note("Sumary Four")); // Typo: "Summary"
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("analysis", searchContext);
|
||||
|
||||
// Should find the typo through fuzzy matching
|
||||
expect(searchResults.length).toBeGreaterThan(0);
|
||||
expect(findNoteByTitle(searchResults, "Anaylsis Three")).toBeTruthy();
|
||||
});
|
||||
|
||||
it("should merge exact and fuzzy results with exact matches always ranked higher", () => {
|
||||
rootNote
|
||||
.child(note("Analysis Report")) // Exact match
|
||||
.child(note("Data Analysis")) // Exact match
|
||||
.child(note("Anaylsis Doc")) // Fuzzy match
|
||||
.child(note("Statistical Anlaysis")); // Fuzzy match
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("analysis", searchContext);
|
||||
|
||||
expect(searchResults.length).toBe(4);
|
||||
|
||||
// Get the note titles in result order
|
||||
const resultTitles = searchResults.map(r => becca.notes[r.noteId].title);
|
||||
|
||||
// Find positions of exact and fuzzy matches
|
||||
const exactPositions = resultTitles.map((title, index) =>
|
||||
title.toLowerCase().includes("analysis") ? index : -1
|
||||
).filter(pos => pos !== -1);
|
||||
|
||||
const fuzzyPositions = resultTitles.map((title, index) =>
|
||||
(title.includes("Anaylsis") || title.includes("Anlaysis")) ? index : -1
|
||||
).filter(pos => pos !== -1);
|
||||
|
||||
expect(exactPositions.length).toBe(2);
|
||||
expect(fuzzyPositions.length).toBe(2);
|
||||
|
||||
// CRITICAL: All exact matches must come before all fuzzy matches
|
||||
const lastExactPosition = Math.max(...exactPositions);
|
||||
const firstFuzzyPosition = Math.min(...fuzzyPositions);
|
||||
|
||||
expect(lastExactPosition).toBeLessThan(firstFuzzyPosition);
|
||||
});
|
||||
|
||||
it("should not duplicate results between phases", () => {
|
||||
rootNote
|
||||
.child(note("Test Document")) // Would match in both phases
|
||||
.child(note("Tset Report")); // Only fuzzy match
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("test", searchContext);
|
||||
|
||||
// Should only have unique results
|
||||
const noteIds = searchResults.map(r => r.noteId);
|
||||
const uniqueNoteIds = [...new Set(noteIds)];
|
||||
|
||||
expect(noteIds.length).toBe(uniqueNoteIds.length);
|
||||
expect(findNoteByTitle(searchResults, "Test Document")).toBeTruthy();
|
||||
expect(findNoteByTitle(searchResults, "Tset Report")).toBeTruthy();
|
||||
});
|
||||
});
|
||||
|
||||
describe("Result Sufficiency Thresholds", () => {
|
||||
it("should respect minimum result count threshold", () => {
|
||||
// Create exactly 4 high-quality results (below threshold of 5)
|
||||
rootNote
|
||||
.child(note("Test One"))
|
||||
.child(note("Test Two"))
|
||||
.child(note("Test Three"))
|
||||
.child(note("Test Four"))
|
||||
.child(note("Tset Five")); // Typo that should be found via fuzzy
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("test", searchContext);
|
||||
|
||||
// Should proceed to Phase 2 and include fuzzy match
|
||||
expect(searchResults.length).toBe(5);
|
||||
expect(findNoteByTitle(searchResults, "Tset Five")).toBeTruthy();
|
||||
});
|
||||
|
||||
it("should respect minimum quality score threshold", () => {
|
||||
// Create notes that might have low exact match scores
|
||||
rootNote
|
||||
.child(note("Testing Document")) // Should have decent score
|
||||
.child(note("Document with test inside")) // Lower score due to position
|
||||
.child(note("Another test case"))
|
||||
.child(note("Test case example"))
|
||||
.child(note("Tset with typo")); // Fuzzy match
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("test", searchContext);
|
||||
|
||||
// Should include fuzzy results if exact results don't meet quality threshold
|
||||
expect(searchResults.length).toBeGreaterThan(4);
|
||||
});
|
||||
});
|
||||
|
||||
describe("Fuzzy Score Management", () => {
|
||||
it("should cap fuzzy token scores to prevent outranking exact matches", () => {
|
||||
// Create note with exact match
|
||||
rootNote.child(note("Test Document"));
|
||||
// Create note that could accumulate high fuzzy scores
|
||||
rootNote.child(note("Tset Documnt with many fuzzy tockens for testng")); // Multiple typos
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("test document", searchContext);
|
||||
|
||||
expect(searchResults.length).toBe(2);
|
||||
|
||||
// Find the exact and fuzzy match results
|
||||
const exactResult = searchResults.find(r => becca.notes[r.noteId].title === "Test Document");
|
||||
const fuzzyResult = searchResults.find(r => becca.notes[r.noteId].title.includes("Tset"));
|
||||
|
||||
expect(exactResult).toBeTruthy();
|
||||
expect(fuzzyResult).toBeTruthy();
|
||||
|
||||
// Exact match should always score higher than fuzzy, even with multiple fuzzy matches
|
||||
expect(exactResult!.score).toBeGreaterThan(fuzzyResult!.score);
|
||||
});
|
||||
|
||||
it("should enforce maximum total fuzzy score per search", () => {
|
||||
// Create note with many potential fuzzy matches
|
||||
rootNote.child(note("Tset Documnt Anaylsis Sumary Reportng")); // Many typos
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("test document analysis summary reporting", searchContext);
|
||||
|
||||
expect(searchResults.length).toBe(1);
|
||||
|
||||
// Total score should be bounded despite many fuzzy matches
|
||||
expect(searchResults[0].score).toBeLessThan(500); // Should not exceed reasonable bounds due to caps
|
||||
});
|
||||
});
|
||||
|
||||
describe("SearchContext Integration", () => {
|
||||
it("should respect enableFuzzyMatching flag", () => {
|
||||
rootNote
|
||||
.child(note("Test Document"))
|
||||
.child(note("Tset Report")); // Typo
|
||||
|
||||
// Test with fuzzy matching disabled
|
||||
const exactOnlyContext = new SearchContext();
|
||||
exactOnlyContext.enableFuzzyMatching = false;
|
||||
|
||||
const exactResults = searchService.findResultsWithQuery("test", exactOnlyContext);
|
||||
expect(exactResults.length).toBe(1);
|
||||
expect(findNoteByTitle(exactResults, "Test Document")).toBeTruthy();
|
||||
expect(findNoteByTitle(exactResults, "Tset Report")).toBeFalsy();
|
||||
|
||||
// Test with fuzzy matching enabled (default)
|
||||
const fuzzyContext = new SearchContext();
|
||||
const fuzzyResults = searchService.findResultsWithQuery("test", fuzzyContext);
|
||||
expect(fuzzyResults.length).toBe(2);
|
||||
expect(findNoteByTitle(fuzzyResults, "Tset Report")).toBeTruthy();
|
||||
});
|
||||
});
|
||||
|
||||
describe("Edge Cases", () => {
|
||||
it("should handle empty search results gracefully", () => {
|
||||
rootNote.child(note("Unrelated Content"));
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("nonexistent", searchContext);
|
||||
|
||||
expect(searchResults.length).toBe(0);
|
||||
});
|
||||
});
|
||||
});
|
@ -553,6 +553,70 @@ describe("Search", () => {
|
||||
expect(becca.notes[searchResults[0].noteId].title).toEqual("Reddit is bad");
|
||||
});
|
||||
|
||||
it("search completes in reasonable time", () => {
|
||||
// Create a moderate-sized dataset to test performance
|
||||
const countries = ["Austria", "Belgium", "Croatia", "Denmark", "Estonia", "Finland", "Germany", "Hungary", "Ireland", "Japan"];
|
||||
const europeanCountries = note("Europe");
|
||||
|
||||
countries.forEach(country => {
|
||||
europeanCountries.child(note(country).label("type", "country").label("continent", "Europe"));
|
||||
});
|
||||
|
||||
rootNote.child(europeanCountries);
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const startTime = Date.now();
|
||||
|
||||
// Perform a search that exercises multiple features
|
||||
const searchResults = searchService.findResultsWithQuery("#type=country AND continent", searchContext);
|
||||
|
||||
const endTime = Date.now();
|
||||
const duration = endTime - startTime;
|
||||
|
||||
// Search should complete in under 1 second for reasonable dataset
|
||||
expect(duration).toBeLessThan(1000);
|
||||
expect(searchResults.length).toEqual(10);
|
||||
});
|
||||
|
||||
it("progressive search always puts exact matches before fuzzy matches", () => {
|
||||
rootNote
|
||||
.child(note("Analysis Report")) // Exact match
|
||||
.child(note("Data Analysis")) // Exact match
|
||||
.child(note("Test Analysis")) // Exact match
|
||||
.child(note("Advanced Anaylsis")) // Fuzzy match (typo)
|
||||
.child(note("Quick Anlaysis")); // Fuzzy match (typo)
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("analysis", searchContext);
|
||||
|
||||
// With only 3 exact matches (below threshold), fuzzy should be triggered
|
||||
// Should find all 5 matches but exact ones should come first
|
||||
expect(searchResults.length).toEqual(5);
|
||||
|
||||
// Get note titles in result order
|
||||
const resultTitles = searchResults.map(r => becca.notes[r.noteId].title);
|
||||
|
||||
// Find all exact matches (contain "analysis")
|
||||
const exactMatchIndices = resultTitles.map((title, index) =>
|
||||
title.toLowerCase().includes("analysis") ? index : -1
|
||||
).filter(index => index !== -1);
|
||||
|
||||
// Find all fuzzy matches (contain typos)
|
||||
const fuzzyMatchIndices = resultTitles.map((title, index) =>
|
||||
(title.includes("Anaylsis") || title.includes("Anlaysis")) ? index : -1
|
||||
).filter(index => index !== -1);
|
||||
|
||||
expect(exactMatchIndices.length).toEqual(3);
|
||||
expect(fuzzyMatchIndices.length).toEqual(2);
|
||||
|
||||
// CRITICAL: All exact matches must appear before all fuzzy matches
|
||||
const lastExactIndex = Math.max(...exactMatchIndices);
|
||||
const firstFuzzyIndex = Math.min(...fuzzyMatchIndices);
|
||||
|
||||
expect(lastExactIndex).toBeLessThan(firstFuzzyIndex);
|
||||
});
|
||||
|
||||
|
||||
// FIXME: test what happens when we order without any filter criteria
|
||||
|
||||
// it("comparison between labels", () => {
|
||||
|
@ -17,6 +17,8 @@ import type { SearchParams, TokenStructure } from "./types.js";
|
||||
import type Expression from "../expressions/expression.js";
|
||||
import sql from "../../sql.js";
|
||||
import scriptService from "../../script.js";
|
||||
import striptags from "striptags";
|
||||
import protectedSessionService from "../../protected_session.js";
|
||||
|
||||
export interface SearchNoteResult {
|
||||
searchResultNoteIds: string[];
|
||||
@ -235,6 +237,41 @@ function findResultsWithExpression(expression: Expression, searchContext: Search
|
||||
loadNeededInfoFromDatabase();
|
||||
}
|
||||
|
||||
// If there's an explicit orderBy clause, skip progressive search
|
||||
// as it would interfere with the ordering
|
||||
if (searchContext.orderBy) {
|
||||
// For ordered queries, don't use progressive search but respect
|
||||
// the original fuzzy matching setting
|
||||
return performSearch(expression, searchContext, searchContext.enableFuzzyMatching);
|
||||
}
|
||||
|
||||
// If fuzzy matching is explicitly disabled, skip progressive search
|
||||
if (!searchContext.enableFuzzyMatching) {
|
||||
return performSearch(expression, searchContext, false);
|
||||
}
|
||||
|
||||
// Phase 1: Try exact matches first (without fuzzy matching)
|
||||
const exactResults = performSearch(expression, searchContext, false);
|
||||
|
||||
// Check if we have sufficient high-quality results
|
||||
const minResultThreshold = 5;
|
||||
const minScoreForQuality = 10; // Minimum score to consider a result "high quality"
|
||||
|
||||
const highQualityResults = exactResults.filter(result => result.score >= minScoreForQuality);
|
||||
|
||||
// If we have enough high-quality exact matches, return them
|
||||
if (highQualityResults.length >= minResultThreshold) {
|
||||
return exactResults;
|
||||
}
|
||||
|
||||
// Phase 2: Add fuzzy matching as fallback when exact matches are insufficient
|
||||
const fuzzyResults = performSearch(expression, searchContext, true);
|
||||
|
||||
// Merge results, ensuring exact matches always rank higher than fuzzy matches
|
||||
return mergeExactAndFuzzyResults(exactResults, fuzzyResults);
|
||||
}
|
||||
|
||||
function performSearch(expression: Expression, searchContext: SearchContext, enableFuzzyMatching: boolean): SearchResult[] {
|
||||
const allNoteSet = becca.getAllNoteSet();
|
||||
|
||||
const noteIdToNotePath: Record<string, string[]> = {};
|
||||
@ -242,6 +279,10 @@ function findResultsWithExpression(expression: Expression, searchContext: Search
|
||||
noteIdToNotePath
|
||||
};
|
||||
|
||||
// Store original fuzzy setting and temporarily override it
|
||||
const originalFuzzyMatching = searchContext.enableFuzzyMatching;
|
||||
searchContext.enableFuzzyMatching = enableFuzzyMatching;
|
||||
|
||||
const noteSet = expression.execute(allNoteSet, executionContext, searchContext);
|
||||
|
||||
const searchResults = noteSet.notes.map((note) => {
|
||||
@ -255,9 +296,12 @@ function findResultsWithExpression(expression: Expression, searchContext: Search
|
||||
});
|
||||
|
||||
for (const res of searchResults) {
|
||||
res.computeScore(searchContext.fulltextQuery, searchContext.highlightedTokens);
|
||||
res.computeScore(searchContext.fulltextQuery, searchContext.highlightedTokens, enableFuzzyMatching);
|
||||
}
|
||||
|
||||
// Restore original fuzzy setting
|
||||
searchContext.enableFuzzyMatching = originalFuzzyMatching;
|
||||
|
||||
if (!noteSet.sorted) {
|
||||
searchResults.sort((a, b) => {
|
||||
if (a.score > b.score) {
|
||||
@ -279,6 +323,49 @@ function findResultsWithExpression(expression: Expression, searchContext: Search
|
||||
return searchResults;
|
||||
}
|
||||
|
||||
function mergeExactAndFuzzyResults(exactResults: SearchResult[], fuzzyResults: SearchResult[]): SearchResult[] {
|
||||
// Create a map of exact result note IDs for deduplication
|
||||
const exactNoteIds = new Set(exactResults.map(result => result.noteId));
|
||||
|
||||
// Add fuzzy results that aren't already in exact results
|
||||
const additionalFuzzyResults = fuzzyResults.filter(result => !exactNoteIds.has(result.noteId));
|
||||
|
||||
// Sort exact results by score (best exact matches first)
|
||||
exactResults.sort((a, b) => {
|
||||
if (a.score > b.score) {
|
||||
return -1;
|
||||
} else if (a.score < b.score) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// if score does not decide then sort results by depth of the note.
|
||||
if (a.notePathArray.length === b.notePathArray.length) {
|
||||
return a.notePathTitle < b.notePathTitle ? -1 : 1;
|
||||
}
|
||||
|
||||
return a.notePathArray.length < b.notePathArray.length ? -1 : 1;
|
||||
});
|
||||
|
||||
// Sort fuzzy results by score (best fuzzy matches first)
|
||||
additionalFuzzyResults.sort((a, b) => {
|
||||
if (a.score > b.score) {
|
||||
return -1;
|
||||
} else if (a.score < b.score) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// if score does not decide then sort results by depth of the note.
|
||||
if (a.notePathArray.length === b.notePathArray.length) {
|
||||
return a.notePathTitle < b.notePathTitle ? -1 : 1;
|
||||
}
|
||||
|
||||
return a.notePathArray.length < b.notePathArray.length ? -1 : 1;
|
||||
});
|
||||
|
||||
// CRITICAL: Always put exact matches before fuzzy matches, regardless of scores
|
||||
return [...exactResults, ...additionalFuzzyResults];
|
||||
}
|
||||
|
||||
function parseQueryToExpression(query: string, searchContext: SearchContext) {
|
||||
const { fulltextQuery, fulltextTokens, expressionTokens } = lex(query);
|
||||
searchContext.fulltextQuery = fulltextQuery;
|
||||
@ -328,6 +415,16 @@ function findResultsWithQuery(query: string, searchContext: SearchContext): Sear
|
||||
return [];
|
||||
}
|
||||
|
||||
// If the query starts with '#', it's a pure expression query.
|
||||
// Don't use progressive search for these as they may have complex
|
||||
// ordering or other logic that shouldn't be interfered with.
|
||||
const isPureExpressionQuery = query.trim().startsWith('#');
|
||||
|
||||
if (isPureExpressionQuery) {
|
||||
// For pure expression queries, use standard search without progressive phases
|
||||
return performSearch(expression, searchContext, searchContext.enableFuzzyMatching);
|
||||
}
|
||||
|
||||
return findResultsWithExpression(expression, searchContext);
|
||||
}
|
||||
|
||||
@ -337,6 +434,91 @@ function findFirstNoteWithQuery(query: string, searchContext: SearchContext): BN
|
||||
return searchResults.length > 0 ? becca.notes[searchResults[0].noteId] : null;
|
||||
}
|
||||
|
||||
function extractContentSnippet(noteId: string, searchTokens: string[], maxLength: number = 200): string {
|
||||
const note = becca.notes[noteId];
|
||||
if (!note) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Only extract content for text-based notes
|
||||
if (!["text", "code", "mermaid", "canvas", "mindMap"].includes(note.type)) {
|
||||
return "";
|
||||
}
|
||||
|
||||
try {
|
||||
let content = note.getContent();
|
||||
|
||||
if (!content || typeof content !== "string") {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Handle protected notes
|
||||
if (note.isProtected && protectedSessionService.isProtectedSessionAvailable()) {
|
||||
try {
|
||||
content = protectedSessionService.decryptString(content) || "";
|
||||
} catch (e) {
|
||||
return ""; // Can't decrypt, don't show content
|
||||
}
|
||||
} else if (note.isProtected) {
|
||||
return ""; // Protected but no session available
|
||||
}
|
||||
|
||||
// Strip HTML tags for text notes
|
||||
if (note.type === "text") {
|
||||
content = striptags(content);
|
||||
}
|
||||
|
||||
// Normalize whitespace
|
||||
content = content.replace(/\s+/g, " ").trim();
|
||||
|
||||
if (!content) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Try to find a snippet around the first matching token
|
||||
const normalizedContent = normalizeString(content.toLowerCase());
|
||||
let snippetStart = 0;
|
||||
let matchFound = false;
|
||||
|
||||
for (const token of searchTokens) {
|
||||
const normalizedToken = normalizeString(token.toLowerCase());
|
||||
const matchIndex = normalizedContent.indexOf(normalizedToken);
|
||||
|
||||
if (matchIndex !== -1) {
|
||||
// Center the snippet around the match
|
||||
snippetStart = Math.max(0, matchIndex - maxLength / 2);
|
||||
matchFound = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract snippet
|
||||
let snippet = content.substring(snippetStart, snippetStart + maxLength);
|
||||
|
||||
// Try to start/end at word boundaries
|
||||
if (snippetStart > 0) {
|
||||
const firstSpace = snippet.indexOf(" ");
|
||||
if (firstSpace > 0 && firstSpace < 20) {
|
||||
snippet = snippet.substring(firstSpace + 1);
|
||||
}
|
||||
snippet = "..." + snippet;
|
||||
}
|
||||
|
||||
if (snippetStart + maxLength < content.length) {
|
||||
const lastSpace = snippet.lastIndexOf(" ");
|
||||
if (lastSpace > snippet.length - 20) {
|
||||
snippet = snippet.substring(0, lastSpace);
|
||||
}
|
||||
snippet = snippet + "...";
|
||||
}
|
||||
|
||||
return snippet;
|
||||
} catch (e) {
|
||||
log.error(`Error extracting content snippet for note ${noteId}: ${e}`);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
function searchNotesForAutocomplete(query: string, fastSearch: boolean = true) {
|
||||
const searchContext = new SearchContext({
|
||||
fastSearch: fastSearch,
|
||||
@ -351,6 +533,11 @@ function searchNotesForAutocomplete(query: string, fastSearch: boolean = true) {
|
||||
|
||||
const trimmed = allSearchResults.slice(0, 200);
|
||||
|
||||
// Extract content snippets
|
||||
for (const result of trimmed) {
|
||||
result.contentSnippet = extractContentSnippet(result.noteId, searchContext.highlightedTokens);
|
||||
}
|
||||
|
||||
highlightSearchResults(trimmed, searchContext.highlightedTokens, searchContext.ignoreInternalAttributes);
|
||||
|
||||
return trimmed.map((result) => {
|
||||
@ -360,6 +547,8 @@ function searchNotesForAutocomplete(query: string, fastSearch: boolean = true) {
|
||||
noteTitle: title,
|
||||
notePathTitle: result.notePathTitle,
|
||||
highlightedNotePathTitle: result.highlightedNotePathTitle,
|
||||
contentSnippet: result.contentSnippet,
|
||||
highlightedContentSnippet: result.highlightedContentSnippet,
|
||||
icon: icon ?? "bx bx-note"
|
||||
};
|
||||
});
|
||||
@ -381,26 +570,11 @@ function highlightSearchResults(searchResults: SearchResult[], highlightedTokens
|
||||
highlightedTokens.sort((a, b) => (a.length > b.length ? -1 : 1));
|
||||
|
||||
for (const result of searchResults) {
|
||||
const note = becca.notes[result.noteId];
|
||||
|
||||
result.highlightedNotePathTitle = result.notePathTitle.replace(/[<{}]/g, "");
|
||||
|
||||
if (highlightedTokens.find((token) => note.type.includes(token))) {
|
||||
result.highlightedNotePathTitle += ` "type: ${note.type}'`;
|
||||
}
|
||||
|
||||
if (highlightedTokens.find((token) => note.mime.includes(token))) {
|
||||
result.highlightedNotePathTitle += ` "mime: ${note.mime}'`;
|
||||
}
|
||||
|
||||
for (const attr of note.getAttributes()) {
|
||||
if (attr.type === "relation" && attr.name === "internalLink" && ignoreInternalAttributes) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (highlightedTokens.find((token) => normalize(attr.name).includes(token) || normalize(attr.value).includes(token))) {
|
||||
result.highlightedNotePathTitle += ` "${formatAttribute(attr)}'`;
|
||||
}
|
||||
|
||||
// Initialize highlighted content snippet
|
||||
if (result.contentSnippet) {
|
||||
result.highlightedContentSnippet = escapeHtml(result.contentSnippet).replace(/[<{}]/g, "");
|
||||
}
|
||||
}
|
||||
|
||||
@ -419,40 +593,36 @@ function highlightSearchResults(searchResults: SearchResult[], highlightedTokens
|
||||
const tokenRegex = new RegExp(escapeRegExp(token), "gi");
|
||||
let match;
|
||||
|
||||
// Find all matches
|
||||
if (!result.highlightedNotePathTitle) {
|
||||
continue;
|
||||
// Highlight in note path title
|
||||
if (result.highlightedNotePathTitle) {
|
||||
const titleRegex = new RegExp(escapeRegExp(token), "gi");
|
||||
while ((match = titleRegex.exec(normalizeString(result.highlightedNotePathTitle))) !== null) {
|
||||
result.highlightedNotePathTitle = wrapText(result.highlightedNotePathTitle, match.index, token.length, "{", "}");
|
||||
// 2 characters are added, so we need to adjust the index
|
||||
titleRegex.lastIndex += 2;
|
||||
}
|
||||
}
|
||||
while ((match = tokenRegex.exec(normalizeString(result.highlightedNotePathTitle))) !== null) {
|
||||
result.highlightedNotePathTitle = wrapText(result.highlightedNotePathTitle, match.index, token.length, "{", "}");
|
||||
|
||||
// 2 characters are added, so we need to adjust the index
|
||||
tokenRegex.lastIndex += 2;
|
||||
// Highlight in content snippet
|
||||
if (result.highlightedContentSnippet) {
|
||||
const contentRegex = new RegExp(escapeRegExp(token), "gi");
|
||||
while ((match = contentRegex.exec(normalizeString(result.highlightedContentSnippet))) !== null) {
|
||||
result.highlightedContentSnippet = wrapText(result.highlightedContentSnippet, match.index, token.length, "{", "}");
|
||||
// 2 characters are added, so we need to adjust the index
|
||||
contentRegex.lastIndex += 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const result of searchResults) {
|
||||
if (!result.highlightedNotePathTitle) {
|
||||
continue;
|
||||
if (result.highlightedNotePathTitle) {
|
||||
result.highlightedNotePathTitle = result.highlightedNotePathTitle.replace(/{/g, "<b>").replace(/}/g, "</b>");
|
||||
}
|
||||
result.highlightedNotePathTitle = result.highlightedNotePathTitle.replace(/"/g, "<small>").replace(/'/g, "</small>").replace(/{/g, "<b>").replace(/}/g, "</b>");
|
||||
}
|
||||
}
|
||||
|
||||
function formatAttribute(attr: BAttribute) {
|
||||
if (attr.type === "relation") {
|
||||
return `~${escapeHtml(attr.name)}=…`;
|
||||
} else if (attr.type === "label") {
|
||||
let label = `#${escapeHtml(attr.name)}`;
|
||||
|
||||
if (attr.value) {
|
||||
const val = /[^\w-]/.test(attr.value) ? `"${attr.value}"` : attr.value;
|
||||
|
||||
label += `=${escapeHtml(val)}`;
|
||||
|
||||
if (result.highlightedContentSnippet) {
|
||||
result.highlightedContentSnippet = result.highlightedContentSnippet.replace(/{/g, "<b>").replace(/}/g, "</b>");
|
||||
}
|
||||
|
||||
return label;
|
||||
}
|
||||
}
|
||||
|
||||
|
65
apps/server/src/services/search/utils/text_utils.spec.ts
Normal file
65
apps/server/src/services/search/utils/text_utils.spec.ts
Normal file
@ -0,0 +1,65 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { calculateOptimizedEditDistance, validateFuzzySearchTokens, fuzzyMatchWord } from './text_utils.js';
|
||||
|
||||
describe('Fuzzy Search Core', () => {
|
||||
describe('calculateOptimizedEditDistance', () => {
|
||||
it('calculates edit distance for common typos', () => {
|
||||
expect(calculateOptimizedEditDistance('hello', 'helo')).toBe(1);
|
||||
expect(calculateOptimizedEditDistance('world', 'wrold')).toBe(2);
|
||||
expect(calculateOptimizedEditDistance('cafe', 'café')).toBe(1);
|
||||
expect(calculateOptimizedEditDistance('identical', 'identical')).toBe(0);
|
||||
});
|
||||
|
||||
it('handles performance safety with oversized input', () => {
|
||||
const longString = 'a'.repeat(2000);
|
||||
const result = calculateOptimizedEditDistance(longString, 'short');
|
||||
expect(result).toBeGreaterThan(2); // Should use fallback heuristic
|
||||
});
|
||||
});
|
||||
|
||||
describe('validateFuzzySearchTokens', () => {
|
||||
it('validates minimum length requirements for fuzzy operators', () => {
|
||||
const result1 = validateFuzzySearchTokens(['ab'], '~=');
|
||||
expect(result1.isValid).toBe(false);
|
||||
expect(result1.error).toContain('at least 3 characters');
|
||||
|
||||
const result2 = validateFuzzySearchTokens(['hello'], '~=');
|
||||
expect(result2.isValid).toBe(true);
|
||||
|
||||
const result3 = validateFuzzySearchTokens(['ok'], '=');
|
||||
expect(result3.isValid).toBe(true); // Non-fuzzy operators allow short tokens
|
||||
});
|
||||
|
||||
it('validates token types and empty arrays', () => {
|
||||
expect(validateFuzzySearchTokens([], '=')).toEqual({
|
||||
isValid: false,
|
||||
error: 'Invalid tokens: at least one token is required'
|
||||
});
|
||||
|
||||
expect(validateFuzzySearchTokens([''], '=')).toEqual({
|
||||
isValid: false,
|
||||
error: 'Invalid tokens: empty or whitespace-only tokens are not allowed'
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('fuzzyMatchWord', () => {
|
||||
it('matches words with diacritics normalization', () => {
|
||||
expect(fuzzyMatchWord('cafe', 'café')).toBe(true);
|
||||
expect(fuzzyMatchWord('naive', 'naïve')).toBe(true);
|
||||
});
|
||||
|
||||
it('matches with typos within distance threshold', () => {
|
||||
expect(fuzzyMatchWord('hello', 'helo')).toBe(true);
|
||||
expect(fuzzyMatchWord('world', 'wrold')).toBe(true);
|
||||
expect(fuzzyMatchWord('test', 'tset')).toBe(true);
|
||||
expect(fuzzyMatchWord('test', 'xyz')).toBe(false);
|
||||
});
|
||||
|
||||
it('handles edge cases safely', () => {
|
||||
expect(fuzzyMatchWord('', 'test')).toBe(false);
|
||||
expect(fuzzyMatchWord('test', '')).toBe(false);
|
||||
expect(fuzzyMatchWord('a', 'b')).toBe(false); // Very short tokens
|
||||
});
|
||||
});
|
||||
});
|
334
apps/server/src/services/search/utils/text_utils.ts
Normal file
334
apps/server/src/services/search/utils/text_utils.ts
Normal file
@ -0,0 +1,334 @@
|
||||
"use strict";
|
||||
|
||||
import { normalize } from "../../utils.js";
|
||||
|
||||
/**
|
||||
* Shared text processing utilities for search functionality
|
||||
*/
|
||||
|
||||
// Configuration constants for fuzzy matching
|
||||
export const FUZZY_SEARCH_CONFIG = {
|
||||
// Minimum token length for fuzzy operators to prevent false positives
|
||||
MIN_FUZZY_TOKEN_LENGTH: 3,
|
||||
// Maximum edit distance for fuzzy matching
|
||||
MAX_EDIT_DISTANCE: 2,
|
||||
// Maximum proximity distance for phrase matching (in words)
|
||||
MAX_PHRASE_PROXIMITY: 10,
|
||||
// Absolute hard limits for extreme cases - only to prevent system crashes
|
||||
ABSOLUTE_MAX_CONTENT_SIZE: 100 * 1024 * 1024, // 100MB - extreme upper limit to prevent OOM
|
||||
ABSOLUTE_MAX_WORD_COUNT: 2000000, // 2M words - extreme upper limit for word processing
|
||||
// Performance warning thresholds - inform user but still attempt search
|
||||
PERFORMANCE_WARNING_SIZE: 5 * 1024 * 1024, // 5MB - warn about potential performance impact
|
||||
PERFORMANCE_WARNING_WORDS: 100000, // 100K words - warn about word count impact
|
||||
// Progressive processing thresholds for very large content
|
||||
PROGRESSIVE_PROCESSING_SIZE: 10 * 1024 * 1024, // 10MB - use progressive processing
|
||||
PROGRESSIVE_PROCESSING_WORDS: 500000, // 500K words - use progressive processing
|
||||
// Performance thresholds
|
||||
EARLY_TERMINATION_THRESHOLD: 3,
|
||||
} as const;
|
||||
|
||||
/**
|
||||
* Normalizes text by removing diacritics and converting to lowercase.
|
||||
* This is the centralized text normalization function used across all search components.
|
||||
* Uses the shared normalize function from utils for consistency.
|
||||
*
|
||||
* Examples:
|
||||
* - "café" -> "cafe"
|
||||
* - "naïve" -> "naive"
|
||||
* - "HELLO WORLD" -> "hello world"
|
||||
*
|
||||
* @param text The text to normalize
|
||||
* @returns The normalized text
|
||||
*/
|
||||
export function normalizeSearchText(text: string): string {
|
||||
if (!text || typeof text !== 'string') {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Use shared normalize function for consistency across the codebase
|
||||
return normalize(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Optimized edit distance calculation using single array and early termination.
|
||||
* This is significantly more memory efficient than the 2D matrix approach and includes
|
||||
* early termination optimizations for better performance.
|
||||
*
|
||||
* @param str1 First string
|
||||
* @param str2 Second string
|
||||
* @param maxDistance Maximum allowed distance (for early termination)
|
||||
* @returns The edit distance between the strings, or maxDistance + 1 if exceeded
|
||||
*/
|
||||
export function calculateOptimizedEditDistance(str1: string, str2: string, maxDistance: number = FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE): number {
|
||||
// Input validation
|
||||
if (typeof str1 !== 'string' || typeof str2 !== 'string') {
|
||||
throw new Error('Both arguments must be strings');
|
||||
}
|
||||
|
||||
if (maxDistance < 0 || !Number.isInteger(maxDistance)) {
|
||||
throw new Error('maxDistance must be a non-negative integer');
|
||||
}
|
||||
|
||||
const len1 = str1.length;
|
||||
const len2 = str2.length;
|
||||
|
||||
// Performance guard: if strings are too long, limit processing
|
||||
const maxStringLength = 1000;
|
||||
if (len1 > maxStringLength || len2 > maxStringLength) {
|
||||
// For very long strings, fall back to simple length-based heuristic
|
||||
return Math.abs(len1 - len2) <= maxDistance ? Math.abs(len1 - len2) : maxDistance + 1;
|
||||
}
|
||||
|
||||
// Early termination: if length difference exceeds max distance
|
||||
if (Math.abs(len1 - len2) > maxDistance) {
|
||||
return maxDistance + 1;
|
||||
}
|
||||
|
||||
// Handle edge cases
|
||||
if (len1 === 0) return len2 <= maxDistance ? len2 : maxDistance + 1;
|
||||
if (len2 === 0) return len1 <= maxDistance ? len1 : maxDistance + 1;
|
||||
|
||||
// Use single array optimization for better memory usage
|
||||
let previousRow = Array.from({ length: len2 + 1 }, (_, i) => i);
|
||||
let currentRow = new Array(len2 + 1);
|
||||
|
||||
for (let i = 1; i <= len1; i++) {
|
||||
currentRow[0] = i;
|
||||
let minInRow = i;
|
||||
|
||||
for (let j = 1; j <= len2; j++) {
|
||||
const cost = str1[i - 1] === str2[j - 1] ? 0 : 1;
|
||||
currentRow[j] = Math.min(
|
||||
previousRow[j] + 1, // deletion
|
||||
currentRow[j - 1] + 1, // insertion
|
||||
previousRow[j - 1] + cost // substitution
|
||||
);
|
||||
|
||||
// Track minimum value in current row for early termination
|
||||
if (currentRow[j] < minInRow) {
|
||||
minInRow = currentRow[j];
|
||||
}
|
||||
}
|
||||
|
||||
// Early termination: if minimum distance in row exceeds threshold
|
||||
if (minInRow > maxDistance) {
|
||||
return maxDistance + 1;
|
||||
}
|
||||
|
||||
// Swap arrays for next iteration
|
||||
[previousRow, currentRow] = [currentRow, previousRow];
|
||||
}
|
||||
|
||||
const result = previousRow[len2];
|
||||
return result <= maxDistance ? result : maxDistance + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates that tokens meet minimum requirements for fuzzy operators.
|
||||
*
|
||||
* @param tokens Array of search tokens
|
||||
* @param operator The search operator being used
|
||||
* @returns Validation result with success status and error message
|
||||
*/
|
||||
export function validateFuzzySearchTokens(tokens: string[], operator: string): { isValid: boolean; error?: string } {
|
||||
if (!operator || typeof operator !== 'string') {
|
||||
return {
|
||||
isValid: false,
|
||||
error: 'Invalid operator: operator must be a non-empty string'
|
||||
};
|
||||
}
|
||||
|
||||
if (!Array.isArray(tokens)) {
|
||||
return {
|
||||
isValid: false,
|
||||
error: 'Invalid tokens: tokens must be an array'
|
||||
};
|
||||
}
|
||||
|
||||
if (tokens.length === 0) {
|
||||
return {
|
||||
isValid: false,
|
||||
error: 'Invalid tokens: at least one token is required'
|
||||
};
|
||||
}
|
||||
|
||||
// Check for null, undefined, or non-string tokens
|
||||
const invalidTypeTokens = tokens.filter(token =>
|
||||
token == null || typeof token !== 'string'
|
||||
);
|
||||
|
||||
if (invalidTypeTokens.length > 0) {
|
||||
return {
|
||||
isValid: false,
|
||||
error: 'Invalid tokens: all tokens must be non-null strings'
|
||||
};
|
||||
}
|
||||
|
||||
// Check for empty string tokens
|
||||
const emptyTokens = tokens.filter(token => token.trim().length === 0);
|
||||
|
||||
if (emptyTokens.length > 0) {
|
||||
return {
|
||||
isValid: false,
|
||||
error: 'Invalid tokens: empty or whitespace-only tokens are not allowed'
|
||||
};
|
||||
}
|
||||
|
||||
if (operator !== '~=' && operator !== '~*') {
|
||||
return { isValid: true };
|
||||
}
|
||||
|
||||
// Check minimum token length for fuzzy operators
|
||||
const shortTokens = tokens.filter(token => token.length < FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH);
|
||||
|
||||
if (shortTokens.length > 0) {
|
||||
return {
|
||||
isValid: false,
|
||||
error: `Fuzzy search operators (~=, ~*) require tokens of at least ${FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH} characters. Invalid tokens: ${shortTokens.join(', ')}`
|
||||
};
|
||||
}
|
||||
|
||||
// Check for excessively long tokens that could cause performance issues
|
||||
const maxTokenLength = 100; // Reasonable limit for search tokens
|
||||
const longTokens = tokens.filter(token => token.length > maxTokenLength);
|
||||
|
||||
if (longTokens.length > 0) {
|
||||
return {
|
||||
isValid: false,
|
||||
error: `Tokens are too long (max ${maxTokenLength} characters). Long tokens: ${longTokens.map(t => t.substring(0, 20) + '...').join(', ')}`
|
||||
};
|
||||
}
|
||||
|
||||
return { isValid: true };
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates and preprocesses content for search operations.
|
||||
* Philosophy: Try to search everything! Only block truly extreme cases that could crash the system.
|
||||
*
|
||||
* @param content The content to validate and preprocess
|
||||
* @param noteId The note ID (for logging purposes)
|
||||
* @returns Processed content, only null for truly extreme cases that could cause system instability
|
||||
*/
|
||||
export function validateAndPreprocessContent(content: string, noteId?: string): string | null {
|
||||
if (!content || typeof content !== 'string') {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Only block content that could actually crash the system (100MB+)
|
||||
if (content.length > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_CONTENT_SIZE) {
|
||||
console.error(`Content size exceeds absolute system limit for note ${noteId || 'unknown'}: ${content.length} bytes - this could cause system instability`);
|
||||
// Only in truly extreme cases, truncate to prevent system crash
|
||||
return content.substring(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_CONTENT_SIZE);
|
||||
}
|
||||
|
||||
// Warn about very large content but still process it
|
||||
if (content.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_SIZE) {
|
||||
console.info(`Large content for note ${noteId || 'unknown'}: ${content.length} bytes - processing may take time but will attempt full search`);
|
||||
}
|
||||
|
||||
// For word count, be even more permissive - only block truly extreme cases
|
||||
const wordCount = content.split(/\s+/).length;
|
||||
if (wordCount > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT) {
|
||||
console.error(`Word count exceeds absolute system limit for note ${noteId || 'unknown'}: ${wordCount} words - this could cause system instability`);
|
||||
// Only in truly extreme cases, truncate to prevent system crash
|
||||
return content.split(/\s+/).slice(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT).join(' ');
|
||||
}
|
||||
|
||||
// Warn about high word counts but still process them
|
||||
if (wordCount > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) {
|
||||
console.info(`High word count for note ${noteId || 'unknown'}: ${wordCount} words - phrase matching may take time but will attempt full search`);
|
||||
}
|
||||
|
||||
// Progressive processing warning for very large content
|
||||
if (content.length > FUZZY_SEARCH_CONFIG.PROGRESSIVE_PROCESSING_SIZE || wordCount > FUZZY_SEARCH_CONFIG.PROGRESSIVE_PROCESSING_WORDS) {
|
||||
console.info(`Very large content for note ${noteId || 'unknown'} - using progressive processing to maintain responsiveness`);
|
||||
}
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Escapes special regex characters in a string for use in RegExp constructor
|
||||
*/
|
||||
function escapeRegExp(string: string): string {
|
||||
return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a word matches a token with fuzzy matching and returns the matched word.
|
||||
* Optimized for common case where distances are small.
|
||||
*
|
||||
* @param token The search token (should be normalized)
|
||||
* @param text The text to match against (should be normalized)
|
||||
* @param maxDistance Maximum allowed edit distance
|
||||
* @returns The matched word if found, null otherwise
|
||||
*/
|
||||
export function fuzzyMatchWordWithResult(token: string, text: string, maxDistance: number = FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE): string | null {
|
||||
// Input validation
|
||||
if (typeof token !== 'string' || typeof text !== 'string') {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (token.length === 0 || text.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
// Normalize both strings for comparison
|
||||
const normalizedToken = token.toLowerCase();
|
||||
const normalizedText = text.toLowerCase();
|
||||
|
||||
// Exact match check first (most common case)
|
||||
if (normalizedText.includes(normalizedToken)) {
|
||||
// Find the exact match in the original text to preserve case
|
||||
const exactMatch = text.match(new RegExp(escapeRegExp(token), 'i'));
|
||||
return exactMatch ? exactMatch[0] : token;
|
||||
}
|
||||
|
||||
// For fuzzy matching, we need to check individual words in the text
|
||||
// Split the text into words and check each word against the token
|
||||
const words = normalizedText.split(/\s+/).filter(word => word.length > 0);
|
||||
const originalWords = text.split(/\s+/).filter(word => word.length > 0);
|
||||
|
||||
for (let i = 0; i < words.length; i++) {
|
||||
const word = words[i];
|
||||
const originalWord = originalWords[i];
|
||||
|
||||
// Skip if word is too different in length for fuzzy matching
|
||||
if (Math.abs(word.length - normalizedToken.length) > maxDistance) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// For very short tokens or very different lengths, be more strict
|
||||
if (normalizedToken.length < 4 || Math.abs(word.length - normalizedToken.length) > 2) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Use optimized edit distance calculation
|
||||
const distance = calculateOptimizedEditDistance(normalizedToken, word, maxDistance);
|
||||
if (distance <= maxDistance) {
|
||||
return originalWord; // Return the original word with case preserved
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch (error) {
|
||||
// Log error and return null for safety
|
||||
console.warn('Error in fuzzy word matching:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a word matches a token with fuzzy matching.
|
||||
* Optimized for common case where distances are small.
|
||||
*
|
||||
* @param token The search token (should be normalized)
|
||||
* @param word The word to match against (should be normalized)
|
||||
* @param maxDistance Maximum allowed edit distance
|
||||
* @returns True if the word matches the token within the distance threshold
|
||||
*/
|
||||
export function fuzzyMatchWord(token: string, text: string, maxDistance: number = FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE): boolean {
|
||||
return fuzzyMatchWordWithResult(token, text, maxDistance) !== null;
|
||||
}
|
@ -3,13 +3,50 @@
|
||||
|
||||
The _Quick search_ function does a full-text search (that is, it searches through the content of notes and not just the title of a note) and displays the result in an easy-to-access manner.
|
||||
|
||||
The alternative to the quick search is the <a class="reference-link" href="Search.md">Search</a> function, which opens in a dedicated tab and has support for advanced queries.
|
||||
The alternative to the quick search is the <a class="reference-link" href="Search.md">Search</a> function, which opens in a dedicated tab and has support for advanced queries.
|
||||
|
||||
For even faster navigation, it's possible to use <a class="reference-link" href="Jump%20to.md">Jump to Note</a> which will only search through the note titles instead of the content.
|
||||
For even faster navigation, it's possible to use <a class="reference-link" href="Jump%20to.md">Jump to Note</a> which will only search through the note titles instead of the content.
|
||||
|
||||
## Layout
|
||||
|
||||
Based on the <a class="reference-link" href="../UI%20Elements/Vertical%20and%20horizontal%20layout.md">Vertical and horizontal layout</a>, the quick search is placed:
|
||||
Based on the <a class="reference-link" href="../UI%20Elements/Vertical%20and%20horizontal%20layout.md">Vertical and horizontal layout</a>, the quick search is placed:
|
||||
|
||||
* On the vertical layout, it is displayed right above the <a class="reference-link" href="../UI%20Elements/Note%20Tree.md">Note Tree</a>.
|
||||
* On the horizontal layout, it is displayed in the <a class="reference-link" href="../UI%20Elements/Launch%20Bar.md">Launch Bar</a>, where it can be positioned just like any other icon.
|
||||
* On the vertical layout, it is displayed right above the <a class="reference-link" href="../UI%20Elements/Note%20Tree.md">Note Tree</a>.
|
||||
* On the horizontal layout, it is displayed in the <a class="reference-link" href="../UI%20Elements/Launch%20Bar.md">Launch Bar</a>, where it can be positioned just like any other icon.
|
||||
|
||||
## Search Features
|
||||
|
||||
Quick search includes the following features:
|
||||
|
||||
### Content Previews
|
||||
Search results now display a 200-character preview of the note content below the note title. This preview shows the context where your search terms appear, making it easier to identify the right note without opening it.
|
||||
|
||||
### Infinite Scrolling
|
||||
Results are loaded progressively as you scroll:
|
||||
- Initial display shows 15 results
|
||||
- Scrolling near the bottom automatically loads 10 more results
|
||||
- Continue scrolling to load all matching notes
|
||||
|
||||
### Visual Features
|
||||
- **Highlighting**: Search terms appear in bold with accent colors
|
||||
- **Separation**: Results are separated with dividers
|
||||
- **Theme Support**: Highlighting colors adapt to light/dark themes
|
||||
|
||||
### Search Behavior
|
||||
Quick search uses progressive search:
|
||||
1. Shows exact matches first
|
||||
2. Includes fuzzy matches when exact results are fewer than 5
|
||||
3. Exact matches appear before fuzzy matches
|
||||
|
||||
### Keyboard Navigation
|
||||
- Press `Enter` to open the first result
|
||||
- Use arrow keys to navigate through results
|
||||
- Press `Escape` to close the quick search
|
||||
|
||||
## Using Quick Search
|
||||
|
||||
1. **Typo tolerance**: Search finds results despite minor typos
|
||||
2. **Content previews**: 200-character snippets show match context
|
||||
3. **Infinite scrolling**: Additional results load on scroll
|
||||
4. **Specific terms**: Specific search terms return more focused results
|
||||
5. **Match locations**: Bold text indicates where matches occur
|
@ -66,11 +66,25 @@ The options available are:
|
||||
* `#book #publicationYear = 1954`: Find notes with the "book" label and "publicationYear" set to 1954.
|
||||
* `#genre *=* fan`: Find notes with the "genre" label containing the substring "fan". Additional operators include `*=*` for "contains", `=*` for "starts with", `*=` for "ends with", and `!=` for "is not equal to".
|
||||
* `#book #publicationYear >= 1950 #publicationYear < 1960`: Use numeric operators to find all books published in the 1950s.
|
||||
* `#dateNote >= TODAY-30`: A "smart search" to find notes with the "dateNote" label within the last 30 days. Supported smart values include NOW +- seconds, TODAY +- days, MONTH +- months, YEAR +- years.
|
||||
* `#dateNote >= TODAY-30`: Find notes with the "dateNote" label within the last 30 days. Supported date values include NOW +- seconds, TODAY +- days, MONTH +- months, YEAR +- years.
|
||||
* `~author.title *=* Tolkien`: Find notes related to an author whose title contains "Tolkien".
|
||||
* `#publicationYear %= '19[0-9]{2}'`: Use the '%=' operator to match a regular expression (regex). This feature has been available since Trilium 0.52.
|
||||
* `note.content %= '\\d{2}:\\d{2} (PM|AM)'`: Find notes that mention a time. Backslashes in a regex must be escaped.
|
||||
|
||||
### Fuzzy Search
|
||||
|
||||
Trilium supports fuzzy search operators that find results with typos or spelling variations:
|
||||
|
||||
* `#title ~= trilim`: Fuzzy exact match - finds notes with titles like "Trilium" even if you typed "trilim" (with typo)
|
||||
* `#content ~* progra`: Fuzzy contains match - finds notes containing words like "program", "programmer", "programming" even with slight misspellings
|
||||
* `note.content ~* develpment`: Will find notes containing "development" despite the typo
|
||||
|
||||
**Important notes about fuzzy search:**
|
||||
- Fuzzy search requires at least 3 characters in the search term
|
||||
- Maximum edit distance is 2 characters (number of character changes needed)
|
||||
- Diacritics are normalized (e.g., "café" matches "cafe")
|
||||
- Fuzzy matches work best for finding content with minor typos or spelling variations
|
||||
|
||||
### Advanced Use Cases
|
||||
|
||||
* `~author.relations.son.title = 'Christopher Tolkien'`: Search for notes with an "author" relation to a note that has a "son" relation to "Christopher Tolkien". This can be modeled with the following note structure:
|
||||
@ -117,6 +131,32 @@ Some queries can only be expressed with negation:
|
||||
|
||||
This query finds all book notes not in the "Tolkien" subtree.
|
||||
|
||||
## Progressive Search Strategy
|
||||
|
||||
Trilium uses a progressive search strategy that performs exact matching first, then adds fuzzy matching when needed.
|
||||
|
||||
### How Progressive Search Works
|
||||
|
||||
1. **Phase 1 - Exact Matching**: When you search, Trilium first looks for exact matches of your search terms. This handles the vast majority of searches (90%+) and returns results almost instantly.
|
||||
|
||||
2. **Phase 2 - Fuzzy Fallback**: If Phase 1 doesn't find enough high-quality results (fewer than 5 results with good relevance scores), Trilium automatically adds fuzzy matching to find results with typos or spelling variations.
|
||||
|
||||
3. **Result Ordering**: Exact matches always appear before fuzzy matches, regardless of individual scores. This ensures that when you search for "project", notes containing the exact word "project" will appear before notes containing similar words like "projects" or "projection".
|
||||
|
||||
### Progressive Search Behavior
|
||||
|
||||
- **Speed**: Most searches complete using only exact matching
|
||||
- **Ordering**: Exact matches appear before fuzzy matches
|
||||
- **Fallback**: Fuzzy matching activates when exact matches return fewer than 5 results
|
||||
- **Identification**: Results indicate whether they are exact or fuzzy matches
|
||||
|
||||
### Search Performance
|
||||
|
||||
Search system specifications:
|
||||
- Content size limit: 10MB per note (previously 50KB)
|
||||
- Edit distance calculations for fuzzy matching
|
||||
- Infinite scrolling in Quick Search
|
||||
|
||||
## Under the Hood
|
||||
|
||||
### Label and Relation Shortcuts
|
||||
@ -142,7 +182,7 @@ However, common label and relation searches have shortcut syntax:
|
||||
|
||||
### Separating Full-Text and Attribute Parts
|
||||
|
||||
Search syntax allows combining full-text search with attribute-based search seamlessly. For example, `tolkien #book` contains:
|
||||
Search syntax allows combining full-text search with attribute-based search. For example, `tolkien #book` contains:
|
||||
|
||||
1. Full-text tokens - `tolkien`
|
||||
2. Attribute expressions - `#book`
|
||||
@ -181,4 +221,21 @@ This finds notes created in May 2019. Numeric operators like `#publicationYear >
|
||||
|
||||
You can open Trilium and automatically trigger a search by including the search [url encoded](https://meyerweb.com/eric/tools/dencoder/) string in the URL:
|
||||
|
||||
`http://localhost:8080/#?searchString=abc`
|
||||
`http://localhost:8080/#?searchString=abc`
|
||||
|
||||
## Search Configuration
|
||||
|
||||
### Parameters
|
||||
|
||||
| Parameter | Value | Description |
|
||||
|-----------|-------|-------------|
|
||||
| MIN_FUZZY_TOKEN_LENGTH | 3 | Minimum characters for fuzzy matching |
|
||||
| MAX_EDIT_DISTANCE | 2 | Maximum character changes allowed |
|
||||
| RESULT_SUFFICIENCY_THRESHOLD | 5 | Minimum exact results before fuzzy fallback |
|
||||
| MAX_CONTENT_SIZE | 10MB | Maximum note content size for search processing |
|
||||
|
||||
### Limits
|
||||
|
||||
* Searched note content is limited to 10MB per note to prevent performance issues
|
||||
* Notes exceeding this limit will still be included in title and attribute searches
|
||||
* Fuzzy matching requires tokens of at least 3 characters
|
25
docs/User Guide/User Guide/FAQ.md
vendored
25
docs/User Guide/User Guide/FAQ.md
vendored
@ -54,4 +54,27 @@ More detailed answer:
|
||||
* files are stored in no particular order and user can't change this
|
||||
* Trilium allows storing note [attributes](Advanced%20Usage/Attributes.md) which could be represented in extended user attributes but their support differs greatly among different filesystems / operating systems
|
||||
* Trilium makes links / relations between different notes which can be quickly retrieved / navigated (e.g. for [note map](Advanced%20Usage/Note%20Map%20\(Link%20map%2C%20Tree%20map\).md)). There's no such support in file systems which means these would have to be stored in some kind of side-car files (mini-databases).
|
||||
* Filesystems are generally not transactional. While this is not completely required for a note-taking application, having transactions make it way easier to keep notes and their metadata in predictable and consistent state.
|
||||
* Filesystems are generally not transactional. While this is not completely required for a note-taking application, having transactions make it way easier to keep notes and their metadata in predictable and consistent state.
|
||||
|
||||
## Search-related Questions
|
||||
|
||||
### Why does search sometimes find results with typos?
|
||||
|
||||
Trilium uses a progressive search strategy that includes fuzzy matching when exact matches return fewer than 5 results. This finds notes despite minor typos in your search query. You can use fuzzy search operators (`~=` for fuzzy exact match and `~*` for fuzzy contains). See the <a class="reference-link" href="Basic%20Concepts%20and%20Features/Navigation/Search.md">Search</a> documentation for details.
|
||||
|
||||
### How can I search for notes when I'm not sure of the exact spelling?
|
||||
|
||||
Use the fuzzy search operators:
|
||||
- `#title ~= "projct"` - finds notes with titles like "project" despite the typo
|
||||
- `note.content ~* "algoritm"` - finds content containing "algorithm" or similar words
|
||||
|
||||
### Why do some search results appear before others with lower scores?
|
||||
|
||||
Trilium places exact matches before fuzzy matches. When you search for "project", notes containing exactly "project" appear before notes with variations like "projects" or "projection", regardless of other scoring factors.
|
||||
|
||||
### How can I make my searches faster?
|
||||
|
||||
1. Use the "Fast search" option to search only titles and attributes (not content)
|
||||
2. Limit search scope using the "Ancestor" field
|
||||
3. Set a result limit to prevent loading too many results
|
||||
4. For large databases, consider archiving old notes to reduce search scope
|
Loading…
x
Reference in New Issue
Block a user