trilium/src/services/note_cache/similarity.js
2020-09-16 11:50:47 +02:00

225 lines
5.9 KiB
JavaScript

const noteCache = require('./note_cache');
const noteCacheService = require('./note_cache_service.js');
const dateUtils = require('../date_utils');
const IGNORED_ATTR_NAMES = [
"includenotelink",
"internallink",
"imagelink",
"relationmaplink"
];
/**
* @param {Note} note
*/
function buildRewardMap(note) {
const map = {};
function addToRewardMap(text, baseReward) {
if (!text) {
return;
}
for (const word of text.toLowerCase().split(/\W+/)) {
if (word) {
map[word] = map[word] || 0;
// reward grows with the length of matched string
map[word] += baseReward * Math.sqrt(word.length);
}
}
}
for (const ancestorNote of note.ancestors) {
if (ancestorNote.isDecrypted) {
addToRewardMap(ancestorNote.title, 0.4);
}
for (const branch of ancestorNote.parentBranches) {
addToRewardMap(branch.prefix, 0.4);
}
}
addToRewardMap(note.type, 0.2);
addToRewardMap(trimMime(note.mime), 0.3);
if (note.isDecrypted) {
addToRewardMap(note.title, 1);
}
for (const branch of note.parentBranches) {
addToRewardMap(branch.prefix, 1);
}
for (const attr of note.attributes) {
const reward = note.noteId === attr.noteId ? 0.8 : 0.5;
if (!IGNORED_ATTR_NAMES.includes(attr.name)) {
addToRewardMap(attr.name, reward);
}
addToRewardMap(attr.value, reward);
}
return map;
}
function trimMime(mime) {
if (!mime) {
return;
}
const chunks = mime.split('/');
if (chunks.length < 2) {
return;
}
// we're not interested in 'text/' or 'application/' prefix
let str = chunks[1];
if (str.startsWith('-x')) {
str = str.substr(2);
}
return str;
}
function findSimilarNotes(noteId) {const start = Date.now();
const results = [];
let i = 0;
const baseNote = noteCache.notes[noteId];
if (!baseNote) {
return [];
}
const dateCreatedTs = dateUtils.parseDateTime(baseNote.utcDateCreated);
const dates = {
minDate: dateUtils.utcDateStr(new Date(dateCreatedTs - 1800)),
minExcludedDate: dateUtils.utcDateStr(new Date(dateCreatedTs - 5)),
maxExcludedDate: dateUtils.utcDateStr(new Date(dateCreatedTs + 5)),
maxDate: dateUtils.utcDateStr(new Date(dateCreatedTs + 1800)),
};
const rewardMap = buildRewardMap(baseNote);
const ancestorNoteIds = new Set(baseNote.ancestors.map(note => note.noteId));
function gatherRewards(text) {
if (!text) {
return 0;
}
let counter = 0;
for (const word of text.toLowerCase().split(/\W+/)) {
counter += rewardMap[word] || 0;
}
return counter;
}
function computeScore(candidateNote) {
let score = gatherRewards(candidateNote.type);
+ gatherRewards(trimMime(candidateNote.mime));
if (candidateNote.isDecrypted) {
score += gatherRewards(candidateNote.title);
}
for (const ancestorNote of candidateNote.ancestors) {
if (!ancestorNoteIds.has(ancestorNote.noteId)) {
if (ancestorNote.isDecrypted) {
score += gatherRewards(ancestorNote.title);
}
for (const branch of ancestorNote.parentBranches) {
score += gatherRewards(branch.prefix);
}
}
}
for (const branch of candidateNote.parentBranches) {
score += gatherRewards(branch.prefix);
}
for (const attr of candidateNote.attributes) {
if (!IGNORED_ATTR_NAMES.includes(attr.name)) {
score += gatherRewards(attr.name);
}
score += gatherRewards(attr.value);
}
/**
* We want to improve standing of notes which have been created in similar time to each other since
* there's a good chance they are related.
*
* But there's an exception - if they were created really close to each other (withing few seconds) then
* they are probably part of the import and not created by hand - these OTOH should not benefit.
*/
const {utcDateCreated} = candidateNote;
if (utcDateCreated >= dates.minDate && utcDateCreated <= dates.maxDate
&& utcDateCreated < dates.minExcludedDate && utcDateCreated > dates.maxExcludedDate) {
score += 3;
}
return score;
}
function evaluateSimilarity(candidateNote) {
let score = computeScore(candidateNote);
if (score >= 4) {
const notePath = noteCacheService.getSomePath(candidateNote);
// this takes care of note hoisting
if (!notePath) {
return;
}
if (noteCacheService.isNotePathArchived(notePath)) {
score -= 1; // archived penalization
}
results.push({score, notePath, noteId: candidateNote.noteId});
}
}
for (const candidateNote of Object.values(noteCache.notes)) {
if (candidateNote.noteId === baseNote.noteId) {
continue;
}
evaluateSimilarity(candidateNote);
i++;
if (i % 1000 === 0) {
//await setImmediatePromise();
}
}
results.sort((a, b) => a.score > b.score ? -1 : 1);
console.log("Similarity search took", Date.now() - start, "ms");
return results.length > 200 ? results.slice(0, 200) : results;
}
/**
* Point of this is to break up long running sync process to avoid blocking
* see https://snyk.io/blog/nodejs-how-even-quick-async-functions-can-block-the-event-loop-starve-io/
*/
function setImmediatePromise() {
return new Promise((resolve) => {
setTimeout(() => resolve(), 0);
});
}
module.exports = {
findSimilarNotes
};