trilium/src/services/note_cache/similarity.js

452 lines
13 KiB
JavaScript

const noteCache = require('./note_cache');
const noteCacheService = require('./note_cache_service.js');
const dateUtils = require('../date_utils');
const repository = require('../repository');
const { JSDOM } = require("jsdom");
const DEBUG = false;
const IGNORED_ATTRS = [
"datenote",
"monthnote",
"yearnote"
];
const IGNORED_ATTR_NAMES = [
"includenotelink",
"internallink",
"imagelink",
"relationmaplink",
"template",
"disableversioning",
"archived",
"hidepromotedattributes",
"keyboardshortcut",
"bookzoomlevel",
"noteinfowidgetdisabled",
"linkmapwidgetdisabled",
"noterevisionswidgetdisabled",
"whatlinksherewidgetdisabled",
"similarnoteswidgetdisabled",
"disableinclusion",
"rendernote",
"pageurl",
];
function filterUrlValue(value) {
return value
.replace(/https?:\/\//ig, "")
.replace(/www\./ig, "")
.replace(/(\.net|\.com|\.org|\.info|\.edu)/ig, "");
}
/**
* @param {Note} note
*/
function buildRewardMap(note) {
const map = {};
function addToRewardMap(text, rewardFactor) {
if (!text) {
return;
}
for (const word of splitToWords(text)) {
if (word) {
map[word] = map[word] || 0;
// reward grows with the length of matched string
const length = word.length
- 0.9; // to penalize specifically very short words - 1 and 2 characters
map[word] += rewardFactor * Math.pow(length, 0.7);
}
}
}
for (const ancestorNote of note.ancestors) {
if (ancestorNote.noteId === 'root') {
continue;
}
if (ancestorNote.isDecrypted) {
addToRewardMap(ancestorNote.title, 0.3);
}
for (const branch of ancestorNote.parentBranches) {
addToRewardMap(branch.prefix, 0.3);
}
}
addToRewardMap(trimMime(note.mime), 0.5);
if (note.isDecrypted) {
addToRewardMap(note.title, 1);
}
for (const branch of note.parentBranches) {
addToRewardMap(branch.prefix, 1);
}
for (const attr of note.attributes) {
if (attr.name.startsWith('child:')
|| attr.name.startsWith('relation:')
|| attr.name.startsWith('label:')) {
continue;
}
// inherited notes get small penalization
let reward = note.noteId === attr.noteId ? 0.8 : 0.5;
if (IGNORED_ATTRS.includes(attr.name)) {
continue;
}
if (!IGNORED_ATTR_NAMES.includes(attr.name)) {
addToRewardMap(attr.name, reward);
}
if (attr.name === 'cliptype') {
reward /= 2;
}
let value = attr.value;
if (value.startsWith('http')) {
value = filterUrlValue(value);
// words in URLs are not that valuable
reward = reward / 2;
}
addToRewardMap(value, reward);
}
if (note.type === 'text' && note.isDecrypted) {
const noteEntity = repository.getNote(note.noteId);
const content = noteEntity.getContent();
const dom = new JSDOM(content);
function addHeadingsToRewardMap(elName, rewardFactor) {
for (const el of dom.window.document.querySelectorAll(elName)) {
addToRewardMap(el.textContent, rewardFactor);
}
}
// title is the top with weight 1 so smaller headings will have lower weight
// technically H1 is not supported but for the case it's present let's weigh it just as H2
addHeadingsToRewardMap("h1", 0.9);
addHeadingsToRewardMap("h2", 0.9);
addHeadingsToRewardMap("h3", 0.8);
addHeadingsToRewardMap("h4", 0.7);
addHeadingsToRewardMap("h5", 0.6);
addHeadingsToRewardMap("h6", 0.5);
}
return map;
}
const mimeCache = {};
function trimMime(mime) {
if (!mime || mime === 'text/html') {
return;
}
if (!(mime in mimeCache)) {
const chunks = mime.split('/');
let str = "";
if (chunks.length >= 2) {
// we're not interested in 'text/' or 'application/' prefix
str = chunks[1];
if (str.startsWith('-x')) {
str = str.substr(2);
}
}
mimeCache[mime] = str;
mimeCache[mime] = str;
}
return mimeCache[mime];
}
function buildDateLimits(baseNote) {
const dateCreatedTs = dateUtils.parseDateTime(baseNote.utcDateCreated).getTime();
return {
minDate: dateUtils.utcDateStr(new Date(dateCreatedTs - 3600 * 1000)),
minExcludedDate: dateUtils.utcDateStr(new Date(dateCreatedTs - 5 * 1000)),
maxExcludedDate: dateUtils.utcDateStr(new Date(dateCreatedTs + 5 * 1000)),
maxDate: dateUtils.utcDateStr(new Date(dateCreatedTs + 3600 * 1000)),
};
}
const wordCache = {};
const WORD_BLACKLIST = [
"a", "the", "in", "for", "from", "but", "s", "so", "if", "while", "until",
"whether", "after", "before", "because", "since", "when", "where", "how",
"than", "then", "and", "either", "or", "neither", "nor", "both", "also"
];
function splitToWords(text) {
let words = wordCache[text];
if (!words) {
wordCache[text] = words = text.toLowerCase().split(/[^\p{L}\p{N}]+/u);
for (const idx in words) {
if (WORD_BLACKLIST.includes(words[idx])) {
words[idx] = "";
}
// special case for english plurals
else if (words[idx].length > 2 && words[idx].endsWith("es")) {
words[idx] = words[idx].substr(0, words[idx] - 2);
}
else if (words[idx].length > 1 && words[idx].endsWith("s")) {
words[idx] = words[idx].substr(0, words[idx] - 1);
}
}
}
return words;
}
/**
* includeNoteLink and imageLink relation mean that notes are clearly related, but so clearly
* that it doesn't actually need to be shown to the user.
*/
function hasConnectingRelation(sourceNote, targetNote) {
return sourceNote.attributes.find(attr => attr.type === 'relation'
&& ['includenotelink', 'imagelink'].includes(attr.name)
&& attr.value === targetNote.noteId);
}
async function findSimilarNotes(noteId) {
const results = [];
let i = 0;
const baseNote = noteCache.notes[noteId];
if (!baseNote) {
return [];
}
const dateLimits = buildDateLimits(baseNote);
const rewardMap = buildRewardMap(baseNote);
let ancestorRewardCache = {};
const ancestorNoteIds = new Set(baseNote.ancestors.map(note => note.noteId));
ancestorNoteIds.add(baseNote.noteId);
let displayRewards = false;
function gatherRewards(text, factor = 1) {
if (!text) {
return 0;
}
let counter = 0;
// when the title is very long then weight of each individual word should be lower
// also pretty important in e.g. long URLs in label values
const lengthPenalization = 1 / Math.pow(text.length, 0.3);
for (const word of splitToWords(text)) {
const reward = (rewardMap[word] * factor * lengthPenalization) || 0;
if (displayRewards && reward > 0) {
console.log(`Reward ${Math.round(reward * 10) / 10} for word: ${word}`);
console.log(`Before: ${counter}, add ${reward}, res: ${counter + reward}`);
console.log(`${rewardMap[word]} * ${factor} * ${lengthPenalization}`);
}
counter += reward;
}
return counter;
}
function gatherAncestorRewards(note) {
if (ancestorNoteIds.has(note.noteId)) {
return 0;
}
if (!(note.noteId in ancestorRewardCache)) {
let score = 0;
for (const parentNote of note.parents) {
if (!ancestorNoteIds.has(parentNote.noteId)) {
if (displayRewards) {
console.log("Considering", parentNote.title);
}
if (parentNote.isDecrypted) {
score += gatherRewards(parentNote.title, 0.3);
}
for (const branch of parentNote.parentBranches) {
score += gatherRewards(branch.prefix, 0.3)
+ gatherAncestorRewards(branch.parentNote);
}
}
}
ancestorRewardCache[note.noteId] = score;
}
return ancestorRewardCache[note.noteId];
}
function computeScore(candidateNote) {
let score = gatherRewards(trimMime(candidateNote.mime))
+ gatherAncestorRewards(candidateNote);
if (candidateNote.isDecrypted) {
score += gatherRewards(candidateNote.title);
}
for (const branch of candidateNote.parentBranches) {
score += gatherRewards(branch.prefix);
}
for (const attr of candidateNote.attributes) {
if (attr.name.startsWith('child:')
|| attr.name.startsWith('relation:')
|| attr.name.startsWith('label:')) {
continue;
}
if (IGNORED_ATTRS.includes(attr.name)) {
continue;
}
if (!IGNORED_ATTR_NAMES.includes(attr.name)) {
score += gatherRewards(attr.name);
}
let value = attr.value;
let factor = 1;
if (value.startsWith('http')) {
value = filterUrlValue(value);
// words in URLs are not that valuable
factor = 0.5;
}
score += gatherRewards(attr.value, factor);
}
if (candidateNote.type === baseNote.type) {
if (displayRewards) {
console.log("Adding reward for same note type");
}
score += 0.2;
}
/**
* We want to improve standing of notes which have been created in similar time to each other since
* there's a good chance they are related.
*
* But there's an exception - if they were created really close to each other (withing few seconds) then
* they are probably part of the import and not created by hand - these OTOH should not benefit.
*/
const {utcDateCreated} = candidateNote;
if (utcDateCreated < dateLimits.minExcludedDate || utcDateCreated > dateLimits.maxExcludedDate) {
if (utcDateCreated >= dateLimits.minDate && utcDateCreated <= dateLimits.maxDate) {
if (displayRewards) {
console.log("Adding reward for very similar date of creation");
}
score += 1;
}
else if (utcDateCreated.substr(0, 10) === dateLimits.minDate.substr(0, 10)
|| utcDateCreated.substr(0, 10) === dateLimits.maxDate.substr(0, 10)) {
if (displayRewards) {
console.log("Adding reward for same day of creation");
}
// smaller bonus when outside of the window but within same date
score += 0.5;
}
}
return score;
}
for (const candidateNote of Object.values(noteCache.notes)) {
if (candidateNote.noteId === baseNote.noteId
|| hasConnectingRelation(candidateNote, baseNote)
|| hasConnectingRelation(baseNote, candidateNote)) {
continue;
}
let score = computeScore(candidateNote);
if (score >= 1.5) {
const notePath = noteCacheService.getSomePath(candidateNote);
// this takes care of note hoisting
if (!notePath) {
return;
}
if (noteCacheService.isNotePathArchived(notePath)) {
score -= 0.5; // archived penalization
}
results.push({score, notePath, noteId: candidateNote.noteId});
}
i++;
if (i % 1000 === 0) {
await setImmediatePromise();
}
}
results.sort((a, b) => a.score > b.score ? -1 : 1);
if (DEBUG) {
console.log("REWARD MAP", rewardMap);
if (results.length >= 1) {
for (const {noteId} of results) {
const note = noteCache.notes[noteId];
console.log("NOTE", note.pojo);
displayRewards = true;
ancestorRewardCache = {}; // reset cache
const totalReward = computeScore(note);
console.log("Total reward:", Math.round(totalReward * 10) / 10);
}
}
}
return results.length > 200 ? results.slice(0, 200) : results;
}
/**
* Point of this is to break up long running sync process to avoid blocking
* see https://snyk.io/blog/nodejs-how-even-quick-async-functions-can-block-the-event-loop-starve-io/
*/
function setImmediatePromise() {
return new Promise((resolve) => {
setTimeout(() => resolve(), 0);
});
}
module.exports = {
findSimilarNotes
};