custom implementation of similar notes algorithm

This commit is contained in:
zadam 2020-09-15 22:46:51 +02:00
parent eeacd8118f
commit d345c4850f
6 changed files with 74 additions and 41 deletions

5
package-lock.json generated
View File

@ -8158,11 +8158,6 @@
"resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-0.1.2.tgz", "resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-0.1.2.tgz",
"integrity": "sha1-gIudDlb8Jz2Am6VzOOkpkZoanxo=" "integrity": "sha1-gIudDlb8Jz2Am6VzOOkpkZoanxo="
}, },
"string-similarity": {
"version": "4.0.2",
"resolved": "https://registry.npmjs.org/string-similarity/-/string-similarity-4.0.2.tgz",
"integrity": "sha512-eCsPPyoQBgY4TMpVD6DVfO7pLrimUONriaO4Xjp3WPUW0YnNLqdHgRj23xotLlqrL90eJhBeq3zdAJf2mQgfBQ=="
},
"string-width": { "string-width": {
"version": "1.0.2", "version": "1.0.2",
"resolved": "https://registry.npmjs.org/string-width/-/string-width-1.0.2.tgz", "resolved": "https://registry.npmjs.org/string-width/-/string-width-1.0.2.tgz",

View File

@ -36,6 +36,11 @@ const TPL = `
overflow: hidden; overflow: hidden;
text-overflow: ellipsis; text-overflow: ellipsis;
} }
.note-path-list {
max-height: 600px;
overflow-y: auto;
}
</style> </style>
<div class="current-path"></div> <div class="current-path"></div>

View File

@ -141,7 +141,7 @@ export default class SimilarNotesWidget extends TabAwareWidget {
} }
const $item = (await linkService.createNoteLink(similarNote.notePath.join("/"))) const $item = (await linkService.createNoteLink(similarNote.notePath.join("/")))
.css("font-size", 24 * similarNote.score); .css("font-size", 24 * (1 - 1 / (similarNote.score - 1)));
$list.append($item); $list.append($item);
} }

View File

@ -649,7 +649,7 @@ a.external:not(.no-arrow):after, a[href^="http://"]:not(.no-arrow):after, a[href
} }
.component { .component {
contain: layout size; contain: size;
} }
.toast { .toast {

View File

@ -1,6 +1,6 @@
"use strict"; "use strict";
const noteCacheService = require('../../services/note_cache/note_cache_service'); const similarityService = require('../../services/note_cache/similarity.js');
const repository = require('../../services/repository'); const repository = require('../../services/repository');
async function getSimilarNotes(req) { async function getSimilarNotes(req) {
@ -12,10 +12,7 @@ async function getSimilarNotes(req) {
return [404, `Note ${noteId} not found.`]; return [404, `Note ${noteId} not found.`];
} }
const results = await noteCacheService.findSimilarNotes(noteId); return await similarityService.findSimilarNotes(noteId);
return results
.filter(note => note.noteId !== noteId);
} }
module.exports = { module.exports = {

View File

@ -2,10 +2,47 @@ const noteCache = require('./note_cache');
const noteCacheService = require('./note_cache_service.js'); const noteCacheService = require('./note_cache_service.js');
const dateUtils = require('../date_utils'); const dateUtils = require('../date_utils');
function computeScore(candidateNote, dates) { function gatherRewards(rewardMap, text) {
let score = 0; if (!text) {
return 0;
}
let counter = 0;
for (const word of text.toLowerCase().split(/\W+/)) {
counter += rewardMap[word] || 0;
}
return counter;
}
function computeScore(candidateNote, ancestorNoteIds, rewardMap, dates) {
let score =
gatherRewards(rewardMap, candidateNote.title)
+ gatherRewards(rewardMap, candidateNote.type);
+ gatherRewards(rewardMap, trimMime(candidateNote.mime));
for (const ancestorNote of candidateNote.ancestors) {
if (!ancestorNoteIds.includes(ancestorNote.noteId)) {
score += gatherRewards(rewardMap, ancestorNote.title);
for (const branch of ancestorNote.parentBranches) {
score += gatherRewards(rewardMap, branch.prefix);
}
}
}
for (const branch of candidateNote.parentBranches) {
score += gatherRewards(rewardMap, branch.prefix);
}
for (const attr of candidateNote.attributes) {
if (!IGNORED_ATTR_NAMES.includes(attr.name)) {
score += gatherRewards(rewardMap, attr.name);
}
score += gatherRewards(rewardMap, attr.value);
}
/** /**
* We want to improve standing of notes which have been created in similar time to each other since * We want to improve standing of notes which have been created in similar time to each other since
@ -19,16 +56,16 @@ function computeScore(candidateNote, dates) {
if (utcDateCreated >= dates.minDate && utcDateCreated <= dates.maxDate if (utcDateCreated >= dates.minDate && utcDateCreated <= dates.maxDate
&& utcDateCreated < dates.minExcludedDate && utcDateCreated > dates.maxExcludedDate) { && utcDateCreated < dates.minExcludedDate && utcDateCreated > dates.maxExcludedDate) {
score += 0.3; score += 3;
} }
return score; return score;
} }
function evaluateSimilarity(sourceNote, candidateNote, rewardMap, dates, results) { function evaluateSimilarity(sourceNote, candidateNote, ancestorNoteIds, rewardMap, dates, results) {
let score = computeScore(candidateNote, rewardMap, dates); let score = computeScore(candidateNote, ancestorNoteIds, rewardMap, dates);
if (score > 0.5) { if (score >= 4) {
const notePath = noteCacheService.getSomePath(candidateNote); const notePath = noteCacheService.getSomePath(candidateNote);
// this takes care of note hoisting // this takes care of note hoisting
@ -37,7 +74,7 @@ function evaluateSimilarity(sourceNote, candidateNote, rewardMap, dates, results
} }
if (noteCacheService.isNotePathArchived(notePath)) { if (noteCacheService.isNotePathArchived(notePath)) {
score -= 0.2; // archived penalization score -= 1; // archived penalization
} }
results.push({score, notePath, noteId: candidateNote.noteId}); results.push({score, notePath, noteId: candidateNote.noteId});
@ -68,36 +105,36 @@ function buildRewardMap(note) {
const map = {}; const map = {};
for (const ancestorNote of note.ancestors) { for (const ancestorNote of note.ancestors) {
updateMap(map, ancestorNote.title, 0.4); addToRewardMap(map, ancestorNote.title, 0.4);
for (const branch of ancestorNote.parentBranches) { for (const branch of ancestorNote.parentBranches) {
updateMap(map, branch.prefix, 0.4); addToRewardMap(map, branch.prefix, 0.4);
} }
} }
updateMap(map, note.type, 0.2); addToRewardMap(map, note.type, 0.2);
updateMap(map, processMime(note.mime), 0.3); addToRewardMap(map, trimMime(note.mime), 0.3);
updateMap(map, note.title, 1); addToRewardMap(map, note.title, 1);
for (const branch of note.parentBranches) { for (const branch of note.parentBranches) {
updateMap(map, branch.prefix, 1); addToRewardMap(map, branch.prefix, 1);
} }
for (const attr of note.attributes) { for (const attr of note.attributes) {
const reward = note.noteId === attr.noteId ? 0.8 : 0.5; const reward = note.noteId === attr.noteId ? 0.8 : 0.5;
if (!IGNORED_ATTR_NAMES.includes(attr.name)) { if (!IGNORED_ATTR_NAMES.includes(attr.name)) {
updateMap(map, attr.name, reward); addToRewardMap(map, attr.name, reward);
} }
updateMap(map, attr.value, reward); addToRewardMap(map, attr.value, reward);
} }
return map; return map;
} }
function processMime(mime) { function trimMime(mime) {
if (!mime) { if (!mime) {
return; return;
} }
@ -118,21 +155,19 @@ function processMime(mime) {
return str; return str;
} }
function updateMap(map, text, baseReward) { function addToRewardMap(map, text, baseReward) {
if (!text) { if (!text) {
return; return;
} }
for (const word of text.split(/\W+/)) { for (const word of text.toLowerCase().split(/\W+/)) {
if (word) {
map[word] = map[word] || 0; map[word] = map[word] || 0;
// reward grows with the length of matched string // reward grows with the length of matched string
map[word] += baseReward * Math.sqrt(word.length); map[word] += baseReward * Math.sqrt(word.length);
} }
} }
function tokenize(str) {
return ;
} }
async function findSimilarNotes(noteId) { async function findSimilarNotes(noteId) {
@ -155,24 +190,25 @@ async function findSimilarNotes(noteId) {
}; };
const rewardMap = buildRewardMap(baseNote); const rewardMap = buildRewardMap(baseNote);
const ancestorNoteIds = baseNote.ancestors.map(note => note.noteId);
for (const candidateNote of Object.values(noteCache.notes)) { for (const candidateNote of Object.values(noteCache.notes)) {
if (candidateNote.noteId === baseNote.noteId) { if (candidateNote.noteId === baseNote.noteId) {
continue; continue;
} }
evaluateSimilarity(baseNote, candidateNote, rewardMap, dates, results); evaluateSimilarity(baseNote, candidateNote, ancestorNoteIds, rewardMap, dates, results);
i++; i++;
if (i % 200 === 0) { if (i % 1000 === 0) {
await setImmediatePromise(); await setImmediatePromise();
} }
} }
results.sort((a, b) => a.score > b.score ? -1 : 1); results.sort((a, b) => a.score > b.score ? -1 : 1);
return results.length > 50 ? results.slice(0, 200) : results; return results.length > 200 ? results.slice(0, 200) : results;
} }
module.exports = { module.exports = {