From eeacd8118f706c37b5d6325ffa2eeabe0fec7665 Mon Sep 17 00:00:00 2001 From: zadam Date: Tue, 15 Sep 2020 16:46:03 +0200 Subject: [PATCH] similar notes changes --- package-lock.json | 12 +- package.json | 5 +- src/public/app/widgets/similar_notes.js | 2 +- src/services/note_cache/note_cache_service.js | 85 +-------- src/services/note_cache/similarity.js | 180 ++++++++++++++++++ 5 files changed, 190 insertions(+), 94 deletions(-) create mode 100644 src/services/note_cache/similarity.js diff --git a/package-lock.json b/package-lock.json index be9127bb6..9a7f96b4c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -3143,9 +3143,9 @@ } }, "electron": { - "version": "9.3.0", - "resolved": "https://registry.npmjs.org/electron/-/electron-9.3.0.tgz", - "integrity": "sha512-7zPLEZ+kOjVJqfawMQ0vVuZZRqvZIeiID3tbjjbVybbxXIlFMpZ2jogoh7PV3rLrtm+dKRfu7Qc4E7ob1d0FqQ==", + "version": "9.3.1", + "resolved": "https://registry.npmjs.org/electron/-/electron-9.3.1.tgz", + "integrity": "sha512-DScrhqBT4a54KfdF0EoipALpHmdQTn3m7SSCtbpTcEcG+UDUiXad2cOfW6DHeVH7N+CVDKDG12q2PhVJjXkFAA==", "dev": true, "requires": { "@electron/get": "^1.0.1", @@ -4184,9 +4184,9 @@ } }, "file-type": { - "version": "15.0.0", - "resolved": "https://registry.npmjs.org/file-type/-/file-type-15.0.0.tgz", - "integrity": "sha512-l0JCuF5F7NIybCfa9G2H0lKhhGaf0z+HJyLOmB2feknY7/HBVNyD4PLesGKLGqznwyVXGNnfpIOr+Fvca6bOEg==", + "version": "15.0.1", + "resolved": "https://registry.npmjs.org/file-type/-/file-type-15.0.1.tgz", + "integrity": "sha512-0LieQlSA3bWUdErNrxzxfI4rhsvNAVPBO06R8pTc1hp9SE6nhqlVyvhcaXoMmtXkBTPnQenbMPLW9X76hH76oQ==", "requires": { "readable-web-to-node-stream": "^2.0.0", "strtok3": "^6.0.3", diff --git a/package.json b/package.json index 378dbc390..1af49a519 100644 --- a/package.json +++ b/package.json @@ -41,7 +41,7 @@ "electron-window-state": "5.0.3", "express": "4.17.1", "express-session": "1.17.1", - "file-type": "15.0.0", + "file-type": "15.0.1", "fs-extra": "9.0.1", "helmet": "4.1.1", "html": "1.0.0", @@ -66,7 +66,6 @@ "semver": "7.3.2", "serve-favicon": "2.5.0", "session-file-store": "1.4.0", - "string-similarity": "4.0.2", "striptags": "3.1.1", "turndown": "6.0.0", "turndown-plugin-gfm": "1.0.2", @@ -77,7 +76,7 @@ }, "devDependencies": { "cross-env": "7.0.2", - "electron": "9.3.0", + "electron": "9.3.1", "electron-builder": "22.8.0", "electron-packager": "15.1.0", "electron-rebuild": "2.0.3", diff --git a/src/public/app/widgets/similar_notes.js b/src/public/app/widgets/similar_notes.js index 5d297996b..1cb426efb 100644 --- a/src/public/app/widgets/similar_notes.js +++ b/src/public/app/widgets/similar_notes.js @@ -141,7 +141,7 @@ export default class SimilarNotesWidget extends TabAwareWidget { } const $item = (await linkService.createNoteLink(similarNote.notePath.join("/"))) - .css("font-size", 24 * similarNote.coeff); + .css("font-size", 24 * similarNote.score); $list.append($item); } diff --git a/src/services/note_cache/note_cache_service.js b/src/services/note_cache/note_cache_service.js index 6763f8048..98c6b3f7c 100644 --- a/src/services/note_cache/note_cache_service.js +++ b/src/services/note_cache/note_cache_service.js @@ -3,9 +3,7 @@ const noteCache = require('./note_cache'); const hoistedNoteService = require('../hoisted_note'); const protectedSessionService = require('../protected_session'); -const stringSimilarity = require('string-similarity'); const log = require('../log'); -const dateUtils = require('../date_utils'); function isNotePathArchived(notePath) { const noteId = notePath[notePath.length - 1]; @@ -175,87 +173,6 @@ function getNotePath(noteId) { } } -function evaluateSimilarity(sourceNote, candidateNote, dates, results) { - let coeff = stringSimilarity.compareTwoStrings(sourceNote.flatText, candidateNote.flatText); - const {utcDateCreated} = candidateNote; - - /** - * We want to improve standing of notes which have been created in similar time to each other since - * there's a good chance they are related. - * - * But there's an exception - if they were created really close to each other (withing few seconds) then - * they are probably part of the import and not created by hand - these OTOH should not benefit. - */ - if (utcDateCreated >= dates.minDate && utcDateCreated <= dates.maxDate - && utcDateCreated < dates.minExcludedDate && utcDateCreated > dates.maxExcludedDate) { - - coeff += 0.3; - } - - if (coeff > 0.5) { - const notePath = getSomePath(candidateNote); - - // this takes care of note hoisting - if (!notePath) { - return; - } - - if (isNotePathArchived(notePath)) { - coeff -= 0.2; // archived penalization - } - - results.push({coeff, notePath, noteId: candidateNote.noteId}); - } -} - -/** - * Point of this is to break up long running sync process to avoid blocking - * see https://snyk.io/blog/nodejs-how-even-quick-async-functions-can-block-the-event-loop-starve-io/ - */ -function setImmediatePromise() { - return new Promise((resolve) => { - setTimeout(() => resolve(), 0); - }); -} - -async function findSimilarNotes(noteId) { - const results = []; - let i = 0; - - const origNote = noteCache.notes[noteId]; - - if (!origNote) { - return []; - } - - const dateCreatedTs = dateUtils.parseDateTime(origNote.utcDateCreated); - - const dates = { - minDate: dateUtils.utcDateStr(new Date(dateCreatedTs - 1800)), - minExcludedDate: dateUtils.utcDateStr(new Date(dateCreatedTs - 5)), - maxExcludedDate: dateUtils.utcDateStr(new Date(dateCreatedTs + 5)), - maxDate: dateUtils.utcDateStr(new Date(dateCreatedTs + 1800)), - }; - - for (const note of Object.values(noteCache.notes)) { - if (note.noteId === origNote.noteId) { - continue; - } - - evaluateSimilarity(origNote, note, dates, results); - - i++; - - if (i % 200 === 0) { - await setImmediatePromise(); - } - } - - results.sort((a, b) => a.coeff > b.coeff ? -1 : 1); - - return results.length > 50 ? results.slice(0, 200) : results; -} - /** * @param noteId * @returns {boolean} - true if note exists (is not deleted) and is available in current note hoisting @@ -274,5 +191,5 @@ module.exports = { isAvailable, isArchived, isInAncestor, - findSimilarNotes + isNotePathArchived }; diff --git a/src/services/note_cache/similarity.js b/src/services/note_cache/similarity.js new file mode 100644 index 000000000..9d34d81b8 --- /dev/null +++ b/src/services/note_cache/similarity.js @@ -0,0 +1,180 @@ +const noteCache = require('./note_cache'); +const noteCacheService = require('./note_cache_service.js'); +const dateUtils = require('../date_utils'); + +function computeScore(candidateNote, dates) { + let score = 0; + + + + /** + * We want to improve standing of notes which have been created in similar time to each other since + * there's a good chance they are related. + * + * But there's an exception - if they were created really close to each other (withing few seconds) then + * they are probably part of the import and not created by hand - these OTOH should not benefit. + */ + const {utcDateCreated} = candidateNote; + + if (utcDateCreated >= dates.minDate && utcDateCreated <= dates.maxDate + && utcDateCreated < dates.minExcludedDate && utcDateCreated > dates.maxExcludedDate) { + + score += 0.3; + } + + return score; +} + +function evaluateSimilarity(sourceNote, candidateNote, rewardMap, dates, results) { + let score = computeScore(candidateNote, rewardMap, dates); + + if (score > 0.5) { + const notePath = noteCacheService.getSomePath(candidateNote); + + // this takes care of note hoisting + if (!notePath) { + return; + } + + if (noteCacheService.isNotePathArchived(notePath)) { + score -= 0.2; // archived penalization + } + + results.push({score, notePath, noteId: candidateNote.noteId}); + } +} + +/** + * Point of this is to break up long running sync process to avoid blocking + * see https://snyk.io/blog/nodejs-how-even-quick-async-functions-can-block-the-event-loop-starve-io/ + */ +function setImmediatePromise() { + return new Promise((resolve) => { + setTimeout(() => resolve(), 0); + }); +} + +const IGNORED_ATTR_NAMES = [ + "includenotelink", + "internallink", + "imagelink", + "relationmaplink" +]; + +/** + * @param {Note} note + */ +function buildRewardMap(note) { + const map = {}; + + for (const ancestorNote of note.ancestors) { + updateMap(map, ancestorNote.title, 0.4); + + for (const branch of ancestorNote.parentBranches) { + updateMap(map, branch.prefix, 0.4); + } + } + + updateMap(map, note.type, 0.2); + updateMap(map, processMime(note.mime), 0.3); + + updateMap(map, note.title, 1); + + for (const branch of note.parentBranches) { + updateMap(map, branch.prefix, 1); + } + + for (const attr of note.attributes) { + const reward = note.noteId === attr.noteId ? 0.8 : 0.5; + + if (!IGNORED_ATTR_NAMES.includes(attr.name)) { + updateMap(map, attr.name, reward); + } + + updateMap(map, attr.value, reward); + } + + return map; +} + +function processMime(mime) { + if (!mime) { + return; + } + + const chunks = mime.split('/'); + + if (chunks.length < 2) { + return; + } + + // we're not interested in 'text/' or 'application/' prefix + let str = chunks[1]; + + if (str.startsWith('-x')) { + str = str.substr(2); + } + + return str; +} + +function updateMap(map, text, baseReward) { + if (!text) { + return; + } + + for (const word of text.split(/\W+/)) { + map[word] = map[word] || 0; + + // reward grows with the length of matched string + map[word] += baseReward * Math.sqrt(word.length); + } +} + +function tokenize(str) { + return ; +} + +async function findSimilarNotes(noteId) { + const results = []; + let i = 0; + + const baseNote = noteCache.notes[noteId]; + + if (!baseNote) { + return []; + } + + const dateCreatedTs = dateUtils.parseDateTime(baseNote.utcDateCreated); + + const dates = { + minDate: dateUtils.utcDateStr(new Date(dateCreatedTs - 1800)), + minExcludedDate: dateUtils.utcDateStr(new Date(dateCreatedTs - 5)), + maxExcludedDate: dateUtils.utcDateStr(new Date(dateCreatedTs + 5)), + maxDate: dateUtils.utcDateStr(new Date(dateCreatedTs + 1800)), + }; + + const rewardMap = buildRewardMap(baseNote); + + for (const candidateNote of Object.values(noteCache.notes)) { + if (candidateNote.noteId === baseNote.noteId) { + continue; + } + + evaluateSimilarity(baseNote, candidateNote, rewardMap, dates, results); + + i++; + + if (i % 200 === 0) { + await setImmediatePromise(); + } + } + + results.sort((a, b) => a.score > b.score ? -1 : 1); + + return results.length > 50 ? results.slice(0, 200) : results; +} + +module.exports = { + findSimilarNotes +}; \ No newline at end of file