similarity takes into account also headings in the base note plus additional fixes

This commit is contained in:
zadam 2020-09-22 23:10:01 +02:00
parent 15796b6870
commit 995f4f2582
3 changed files with 138 additions and 133 deletions

185
package-lock.json generated
View File

@ -1092,9 +1092,9 @@
"dev": true
},
"abab": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/abab/-/abab-2.0.3.tgz",
"integrity": "sha512-tsFzPpcttalNjFBCFMqsKYQcWxxen1pgJR56by//QwvJc4/OUS3kPOOttx2tSIfjsylB0pYu7f5D3K1RCxUnUg=="
"version": "2.0.5",
"resolved": "https://registry.npmjs.org/abab/-/abab-2.0.5.tgz",
"integrity": "sha512-9IK9EadsbHo6jLWIpxpR6pL0sazTXV6+SQv25ZB+F7Bj9mJNaOc4nCRabwd5M/JwmUa8idz6Eci6eKfJryPs6Q=="
},
"abbrev": {
"version": "1.1.1",
@ -1112,9 +1112,9 @@
}
},
"acorn": {
"version": "7.1.1",
"resolved": "https://registry.npmjs.org/acorn/-/acorn-7.1.1.tgz",
"integrity": "sha512-add7dgA5ppRPxCFJoAGfMDi7PIBXq1RtGo7BhbLaxwrXPOmw8gq48Y9ozT01hUKy9byMjlR20EJhu5zlkErEkg=="
"version": "7.4.0",
"resolved": "https://registry.npmjs.org/acorn/-/acorn-7.4.0.tgz",
"integrity": "sha512-+G7P8jJmCHr+S+cLfQxygbWhXy+8YTVGzAkpEbcLo2mLoL7tij/VG41QSHACSf5QgYRhMZYHuNc6drJaO0Da+w=="
},
"acorn-globals": {
"version": "6.0.0",
@ -1126,9 +1126,9 @@
}
},
"acorn-walk": {
"version": "7.1.1",
"resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-7.1.1.tgz",
"integrity": "sha512-wdlPY2tm/9XBr7QkKlq0WQVgiuGTX6YWPyRyBviSoScBuLfTVQhvwg6wJ369GJ/1nPfTLMfnrFIfjqVg6d+jQQ=="
"version": "7.2.0",
"resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-7.2.0.tgz",
"integrity": "sha512-OPdCF6GsMIP+Az+aWfAAOEt2/+iVDKE7oy6lJ098aoe59oAmK76qV6Gw60SbZ8jHuG2wH058GF4pLFbYamYrVA=="
},
"agent-base": {
"version": "6.0.0",
@ -2781,9 +2781,9 @@
"integrity": "sha512-p3pvU7r1MyyqbTk+WbNJIgJjG2VmTIaB10rI93LzVPrmDJKkzKYMtxxyAvQXR/NS6otuzveI7+7BBq3SjBS2mw=="
},
"cssstyle": {
"version": "2.2.0",
"resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-2.2.0.tgz",
"integrity": "sha512-sEb3XFPx3jNnCAMtqrXPDeSgQr+jojtCeNf8cvMNMh1cG970+lljssvQDzPq6lmmJu2Vhqood/gtEomBiHOGnA==",
"version": "2.3.0",
"resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-2.3.0.tgz",
"integrity": "sha512-AZL67abkUzIuvcHqk7c09cezpGNcxUxU4Ioi/05xHk4DQeTkWmGYftIE6ctU6AEt+Gn4n1lDStOtj7FKycP71A==",
"requires": {
"cssom": "~0.3.6"
},
@ -3089,6 +3089,13 @@
"integrity": "sha512-yxJ2mFy/sibVQlu5qHjOkf9J3K6zgmCxgJ94u2EdvDOV09H+32LtRswEcUsmUWN72pVLOEnTSRaIVVzVQgS0dg==",
"requires": {
"webidl-conversions": "^5.0.0"
},
"dependencies": {
"webidl-conversions": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-5.0.0.tgz",
"integrity": "sha512-VlZwKPCkYKxQgeSbH5EyngOmRp7Ww7I9rQLERETtf5ofd9pGeswWiOtogpEO850jziPRarreGxn5QIiTqpb2wA=="
}
}
},
"domhandler": {
@ -3838,9 +3845,9 @@
"integrity": "sha1-G2HAViGQqN/2rjuyzwIAyhMLhtQ="
},
"escodegen": {
"version": "1.14.1",
"resolved": "https://registry.npmjs.org/escodegen/-/escodegen-1.14.1.tgz",
"integrity": "sha512-Bmt7NcRySdIfNPfU2ZoXDrrXsG9ZjvDxcAlMfDUgRBjLOWTuIACXPBFJH7Z+cLb40JeQco5toikyc9t9P8E9SQ==",
"version": "1.14.3",
"resolved": "https://registry.npmjs.org/escodegen/-/escodegen-1.14.3.tgz",
"integrity": "sha512-qFcX0XJkdg+PB3xjZZG/wKSuT1PnQWx57+TVSjIMmILd2yC/6ByYElPwJnslDsuWuSAp4AwJGumarAAmJch5Kw==",
"requires": {
"esprima": "^4.0.1",
"estraverse": "^4.2.0",
@ -5004,6 +5011,11 @@
"integrity": "sha512-e0/LknJ8wpMMhTiWcjivB+ESwIuvHnBSlBbmP/pSb8CQJldoj1p2qv7xGZ/+BtbTziYRFSz8OsvdbiX45LtYQA==",
"dev": true
},
"ip-regex": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/ip-regex/-/ip-regex-2.1.0.tgz",
"integrity": "sha1-+ni/XS5pE8kRzp+BnuUUa7bYROk="
},
"ipaddr.js": {
"version": "1.9.0",
"resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.0.tgz",
@ -5403,9 +5415,9 @@
}
},
"jsdom": {
"version": "16.2.1",
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.2.1.tgz",
"integrity": "sha512-3p0gHs5EfT7PxW9v8Phz3mrq//4Dy8MQenU/PoKxhdT+c45S7NjIjKbGT3Ph0nkICweE1r36+yaknXA5WfVNAg==",
"version": "16.4.0",
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.4.0.tgz",
"integrity": "sha512-lYMm3wYdgPhrl7pDcRmvzPhhrGVBeVhPIqeHjzeiHN3DFmD1RBpbExbi8vU7BJdH8VAZYovR8DMt0PNNDM7k8w==",
"requires": {
"abab": "^2.0.3",
"acorn": "^7.1.1",
@ -5427,84 +5439,18 @@
"tough-cookie": "^3.0.1",
"w3c-hr-time": "^1.0.2",
"w3c-xmlserializer": "^2.0.0",
"webidl-conversions": "^5.0.0",
"webidl-conversions": "^6.1.0",
"whatwg-encoding": "^1.0.5",
"whatwg-mimetype": "^2.3.0",
"whatwg-url": "^8.0.0",
"ws": "^7.2.1",
"ws": "^7.2.3",
"xml-name-validator": "^3.0.0"
},
"dependencies": {
"ip-regex": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/ip-regex/-/ip-regex-2.1.0.tgz",
"integrity": "sha1-+ni/XS5pE8kRzp+BnuUUa7bYROk="
},
"parse5": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/parse5/-/parse5-5.1.1.tgz",
"integrity": "sha512-ugq4DFI0Ptb+WWjAdOK16+u/nHfiIrcE+sh8kZMaM0WllQKLI9rOUq6c2b7cwPkXdzfQESqvoqK6ug7U/Yyzug=="
},
"qs": {
"version": "6.5.2",
"resolved": "https://registry.npmjs.org/qs/-/qs-6.5.2.tgz",
"integrity": "sha512-N5ZAX4/LxJmF+7wN74pUD6qAh9/wnvdQcjq9TZjevvXzSUo7bfmw91saqMjzGS2xq91/odN2dW/WOl7qQHNDGA=="
},
"request": {
"version": "2.88.2",
"resolved": "https://registry.npmjs.org/request/-/request-2.88.2.tgz",
"integrity": "sha512-MsvtOrfG9ZcrOwAW+Qi+F6HbD0CWXEh9ou77uOb7FM2WPhwT7smM833PzanhJLsgXjN89Ir6V2PczXNnMpwKhw==",
"requires": {
"aws-sign2": "~0.7.0",
"aws4": "^1.8.0",
"caseless": "~0.12.0",
"combined-stream": "~1.0.6",
"extend": "~3.0.2",
"forever-agent": "~0.6.1",
"form-data": "~2.3.2",
"har-validator": "~5.1.3",
"http-signature": "~1.2.0",
"is-typedarray": "~1.0.0",
"isstream": "~0.1.2",
"json-stringify-safe": "~5.0.1",
"mime-types": "~2.1.19",
"oauth-sign": "~0.9.0",
"performance-now": "^2.1.0",
"qs": "~6.5.2",
"safe-buffer": "^5.1.2",
"tough-cookie": "~2.5.0",
"tunnel-agent": "^0.6.0",
"uuid": "^3.3.2"
},
"dependencies": {
"tough-cookie": {
"version": "2.5.0",
"resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-2.5.0.tgz",
"integrity": "sha512-nlLsUzgm1kfLXSXfRZMc1KLAugd4hqJHDTvc2hDIwS3mZAfMEuMbc03SujMF+GEcpaX/qboeycw6iO8JwVv2+g==",
"requires": {
"psl": "^1.1.28",
"punycode": "^2.1.1"
}
}
}
},
"tough-cookie": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-3.0.1.tgz",
"integrity": "sha512-yQyJ0u4pZsv9D4clxO69OEjLWYw+jbgspjTue4lTQZLfV0c5l1VmK2y1JK8E9ahdpltPOaAThPcp5nKPUgSnsg==",
"requires": {
"ip-regex": "^2.1.0",
"psl": "^1.1.28",
"punycode": "^2.1.1"
}
},
"tunnel-agent": {
"version": "0.6.0",
"resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz",
"integrity": "sha1-J6XeoGs2sEoKmWZ3SykIaPD8QP0=",
"requires": {
"safe-buffer": "^5.0.1"
}
}
}
},
@ -7322,21 +7268,32 @@
}
},
"request-promise-core": {
"version": "1.1.3",
"resolved": "https://registry.npmjs.org/request-promise-core/-/request-promise-core-1.1.3.tgz",
"integrity": "sha512-QIs2+ArIGQVp5ZYbWD5ZLCY29D5CfWizP8eWnm8FoGD1TX61veauETVQbrV60662V0oFBkrDOuaBI8XgtuyYAQ==",
"version": "1.1.4",
"resolved": "https://registry.npmjs.org/request-promise-core/-/request-promise-core-1.1.4.tgz",
"integrity": "sha512-TTbAfBBRdWD7aNNOoVOBH4pN/KigV6LyapYNNlAPA8JwbovRti1E88m3sYAwsLi5ryhPKsE9APwnjFTgdUjTpw==",
"requires": {
"lodash": "^4.17.15"
"lodash": "^4.17.19"
}
},
"request-promise-native": {
"version": "1.0.8",
"resolved": "https://registry.npmjs.org/request-promise-native/-/request-promise-native-1.0.8.tgz",
"integrity": "sha512-dapwLGqkHtwL5AEbfenuzjTYg35Jd6KPytsC2/TLkVMz8rm+tNt72MGUWT1RP/aYawMpN6HqbNGBQaRcBtjQMQ==",
"version": "1.0.9",
"resolved": "https://registry.npmjs.org/request-promise-native/-/request-promise-native-1.0.9.tgz",
"integrity": "sha512-wcW+sIUiWnKgNY0dqCpOZkUbF/I+YPi+f09JZIDa39Ec+q82CpSYniDp+ISgTTbKmnpJWASeJBPZmoxH84wt3g==",
"requires": {
"request-promise-core": "1.1.3",
"request-promise-core": "1.1.4",
"stealthy-require": "^1.1.1",
"tough-cookie": "^2.3.3"
},
"dependencies": {
"tough-cookie": {
"version": "2.5.0",
"resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-2.5.0.tgz",
"integrity": "sha512-nlLsUzgm1kfLXSXfRZMc1KLAugd4hqJHDTvc2hDIwS3mZAfMEuMbc03SujMF+GEcpaX/qboeycw6iO8JwVv2+g==",
"requires": {
"psl": "^1.1.28",
"punycode": "^2.1.1"
}
}
}
},
"require-directory": {
@ -7565,9 +7522,9 @@
"integrity": "sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw=="
},
"saxes": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/saxes/-/saxes-5.0.0.tgz",
"integrity": "sha512-LXTZygxhf8lfwKaTP/8N9CsVdjTlea3teze4lL6u37ivbgGbV0GGMuNtS/I9rnD/HC2/txUM7Df4S2LVl1qhiA==",
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/saxes/-/saxes-5.0.1.tgz",
"integrity": "sha512-5LBh1Tls8c9xgGjw3QrMwETmTMVk0oFgvrFSvWx62llR2hcEInrKNZ2GZCCuuy2lvWrdl5jhbpeqc5hRYKFOcw==",
"requires": {
"xmlchars": "^2.2.0"
}
@ -8625,19 +8582,13 @@
}
},
"tough-cookie": {
"version": "2.4.3",
"resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-2.4.3.tgz",
"integrity": "sha512-Q5srk/4vDM54WJsJio3XNn6K2sCG+CQ8G5Wz6bZhRZoAe/+TxjWB/GlFAnYEbkYVlON9FMk/fE3h2RLpPXo4lQ==",
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-3.0.1.tgz",
"integrity": "sha512-yQyJ0u4pZsv9D4clxO69OEjLWYw+jbgspjTue4lTQZLfV0c5l1VmK2y1JK8E9ahdpltPOaAThPcp5nKPUgSnsg==",
"requires": {
"psl": "^1.1.24",
"punycode": "^1.4.1"
},
"dependencies": {
"punycode": {
"version": "1.4.1",
"resolved": "https://registry.npmjs.org/punycode/-/punycode-1.4.1.tgz",
"integrity": "sha1-wNWmOycYgArY4esPpSachN1BhF4="
}
"ip-regex": "^2.1.0",
"psl": "^1.1.28",
"punycode": "^2.1.1"
}
},
"tr46": {
@ -9066,9 +9017,9 @@
}
},
"webidl-conversions": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-5.0.0.tgz",
"integrity": "sha512-VlZwKPCkYKxQgeSbH5EyngOmRp7Ww7I9rQLERETtf5ofd9pGeswWiOtogpEO850jziPRarreGxn5QIiTqpb2wA=="
"version": "6.1.0",
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-6.1.0.tgz",
"integrity": "sha512-qBIvFLGiBpLjfwmYAaHPXsn+ho5xZnGvyGvsarywGNc8VyQJUMHJ8OBKGGrPER0okBeMDaan4mNBlgBROxuI8w=="
},
"webpack": {
"version": "5.0.0-beta.32",
@ -9375,13 +9326,13 @@
"integrity": "sha512-M4yMwr6mAnQz76TbJm914+gPpB/nCwvZbJU28cUD6dR004SAxDLOOSUaB1JDRqLtaOV/vi0IC5lEAGFgrjGv/g=="
},
"whatwg-url": {
"version": "8.0.0",
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-8.0.0.tgz",
"integrity": "sha512-41ou2Dugpij8/LPO5Pq64K5q++MnRCBpEHvQr26/mArEKTkCV5aoXIqyhuYtE0pkqScXwhf2JP57rkRTYM29lQ==",
"version": "8.2.2",
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-8.2.2.tgz",
"integrity": "sha512-PcVnO6NiewhkmzV0qn7A+UZ9Xx4maNTI+O+TShmfE4pqjoCMwUMjkvoNhNHPTvgR7QH9Xt3R13iHuWy2sToFxQ==",
"requires": {
"lodash.sortby": "^4.7.0",
"tr46": "^2.0.0",
"webidl-conversions": "^5.0.0"
"tr46": "^2.0.2",
"webidl-conversions": "^6.1.0"
}
},
"which": {

View File

@ -51,6 +51,7 @@
"ini": "1.3.5",
"is-svg": "4.2.1",
"jimp": "0.16.1",
"jsdom": "^16.4.0",
"mime-types": "2.1.27",
"multer": "1.4.2",
"node-abi": "2.19.1",

View File

@ -1,8 +1,10 @@
const noteCache = require('./note_cache');
const noteCacheService = require('./note_cache_service.js');
const dateUtils = require('../date_utils');
const repository = require('../repository');
const { JSDOM } = require("jsdom");
const DEBUG = true;
const DEBUG = false;
const IGNORED_ATTRS = [
"datenote",
@ -116,6 +118,28 @@ function buildRewardMap(note) {
addToRewardMap(value, reward);
}
if (note.type === 'text' && note.isDecrypted) {
const noteEntity = repository.getNote(note.noteId);
const content = noteEntity.getContent();
const dom = new JSDOM(content);
function addHeadingsToRewardMap(elName, rewardFactor) {
for (const el of dom.window.document.querySelectorAll(elName)) {
addToRewardMap(el.textContent, rewardFactor);
}
}
// title is the top with weight 1 so smaller headings will have lower weight
// technically H1 is not supported but for the case it's present let's weigh it just as H2
addHeadingsToRewardMap("h1", 0.9);
addHeadingsToRewardMap("h2", 0.9);
addHeadingsToRewardMap("h3", 0.8);
addHeadingsToRewardMap("h4", 0.7);
addHeadingsToRewardMap("h5", 0.6);
addHeadingsToRewardMap("h6", 0.5);
}
return map;
}
@ -148,13 +172,13 @@ function trimMime(mime) {
}
function buildDateLimits(baseNote) {
const dateCreatedTs = dateUtils.parseDateTime(baseNote.utcDateCreated);
const dateCreatedTs = dateUtils.parseDateTime(baseNote.utcDateCreated).getTime();
return {
minDate: dateUtils.utcDateStr(new Date(dateCreatedTs - 3600)),
minExcludedDate: dateUtils.utcDateStr(new Date(dateCreatedTs - 5)),
maxExcludedDate: dateUtils.utcDateStr(new Date(dateCreatedTs + 5)),
maxDate: dateUtils.utcDateStr(new Date(dateCreatedTs + 3600)),
minDate: dateUtils.utcDateStr(new Date(dateCreatedTs - 3600 * 1000)),
minExcludedDate: dateUtils.utcDateStr(new Date(dateCreatedTs - 5 * 1000)),
maxExcludedDate: dateUtils.utcDateStr(new Date(dateCreatedTs + 5 * 1000)),
maxDate: dateUtils.utcDateStr(new Date(dateCreatedTs + 3600 * 1000)),
};
}
@ -211,8 +235,10 @@ async function findSimilarNotes(noteId) {
const dateLimits = buildDateLimits(baseNote);
const rewardMap = buildRewardMap(baseNote);
const ancestorRewardCache = {};
let ancestorRewardCache = {};
const ancestorNoteIds = new Set(baseNote.ancestors.map(note => note.noteId));
ancestorNoteIds.add(baseNote.noteId);
let displayRewards = false;
function gatherRewards(text, factor = 1) {
@ -227,10 +253,12 @@ async function findSimilarNotes(noteId) {
const lengthPenalization = 1 / Math.pow(text.length, 0.3);
for (const word of splitToWords(text)) {
const reward = rewardMap[word] * factor * lengthPenalization || 0;
const reward = (rewardMap[word] * factor * lengthPenalization) || 0;
if (displayRewards && reward > 0) {
console.log(`Reward ${Math.round(reward * 10) / 10} for word: ${word}`);
console.log(`Before: ${counter}, add ${reward}, res: ${counter + reward}`);
console.log(`${rewardMap[word]} * ${factor} * ${lengthPenalization}`);
}
counter += reward;
@ -240,11 +268,20 @@ async function findSimilarNotes(noteId) {
}
function gatherAncestorRewards(note) {
if (ancestorNoteIds.has(note.noteId)) {
return 0;
}
if (!(note.noteId in ancestorRewardCache)) {
let score = 0;
for (const parentNote of note.parents) {
if (!ancestorNoteIds.has(parentNote.noteId)) {
if (displayRewards) {
console.log("Considering", parentNote.title);
}
if (parentNote.isDecrypted) {
score += gatherRewards(parentNote.title, 0.3);
}
@ -303,6 +340,10 @@ async function findSimilarNotes(noteId) {
}
if (candidateNote.type === baseNote.type) {
if (displayRewards) {
console.log("Adding reward for same note type");
}
score += 0.2;
}
@ -315,11 +356,20 @@ async function findSimilarNotes(noteId) {
*/
const {utcDateCreated} = candidateNote;
if (utcDateCreated < dateLimits.minExcludedDate && utcDateCreated > dateLimits.maxExcludedDate) {
if (utcDateCreated < dateLimits.minExcludedDate || utcDateCreated > dateLimits.maxExcludedDate) {
if (utcDateCreated >= dateLimits.minDate && utcDateCreated <= dateLimits.maxDate) {
if (displayRewards) {
console.log("Adding reward for very similar date of creation");
}
score += 1;
}
else if (utcDateCreated.substr(0, 10) === dateLimits.minDate.substr(0, 10) || utcDateCreated.substr(0, 10) === dateLimits.maxDate.substr(0, 10)) {
else if (utcDateCreated.substr(0, 10) === dateLimits.minDate.substr(0, 10)
|| utcDateCreated.substr(0, 10) === dateLimits.maxDate.substr(0, 10)) {
if (displayRewards) {
console.log("Adding reward for same day of creation");
}
// smaller bonus when outside of the window but within same date
score += 0.5;
}
@ -337,7 +387,7 @@ async function findSimilarNotes(noteId) {
let score = computeScore(candidateNote);
if (score >= 2) {
if (score >= 1.5) {
const notePath = noteCacheService.getSomePath(candidateNote);
// this takes care of note hoisting
@ -365,14 +415,17 @@ async function findSimilarNotes(noteId) {
console.log("REWARD MAP", rewardMap);
if (results.length >= 1) {
const note = noteCache.notes[results[0].noteId];
for (const {noteId} of results) {
const note = noteCache.notes[noteId];
console.log("WINNER", note.pojo);
console.log("NOTE", note.pojo);
displayRewards = true;
const totalReward = computeScore(note);
displayRewards = true;
ancestorRewardCache = {}; // reset cache
const totalReward = computeScore(note);
console.log("Total reward:", Math.round(totalReward * 10) / 10);
console.log("Total reward:", Math.round(totalReward * 10) / 10);
}
}
}