trilium/src/services/import/zip.js

"use strict";

const BAttribute = require('../../becca/entities/battribute');
const utils = require('../../services/utils');
const log = require('../../services/log');
const noteService = require('../../services/notes');
const attributeService = require('../../services/attributes');
const BBranch = require('../../becca/entities/bbranch');
const path = require('path');
const commonmark = require('commonmark');
const protectedSessionService = require('../protected_session');
const mimeService = require("./mime");
const treeService = require("../tree");
const yauzl = require("yauzl");
const htmlSanitizer = require('../html_sanitizer');
const becca = require("../../becca/becca");

/**
 * @param {TaskContext} taskContext
 * @param {Buffer} fileBuffer
 * @param {BNote} importRootNote
 * @return {Promise<*>}
 */
async function importZip(taskContext, fileBuffer, importRootNote) {
    // maps from original noteId (in ZIP file) to newly generated noteId
    const noteIdMap = {};
    const attributes = [];
    // path => noteId, used only when meta file is not available
    const createdPaths = { '/': importRootNote.noteId, '\\': importRootNote.noteId };
    const mdReader = new commonmark.Parser();
    const mdWriter = new commonmark.HtmlRenderer();
    let metaFile = null;
    let firstNote = null;
    const createdNoteIds = {};

    function getNewNoteId(origNoteId) {
        // in case the original noteId is empty. This probably shouldn't happen, but still good to have this precaution
        if (!origNoteId.trim()) {
            return "";
        }

        if (origNoteId === 'root' || origNoteId.startsWith("_")) {
            // these "named" noteIds don't differ between Trilium instances
            return origNoteId;
        }

        if (!noteIdMap[origNoteId]) {
            noteIdMap[origNoteId] = utils.newEntityId();
        }

        return noteIdMap[origNoteId];
    }

    function getMeta(filePath) {
        if (!metaFile) {
            return {};
        }

        const pathSegments = filePath.split(/[\/\\]/g);

        let cursor = {
            isImportRoot: true,
            children: metaFile.files
        };

        let parent;

        for (const segment of pathSegments) {
            if (!cursor || !cursor.children || cursor.children.length === 0) {
                return {};
            }

            parent = cursor;
            cursor = cursor.children.find(file => file.dataFileName === segment || file.dirFileName === segment);
        }

        return {
            parentNoteMeta: parent,
            noteMeta: cursor
        };
    }

    function getParentNoteId(filePath, parentNoteMeta) {
        let parentNoteId;

        if (parentNoteMeta) {
            parentNoteId = parentNoteMeta.isImportRoot ? importRootNote.noteId : getNewNoteId(parentNoteMeta.noteId);
        }
        else {
            const parentPath = path.dirname(filePath);

            if (parentPath === '.') {
                parentNoteId = importRootNote.noteId;
            }
            else if (parentPath in createdPaths) {
                parentNoteId = createdPaths[parentPath];
            }
            else {
                // ZIP allows creating out of order records - i.e. file in a directory can appear in the ZIP stream before actual directory
                parentNoteId = saveDirectory(parentPath);
            }
        }

        return parentNoteId;
    }

    function getNoteId(noteMeta, filePath) {
        if (noteMeta) {
            return getNewNoteId(noteMeta.noteId);
        }

        const filePathNoExt = utils.removeTextFileExtension(filePath);

        if (filePathNoExt in createdPaths) {
            return createdPaths[filePathNoExt];
        }

        const noteId = utils.newEntityId();

        createdPaths[filePathNoExt] = noteId;

        return noteId;
    }

    function detectFileTypeAndMime(taskContext, filePath) {
        const mime = mimeService.getMime(filePath) || "application/octet-stream";
        const type = mimeService.getType(taskContext.data, mime);

        return { mime, type };
    }

    function saveAttributes(note, noteMeta) {
        if (!noteMeta) {
            return;
        }

        for (const attr of noteMeta.attributes) {
            attr.noteId = note.noteId;

            if (attr.type === 'label-definition') {
                attr.type = 'label';
                attr.name = `label:${attr.name}`;
            }
            else if (attr.type === 'relation-definition') {
                attr.type = 'label';
                attr.name = `relation:${attr.name}`;
            }

            if (!attributeService.isAttributeType(attr.type)) {
                log.error(`Unrecognized attribute type ${attr.type}`);
                continue;
            }

            if (attr.type === 'relation' && ['internalLink', 'imageLink', 'relationMapLink', 'includeNoteLink'].includes(attr.name)) {
                // these relations are created automatically and as such don't need to be duplicated in the import
                continue;
            }

            if (attr.type === 'relation') {
                attr.value = getNewNoteId(attr.value);
            }

            if (taskContext.data.safeImport && attributeService.isAttributeDangerous(attr.type, attr.name)) {
                attr.name = `disabled:${attr.name}`;
            }

            if (taskContext.data.safeImport) {
                attr.name = htmlSanitizer.sanitize(attr.name);
                attr.value = htmlSanitizer.sanitize(attr.value);
            }

            attributes.push(attr);
        }
    }

    function saveDirectory(filePath) {
        const { parentNoteMeta, noteMeta } = getMeta(filePath);

        const noteId = getNoteId(noteMeta, filePath);
        const noteTitle = utils.getNoteTitle(filePath, taskContext.data.replaceUnderscoresWithSpaces, noteMeta);
        const parentNoteId = getParentNoteId(filePath, parentNoteMeta);

        let note = becca.getNote(noteId);

        if (note) {
            return;
        }

        ({note} = noteService.createNewNote({
            parentNoteId: parentNoteId,
            title: noteTitle,
            content: '',
            noteId: noteId,
            type: resolveNoteType(noteMeta?.type),
            mime: noteMeta ? noteMeta.mime : 'text/html',
            prefix: noteMeta ? noteMeta.prefix : '',
            isExpanded: noteMeta ? noteMeta.isExpanded : false,
            notePosition: (noteMeta && firstNote) ? noteMeta.notePosition : undefined,
            isProtected: importRootNote.isProtected && protectedSessionService.isProtectedSessionAvailable(),
        }));

        createdNoteIds[note.noteId] = true;

        saveAttributes(note, noteMeta);

        if (!firstNote) {
            firstNote = note;
        }

        return noteId;
    }

    function getNoteIdFromRelativeUrl(url, filePath) {
        while (url.startsWith("./")) {
            url = url.substr(2);
        }

        let absUrl = path.dirname(filePath);

        while (url.startsWith("../")) {
            absUrl = path.dirname(absUrl);

            url = url.substr(3);
        }

        if (absUrl === '.') {
            absUrl = '';
        }

        absUrl += `${absUrl.length > 0 ? '/' : ''}${url}`;

        const {noteMeta} = getMeta(absUrl);

        if (!noteMeta) {
            log.info(`Could not find note meta for URL '${absUrl}'.`);

            return null;
        }

        const targetNoteId = getNoteId(noteMeta, absUrl);
        return targetNoteId;
    }

    function processTextNoteContent(content, noteTitle, filePath, noteMeta) {
        function isUrlAbsolute(url) {
            return /^(?:[a-z]+:)?\/\//i.test(url);
        }

        content = content.replace(/<h1>([^<]*)<\/h1>/gi, (match, text) => {
            if (noteTitle.trim() === text.trim()) {
                return ""; // remove whole H1 tag
            } else {
                return `<h2>${text}</h2>`;
            }
        });

        content = htmlSanitizer.sanitize(content);

        content = content.replace(/<html.*<body[^>]*>/gis, "");
        content = content.replace(/<\/body>.*<\/html>/gis, "");

        content = content.replace(/src="([^"]*)"/g, (match, url) => {
            try {
                url = decodeURIComponent(url);
            } catch (e) {
                log.error(`Cannot parse image URL '${url}', keeping original (${e}).`);
                return `src="${url}"`;
            }

            if (isUrlAbsolute(url) || url.startsWith("/")) {
                return match;
            }

            const targetNoteId = getNoteIdFromRelativeUrl(url, filePath);

            if (!targetNoteId) {
                return match;
            }

            return `src="api/images/${targetNoteId}/${path.basename(url)}"`;
        });

        content = content.replace(/href="([^"]*)"/g, (match, url) => {
            try {
                url = decodeURIComponent(url);
            } catch (e) {
                log.error(`Cannot parse link URL '${url}', keeping original (${e}).`);
                return `href="${url}"`;
            }

            if (url.startsWith('#') // already a note path (probably)
                || isUrlAbsolute(url)) {
                return match;
            }

            const targetNoteId = getNoteIdFromRelativeUrl(url, filePath);

            if (!targetNoteId) {
                return match;
            }

            return `href="#root/${targetNoteId}"`;
        });

        content = content.replace(/data-note-path="([^"]*)"/g, (match, notePath) => {
            const noteId = notePath.split("/").pop();

            let targetNoteId;

            if (noteId === 'root' || noteId.startsWith("_")) { // named noteIds stay identical across instances
                targetNoteId = noteId;
            } else {
                targetNoteId = noteIdMap[noteId];
            }

            return `data-note-path="root/${targetNoteId}"`;
        });

        if (noteMeta) {
            const includeNoteLinks = (noteMeta.attributes || [])
                .filter(attr => attr.type === 'relation' && attr.name === 'includeNoteLink');

            for (const link of includeNoteLinks) {
                // no need to escape the regexp find string since it's a noteId which doesn't contain any special characters
                content = content.replace(new RegExp(link.value, "g"), getNewNoteId(link.value));
            }
        }
        return content;
    }

    function processNoteContent(noteMeta, type, mime, content, noteTitle, filePath) {
        if (noteMeta?.format === 'markdown'
            || (!noteMeta && taskContext.data.textImportedAsText && ['text/markdown', 'text/x-markdown'].includes(mime))) {
            const parsed = mdReader.parse(content);
            content = mdWriter.render(parsed);
        }

        if (type === 'text') {
            content = processTextNoteContent(content, noteTitle, filePath, noteMeta);
        }

        if (type === 'relationMap' && noteMeta) {
            const relationMapLinks = (noteMeta.attributes || [])
                .filter(attr => attr.type === 'relation' && attr.name === 'relationMapLink');

            // this will replace relation map links
            for (const link of relationMapLinks) {
                // no need to escape the regexp find string since it's a noteId which doesn't contain any special characters
                content = content.replace(new RegExp(link.value, "g"), getNewNoteId(link.value));
            }
        }

        return content;
    }

    function saveNote(filePath, content) {
        const {parentNoteMeta, noteMeta} = getMeta(filePath);

        if (noteMeta?.noImport) {
            return;
        }

        const noteId = getNoteId(noteMeta, filePath);
        const parentNoteId = getParentNoteId(filePath, parentNoteMeta);

        if (!parentNoteId) {
            throw new Error(`Cannot find parentNoteId for ${filePath}`);
        }

        if (noteMeta?.isClone) {
            if (!becca.getBranchFromChildAndParent(noteId, parentNoteId)) {
                new BBranch({
                    noteId,
                    parentNoteId,
                    isExpanded: noteMeta.isExpanded,
                    prefix: noteMeta.prefix,
                    notePosition: noteMeta.notePosition
                }).save();
            }

            return;
        }

        let {type, mime} = noteMeta ? noteMeta : detectFileTypeAndMime(taskContext, filePath);
        type = resolveNoteType(type);

        if (type !== 'file' && type !== 'image') {
            content = content.toString("UTF-8");
        }

        const noteTitle = utils.getNoteTitle(filePath, taskContext.data.replaceUnderscoresWithSpaces, noteMeta);

        content = processNoteContent(noteMeta, type, mime, content, noteTitle, filePath);

        let note = becca.getNote(noteId);

        const isProtected = importRootNote.isProtected && protectedSessionService.isProtectedSessionAvailable();

        if (note) {
            // only skeleton was created because of altered order of cloned notes in ZIP, we need to update
            // https://github.com/zadam/trilium/issues/2440
            if (note.type === undefined) {
                note.type = type;
                note.mime = mime;
                note.title = noteTitle;
                note.isProtected = isProtected;
                note.save();
            }

            note.setContent(content);

            if (!becca.getBranchFromChildAndParent(noteId, parentNoteId)) {
                new BBranch({
                    noteId,
                    parentNoteId,
                    isExpanded: noteMeta.isExpanded,
                    prefix: noteMeta.prefix,
                    notePosition: noteMeta.notePosition
                }).save();
            }
        }
        else {
            ({note} = noteService.createNewNote({
                parentNoteId: parentNoteId,
                title: noteTitle,
                content: content,
                noteId,
                type,
                mime,
                prefix: noteMeta ? noteMeta.prefix : '',
                isExpanded: noteMeta ? noteMeta.isExpanded : false,
                // root notePosition should be ignored since it relates to original document
                // now import root should be placed after existing notes into new parent
                notePosition: (noteMeta && firstNote) ? noteMeta.notePosition : undefined,
                isProtected: isProtected,
            }));

            createdNoteIds[note.noteId] = true;

            saveAttributes(note, noteMeta);

            if (!firstNote) {
                firstNote = note;
            }

            if (type === 'text') {
                filePath = utils.removeTextFileExtension(filePath);
            }
        }

        if (!noteMeta && (type === 'file' || type === 'image')) {
            attributes.push({
                noteId,
                type: 'label',
                name: 'originalFileName',
                value: path.basename(filePath)
            });
        }
    }

    // we're running two passes to make sure that the meta file is loaded before the rest of the files is processed.

    await readZipFile(fileBuffer, async (zipfile, entry) => {
        const filePath = normalizeFilePath(entry.fileName);

        if (filePath === '!!!meta.json') {
            const content = await readContent(zipfile, entry);

            metaFile = JSON.parse(content.toString("UTF-8"));
        }

        zipfile.readEntry();
    });

    await readZipFile(fileBuffer, async (zipfile, entry) => {
        const filePath = normalizeFilePath(entry.fileName);

        if (/\/$/.test(entry.fileName)) {
            saveDirectory(filePath);
        }
        else if (filePath !== '!!!meta.json') {
            const content = await readContent(zipfile, entry);

            saveNote(filePath, content);
        }

        taskContext.increaseProgressCount();
        zipfile.readEntry();
    });

    for (const noteId in createdNoteIds) { // now the noteIds are unique
        noteService.scanForLinks(becca.getNote(noteId));

        if (!metaFile) {
            // if there's no meta file then the notes are created based on the order in that zip file but that
            // is usually quite random, so we sort the notes in the way they would appear in the file manager
            treeService.sortNotes(noteId, 'title', false, true);
        }

        taskContext.increaseProgressCount();
    }

    // we're saving attributes and links only now so that all relation and link target notes
    // are already in the database (we don't want to have "broken" relations, not even transitionally)
    for (const attr of attributes) {
        if (attr.type !== 'relation' || attr.value in becca.notes) {
            new BAttribute(attr).save();
        }
        else {
            log.info(`Relation not imported since the target note doesn't exist: ${JSON.stringify(attr)}`);
        }
    }

    return firstNote;
}

/** @returns {string} path without leading or trailing slash and backslashes converted to forward ones */
function normalizeFilePath(filePath) {
    filePath = filePath.replace(/\\/g, "/");

    if (filePath.startsWith("/")) {
        filePath = filePath.substr(1);
    }

    if (filePath.endsWith("/")) {
        filePath = filePath.substr(0, filePath.length - 1);
    }

    return filePath;
}

function streamToBuffer(stream) {
    const chunks = [];
    stream.on('data', chunk => chunks.push(chunk));

    return new Promise((res, rej) => stream.on('end', () => res(Buffer.concat(chunks))));
}

function readContent(zipfile, entry) {
    return new Promise((res, rej) => {
        zipfile.openReadStream(entry, function(err, readStream) {
            if (err) rej(err);

            streamToBuffer(readStream).then(res);
        });
    });
}

function readZipFile(buffer, processEntryCallback) {
    return new Promise((res, rej) => {
        yauzl.fromBuffer(buffer, {lazyEntries: true, validateEntrySizes: false}, function(err, zipfile) {
            if (err) throw err;
            zipfile.readEntry();
            zipfile.on("entry", entry => processEntryCallback(zipfile, entry));
            zipfile.on("end", res);
        });
    });
}

function resolveNoteType(type) {
    type = type || 'text';

    // BC for ZIPs created in Triliun 0.57 and older
    if (type === 'relation-map') {
        type = 'relationMap';
    } else if (type === 'note-map') {
        type = 'noteMap';
    } else if (type === 'web-view') {
        type = 'webView';
    }

    return type;
}


module.exports = {
    importZip
};