trilium/src/services/import/tar.js

"use strict";

const Attribute = require('../../entities/attribute');
const utils = require('../../services/utils');
const log = require('../../services/log');
const repository = require('../../services/repository');
const noteService = require('../../services/notes');
const attributeService = require('../../services/attributes');
const Branch = require('../../entities/branch');
const tar = require('tar-stream');
const stream = require('stream');
const path = require('path');
const commonmark = require('commonmark');
const TaskContext = require('../task_context.js');
const protectedSessionService = require('../protected_session');
const mimeService = require("./mime");
const treeService = require("../tree");

/**
 * @param {TaskContext} taskContext
 * @param {Buffer} fileBuffer
 * @param {Note} importRootNote
 * @return {Promise<*>}
 */
async function importTar(taskContext, fileBuffer, importRootNote) {
    // maps from original noteId (in tar file) to newly generated noteId
    const noteIdMap = {};
    const attributes = [];
    // path => noteId
    const createdPaths = { '/': importRootNote.noteId, '\\': importRootNote.noteId };
    const mdReader = new commonmark.Parser();
    const mdWriter = new commonmark.HtmlRenderer();
    let metaFile = null;
    let firstNote = null;

    const extract = tar.extract();

    function getNewNoteId(origNoteId) {
        // in case the original noteId is empty. This probably shouldn't happen, but still good to have this precaution
        if (!origNoteId.trim()) {
            return "";
        }

        if (!noteIdMap[origNoteId]) {
            noteIdMap[origNoteId] = utils.newEntityId();
        }

        return noteIdMap[origNoteId];
    }

    function getMeta(filePath) {
        if (!metaFile) {
            return {};
        }

        const pathSegments = filePath.split(/[\/\\]/g);

        let cursor = {
            isImportRoot: true,
            children: metaFile.files
        };

        let parent;

        for (const segment of pathSegments) {
            if (!cursor || !cursor.children || cursor.children.length === 0) {
                return {};
            }

            parent = cursor;
            cursor = cursor.children.find(file => file.dataFileName === segment || file.dirFileName === segment);
        }

        return {
            parentNoteMeta: parent,
            noteMeta: cursor
        };
    }

    async function getParentNoteId(filePath, parentNoteMeta) {
        let parentNoteId;

        if (parentNoteMeta) {
            parentNoteId = parentNoteMeta.isImportRoot ? importRootNote.noteId : getNewNoteId(parentNoteMeta.noteId);
        }
        else {
            const parentPath = path.dirname(filePath);

            if (parentPath === '.') {
                parentNoteId = importRootNote.noteId;
            }
            else if (parentPath in createdPaths) {
                parentNoteId = createdPaths[parentPath];
            }
            else {
                // tar allows creating out of order records - i.e. file in a directory can appear in the tar stream before actual directory
                // (out-of-order-directory-records.tar in test set)
                parentNoteId = await saveDirectory(parentPath);
            }
        }

        return parentNoteId;
    }

    function getNoteTitle(filePath, noteMeta) {
        if (noteMeta) {
            return noteMeta.title;
        }
        else {
            const basename = path.basename(filePath);

            return getTextFileWithoutExtension(basename);
        }
    }

    function getNoteId(noteMeta, filePath) {
        const filePathNoExt = getTextFileWithoutExtension(filePath);

        if (filePathNoExt in createdPaths) {
            return createdPaths[filePathNoExt];
        }

        const noteId = noteMeta ? getNewNoteId(noteMeta.noteId) : utils.newEntityId();

        createdPaths[filePathNoExt] = noteId;

        return noteId;
    }

    function detectFileTypeAndMime(taskContext, filePath) {
        const mime = mimeService.getMime(filePath) || "application/octet-stream";
        const type = mimeService.getType(taskContext.data, mime);

        return { mime, type };
    }

    async function saveAttributes(note, noteMeta) {
        if (!noteMeta) {
            return;
        }

        for (const attr of noteMeta.attributes) {
            attr.noteId = note.noteId;

            if (!attributeService.isAttributeType(attr.type)) {
                log.error("Unrecognized attribute type " + attr.type);
                continue;
            }

            if (attr.type === 'relation' && ['internalLink', 'imageLink', 'relationMapLink', 'includeNoteLink'].includes(attr.name)) {
                // these relations are created automatically and as such don't need to be duplicated in the import
                continue;
            }

            if (attr.type === 'relation') {
                attr.value = getNewNoteId(attr.value);
            }

            if (taskContext.data.safeImport && attributeService.isAttributeDangerous(attr.type, attr.name)) {
                attr.name = 'disabled-' + attr.name;
            }

            attributes.push(attr);
        }
    }

    async function saveDirectory(filePath) {
        const { parentNoteMeta, noteMeta } = getMeta(filePath);

        const noteId = getNoteId(noteMeta, filePath);
        const noteTitle = getNoteTitle(filePath, noteMeta);
        const parentNoteId = await getParentNoteId(filePath, parentNoteMeta);

        let note = await repository.getNote(noteId);

        if (note) {
            return;
        }

        ({note} = await noteService.createNewNote({
            parentNoteId: parentNoteId,
            title: noteTitle,
            content: '',
            noteId: noteId,
            type: noteMeta ? noteMeta.type : 'text',
            mime: noteMeta ? noteMeta.mime : 'text/html',
            prefix: noteMeta ? noteMeta.prefix : '',
            isExpanded: noteMeta ? noteMeta.isExpanded : false,
            isProtected: importRootNote.isProtected && protectedSessionService.isProtectedSessionAvailable(),
        }));

        await saveAttributes(note, noteMeta);

        if (!firstNote) {
            firstNote = note;
        }

        return noteId;
    }

    function getTextFileWithoutExtension(filePath) {
        const extension = path.extname(filePath).toLowerCase();

        if (extension === '.md' || extension === '.html') {
            return filePath.substr(0, filePath.length - extension.length);
        }
        else {
            return filePath;
        }
    }

    function getNoteIdFromRelativeUrl(url, filePath) {
        while (url.startsWith("./")) {
            url = url.substr(2);
        }

        let absUrl = path.dirname(filePath);

        while (url.startsWith("../")) {
            absUrl = path.dirname(absUrl);

            url = url.substr(3);
        }

        if (absUrl === '.') {
            absUrl = '';
        }

        absUrl += (absUrl.length > 0 ? '/' : '') + url;

        const {noteMeta} = getMeta(absUrl);
        const targetNoteId = getNoteId(noteMeta, absUrl);
        return targetNoteId;
    }

    async function saveNote(filePath, content) {
        const {parentNoteMeta, noteMeta} = getMeta(filePath);

        if (noteMeta && noteMeta.noImport) {
            return;
        }

        const noteId = getNoteId(noteMeta, filePath);
        const parentNoteId = await getParentNoteId(filePath, parentNoteMeta);

        if (noteMeta && noteMeta.isClone) {
            await new Branch({
                noteId,
                parentNoteId,
                isExpanded: noteMeta.isExpanded,
                prefix: noteMeta.prefix,
                notePosition: noteMeta.notePosition
            }).save();

            return;
        }

        const {type, mime} = noteMeta ? noteMeta : detectFileTypeAndMime(taskContext, filePath);

        if (type !== 'file' && type !== 'image') {
            content = content.toString("UTF-8");
        }

        if ((noteMeta && noteMeta.format === 'markdown')
            || (!noteMeta && taskContext.data.textImportedAsText && ['text/markdown', 'text/x-markdown'].includes(mime))) {
            const parsed = mdReader.parse(content);
            content = mdWriter.render(parsed);
        }

        const noteTitle = getNoteTitle(filePath, noteMeta);

        if (type === 'text') {
            function isUrlAbsolute(url) {
                return /^(?:[a-z]+:)?\/\//i.test(url);
            }

            content = content.replace(/<html.*<body[^>]*>/gis, "");
            content = content.replace(/<\/body>.*<\/html>/gis, "");

            content = content.replace(/src="([^"]*)"/g, (match, url) => {
                url = decodeURIComponent(url);

                if (isUrlAbsolute(url) || url.startsWith("/")) {
                    return match;
                }

                const targetNoteId = getNoteIdFromRelativeUrl(url, filePath);

                return `src="api/images/${targetNoteId}/${path.basename(url)}"`;
            });

            content = content.replace(/href="([^"]*)"/g, (match, url) => {
                url = decodeURIComponent(url);

                if (isUrlAbsolute(url)) {
                    return match;
                }

                const targetNoteId = getNoteIdFromRelativeUrl(url, filePath);

                return `href="#root/${targetNoteId}"`;
            });

            content = content.replace(/<h1>([^<]*)<\/h1>/gi, (match, text) => {
                if (noteTitle.trim() === text.trim()) {
                    return ""; // remove whole H1 tag
                }
                else {
                    return match;
                }
            });
        }

        if (type === 'relation-map' && noteMeta) {
            const relationMapLinks = (noteMeta.attributes || [])
                .filter(attr => attr.type === 'relation' && attr.name === 'relationMapLink');

            // this will replace relation map links
            for (const link of relationMapLinks) {
                // no need to escape the regexp find string since it's a noteId which doesn't contain any special characters
                content = content.replace(new RegExp(link.value, "g"), getNewNoteId(link.value));
            }
        }

        let note = await repository.getNote(noteId);

        if (note) {
            await note.setContent(content);
        }
        else {
            ({note} = await noteService.createNewNote({
                parentNoteId: parentNoteId,
                title: noteTitle,
                content: content,
                noteId,
                type,
                mime,
                prefix: noteMeta ? noteMeta.prefix : '',
                isExpanded: noteMeta ? noteMeta.isExpanded : false,
                notePosition: noteMeta ? noteMeta.notePosition : false,
                isProtected: importRootNote.isProtected && protectedSessionService.isProtectedSessionAvailable(),
            }));

            await saveAttributes(note, noteMeta);

            if (!firstNote) {
                firstNote = note;
            }

            if (type === 'text') {
                filePath = getTextFileWithoutExtension(filePath);
            }
        }

        if (!noteMeta && (type === 'file' || type === 'image')) {
            attributes.push({
                noteId,
                type: 'label',
                name: 'originalFileName',
                value: path.basename(filePath)
            });
        }
    }

    /** @return {string} path without leading or trailing slash and backslashes converted to forward ones*/
    function normalizeFilePath(filePath) {
        filePath = filePath.replace(/\\/g, "/");

        if (filePath.startsWith("/")) {
            filePath = filePath.substr(1);
        }

        if (filePath.endsWith("/")) {
            filePath = filePath.substr(0, filePath.length - 1);
        }

        return filePath;
    }

    extract.on('entry', function(header, stream, next) {
        const chunks = [];

        stream.on("data", function (chunk) {
            chunks.push(chunk);
        });

        // header is the tar header
        // stream is the content body (might be an empty stream)
        // call next when you are done with this entry

        stream.on('end', async function() {
            const filePath = normalizeFilePath(header.name);

            const content = Buffer.concat(chunks);

            if (filePath === '!!!meta.json') {
                metaFile = JSON.parse(content.toString("UTF-8"));
            }
            else if (header.type === 'directory') {
                await saveDirectory(filePath);
            }
            else if (header.type === 'file') {
                await saveNote(filePath, content);
            }
            else {
                log.info("Ignoring tar import entry with type " + header.type);
            }

            taskContext.increaseProgressCount();

            next(); // ready for next entry
        });

        stream.resume(); // just auto drain the stream
    });

    return new Promise(resolve => {
        extract.on('finish', async function() {
            const createdNoteIds = {};

            for (const path in createdPaths) {
                const noteId = createdPaths[path];

                createdNoteIds[noteId] = true;
            }

            for (const noteId in createdNoteIds) { // now the noteIds are unique
                await noteService.scanForLinks(noteId);

                if (!metaFile) {
                    // if there's no meta file then the notes are created based on the order in that tar file but that
                    // is usually quite random so we sort the notes in the way they would appear in the file manager
                    await treeService.sortNotesAlphabetically(noteId, true);
                }

                taskContext.increaseProgressCount();
            }

            // we're saving attributes and links only now so that all relation and link target notes
            // are already in the database (we don't want to have "broken" relations, not even transitionally)
            for (const attr of attributes) {
                if (attr.type !== 'relation' || attr.value in createdNoteIds) {
                    await new Attribute(attr).save();
                }
                else {
                    log.info("Relation not imported since target note doesn't exist: " + JSON.stringify(attr));
                }
            }

            resolve(firstNote);
        });

        const bufferStream = new stream.PassThrough();
        bufferStream.end(fileBuffer);

        bufferStream.pipe(extract);
    });
}

module.exports = {
    importTar
};