mirror of
https://github.com/zadam/trilium.git
synced 2025-10-20 07:08:55 +02:00
541 lines
18 KiB
TypeScript
541 lines
18 KiB
TypeScript
/**
|
|
* Content Extraction Tool
|
|
*
|
|
* This tool allows the LLM to extract structured information from notes.
|
|
*/
|
|
|
|
import type { Tool, ToolHandler } from './tool_interfaces.js';
|
|
import log from '../../log.js';
|
|
import becca from '../../../becca/becca.js';
|
|
|
|
/**
|
|
* Definition of the content extraction tool
|
|
*/
|
|
export const contentExtractionToolDefinition: Tool = {
|
|
type: 'function',
|
|
function: {
|
|
name: 'extract_content',
|
|
description: 'Extract structured information from a note\'s content, such as lists, tables, or specific sections',
|
|
parameters: {
|
|
type: 'object',
|
|
properties: {
|
|
noteId: {
|
|
type: 'string',
|
|
description: 'ID of the note to extract content from'
|
|
},
|
|
extractionType: {
|
|
type: 'string',
|
|
description: 'Type of content to extract',
|
|
enum: ['lists', 'tables', 'headings', 'codeBlocks', 'all']
|
|
},
|
|
format: {
|
|
type: 'string',
|
|
description: 'Format to return the extracted content in',
|
|
enum: ['json', 'markdown', 'text']
|
|
},
|
|
query: {
|
|
type: 'string',
|
|
description: 'Optional search query to filter extracted content (e.g., "tasks related to finance")'
|
|
}
|
|
},
|
|
required: ['noteId', 'extractionType']
|
|
}
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Content extraction tool implementation
|
|
*/
|
|
export class ContentExtractionTool implements ToolHandler {
|
|
public definition: Tool = contentExtractionToolDefinition;
|
|
|
|
/**
|
|
* Execute the content extraction tool
|
|
*/
|
|
public async execute(args: {
|
|
noteId: string,
|
|
extractionType: 'lists' | 'tables' | 'headings' | 'codeBlocks' | 'all',
|
|
format?: 'json' | 'markdown' | 'text',
|
|
query?: string
|
|
}): Promise<string | object> {
|
|
try {
|
|
const { noteId, extractionType, format = 'json', query } = args;
|
|
|
|
log.info(`Executing extract_content tool - NoteID: "${noteId}", Type: ${extractionType}, Format: ${format}`);
|
|
|
|
// Get the note from becca
|
|
const note = becca.notes[noteId];
|
|
|
|
if (!note) {
|
|
log.info(`Note with ID ${noteId} not found - returning error`);
|
|
return `Error: Note with ID ${noteId} not found`;
|
|
}
|
|
|
|
log.info(`Found note: "${note.title}" (Type: ${note.type})`);
|
|
|
|
// Get the note content
|
|
const content = await note.getContent();
|
|
if (!content) {
|
|
return {
|
|
success: false,
|
|
message: 'Note content is empty'
|
|
};
|
|
}
|
|
|
|
log.info(`Retrieved note content, length: ${content.length} chars`);
|
|
|
|
// Extract the requested content
|
|
const extractedContent: any = {};
|
|
|
|
if (extractionType === 'lists' || extractionType === 'all') {
|
|
extractedContent.lists = this.extractLists(typeof content === 'string' ? content : content.toString());
|
|
log.info(`Extracted ${extractedContent.lists.length} lists`);
|
|
}
|
|
|
|
if (extractionType === 'tables' || extractionType === 'all') {
|
|
extractedContent.tables = this.extractTables(typeof content === 'string' ? content : content.toString());
|
|
log.info(`Extracted ${extractedContent.tables.length} tables`);
|
|
}
|
|
|
|
if (extractionType === 'headings' || extractionType === 'all') {
|
|
extractedContent.headings = this.extractHeadings(typeof content === 'string' ? content : content.toString());
|
|
log.info(`Extracted ${extractedContent.headings.length} headings`);
|
|
}
|
|
|
|
if (extractionType === 'codeBlocks' || extractionType === 'all') {
|
|
extractedContent.codeBlocks = this.extractCodeBlocks(typeof content === 'string' ? content : content.toString());
|
|
log.info(`Extracted ${extractedContent.codeBlocks.length} code blocks`);
|
|
}
|
|
|
|
// Filter by query if provided
|
|
if (query) {
|
|
log.info(`Filtering extracted content with query: "${query}"`);
|
|
this.filterContentByQuery(extractedContent, query);
|
|
}
|
|
|
|
// Format the response based on requested format
|
|
if (format === 'markdown') {
|
|
return this.formatAsMarkdown(extractedContent, extractionType);
|
|
} else if (format === 'text') {
|
|
return this.formatAsText(extractedContent, extractionType);
|
|
} else {
|
|
// Default to JSON format
|
|
return {
|
|
success: true,
|
|
noteId: note.noteId,
|
|
title: note.title,
|
|
extractionType,
|
|
content: extractedContent
|
|
};
|
|
}
|
|
} catch (error: any) {
|
|
log.error(`Error executing extract_content tool: ${error.message || String(error)}`);
|
|
return `Error: ${error.message || String(error)}`;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract lists from HTML content
|
|
*/
|
|
private extractLists(content: string): Array<{ type: string, items: string[] }> {
|
|
const lists = [];
|
|
|
|
// Extract unordered lists
|
|
const ulRegex = /<ul[^>]*>([\s\S]*?)<\/ul>/gi;
|
|
let ulMatch;
|
|
|
|
while ((ulMatch = ulRegex.exec(content)) !== null) {
|
|
const listContent = ulMatch[1];
|
|
const items = this.extractListItems(listContent);
|
|
|
|
if (items.length > 0) {
|
|
lists.push({
|
|
type: 'unordered',
|
|
items
|
|
});
|
|
}
|
|
}
|
|
|
|
// Extract ordered lists
|
|
const olRegex = /<ol[^>]*>([\s\S]*?)<\/ol>/gi;
|
|
let olMatch;
|
|
|
|
while ((olMatch = olRegex.exec(content)) !== null) {
|
|
const listContent = olMatch[1];
|
|
const items = this.extractListItems(listContent);
|
|
|
|
if (items.length > 0) {
|
|
lists.push({
|
|
type: 'ordered',
|
|
items
|
|
});
|
|
}
|
|
}
|
|
|
|
return lists;
|
|
}
|
|
|
|
/**
|
|
* Extract list items from list content
|
|
*/
|
|
private extractListItems(listContent: string): string[] {
|
|
const items = [];
|
|
const itemRegex = /<li[^>]*>([\s\S]*?)<\/li>/gi;
|
|
let itemMatch;
|
|
|
|
while ((itemMatch = itemRegex.exec(listContent)) !== null) {
|
|
const itemText = this.stripHtml(itemMatch[1]).trim();
|
|
if (itemText) {
|
|
items.push(itemText);
|
|
}
|
|
}
|
|
|
|
return items;
|
|
}
|
|
|
|
/**
|
|
* Extract tables from HTML content
|
|
*/
|
|
private extractTables(content: string): Array<{ headers: string[], rows: string[][] }> {
|
|
const tables = [];
|
|
const tableRegex = /<table[^>]*>([\s\S]*?)<\/table>/gi;
|
|
let tableMatch;
|
|
|
|
while ((tableMatch = tableRegex.exec(content)) !== null) {
|
|
const tableContent = tableMatch[1];
|
|
const headers = [];
|
|
const rows = [];
|
|
|
|
// Extract table headers
|
|
const headerRegex = /<th[^>]*>([\s\S]*?)<\/th>/gi;
|
|
let headerMatch;
|
|
while ((headerMatch = headerRegex.exec(tableContent)) !== null) {
|
|
headers.push(this.stripHtml(headerMatch[1]).trim());
|
|
}
|
|
|
|
// Extract table rows
|
|
const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi;
|
|
let rowMatch;
|
|
while ((rowMatch = rowRegex.exec(tableContent)) !== null) {
|
|
const rowContent = rowMatch[1];
|
|
const cells = [];
|
|
|
|
const cellRegex = /<td[^>]*>([\s\S]*?)<\/td>/gi;
|
|
let cellMatch;
|
|
while ((cellMatch = cellRegex.exec(rowContent)) !== null) {
|
|
cells.push(this.stripHtml(cellMatch[1]).trim());
|
|
}
|
|
|
|
if (cells.length > 0) {
|
|
rows.push(cells);
|
|
}
|
|
}
|
|
|
|
if (headers.length > 0 || rows.length > 0) {
|
|
tables.push({
|
|
headers,
|
|
rows
|
|
});
|
|
}
|
|
}
|
|
|
|
return tables;
|
|
}
|
|
|
|
/**
|
|
* Extract headings from HTML content
|
|
*/
|
|
private extractHeadings(content: string): Array<{ level: number, text: string }> {
|
|
const headings = [];
|
|
|
|
for (let i = 1; i <= 6; i++) {
|
|
const headingRegex = new RegExp(`<h${i}[^>]*>([\s\S]*?)<\/h${i}>`, 'gi');
|
|
let headingMatch;
|
|
|
|
while ((headingMatch = headingRegex.exec(content)) !== null) {
|
|
const headingText = this.stripHtml(headingMatch[1]).trim();
|
|
if (headingText) {
|
|
headings.push({
|
|
level: i,
|
|
text: headingText
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
return headings;
|
|
}
|
|
|
|
/**
|
|
* Extract code blocks from HTML content
|
|
*/
|
|
private extractCodeBlocks(content: string): Array<{ language?: string, code: string }> {
|
|
const codeBlocks = [];
|
|
|
|
// Look for <pre> and <code> blocks
|
|
const preRegex = /<pre[^>]*>([\s\S]*?)<\/pre>/gi;
|
|
let preMatch;
|
|
|
|
while ((preMatch = preRegex.exec(content)) !== null) {
|
|
const preContent = preMatch[1];
|
|
// Check if there's a nested <code> tag
|
|
const codeMatch = /<code[^>]*>([\s\S]*?)<\/code>/i.exec(preContent);
|
|
|
|
if (codeMatch) {
|
|
// Extract language if it's in the class attribute
|
|
const classMatch = /class="[^"]*language-([^"\s]+)[^"]*"/i.exec(preMatch[0]);
|
|
codeBlocks.push({
|
|
language: classMatch ? classMatch[1] : undefined,
|
|
code: this.decodeHtmlEntities(codeMatch[1]).trim()
|
|
});
|
|
} else {
|
|
// Just a <pre> without <code>
|
|
codeBlocks.push({
|
|
code: this.decodeHtmlEntities(preContent).trim()
|
|
});
|
|
}
|
|
}
|
|
|
|
// Also look for standalone <code> blocks not inside <pre>
|
|
const standaloneCodeRegex = /(?<!<pre[^>]*>[\s\S]*?)<code[^>]*>([\s\S]*?)<\/code>/gi;
|
|
let standaloneCodeMatch;
|
|
|
|
while ((standaloneCodeMatch = standaloneCodeRegex.exec(content)) !== null) {
|
|
codeBlocks.push({
|
|
code: this.decodeHtmlEntities(standaloneCodeMatch[1]).trim()
|
|
});
|
|
}
|
|
|
|
return codeBlocks;
|
|
}
|
|
|
|
/**
|
|
* Filter content by query
|
|
*/
|
|
private filterContentByQuery(content: any, query: string): void {
|
|
const lowerQuery = query.toLowerCase();
|
|
|
|
if (content.lists) {
|
|
content.lists = content.lists.filter((list: { type: string; items: string[] }) => {
|
|
// Check if any item in the list contains the query
|
|
return list.items.some((item: string) => item.toLowerCase().includes(lowerQuery));
|
|
});
|
|
|
|
// Also filter individual items in each list
|
|
content.lists.forEach((list: { type: string; items: string[] }) => {
|
|
list.items = list.items.filter((item: string) => item.toLowerCase().includes(lowerQuery));
|
|
});
|
|
}
|
|
|
|
if (content.headings) {
|
|
content.headings = content.headings.filter((heading: { level: number; text: string }) =>
|
|
heading.text.toLowerCase().includes(lowerQuery)
|
|
);
|
|
}
|
|
|
|
if (content.tables) {
|
|
content.tables = content.tables.filter((table: { headers: string[]; rows: string[][] }) => {
|
|
// Check if any header contains the query
|
|
const headerMatch = table.headers.some((header: string) =>
|
|
header.toLowerCase().includes(lowerQuery)
|
|
);
|
|
|
|
// Check if any cell in any row contains the query
|
|
const cellMatch = table.rows.some((row: string[]) =>
|
|
row.some((cell: string) => cell.toLowerCase().includes(lowerQuery))
|
|
);
|
|
|
|
return headerMatch || cellMatch;
|
|
});
|
|
}
|
|
|
|
if (content.codeBlocks) {
|
|
content.codeBlocks = content.codeBlocks.filter((block: { language?: string; code: string }) =>
|
|
block.code.toLowerCase().includes(lowerQuery)
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Format extracted content as Markdown
|
|
*/
|
|
private formatAsMarkdown(content: any, extractionType: string): string {
|
|
let markdown = '';
|
|
|
|
if (extractionType === 'lists' || extractionType === 'all') {
|
|
if (content.lists && content.lists.length > 0) {
|
|
markdown += '## Lists\n\n';
|
|
|
|
content.lists.forEach((list: any, index: number) => {
|
|
markdown += `### List ${index + 1} (${list.type})\n\n`;
|
|
|
|
list.items.forEach((item: string) => {
|
|
if (list.type === 'unordered') {
|
|
markdown += `- ${item}\n`;
|
|
} else {
|
|
markdown += `1. ${item}\n`;
|
|
}
|
|
});
|
|
|
|
markdown += '\n';
|
|
});
|
|
}
|
|
}
|
|
|
|
if (extractionType === 'headings' || extractionType === 'all') {
|
|
if (content.headings && content.headings.length > 0) {
|
|
markdown += '## Headings\n\n';
|
|
|
|
content.headings.forEach((heading: any) => {
|
|
markdown += `${'#'.repeat(heading.level)} ${heading.text}\n\n`;
|
|
});
|
|
}
|
|
}
|
|
|
|
if (extractionType === 'tables' || extractionType === 'all') {
|
|
if (content.tables && content.tables.length > 0) {
|
|
markdown += '## Tables\n\n';
|
|
|
|
content.tables.forEach((table: any, index: number) => {
|
|
markdown += `### Table ${index + 1}\n\n`;
|
|
|
|
// Add headers
|
|
if (table.headers.length > 0) {
|
|
markdown += '| ' + table.headers.join(' | ') + ' |\n';
|
|
markdown += '| ' + table.headers.map(() => '---').join(' | ') + ' |\n';
|
|
}
|
|
|
|
// Add rows
|
|
table.rows.forEach((row: string[]) => {
|
|
markdown += '| ' + row.join(' | ') + ' |\n';
|
|
});
|
|
|
|
markdown += '\n';
|
|
});
|
|
}
|
|
}
|
|
|
|
if (extractionType === 'codeBlocks' || extractionType === 'all') {
|
|
if (content.codeBlocks && content.codeBlocks.length > 0) {
|
|
markdown += '## Code Blocks\n\n';
|
|
|
|
content.codeBlocks.forEach((block: any, index: number) => {
|
|
markdown += `### Code Block ${index + 1}\n\n`;
|
|
|
|
if (block.language) {
|
|
markdown += '```' + block.language + '\n';
|
|
} else {
|
|
markdown += '```\n';
|
|
}
|
|
|
|
markdown += block.code + '\n';
|
|
markdown += '```\n\n';
|
|
});
|
|
}
|
|
}
|
|
|
|
return markdown.trim();
|
|
}
|
|
|
|
/**
|
|
* Format extracted content as plain text
|
|
*/
|
|
private formatAsText(content: any, extractionType: string): string {
|
|
let text = '';
|
|
|
|
if (extractionType === 'lists' || extractionType === 'all') {
|
|
if (content.lists && content.lists.length > 0) {
|
|
text += 'LISTS:\n\n';
|
|
|
|
content.lists.forEach((list: any, index: number) => {
|
|
text += `List ${index + 1} (${list.type}):\n\n`;
|
|
|
|
list.items.forEach((item: string, itemIndex: number) => {
|
|
if (list.type === 'unordered') {
|
|
text += `• ${item}\n`;
|
|
} else {
|
|
text += `${itemIndex + 1}. ${item}\n`;
|
|
}
|
|
});
|
|
|
|
text += '\n';
|
|
});
|
|
}
|
|
}
|
|
|
|
if (extractionType === 'headings' || extractionType === 'all') {
|
|
if (content.headings && content.headings.length > 0) {
|
|
text += 'HEADINGS:\n\n';
|
|
|
|
content.headings.forEach((heading: any) => {
|
|
text += `${heading.text} (Level ${heading.level})\n`;
|
|
});
|
|
|
|
text += '\n';
|
|
}
|
|
}
|
|
|
|
if (extractionType === 'tables' || extractionType === 'all') {
|
|
if (content.tables && content.tables.length > 0) {
|
|
text += 'TABLES:\n\n';
|
|
|
|
content.tables.forEach((table: any, index: number) => {
|
|
text += `Table ${index + 1}:\n\n`;
|
|
|
|
// Add headers
|
|
if (table.headers.length > 0) {
|
|
text += table.headers.join(' | ') + '\n';
|
|
text += table.headers.map(() => '-----').join(' | ') + '\n';
|
|
}
|
|
|
|
// Add rows
|
|
table.rows.forEach((row: string[]) => {
|
|
text += row.join(' | ') + '\n';
|
|
});
|
|
|
|
text += '\n';
|
|
});
|
|
}
|
|
}
|
|
|
|
if (extractionType === 'codeBlocks' || extractionType === 'all') {
|
|
if (content.codeBlocks && content.codeBlocks.length > 0) {
|
|
text += 'CODE BLOCKS:\n\n';
|
|
|
|
content.codeBlocks.forEach((block: any, index: number) => {
|
|
text += `Code Block ${index + 1}`;
|
|
|
|
if (block.language) {
|
|
text += ` (${block.language})`;
|
|
}
|
|
|
|
text += ':\n\n';
|
|
text += block.code + '\n\n';
|
|
});
|
|
}
|
|
}
|
|
|
|
return text.trim();
|
|
}
|
|
|
|
/**
|
|
* Strip HTML tags from content
|
|
*/
|
|
private stripHtml(html: string): string {
|
|
return html.replace(/<[^>]*>/g, '');
|
|
}
|
|
|
|
/**
|
|
* Decode HTML entities
|
|
*/
|
|
private decodeHtmlEntities(text: string): string {
|
|
return text
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/&/g, '&')
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, "'")
|
|
.replace(/ /g, ' ');
|
|
}
|
|
}
|