mirror of
https://github.com/zadam/trilium.git
synced 2025-10-20 15:19:01 +02:00
204 lines
6.1 KiB
TypeScript
204 lines
6.1 KiB
TypeScript
/**
|
||
* Formatter Constants
|
||
*
|
||
* Constants related to message formatters for different LLM providers.
|
||
* This centralizes string formatting patterns, HTML cleaning options,
|
||
* and other formatter-specific constants that were previously hardcoded.
|
||
*/
|
||
|
||
/**
|
||
* HTML tag allowlists for different formatter strictness levels
|
||
*/
|
||
export const HTML_ALLOWED_TAGS = {
|
||
// Standard set used by most formatters
|
||
STANDARD: ['b', 'i', 'em', 'strong', 'a', 'p', 'br', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'code', 'pre'],
|
||
|
||
// Minimal set for providers with limited HTML support
|
||
MINIMAL: ['b', 'i', 'p', 'br', 'a'],
|
||
|
||
// Empty set for plain text only (Ollama)
|
||
NONE: []
|
||
};
|
||
|
||
/**
|
||
* HTML attribute allowlists
|
||
*/
|
||
export const HTML_ALLOWED_ATTRIBUTES = {
|
||
// Standard set of allowed attributes
|
||
STANDARD: {
|
||
'a': ['href']
|
||
},
|
||
|
||
// Empty set for plain text only
|
||
NONE: {}
|
||
};
|
||
|
||
/**
|
||
* HTML tag transformations
|
||
*/
|
||
export const HTML_TRANSFORMS = {
|
||
// Standard transformations
|
||
STANDARD: {
|
||
'h1': 'h2',
|
||
'h2': 'h3',
|
||
'div': 'p',
|
||
'span': 'span'
|
||
}
|
||
};
|
||
|
||
/**
|
||
* RegEx patterns for HTML to Markdown conversion
|
||
*/
|
||
export const HTML_TO_MARKDOWN_PATTERNS = {
|
||
// Headings
|
||
HEADING_1: { pattern: /<h1[^>]*>(.*?)<\/h1>/gi, replacement: '# $1\n' },
|
||
HEADING_2: { pattern: /<h2[^>]*>(.*?)<\/h2>/gi, replacement: '## $1\n' },
|
||
HEADING_3: { pattern: /<h3[^>]*>(.*?)<\/h3>/gi, replacement: '### $1\n' },
|
||
HEADING_4: { pattern: /<h4[^>]*>(.*?)<\/h4>/gi, replacement: '#### $1\n' },
|
||
HEADING_5: { pattern: /<h5[^>]*>(.*?)<\/h5>/gi, replacement: '##### $1\n' },
|
||
|
||
// Paragraph and line breaks
|
||
PARAGRAPH: { pattern: /<p[^>]*>(.*?)<\/p>/gi, replacement: '$1\n\n' },
|
||
BREAK: { pattern: /<br[^>]*>/gi, replacement: '\n' },
|
||
|
||
// Links and formatting
|
||
LINK: { pattern: /<a[^>]*href=["'](.*?)["'][^>]*>(.*?)<\/a>/gi, replacement: '[$2]($1)' },
|
||
STRONG: { pattern: /<strong[^>]*>(.*?)<\/strong>/gi, replacement: '**$1**' },
|
||
BOLD: { pattern: /<b[^>]*>(.*?)<\/b>/gi, replacement: '**$1**' },
|
||
EMPHASIS: { pattern: /<em[^>]*>(.*?)<\/em>/gi, replacement: '*$1*' },
|
||
ITALIC: { pattern: /<i[^>]*>(.*?)<\/i>/gi, replacement: '*$1*' },
|
||
|
||
// Code
|
||
INLINE_CODE: { pattern: /<code[^>]*>(.*?)<\/code>/gi, replacement: '`$1`' },
|
||
CODE_BLOCK: { pattern: /<pre[^>]*>(.*?)<\/pre>/gi, replacement: '```\n$1\n```' },
|
||
|
||
// Clean up
|
||
ANY_REMAINING_TAG: { pattern: /<[^>]*>/g, replacement: '' },
|
||
EXCESSIVE_NEWLINES: { pattern: /\n{3,}/g, replacement: '\n\n' }
|
||
};
|
||
|
||
/**
|
||
* HTML entity replacements
|
||
*/
|
||
export const HTML_ENTITY_REPLACEMENTS = {
|
||
// Common HTML entities
|
||
NBSP: { pattern: / /g, replacement: ' ' },
|
||
LT: { pattern: /</g, replacement: '<' },
|
||
GT: { pattern: />/g, replacement: '>' },
|
||
AMP: { pattern: /&/g, replacement: '&' },
|
||
QUOT: { pattern: /"/g, replacement: '"' },
|
||
APOS: { pattern: /'/g, replacement: "'" },
|
||
LDQUO: { pattern: /“/g, replacement: '"' },
|
||
RDQUO: { pattern: /”/g, replacement: '"' },
|
||
LSQUO: { pattern: /‘/g, replacement: "'" },
|
||
RSQUO: { pattern: /’/g, replacement: "'" },
|
||
MDASH: { pattern: /—/g, replacement: '—' },
|
||
NDASH: { pattern: /–/g, replacement: '–' },
|
||
HELLIP: { pattern: /…/g, replacement: '…' }
|
||
};
|
||
|
||
/**
|
||
* Encoding issue fixes
|
||
*/
|
||
export const ENCODING_FIXES = {
|
||
// Common encoding issues
|
||
BROKEN_QUOTES: { pattern: /Γ\u00c2[\u00a3\u00a5]/g, replacement: '"' },
|
||
|
||
// Character replacements for Unicode
|
||
UNICODE_REPLACEMENTS: {
|
||
'\u00A0': ' ', // Non-breaking space
|
||
'\u2018': "'", // Left single quote
|
||
'\u2019': "'", // Right single quote
|
||
'\u201C': '"', // Left double quote
|
||
'\u201D': '"', // Right double quote
|
||
'\u2013': '-', // En dash
|
||
'\u2014': '--', // Em dash
|
||
'\u2022': '*', // Bullet
|
||
'\u2026': '...' // Ellipsis
|
||
}
|
||
};
|
||
|
||
/**
|
||
* Ollama-specific cleaning patterns
|
||
*/
|
||
export const OLLAMA_CLEANING = {
|
||
// Replace fancy quotes
|
||
QUOTES: { pattern: /[""]/g, replacement: '"' },
|
||
APOSTROPHES: { pattern: /['']/g, replacement: "'" },
|
||
|
||
// Replace other Unicode characters
|
||
DASHES: { pattern: /[–—]/g, replacement: '-' },
|
||
BULLETS: { pattern: /[•]/g, replacement: '*' },
|
||
ELLIPSES: { pattern: /[…]/g, replacement: '...' },
|
||
|
||
// Remove non-ASCII characters
|
||
NON_ASCII: { pattern: /[^\x00-\x7F]/g, replacement: '' },
|
||
|
||
// Normalize whitespace
|
||
WHITESPACE: { pattern: /\s+/g, replacement: ' ' },
|
||
NEWLINE_WHITESPACE: { pattern: /\n\s+/g, replacement: '\n' }
|
||
};
|
||
|
||
/**
|
||
* Console log messages for formatters
|
||
*/
|
||
export const FORMATTER_LOGS = {
|
||
ANTHROPIC: {
|
||
PROCESSED: (before: number, after: number) => `Anthropic formatter: ${before} messages → ${after} messages`
|
||
},
|
||
OPENAI: {
|
||
PROCESSED: (before: number, after: number) => `OpenAI formatter: ${before} messages → ${after} messages`
|
||
},
|
||
OLLAMA: {
|
||
PROCESSED: (before: number, after: number) => `Ollama formatter processed ${before} messages into ${after} messages`
|
||
},
|
||
ERROR: {
|
||
CONTEXT_CLEANING: (provider: string) => `Error cleaning content for ${provider}:`,
|
||
ENCODING: 'Error fixing encoding issues:'
|
||
}
|
||
};
|
||
|
||
/**
|
||
* Message formatter text templates
|
||
*/
|
||
export const MESSAGE_FORMATTER_TEMPLATES = {
|
||
/**
|
||
* OpenAI-specific message templates
|
||
*/
|
||
OPENAI: {
|
||
CONTEXT_INSTRUCTION: 'Please use the following context to respond to the user\'s messages:\n\n'
|
||
},
|
||
|
||
/**
|
||
* Anthropic-specific message templates
|
||
*/
|
||
ANTHROPIC: {
|
||
CONTEXT_START: '\n\n<context>\n',
|
||
CONTEXT_END: '\n</context>'
|
||
},
|
||
|
||
/**
|
||
* Ollama-specific message templates
|
||
*/
|
||
OLLAMA: {
|
||
REFERENCE_INFORMATION: '\n\nReference information:\n'
|
||
},
|
||
|
||
/**
|
||
* Default formatter message templates
|
||
*/
|
||
DEFAULT: {
|
||
CONTEXT_INSTRUCTION: 'Here is context to help you answer my questions: '
|
||
}
|
||
};
|
||
|
||
/**
|
||
* Provider identifier constants
|
||
*/
|
||
export const PROVIDER_IDENTIFIERS = {
|
||
OPENAI: 'openai',
|
||
ANTHROPIC: 'anthropic',
|
||
OLLAMA: 'ollama',
|
||
DEFAULT: 'default'
|
||
};
|