Changed the module to use puppeteer and html-to-docx

This commit is contained in:
MikeHughes-BIN
2025-12-14 18:14:16 +01:00
parent 7cd334645f
commit 271fe78b7b
5 changed files with 1675 additions and 414 deletions
+194
View File
@@ -0,0 +1,194 @@
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer');
const htmlToDocx = require('html-to-docx');
const { execSync } = require('child_process');
const os = require('os');
const outputDir = path.join(__dirname, "../../../storage/documents");
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
async function showSaveDialog(defaultName, format) {
const platform = os.platform();
if (platform === 'darwin') {
// macOS
const applescript = `
set defaultName to "${defaultName}.${format}"
set theFile to choose file name with prompt "Dokument speichern als:" default name defaultName
POSIX path of theFile
`;
try {
const result = execSync(`osascript -e '${applescript}'`, { encoding: 'utf8' });
return result.trim();
} catch (err) {
if (err.status === 1) return null; // User canceled
throw err;
}
} else if (platform === 'win32') {
// Windows
const powershell = `
Add-Type -AssemblyName System.Windows.Forms
$dialog = New-Object System.Windows.Forms.SaveFileDialog
$dialog.FileName = "${defaultName}.${format}"
$dialog.Filter = "${format.toUpperCase()} Dateien (*.${format})|*.${format}|Alle Dateien (*.*)|*.*"
$dialog.Title = "Dokument speichern als"
$result = $dialog.ShowDialog()
if ($result -eq 'OK') { $dialog.FileName }
`;
try {
const result = execSync(`powershell -Command "${powershell.replace(/\n/g, '; ')}"`, {
encoding: 'utf8'
});
return result.trim() || null;
} catch (err) {
throw err;
}
} else {
// Linux - zenity oder kdialog
try {
const result = execSync(
`zenity --file-selection --save --confirm-overwrite --filename="${defaultName}.${format}"`,
{ encoding: 'utf8' }
);
return result.trim();
} catch (err) {
try {
const result = execSync(
`kdialog --getsavefilename . "${defaultName}.${format}"`,
{ encoding: 'utf8' }
);
return result.trim();
} catch (err2) {
// Fallback
return path.join(os.homedir(), 'Downloads', `${defaultName}.${format}`);
}
}
}
}
const module_exports = {
name: "htmlDocumentConverter",
type: "converter",
displayname: "HTML Document Converter",
description: "Converts LLM-generated HTML to PDF, DOCX, TXT, or HTML",
/**
* Main conversion function
* @param {Object} options
* @param {string} options.inputPath - Path to the HTML input
* @param {string} options.format - 'pdf' | 'docx' | 'html' | 'txt'
* @param {string} [options.outputName] - Optional output filename (without extension)
* @param {boolean} [options.showDialog] - Show save dialog (default: false in module mode, true in CLI mode)
*/
async convert({ inputPath, format = 'pdf', outputName, showDialog = false }) {
if (!fs.existsSync(inputPath)) {
throw new Error(`Input file not found: ${inputPath}`);
}
const ext = path.extname(inputPath).toLowerCase();
const baseName = outputName || path.basename(inputPath, ext);
let outputFile;
if (showDialog) {
// Zeige nativen Dialog
outputFile = await showSaveDialog(baseName, format);
if (!outputFile) {
console.log('Speichervorgang abgebrochen.');
return null;
}
} else {
// Nutze Standard-Ausgabeverzeichnis
outputFile = path.join(outputDir, `${baseName}.${format.toLowerCase()}`);
}
let htmlContent = fs.readFileSync(inputPath, 'utf8');
// Remove <think> tags if present
htmlContent = htmlContent.replace(/<think>[\s\S]*?<\/think>/gi, '');
switch (format.toLowerCase()) {
case 'html':
fs.writeFileSync(outputFile, htmlContent, 'utf8');
break;
case 'pdf':
await this.htmlToPDF(htmlContent, outputFile);
break;
case 'docx':
await this.htmlToDOCX(htmlContent, outputFile);
break;
case 'txt':
fs.writeFileSync(outputFile, this.htmlToTXT(htmlContent), 'utf8');
break;
default:
throw new Error(`Unsupported format: ${format}`);
}
console.log(`✓ Erfolgreich gespeichert: ${outputFile}`);
return outputFile;
},
// HTML → PDF
async htmlToPDF(html, outputPath) {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const page = await browser.newPage();
await page.setContent(html, { waitUntil: 'networkidle0' });
await page.pdf({
path: outputPath,
format: 'A4',
printBackground: true,
margin: { top: '20mm', right: '20mm', bottom: '20mm', left: '20mm' }
});
await browser.close();
},
// HTML → DOCX
async htmlToDOCX(html, outputPath) {
const buffer = await htmlToDocx(html);
fs.writeFileSync(outputPath, buffer);
},
// HTML → TXT (rudimentär)
htmlToTXT(html) {
return html.replace(/<[^>]*>/g, '').replace(/\s+\n/g, '\n').trim();
}
};
module.exports = module_exports;
// CLI usage mit Dialog
if (require.main === module) {
(async () => {
const args = process.argv.slice(2);
if (args.length < 1) {
console.log('Usage: node htmlDocumentConverter.js <input.html> [format]');
console.log('Formats: pdf (default), docx, html, txt');
console.log('');
console.log('Ein nativer "Speichern unter" Dialog wird automatisch geöffnet.');
process.exit(1);
}
const inputPath = args[0];
const format = args[1] || 'pdf';
try {
await module_exports.convert({
inputPath,
format,
showDialog: true
});
} catch (err) {
console.error('✗ Konvertierung fehlgeschlagen:', err.message);
process.exit(1);
}
})();
}
-411
View File
@@ -1,411 +0,0 @@
#!/usr/bin/env node
const fs = require('fs');
const path = require('path');
const { marked } = require('marked');
const PDFDocument = require('pdfkit');
const docx = require('docx');
const { Document, Paragraph, TextRun, HeadingLevel } = docx;
const args = process.argv.slice(2);
if (args.length < 1) {
console.log('Usage: node convert.js <input.md> [format]');
console.log('Formats: pdf (default), docx, html, txt');
console.log('Example: node convert.js document.md docx');
process.exit(1);
}
const inputFile = args[0];
const format = (args[1] || 'pdf').toLowerCase();
if (!fs.existsSync(inputFile)) {
console.error(`File not found: ${inputFile}`);
process.exit(1);
}
const validFormats = ['pdf', 'docx', 'html', 'txt'];
if (!validFormats.includes(format)) {
console.error(`Invalid format: ${format}`);
console.log('Valid formats: pdf, docx, html, txt');
process.exit(1);
}
// Enhanced marked renderer to handle bold, italic, etc.
const renderer = new marked.Renderer();
marked.setOptions({ renderer });
async function showSaveDialog(defaultName, format) {
const applescript = `
set defaultName to "${defaultName}.${format}"
set theFile to choose file name with prompt "Save converted file as:" default name defaultName
POSIX path of theFile
`;
try {
const { execSync } = require('child_process');
const result = execSync(`osascript -e '${applescript}'`, { encoding: 'utf8' });
return result.trim();
} catch (err) {
if (err.status === 1) {
console.log('Save canceled.');
process.exit(0);
}
throw err;
}
}
async function promptForFormat() {
const readline = require('readline').createInterface({
input: process.stdin,
output: process.stdout
});
return new Promise((resolve) => {
console.log('\nChoose output format:');
console.log('1) PDF');
console.log('2) DOCX (Word)');
console.log('3) HTML');
console.log('4) TXT (Plain text)');
readline.question('\nEnter choice (1-4): ', (answer) => {
readline.close();
const formats = { '1': 'pdf', '2': 'docx', '3': 'html', '4': 'txt' };
resolve(formats[answer] || 'pdf');
});
});
}
async function convertMarkdown() {
const mdContent = fs.readFileSync(inputFile, 'utf8');
const baseName = path.basename(inputFile, '.md');
const defaultName = baseName;
const outputFile = await showSaveDialog(defaultName, format);
console.log(`\nConverting ${path.basename(inputFile)}${path.basename(outputFile)} ...`);
try {
switch (format) {
case 'pdf':
await convertToPDF(mdContent, outputFile);
break;
case 'docx':
await convertToDOCX(mdContent, outputFile);
break;
case 'html':
await convertToHTML(mdContent, outputFile);
break;
case 'txt':
await convertToTXT(mdContent, outputFile);
break;
default:
throw new Error(`Unsupported format: ${format}`);
}
console.log(`✓ Successfully saved: ${outputFile}\n`);
} catch (err) {
console.error('✗ Error during conversion:', err.message);
process.exit(1);
}
}
// Parse inline markdown (bold, italic, code) for text extraction
function parseInlineMarkdown(text) {
const parts = [];
const regex = /(\*\*|__|`)(.*?)\1/g;
let lastIndex = 0;
let match;
while ((match = regex.exec(text)) !== null) {
if (match.index > lastIndex) {
parts.push({ text: text.slice(lastIndex, match.index), style: 'normal' });
}
const marker = match[1];
const content = match[2];
if (marker === '**' || marker === '__') {
parts.push({ text: content, style: 'bold' });
} else if (marker === '`') {
parts.push({ text: content, style: 'code' });
}
lastIndex = regex.lastIndex;
}
if (lastIndex < text.length) {
parts.push({ text: text.slice(lastIndex), style: 'normal' });
}
return parts.length > 0 ? parts : [{ text, style: 'normal' }];
}
async function convertToPDF(mdContent, outputFile) {
const doc = new PDFDocument({ margin: 50 });
const stream = fs.createWriteStream(outputFile);
doc.pipe(stream);
const tokens = marked.lexer(mdContent);
for (const token of tokens) {
switch (token.type) {
case 'heading':
const size = 28 - (token.depth * 3);
doc.fontSize(size)
.font('Helvetica-Bold')
.text(token.text, { continued: false })
.moveDown(0.5);
break;
case 'paragraph':
const parts = parseInlineMarkdown(token.text);
doc.fontSize(12);
parts.forEach((part, idx) => {
if (part.style === 'bold') {
doc.font('Helvetica-Bold');
} else if (part.style === 'code') {
doc.font('Courier').fontSize(11);
} else {
doc.font('Helvetica');
}
doc.text(part.text, { continued: idx < parts.length - 1 });
});
doc.moveDown(0.5);
break;
case 'list':
token.items.forEach(item => {
const parts = parseInlineMarkdown(item.text);
doc.fontSize(12);
doc.font('Helvetica').text('• ', { indent: 20, continued: true });
parts.forEach((part, idx) => {
if (part.style === 'bold') {
doc.font('Helvetica-Bold');
} else if (part.style === 'code') {
doc.font('Courier').fontSize(11);
} else {
doc.font('Helvetica').fontSize(12);
}
doc.text(part.text, { continued: idx < parts.length - 1 });
});
doc.moveDown(0.3);
});
doc.moveDown(0.5);
break;
case 'code':
doc.fontSize(10)
.font('Courier')
.fillColor('#333333')
.text(token.text, { indent: 20 })
.fillColor('#000000')
.moveDown();
break;
case 'blockquote':
doc.fontSize(11)
.font('Helvetica-Oblique')
.fillColor('#666666')
.text(token.text, { indent: 20 })
.fillColor('#000000')
.moveDown();
break;
case 'space':
doc.moveDown(0.5);
break;
}
}
doc.end();
return new Promise((resolve, reject) => {
stream.on('finish', resolve);
stream.on('error', reject);
});
}
async function convertToDOCX(mdContent, outputFile) {
const tokens = marked.lexer(mdContent);
const children = [];
for (const token of tokens) {
switch (token.type) {
case 'heading':
const headingLevels = [
HeadingLevel.HEADING_1,
HeadingLevel.HEADING_2,
HeadingLevel.HEADING_3,
HeadingLevel.HEADING_4,
HeadingLevel.HEADING_5,
HeadingLevel.HEADING_6
];
children.push(
new Paragraph({
text: token.text,
heading: headingLevels[token.depth - 1] || HeadingLevel.HEADING_1
})
);
break;
case 'paragraph':
const parts = parseInlineMarkdown(token.text);
const textRuns = parts.map(part => {
return new TextRun({
text: part.text,
bold: part.style === 'bold',
font: part.style === 'code' ? 'Courier New' : undefined
});
});
children.push(new Paragraph({ children: textRuns }));
break;
case 'list':
token.items.forEach(item => {
const parts = parseInlineMarkdown(item.text);
const textRuns = parts.map(part => {
return new TextRun({
text: part.text,
bold: part.style === 'bold',
font: part.style === 'code' ? 'Courier New' : undefined
});
});
children.push(new Paragraph({
children: textRuns,
bullet: { level: 0 }
}));
});
break;
case 'code':
children.push(new Paragraph({
text: token.text,
style: 'Code'
}));
break;
case 'blockquote':
children.push(new Paragraph({
text: token.text,
italics: true,
indent: { left: 720 }
}));
break;
case 'space':
children.push(new Paragraph({ text: '' }));
break;
}
}
const doc = new Document({
sections: [{
properties: {},
children: children
}]
});
const buffer = await docx.Packer.toBuffer(doc);
fs.writeFileSync(outputFile, buffer);
}
async function convertToHTML(mdContent, outputFile) {
const html = marked.parse(mdContent);
const fullHTML = `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>${path.basename(inputFile, '.md')}</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
max-width: 800px;
margin: 50px auto;
padding: 20px;
line-height: 1.6;
color: #333;
}
h1, h2, h3, h4, h5, h6 {
margin-top: 24px;
margin-bottom: 16px;
font-weight: 600;
line-height: 1.25;
}
h1 { font-size: 2em; border-bottom: 1px solid #eaecef; padding-bottom: 0.3em; }
h2 { font-size: 1.5em; border-bottom: 1px solid #eaecef; padding-bottom: 0.3em; }
code {
background: #f6f8fa;
padding: 2px 6px;
border-radius: 3px;
font-family: 'Courier New', Courier, monospace;
font-size: 0.9em;
}
pre {
background: #f6f8fa;
padding: 16px;
border-radius: 6px;
overflow-x: auto;
line-height: 1.45;
}
pre code {
background: none;
padding: 0;
}
blockquote {
border-left: 4px solid #dfe2e5;
padding-left: 16px;
color: #6a737d;
margin: 16px 0;
}
a {
color: #0366d6;
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
strong {
font-weight: 600;
}
</style>
</head>
<body>
${html}
</body>
</html>`;
fs.writeFileSync(outputFile, fullHTML, 'utf8');
}
async function convertToTXT(mdContent, outputFile) {
const tokens = marked.lexer(mdContent);
let text = '';
for (const token of tokens) {
switch (token.type) {
case 'heading':
text += '\n' + '='.repeat(token.text.length) + '\n';
text += token.text.toUpperCase() + '\n';
text += '='.repeat(token.text.length) + '\n\n';
break;
case 'paragraph':
// Remove markdown formatting for plain text
const cleanText = token.text.replace(/\*\*(.+?)\*\*/g, '$1').replace(/__(.+?)__/g, '$1');
text += cleanText + '\n\n';
break;
case 'list':
token.items.forEach(item => {
const cleanItem = item.text.replace(/\*\*(.+?)\*\*/g, '$1').replace(/__(.+?)__/g, '$1');
text += ' • ' + cleanItem + '\n';
});
text += '\n';
break;
case 'code':
text += '\n' + token.text + '\n\n';
break;
case 'blockquote':
text += ' > ' + token.text + '\n\n';
break;
}
}
fs.writeFileSync(outputFile, text, 'utf8');
}
convertMarkdown();