Changed the module to use puppeteer and html-to-docx

This commit is contained in:
MikeHughes-BIN
2025-12-14 18:14:16 +01:00
parent 7cd334645f
commit 271fe78b7b
5 changed files with 1675 additions and 414 deletions
+1442 -2
View File
File diff suppressed because it is too large Load Diff
+3 -1
View File
@@ -10,9 +10,11 @@
"express": "^5.1.0",
"ffmpeg-static": "^5.2.0",
"fluent-ffmpeg": "^2.1.3",
"html-to-docx": "^1.8.0",
"marked": "^17.0.1",
"open": "^11.0.0",
"pdfkit": "^0.17.2"
"pdfkit": "^0.17.2",
"puppeteer": "^24.33.0"
},
"devDependencies": {
"@types/cli-progress": "^3.11.6",
+194
View File
@@ -0,0 +1,194 @@
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer');
const htmlToDocx = require('html-to-docx');
const { execSync } = require('child_process');
const os = require('os');
const outputDir = path.join(__dirname, "../../../storage/documents");
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
async function showSaveDialog(defaultName, format) {
const platform = os.platform();
if (platform === 'darwin') {
// macOS
const applescript = `
set defaultName to "${defaultName}.${format}"
set theFile to choose file name with prompt "Dokument speichern als:" default name defaultName
POSIX path of theFile
`;
try {
const result = execSync(`osascript -e '${applescript}'`, { encoding: 'utf8' });
return result.trim();
} catch (err) {
if (err.status === 1) return null; // User canceled
throw err;
}
} else if (platform === 'win32') {
// Windows
const powershell = `
Add-Type -AssemblyName System.Windows.Forms
$dialog = New-Object System.Windows.Forms.SaveFileDialog
$dialog.FileName = "${defaultName}.${format}"
$dialog.Filter = "${format.toUpperCase()} Dateien (*.${format})|*.${format}|Alle Dateien (*.*)|*.*"
$dialog.Title = "Dokument speichern als"
$result = $dialog.ShowDialog()
if ($result -eq 'OK') { $dialog.FileName }
`;
try {
const result = execSync(`powershell -Command "${powershell.replace(/\n/g, '; ')}"`, {
encoding: 'utf8'
});
return result.trim() || null;
} catch (err) {
throw err;
}
} else {
// Linux - zenity oder kdialog
try {
const result = execSync(
`zenity --file-selection --save --confirm-overwrite --filename="${defaultName}.${format}"`,
{ encoding: 'utf8' }
);
return result.trim();
} catch (err) {
try {
const result = execSync(
`kdialog --getsavefilename . "${defaultName}.${format}"`,
{ encoding: 'utf8' }
);
return result.trim();
} catch (err2) {
// Fallback
return path.join(os.homedir(), 'Downloads', `${defaultName}.${format}`);
}
}
}
}
const module_exports = {
name: "htmlDocumentConverter",
type: "converter",
displayname: "HTML Document Converter",
description: "Converts LLM-generated HTML to PDF, DOCX, TXT, or HTML",
/**
* Main conversion function
* @param {Object} options
* @param {string} options.inputPath - Path to the HTML input
* @param {string} options.format - 'pdf' | 'docx' | 'html' | 'txt'
* @param {string} [options.outputName] - Optional output filename (without extension)
* @param {boolean} [options.showDialog] - Show save dialog (default: false in module mode, true in CLI mode)
*/
async convert({ inputPath, format = 'pdf', outputName, showDialog = false }) {
if (!fs.existsSync(inputPath)) {
throw new Error(`Input file not found: ${inputPath}`);
}
const ext = path.extname(inputPath).toLowerCase();
const baseName = outputName || path.basename(inputPath, ext);
let outputFile;
if (showDialog) {
// Zeige nativen Dialog
outputFile = await showSaveDialog(baseName, format);
if (!outputFile) {
console.log('Speichervorgang abgebrochen.');
return null;
}
} else {
// Nutze Standard-Ausgabeverzeichnis
outputFile = path.join(outputDir, `${baseName}.${format.toLowerCase()}`);
}
let htmlContent = fs.readFileSync(inputPath, 'utf8');
// Remove <think> tags if present
htmlContent = htmlContent.replace(/<think>[\s\S]*?<\/think>/gi, '');
switch (format.toLowerCase()) {
case 'html':
fs.writeFileSync(outputFile, htmlContent, 'utf8');
break;
case 'pdf':
await this.htmlToPDF(htmlContent, outputFile);
break;
case 'docx':
await this.htmlToDOCX(htmlContent, outputFile);
break;
case 'txt':
fs.writeFileSync(outputFile, this.htmlToTXT(htmlContent), 'utf8');
break;
default:
throw new Error(`Unsupported format: ${format}`);
}
console.log(`✓ Erfolgreich gespeichert: ${outputFile}`);
return outputFile;
},
// HTML → PDF
async htmlToPDF(html, outputPath) {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const page = await browser.newPage();
await page.setContent(html, { waitUntil: 'networkidle0' });
await page.pdf({
path: outputPath,
format: 'A4',
printBackground: true,
margin: { top: '20mm', right: '20mm', bottom: '20mm', left: '20mm' }
});
await browser.close();
},
// HTML → DOCX
async htmlToDOCX(html, outputPath) {
const buffer = await htmlToDocx(html);
fs.writeFileSync(outputPath, buffer);
},
// HTML → TXT (rudimentär)
htmlToTXT(html) {
return html.replace(/<[^>]*>/g, '').replace(/\s+\n/g, '\n').trim();
}
};
module.exports = module_exports;
// CLI usage mit Dialog
if (require.main === module) {
(async () => {
const args = process.argv.slice(2);
if (args.length < 1) {
console.log('Usage: node htmlDocumentConverter.js <input.html> [format]');
console.log('Formats: pdf (default), docx, html, txt');
console.log('');
console.log('Ein nativer "Speichern unter" Dialog wird automatisch geöffnet.');
process.exit(1);
}
const inputPath = args[0];
const format = args[1] || 'pdf';
try {
await module_exports.convert({
inputPath,
format,
showDialog: true
});
} catch (err) {
console.error('✗ Konvertierung fehlgeschlagen:', err.message);
process.exit(1);
}
})();
}
-411
View File
@@ -1,411 +0,0 @@
#!/usr/bin/env node
const fs = require('fs');
const path = require('path');
const { marked } = require('marked');
const PDFDocument = require('pdfkit');
const docx = require('docx');
const { Document, Paragraph, TextRun, HeadingLevel } = docx;
const args = process.argv.slice(2);
if (args.length < 1) {
console.log('Usage: node convert.js <input.md> [format]');
console.log('Formats: pdf (default), docx, html, txt');
console.log('Example: node convert.js document.md docx');
process.exit(1);
}
const inputFile = args[0];
const format = (args[1] || 'pdf').toLowerCase();
if (!fs.existsSync(inputFile)) {
console.error(`File not found: ${inputFile}`);
process.exit(1);
}
const validFormats = ['pdf', 'docx', 'html', 'txt'];
if (!validFormats.includes(format)) {
console.error(`Invalid format: ${format}`);
console.log('Valid formats: pdf, docx, html, txt');
process.exit(1);
}
// Enhanced marked renderer to handle bold, italic, etc.
const renderer = new marked.Renderer();
marked.setOptions({ renderer });
async function showSaveDialog(defaultName, format) {
const applescript = `
set defaultName to "${defaultName}.${format}"
set theFile to choose file name with prompt "Save converted file as:" default name defaultName
POSIX path of theFile
`;
try {
const { execSync } = require('child_process');
const result = execSync(`osascript -e '${applescript}'`, { encoding: 'utf8' });
return result.trim();
} catch (err) {
if (err.status === 1) {
console.log('Save canceled.');
process.exit(0);
}
throw err;
}
}
async function promptForFormat() {
const readline = require('readline').createInterface({
input: process.stdin,
output: process.stdout
});
return new Promise((resolve) => {
console.log('\nChoose output format:');
console.log('1) PDF');
console.log('2) DOCX (Word)');
console.log('3) HTML');
console.log('4) TXT (Plain text)');
readline.question('\nEnter choice (1-4): ', (answer) => {
readline.close();
const formats = { '1': 'pdf', '2': 'docx', '3': 'html', '4': 'txt' };
resolve(formats[answer] || 'pdf');
});
});
}
async function convertMarkdown() {
const mdContent = fs.readFileSync(inputFile, 'utf8');
const baseName = path.basename(inputFile, '.md');
const defaultName = baseName;
const outputFile = await showSaveDialog(defaultName, format);
console.log(`\nConverting ${path.basename(inputFile)}${path.basename(outputFile)} ...`);
try {
switch (format) {
case 'pdf':
await convertToPDF(mdContent, outputFile);
break;
case 'docx':
await convertToDOCX(mdContent, outputFile);
break;
case 'html':
await convertToHTML(mdContent, outputFile);
break;
case 'txt':
await convertToTXT(mdContent, outputFile);
break;
default:
throw new Error(`Unsupported format: ${format}`);
}
console.log(`✓ Successfully saved: ${outputFile}\n`);
} catch (err) {
console.error('✗ Error during conversion:', err.message);
process.exit(1);
}
}
// Parse inline markdown (bold, italic, code) for text extraction
function parseInlineMarkdown(text) {
const parts = [];
const regex = /(\*\*|__|`)(.*?)\1/g;
let lastIndex = 0;
let match;
while ((match = regex.exec(text)) !== null) {
if (match.index > lastIndex) {
parts.push({ text: text.slice(lastIndex, match.index), style: 'normal' });
}
const marker = match[1];
const content = match[2];
if (marker === '**' || marker === '__') {
parts.push({ text: content, style: 'bold' });
} else if (marker === '`') {
parts.push({ text: content, style: 'code' });
}
lastIndex = regex.lastIndex;
}
if (lastIndex < text.length) {
parts.push({ text: text.slice(lastIndex), style: 'normal' });
}
return parts.length > 0 ? parts : [{ text, style: 'normal' }];
}
async function convertToPDF(mdContent, outputFile) {
const doc = new PDFDocument({ margin: 50 });
const stream = fs.createWriteStream(outputFile);
doc.pipe(stream);
const tokens = marked.lexer(mdContent);
for (const token of tokens) {
switch (token.type) {
case 'heading':
const size = 28 - (token.depth * 3);
doc.fontSize(size)
.font('Helvetica-Bold')
.text(token.text, { continued: false })
.moveDown(0.5);
break;
case 'paragraph':
const parts = parseInlineMarkdown(token.text);
doc.fontSize(12);
parts.forEach((part, idx) => {
if (part.style === 'bold') {
doc.font('Helvetica-Bold');
} else if (part.style === 'code') {
doc.font('Courier').fontSize(11);
} else {
doc.font('Helvetica');
}
doc.text(part.text, { continued: idx < parts.length - 1 });
});
doc.moveDown(0.5);
break;
case 'list':
token.items.forEach(item => {
const parts = parseInlineMarkdown(item.text);
doc.fontSize(12);
doc.font('Helvetica').text('• ', { indent: 20, continued: true });
parts.forEach((part, idx) => {
if (part.style === 'bold') {
doc.font('Helvetica-Bold');
} else if (part.style === 'code') {
doc.font('Courier').fontSize(11);
} else {
doc.font('Helvetica').fontSize(12);
}
doc.text(part.text, { continued: idx < parts.length - 1 });
});
doc.moveDown(0.3);
});
doc.moveDown(0.5);
break;
case 'code':
doc.fontSize(10)
.font('Courier')
.fillColor('#333333')
.text(token.text, { indent: 20 })
.fillColor('#000000')
.moveDown();
break;
case 'blockquote':
doc.fontSize(11)
.font('Helvetica-Oblique')
.fillColor('#666666')
.text(token.text, { indent: 20 })
.fillColor('#000000')
.moveDown();
break;
case 'space':
doc.moveDown(0.5);
break;
}
}
doc.end();
return new Promise((resolve, reject) => {
stream.on('finish', resolve);
stream.on('error', reject);
});
}
async function convertToDOCX(mdContent, outputFile) {
const tokens = marked.lexer(mdContent);
const children = [];
for (const token of tokens) {
switch (token.type) {
case 'heading':
const headingLevels = [
HeadingLevel.HEADING_1,
HeadingLevel.HEADING_2,
HeadingLevel.HEADING_3,
HeadingLevel.HEADING_4,
HeadingLevel.HEADING_5,
HeadingLevel.HEADING_6
];
children.push(
new Paragraph({
text: token.text,
heading: headingLevels[token.depth - 1] || HeadingLevel.HEADING_1
})
);
break;
case 'paragraph':
const parts = parseInlineMarkdown(token.text);
const textRuns = parts.map(part => {
return new TextRun({
text: part.text,
bold: part.style === 'bold',
font: part.style === 'code' ? 'Courier New' : undefined
});
});
children.push(new Paragraph({ children: textRuns }));
break;
case 'list':
token.items.forEach(item => {
const parts = parseInlineMarkdown(item.text);
const textRuns = parts.map(part => {
return new TextRun({
text: part.text,
bold: part.style === 'bold',
font: part.style === 'code' ? 'Courier New' : undefined
});
});
children.push(new Paragraph({
children: textRuns,
bullet: { level: 0 }
}));
});
break;
case 'code':
children.push(new Paragraph({
text: token.text,
style: 'Code'
}));
break;
case 'blockquote':
children.push(new Paragraph({
text: token.text,
italics: true,
indent: { left: 720 }
}));
break;
case 'space':
children.push(new Paragraph({ text: '' }));
break;
}
}
const doc = new Document({
sections: [{
properties: {},
children: children
}]
});
const buffer = await docx.Packer.toBuffer(doc);
fs.writeFileSync(outputFile, buffer);
}
async function convertToHTML(mdContent, outputFile) {
const html = marked.parse(mdContent);
const fullHTML = `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>${path.basename(inputFile, '.md')}</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
max-width: 800px;
margin: 50px auto;
padding: 20px;
line-height: 1.6;
color: #333;
}
h1, h2, h3, h4, h5, h6 {
margin-top: 24px;
margin-bottom: 16px;
font-weight: 600;
line-height: 1.25;
}
h1 { font-size: 2em; border-bottom: 1px solid #eaecef; padding-bottom: 0.3em; }
h2 { font-size: 1.5em; border-bottom: 1px solid #eaecef; padding-bottom: 0.3em; }
code {
background: #f6f8fa;
padding: 2px 6px;
border-radius: 3px;
font-family: 'Courier New', Courier, monospace;
font-size: 0.9em;
}
pre {
background: #f6f8fa;
padding: 16px;
border-radius: 6px;
overflow-x: auto;
line-height: 1.45;
}
pre code {
background: none;
padding: 0;
}
blockquote {
border-left: 4px solid #dfe2e5;
padding-left: 16px;
color: #6a737d;
margin: 16px 0;
}
a {
color: #0366d6;
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
strong {
font-weight: 600;
}
</style>
</head>
<body>
${html}
</body>
</html>`;
fs.writeFileSync(outputFile, fullHTML, 'utf8');
}
async function convertToTXT(mdContent, outputFile) {
const tokens = marked.lexer(mdContent);
let text = '';
for (const token of tokens) {
switch (token.type) {
case 'heading':
text += '\n' + '='.repeat(token.text.length) + '\n';
text += token.text.toUpperCase() + '\n';
text += '='.repeat(token.text.length) + '\n\n';
break;
case 'paragraph':
// Remove markdown formatting for plain text
const cleanText = token.text.replace(/\*\*(.+?)\*\*/g, '$1').replace(/__(.+?)__/g, '$1');
text += cleanText + '\n\n';
break;
case 'list':
token.items.forEach(item => {
const cleanItem = item.text.replace(/\*\*(.+?)\*\*/g, '$1').replace(/__(.+?)__/g, '$1');
text += ' • ' + cleanItem + '\n';
});
text += '\n';
break;
case 'code':
text += '\n' + token.text + '\n\n';
break;
case 'blockquote':
text += ' > ' + token.text + '\n\n';
break;
}
}
fs.writeFileSync(outputFile, text, 'utf8');
}
convertMarkdown();
+36
View File
@@ -0,0 +1,36 @@
{
"SPEAKERS":"First Identify each speaker in the transcript and give the first time snippet where they speak for the first time. Use labels like Speaker 1, Speaker 2, etc. If no speakers are identified, use 'Speaker X'.",
"FORMAT": "HTML",
"GOAL":"Generate a structured meeting report (HTML). **Output ONLY:** final .html. No meta.",
"STRUCTURE": {
"titlepage": ["title","date","start","end","duration","location","host","participants"],
"toc": "[section](#anchor) — HH:MM:SS",
"section": {
"h2": "<topic> — HH:MM:SS",
"summary": "1 sentence",
"key_points": "<=5 bullets, quotes optional",
"decisions": "list: text | owner | due",
"actions": "table: id | task | owner | due | status"
},
"exec_summary": "3 short sentences",
"consolidated": ["decisions", "actions"],
"appendix": "optional"
},
"STYLE": {
"tone": "neutral, concise",
"ts_format": "HH:MM:SS",
"no_meta": true
},
"PROCESS": {
"timestamps": "use if present; else estimate minimal",
"speakers": "use labels; else Speaker X",
"long_transcripts": "chunk → summarize → merge",
"unclear": "UNKNOWN:<reason>"
},
"JSON_OUTPUT_OPTIONAL": true,
"PROMPT_SNIPPET": "Generate meeting report in HTML using STRUCTURE and STYLE. Output only the report."
}