Files
video2document/services/modules/convert/convert.js
T

237 lines
7.1 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
const fs = require("fs");
const path = require("path");
const puppeteer = require("puppeteer");
const htmlToDocx = require("html-to-docx");
const { execSync } = require("child_process");
const os = require("os");
const outputDir = path.join(__dirname, "../../../storage/documents");
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
async function showSaveDialog(defaultName, format) {
const platform = os.platform();
if (platform === "darwin") {
// macOS
const applescript = `
set defaultName to "${defaultName}.${format}"
set theFile to choose file name with prompt "Dokument speichern als:" default name defaultName
POSIX path of theFile
`;
try {
const result = execSync(`osascript -e '${applescript}'`, {
encoding: "utf8",
});
return result.trim();
} catch (err) {
if (err.status === 1) return null; // User canceled
throw err;
}
} else if (platform === "win32") {
const safeName = decodeURIComponent(defaultName);
const powershell = `
Add-Type -AssemblyName System.Windows.Forms;
$dialog = New-Object System.Windows.Forms.SaveFileDialog;
$dialog.FileName = '${safeName}.${format}';
$dialog.Filter = '${format.toUpperCase()} Dateien (*.${format})|*.${format}|Alle Dateien (*.*)|*.*';
$dialog.Title = 'Dokument speichern als';
$result = $dialog.ShowDialog();
if ($result -eq 'OK') { $dialog.FileName }
`;
try {
const result = execSync(
`powershell -NoProfile -Command "${powershell.replace(/\r?\n/g, " ")}"`,
{ encoding: "utf8" },
);
return result.trim() || null;
} catch (err) {
if (err.status === 1) return null; // User cancelled
throw new Error("Save dialog failed: " + err.message);
}
} else {
// Linux - zenity oder kdialog
try {
const result = execSync(
`zenity --file-selection --save --confirm-overwrite --filename="${defaultName}.${format}"`,
{ encoding: "utf8" },
);
return result.trim();
} catch (err) {
try {
const result = execSync(
`kdialog --getsavefilename . "${defaultName}.${format}"`,
{ encoding: "utf8" },
);
return result.trim();
} catch (err2) {
// Fallback
return path.join(os.homedir(), "Downloads", `${defaultName}.${format}`);
}
}
}
}
const module_exports = {
name: "htmlDocumentConverter",
type: "converter",
displayname: "HTML Document Converter",
description: "Converts LLM-generated HTML to PDF, DOCX, TXT, or HTML",
/**
* Main conversion function
* @param {Object} options
* @param {string} options.inputPath - Path to the HTML input
* @param {string} options.format - 'pdf' | 'docx' | 'html' | 'txt'
* @param {string} [options.outputName] - Optional output filename (without extension)
* @param {boolean} [options.showDialog] - Show save dialog (default: false in module mode, true in CLI mode)
*/
async convert({ inputPath, format = "pdf", outputName, showDialog = false }) {
format = format.toLowerCase().replace(".", ""); // <-- FIX
if (!["pdf", "docx", "html", "txt"].includes(format)) {
throw new Error(`Unsupported format: ${format}`);
}
if (!fs.existsSync(inputPath)) {
throw new Error(`Input file not found: ${inputPath}`);
}
const ext = path.extname(inputPath).toLowerCase();
const baseName = outputName || path.basename(inputPath, ext);
let outputFile;
if (showDialog) {
// Zeige nativen Dialog
outputFile = await showSaveDialog(baseName, format);
if (!outputFile) {
console.log("Speichervorgang abgebrochen.");
return null;
}
} else {
// Nutze Standard-Ausgabeverzeichnis
outputFile = path.join(outputDir, `${baseName}.${format.toLowerCase()}`);
}
let htmlContent = fs.readFileSync(inputPath, "utf8");
// Remove <think> tags if present
htmlContent = htmlContent.replace(/<think>[\s\S]*?<\/think>/gi, "");
switch (format.toLowerCase()) {
case "html":
fs.writeFileSync(outputFile, htmlContent, "utf8");
break;
case "pdf":
await this.htmlToPDF(htmlContent, outputFile);
break;
case "docx":
await this.htmlToDOCX(htmlContent, outputFile);
break;
case "txt":
fs.writeFileSync(outputFile, this.htmlToTXT(htmlContent), "utf8");
break;
default:
throw new Error(`Unsupported format: ${format}`);
}
console.log(`Erfolgreich gespeichert: ${outputFile}`);
return outputFile;
},
// HTML → PDF
async htmlToPDF(html, outputPath) {
let browser;
try {
browser = await puppeteer.launch({
headless: true,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.setContent(html, { waitUntil: "networkidle0" });
await page.pdf({
path: outputPath,
format: "A4",
printBackground: true,
margin: {
top: "20mm",
right: "20mm",
bottom: "20mm",
left: "20mm",
},
});
} finally {
if (browser) {
await browser.close();
}
}
},
// HTML → DOCX
async htmlToDOCX(html, outputPath) {
try {
// htmltodocx library converts HTML string into a Word .docx buffer
// Usage from htmltodocx docs:
// await HTMLtoDOCX(htmlString, headerHTMLString, documentOptions, footerHTMLString) [oai_citation:0‡GitHub](https://github.com/privateOmega/html-to-docx?utm_source=chatgpt.com)
const buffer = await htmlToDocx(html, null, {
table: { row: { cantSplit: true } },
});
fs.writeFileSync(outputPath, buffer);
} catch (err) {
throw new Error(`DOCX conversion failed: ${err.message}`);
}
},
// HTML → TXT
htmlToTXT(html) {
// A decent plain text conversion: strip tags and collapse whitespace
// If you want more advanced extraction consider using a library like `html-to-text` or `strip-html` [oai_citation:1‡GitHub](https://github.com/html-to-text/node-html-to-text?utm_source=chatgpt.com)
return (
html
// Remove all tags
.replace(/<[^>]+>/g, "")
// Convert multiple whitespace into single spaces
.replace(/\s+/g, " ")
.trim()
);
},
};
module.exports = module_exports;
// CLI usage mit Dialog
if (require.main === module) {
(async () => {
const args = process.argv.slice(2);
if (args.length < 1) {
console.log("Usage: node htmlDocumentConverter.js <input.html> [format]");
console.log("Formats: pdf (default), docx, html, txt");
console.log("");
console.log(
'Ein nativer "Speichern unter" Dialog wird automatisch geöffnet.',
);
process.exit(1);
}
const inputPath = args[0];
const format = args[1] || "pdf";
try {
await module_exports.convert({
inputPath,
format,
showDialog: true,
});
} catch (err) {
console.error("Konvertierung fehlgeschlagen:", err.message);
process.exit(1);
}
})();
}