Files
video2document/services/modules/jsonTools/transcriptionSummarizer.js
T

149 lines
5.9 KiB
JavaScript

// Prepare output directory (always storage/transcriptionSummaries under project root)
const outputDir = `${__dirname}/../../../storage/transcriptionSummaries`;
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
//Speaker, ALL-Sentences, Start, End
module.exports = {
name: "summarize-transcription", // Unique name for our function that will later be used to get the function from the map via "mapFunctions.get("example").function()"
type: "summarizer", // value used to differentiate each module to order them in the UI
displayname: "Summarizer", // The displayname used within the UI
async function(args) {
return new Promise(async (resolve, reject) => {
let inputJson = args.json;
//JSON Path
if (args.jsonPath) {
try {
const raw = fs.readFileSync(args.jsonPath, "utf-8");
inputJson = JSON.parse(raw);
} catch (e) {
console.error("Failed to load JSON from file:", e);
return { error: "Could not read JSON from file path." };
}
}
// JSON parsen
if (typeof args === "string") {
try {
await new Promise((res) => {
fs.readFile(args, 'utf8', function (err, data) {
if (err) throw err;
inputJson = JSON.parse(data);
res()
});
})
} catch (e) {
console.log("Invalid JSON in summarize-transcription");
console.log(e)
return { error: "Invalid JSON" };
}
}
const words = inputJson.words;
if (!Array.isArray(words)) {
return { error: "No words Array found" };
}
const ENDINGS = [".", "!", "?"]; // '...' auch als Satzende ?
const ABBREVIATIONS = new Set(["z.B.", "bzw.", "u.a.", "Dr.", "Mr.", "Mrs.", "Prof.", "etc."]); //TODO weitere Ergaenzen
const result = [];
let currentSentence = "";
let currentSpeaker = null;
let startTime = null;
let endTime = null;
for (const w of words) {
if (!currentSpeaker) currentSpeaker = w.speaker;
if (startTime === null) startTime = w.start;
endTime = w.end;
//speaker changing
if (currentSpeaker !== w.speaker && currentSentence) {
const lastEntry = result[result.length - 1];
if (lastEntry && lastEntry.speaker === currentSpeaker) {
lastEntry.sentence += " " + currentSentence;
lastEntry.end = endTime;
} else {
result.push({
speaker: currentSpeaker,
sentence: currentSentence,
start: startTime,
end: endTime
});
}
currentSentence = "";
startTime = w.start;
}
currentSpeaker = w.speaker;
currentSentence += (currentSentence ? " " : "") + w.text; //sentence beginning or not
const lastWord = w.text.trim();
const lastChar = lastWord.slice(-1);
const isAbbreviation = ABBREVIATIONS.has(lastWord);
//sentence ending
if (ENDINGS.includes(lastChar) && !isAbbreviation) {
const lastEntry = result[result.length - 1];
if (lastEntry && lastEntry.speaker === currentSpeaker) {
lastEntry.sentence += " " + currentSentence;
lastEntry.end = endTime;
} else {
result.push({
speaker: currentSpeaker,
sentence: currentSentence,
start: startTime,
end: endTime
});
}
currentSentence = "";
startTime = null;
endTime = null;
currentSpeaker = null;
}
}
// safe last sentence
if (currentSentence) {
const lastEntry = result[result.length - 1];
if (lastEntry && lastEntry.speaker === currentSpeaker) {
lastEntry.sentence += " " + currentSentence;
lastEntry.end = endTime;
} else {
result.push({
speaker: currentSpeaker,
sentence: currentSentence,
start: startTime,
end: endTime
});
}
}
// Output as Text
const output = result.map(r =>
`Sprecher ${r.speaker} [${r.start.toFixed(2)} - ${r.end.toFixed(2)}]: ${r.sentence}`
);
// Output on cosole
//console.log("\n------------\nMerged Transcription Result:\n", output, "\n------------\n");
try {
const jsonPath = path.join(outputDir, "transcription_result.json");
fs.writeFileSync(jsonPath, JSON.stringify(result, null, 2), "utf-8");
const txtPath = path.join(outputDir, "transcription_result.txt");
fs.writeFileSync(txtPath, output.join("\n"), "utf-8");
console.log(`Summary successfully saved:\n- ${jsonPath}\n- ${txtPath}`);
resolve(jsonPath);
} catch (err) {
console.error("Error saving Summary:", err);
reject(err);
}
})
}
}