mirror of
https://gitlab.rlp.net/proj-wise2526-video2document/video2document.git
synced 2026-06-15 18:01:52 +02:00
faee605f12
Added mocha based unit tests for each module Did a bit of cleanup in the modules to remove debug console.log calls Removed the Progress bar in the extractor and the library requirement Promisified the gemini module to make sure it returns the path as a promise instead of just on the cli Fixed gitignore so that it now only ignores the content int the storage directories, and not the whole directories Added neetingReport.json for the LLMs to use
150 lines
5.4 KiB
JavaScript
150 lines
5.4 KiB
JavaScript
|
|
// Prepare output directory (always storage/transcriptionSummaries under project root)
|
|
const outputDir = `${__dirname}/../../../storage/transcriptionSummaries`;
|
|
if (!fs.existsSync(outputDir)) {
|
|
fs.mkdirSync(outputDir, { recursive: true });
|
|
}
|
|
|
|
function getSessionId(inputPath) {
|
|
try {
|
|
const parsed = new URL(inputPath);
|
|
const base = path.basename(parsed.pathname);
|
|
return base.replace(/\.[^.]+$/, '');
|
|
} catch {
|
|
return path.basename(inputPath, path.extname(inputPath));
|
|
}
|
|
}
|
|
|
|
//Speaker, Sentence, Start, End
|
|
|
|
module.exports = {
|
|
name: "summarize-transcription2", // Unique name for our function that will later be used to get the function from the map via "mapFunctions.get("example").function()"
|
|
type: "summarizer", // value used to differentiate each module to order them in the UI
|
|
displayname: "Summarizer", // The displayname used within the UI
|
|
async function(args) {
|
|
return new Promise(async (resolve, reject) => {
|
|
let inputJson = args.json;
|
|
|
|
//JSON Path
|
|
if (args.jsonPath) {
|
|
try {
|
|
const raw = fs.readFileSync(args.jsonPath, "utf-8");
|
|
inputJson = JSON.parse(raw);
|
|
} catch (e) {
|
|
console.error("Failed to load JSON from file:", e);
|
|
reject("Could not read JSON from file path.")
|
|
return
|
|
}
|
|
}
|
|
// JSON parsen
|
|
if (typeof args === "string") {
|
|
try {
|
|
await new Promise((res, rej) => {
|
|
fs.readFile(args, 'utf8', function (err, data) {
|
|
if (err){
|
|
rej(err)
|
|
return
|
|
}
|
|
inputJson = JSON.parse(data);
|
|
res()
|
|
});
|
|
})
|
|
} catch (e) {
|
|
// console.log("Invalid JSON in summarize-transcription");
|
|
// console.log(e)
|
|
reject(e)
|
|
return
|
|
}
|
|
}
|
|
|
|
const words = inputJson.words;
|
|
if (!Array.isArray(words)) {
|
|
reject("No words Array found")
|
|
return;
|
|
}
|
|
|
|
const ENDINGS = [".", "!", "?"]; // '...' auch als Satzende ?
|
|
const ABBREVIATIONS = new Set(["z.B.", "bzw.", "u.a.", "Dr.", "Mr.", "Mrs.", "Prof.", "etc."]); //TODO weitere Ergaenzen
|
|
|
|
const result = [];
|
|
let currentSentence = "";
|
|
let currentSpeaker = null;
|
|
let startTime = null;
|
|
let endTime = null;
|
|
|
|
for (const w of words) {
|
|
if (!currentSpeaker) currentSpeaker = w.speaker;
|
|
if (startTime === null) startTime = w.start;
|
|
endTime = w.end;
|
|
|
|
//speaker changing
|
|
if (currentSpeaker !== w.speaker && currentSentence) {
|
|
result.push({
|
|
speaker: currentSpeaker,
|
|
sentence: currentSentence,
|
|
start: startTime,
|
|
end: endTime
|
|
});
|
|
currentSentence = "";
|
|
startTime = w.start;
|
|
}
|
|
currentSpeaker = w.speaker;
|
|
currentSentence += (currentSentence ? " " : "") + w.text; //sentence beginning or not
|
|
const lastWord = w.text.trim();
|
|
const lastChar = lastWord.slice(-1);
|
|
const isAbbreviation = ABBREVIATIONS.has(lastWord);
|
|
|
|
//sentence ending
|
|
if (ENDINGS.includes(lastChar) && !isAbbreviation) {
|
|
result.push({
|
|
speaker: currentSpeaker,
|
|
sentence: currentSentence,
|
|
start: startTime,
|
|
end: endTime
|
|
});
|
|
currentSentence = "";
|
|
startTime = null;
|
|
endTime = null;
|
|
currentSpeaker = null;
|
|
}
|
|
}
|
|
|
|
// safe last sentence
|
|
if (currentSentence) {
|
|
result.push({
|
|
speaker: currentSpeaker,
|
|
sentence: currentSentence,
|
|
start: startTime,
|
|
end: endTime
|
|
});
|
|
}
|
|
|
|
|
|
// Output as Text
|
|
const output = result.map(r =>
|
|
`Sprecher ${r.speaker} [${r.start.toFixed(2)} - ${r.end.toFixed(2)}]: ${r.sentence}`
|
|
);
|
|
|
|
// Output on cosole
|
|
//console.log("\n------------\nMerged Transcription Result:\n", output, "\n------------\n");
|
|
|
|
try {
|
|
|
|
let filename = getSessionId(args);
|
|
|
|
const jsonPath = path.join(outputDir, `${filename}-${new Date().getTime()}.json`);
|
|
fs.writeFileSync(jsonPath, JSON.stringify(result, null, 2), "utf-8");
|
|
|
|
const txtPath = path.join(outputDir, `${filename}-${new Date().getTime()}.txt`);
|
|
fs.writeFileSync(txtPath, output.join("\n"), "utf-8");
|
|
|
|
// console.log(`Summary successfully saved:\n- ${jsonPath}\n- ${txtPath}`);
|
|
resolve(jsonPath);
|
|
} catch (err) {
|
|
// console.error("Error saving Summary:", err);
|
|
reject(err);
|
|
}
|
|
})
|
|
}
|
|
}
|