From ee4fc5d9c9a30512051b1fdc1826ca5d735b9b8d Mon Sep 17 00:00:00 2001 From: Cade Date: Wed, 30 Aug 2023 13:36:09 +0200 Subject: [PATCH] Transcript converter --- README.md | 0 import-transcript.js | 72 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 README.md create mode 100644 import-transcript.js diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/import-transcript.js b/import-transcript.js new file mode 100644 index 0000000..d4208e1 --- /dev/null +++ b/import-transcript.js @@ -0,0 +1,72 @@ + + +const fs = require('fs') + +function importTranscript(srtContent, names) { + const entries = srtContent.trim().split(/\n\n+/) + const nameSet = new Set(names.map(name => name.trim())) + + let foundSpeakers = new Set() + + const mdEntries = entries.map(entry => { + const lines = entry.split(/\n/) + if (lines.length < 3) return null + + const timeParts = lines[1].split(' --> ') + if (timeParts.length !== 2) return null + + let content = lines.slice(2).join(' ').trim() + for (const name of nameSet) { + const regex = new RegExp(`^${name}:`, 'i') + if (content.match(regex)) { + content = content.replace(name, `**${name}**`) + foundSpeakers.add(name) + break + } + } + + return `==${timeParts[0]}==${content}==${timeParts[1]}==` + }).filter(Boolean) + + nameSet.forEach(name => { + if (!foundSpeakers.has(name)) { + console.warn(`Warning: Speaker ${name} wasn't found. Did you misspell their name?`) + } + }) + + return mdEntries.join('\n\n') +} + +let srtFileName, mdFileName, names = '' + +for (let i = 2; i < process.argv.length; i++) { + switch (process.argv[i]) { + case '--input': + case '-i': + srtFileName = process.argv[++i] + break + case '--output': + case '-o': + mdFileName = process.argv[++i] + break + case '--speakers': + names = process.argv[++i] + break + } +} + +if (!srtFileName) { + console.log("This utility converts valid .srt files to NDC compatible transcripts. Usage: node import-transcripts.js --input --output --speakers 'Name1,Name2,...'") + process.exit(1) +} + +const srtContent = fs.readFileSync(srtFileName, 'utf8') +const nameList = names.split(',') +const mdContent = importTranscript(srtContent, nameList) + +if (mdFileName) { + fs.writeFileSync(mdFileName, mdContent, 'utf8') + console.log(`Converted content written to ${mdFileName}`) +} else { + console.log(mdContent) +}