diff --git a/biome.json b/biome.json index 6fa66c8..94f5c60 100644 --- a/biome.json +++ b/biome.json @@ -13,7 +13,10 @@ }, "linter": { "rules": { - "all": true + "all": true, + "nursery": { + "noNodejsModules": "off" + } } }, "json": { diff --git a/bun.lockb b/bun.lockb index e629a9e..e73658a 100755 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/cspell.config.cjs b/cspell.config.cjs index b0951c1..7675c5a 100644 --- a/cspell.config.cjs +++ b/cspell.config.cjs @@ -21,5 +21,9 @@ module.exports = { "knip", "commitlint", "automerge", + "openai", + "consola", + "gdrive", + "ffprobe", ], }; diff --git a/knip.config.ts b/knip.config.ts index 6647d2e..fb734a2 100644 --- a/knip.config.ts +++ b/knip.config.ts @@ -2,10 +2,12 @@ import type { KnipConfig } from "knip"; const config: KnipConfig = { ignoreDependencies: [ + "bun", // @commitlint/cli cannot be detected because its binary is named "commitlint" // ref: https://knip.dev/guides/handling-issues/#example "@commitlint/cli", ], + ignoreBinaries: ["screen"], }; // biome-ignore lint/style/noDefaultExport: diff --git a/package.json b/package.json index 1660226..12fe563 100644 --- a/package.json +++ b/package.json @@ -3,6 +3,8 @@ "name": "interview-transcriber", "private": true, "scripts": { + "start": "bun src/main.ts", + "start:screen": "screen -DRS transcriber bun start", "commit": "git-cz", "check": "npm-run-all check:*", "check:biome": "biome check --apply-unsafe .", @@ -12,6 +14,17 @@ "ignore-sync": "ignore-sync .", "prepare": "husky install" }, + "dependencies": { + "@google/generative-ai": "0.1.3", + "@googleapis/drive": "8.5.0", + "consola": "3.2.3", + "csv-parse": "5.5.3", + "discord.js": "14.14.1", + "fluent-ffmpeg": "2.1.2", + "mime": "4.0.1", + "openai": "4.24.1", + "unique-string": "3.0.0" + }, "devDependencies": { "@biomejs/biome": "1.5.2", "@commitlint/cli": "18.4.4", @@ -21,6 +34,8 @@ "@cspell/cspell-types": "8.3.2", "@tsconfig/bun": "1.0.1", "@tsconfig/strictest": "2.0.2", + "@types/fluent-ffmpeg": "2.1.24", + "@types/node": "20.10.8", "bun-types": "1.0.22", "commitizen": "4.3.0", "cspell": "8.3.2", diff --git a/src/.gitkeep b/src/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/src/ai.ts b/src/ai.ts new file mode 100644 index 0000000..893c40d --- /dev/null +++ b/src/ai.ts @@ -0,0 +1,115 @@ +import { createReadStream } from "node:fs"; +import { GoogleGenerativeAI } from "@google/generative-ai"; +import { env } from "bun"; +import openAi from "openai"; +import { SupportedLanguages } from "./transcribe"; + +/** + * OpenAI API client. + */ +export const openaiClient = new openAi({ + apiKey: env.OPENAI_API_KEY, +}); + +/** + * Maximum file size for Whisper API. + * @see https://platform.openai.com/docs/api-reference/speech-to-text + */ +export const whisperMaxFileSize = 25 * 1000 * 1000; + +/** + * Gemini API client. + */ +export const geminiClient = new GoogleGenerativeAI(env.GEMINI_API_KEY); + +/** + * Transcribe an audio file. + * @param audioFilePath Path to the audio file + * @param language Language of the audio file + * @returns Transcribed text segments + */ +export const transcribeAudioFile = async ( + audioFilePath: string, + language: SupportedLanguages, +): Promise => { + const response = (await openaiClient.audio.transcriptions.create({ + file: createReadStream(audioFilePath), + model: "whisper-1", + language, + prompt: + language === "en" + ? "Hello. This is an interview, and you transcribe it." + : "こんにちは。これはインタビューの録音で、文字起こしをします。", + // biome-ignore lint/style/useNamingConvention: library's naming convention + response_format: "verbose_json", + })) as openAi.Audio.Transcriptions.Transcription & { + segments: { + text: string; + }[]; + }; // cast since the library doesn't support verbose_json + + return response.segments.map((segment) => segment.text); +}; + +/** + * Proofread a transcription. + * @param transcription Transcription to proofread + * @param language Language of the transcription + * @param model AI model to use + * @param prompt System prompt to use + * @returns Proofread transcription + */ +export const proofreadTranscription = async ( + transcription: string, + language: SupportedLanguages, + model: M, +): Promise<{ model: M; prompt: string; response: string }> => { + const systemPrompt = `You are a web media proofreader. +The text ${model === "gpt-4" ? "entered by the user" : "below"} is a transcription of the interview. +Follow the guide below and improve it. +- Remove redundant or repeating expressions. +- Remove fillers. +- Correct grammar errors. +- Replace unnatural or difficult wordings. +- Shorten sentences. +The output style should be the style of an interview, like \`interviewer: \` or \`interviewee\`. +${ + language === "en" + ? "The response must not include markdown syntax." + : "The response must be in Japanese without markdown syntax." +}`; + + let result = ""; + if (model === "gpt-4") { + const response = await openaiClient.chat.completions.create({ + messages: [ + { + role: "system", + content: systemPrompt, + }, + { + role: "user", + content: transcription, + }, + ], + model, + }); + result = response.choices[0]?.message.content ?? ""; + } else { + const response = await geminiClient + .getGenerativeModel({ + model, + }) + .generateContent(`${systemPrompt}\n\n---\n\n${transcription}`); + result = response.response.text(); + } + if (!result) { + throw new Error("The response is empty."); + } + + return { + model, + prompt: systemPrompt, + response: result, + }; +}; diff --git a/src/commands.ts b/src/commands.ts new file mode 100644 index 0000000..9992361 --- /dev/null +++ b/src/commands.ts @@ -0,0 +1,269 @@ +import { env } from "bun"; +import consola from "consola"; +import { + ApplicationCommandType, + type ChatInputCommandInteraction, + type Client, + DiscordAPIError, + EmbedBuilder, + type Interaction, + type MessageContextMenuCommandInteraction, + OAuth2Scopes, + RESTJSONErrorCodes, + type RESTPostAPIChatInputApplicationCommandsJSONBody, + type RESTPostAPIContextMenuApplicationCommandsJSONBody, + type RESTPutAPIApplicationGuildCommandsJSONBody, + Routes, + SlashCommandBuilder, + type UserContextMenuCommandInteraction, +} from "discord.js"; +import { extractFileId } from "./gdrive"; +import { transcribe } from "./transcribe"; + +type ExecutableCommand = + | { + type: ApplicationCommandType.ChatInput; + data: RESTPostAPIChatInputApplicationCommandsJSONBody; + execute: (interaction: ChatInputCommandInteraction) => Promise; + } + | { + type: ApplicationCommandType.Message; + data: RESTPostAPIContextMenuApplicationCommandsJSONBody; + execute: ( + interaction: MessageContextMenuCommandInteraction, + ) => Promise; + } + | { + type: ApplicationCommandType.User; + data: RESTPostAPIContextMenuApplicationCommandsJSONBody; + execute: ( + interaction: UserContextMenuCommandInteraction, + ) => Promise; + }; + +/** + * Application commands registered to the bot. + */ +const commands: ExecutableCommand[] = [ + { + type: ApplicationCommandType.ChatInput, + data: new SlashCommandBuilder() + .setName("transcribe") + .setDescription("Transcribe an interview from a Google Drive file.") + .setDescriptionLocalization( + "ja", + "Google ドライブのファイルからインタビューを書き起こします", + ) + .addStringOption((option) => + option + .setName("video_url") + .setDescription("The Google Drive URL of the video to transcribe.") + .setDescriptionLocalization( + "ja", + "書き起こす動画の Google ドライブ URL", + ) + .setRequired(true), + ) + .addStringOption((option) => + option + .setName("proofread_model") + .setDescription("The AI model to use for proofreading.") + .setDescriptionLocalization("ja", "校正に使用する AI モデル") + .setChoices( + { name: "GPT-4", value: "gpt-4" }, + { name: "Gemini Pro", value: "gemini-pro" }, + ), + ) + .toJSON(), + execute: async (interaction) => { + const videoFileId = extractFileId( + interaction.options.getString("video_url", true) ?? "", + ); + if (!videoFileId) { + await interaction.reply({ + content: "Invalid video URL.", + ephemeral: true, + }); + return; + } + + const language = interaction.guildLocale?.startsWith("en") + ? "en" + : interaction.guildLocale?.startsWith("ja") + ? "ja" + : undefined; + + const proofreadModel = interaction.options.getString("proofread_model") as + | "gpt-4" + | "gemini-pro" + | null; + + interaction.deferReply(); + try { + const { video, parent, audio, transcription, proofreadTranscription } = + await transcribe(videoFileId, language, proofreadModel ?? undefined); + await interaction.editReply({ + embeds: [ + new EmbedBuilder() + .setTitle(video.name) + .setURL(video.webViewLink) + .setFields( + [ + ...(parent + ? [ + { + keyEn: "Folder", + keyJa: "フォルダー", + file: parent, + }, + ] + : []), + { + keyEn: "Audio", + keyJa: "音声", + file: audio, + }, + { + keyEn: "Transcription", + keyJa: "文字起こし", + file: transcription, + }, + { + keyEn: "Proofread", + keyJa: "校正", + file: proofreadTranscription, + }, + ].map(({ keyEn, keyJa, file: { name, webViewLink } }) => ({ + name: language === "en" ? keyEn : keyJa, + value: `[${name}](${webViewLink})`, + inline: true, + })), + ) + .setColor("Green") + .toJSON(), + ], + }); + } catch (error) { + const message = + error instanceof Error ? error.message : JSON.stringify(error); + await interaction.editReply({ + embeds: [ + new EmbedBuilder() + .setTitle("Error") + .setDescription(message) + .setColor("Red") + .toJSON(), + ], + }); + console.error(error); + } + }, + }, +]; + +/** + * Register application commands of the bot to Discord. + * @param client client used to register commands + */ +export const registerCommands = async (client: Client) => { + consola.start("Registering application commands..."); + try { + const body: RESTPutAPIApplicationGuildCommandsJSONBody = commands.map( + (command) => command.data, + ); + await client.rest.put( + // register as guild commands to avoid accessing data from DMs or other guilds + Routes.applicationGuildCommands( + client.application.id, + env.DISCORD_GUILD_ID, + ), + { body }, + ); + + consola.success( + `Successfully registered application commands: ${commands + .map((command) => command.data.name) + .join(", ")}`, + ); + } catch (error) { + consola.error("Failed to register application commands."); + // + // cspell:ignore restjson + if ( + error instanceof DiscordAPIError && + error.code === RESTJSONErrorCodes.MissingAccess + ) { + consola.error( + `Bot may not be in the target guild ${env.DISCORD_GUILD_ID}.`, + ); + const application = await client.application.fetch(); + if (!application.botRequireCodeGrant) { + const authorizationUrl = new URL( + "https://discord.com/api/oauth2/authorize", + ); + authorizationUrl.searchParams.append("client_id", client.user.id); + authorizationUrl.searchParams.append( + "scope", + OAuth2Scopes.ApplicationsCommands, + ); + consola.info( + `Follow this link to add the bot to the guild: ${authorizationUrl}`, + ); + } + } + // do not use consola#error to throw Error since it cannot handle line numbers correctly + console.error(error); + process.exit(1); + } +}; + +/** + * Listener for application command interactions. + */ +// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: if-else statements are necessary here +export const commandsListener = async (interaction: Interaction) => { + if (!interaction.isCommand()) { + return; + } + + // ignore commands from unauthorized guilds or DMs + if (interaction.guildId !== env.DISCORD_GUILD_ID) { + consola.warn( + `Command ${interaction.commandName} was triggered in ${ + interaction.inGuild() ? "an unauthorized guild" : "DM" + }.`, + ); + return; + } + + for (const command of commands) { + if (command.data.name !== interaction.commandName) { + continue; + } + + // do not use switch-case here because the types are not narrowed + if ( + interaction.isChatInputCommand() && + command.type === ApplicationCommandType.ChatInput + ) { + await command.execute(interaction); + return; + } + if ( + interaction.isMessageContextMenuCommand() && + command.type === ApplicationCommandType.Message + ) { + await command.execute(interaction); + return; + } + if ( + interaction.isUserContextMenuCommand() && + command.type === ApplicationCommandType.User + ) { + await command.execute(interaction); + return; + } + + consola.error(`Command ${command.data.name} not found.`); + } +}; diff --git a/src/env.d.ts b/src/env.d.ts new file mode 100644 index 0000000..a48d301 --- /dev/null +++ b/src/env.d.ts @@ -0,0 +1,41 @@ +declare module "bun" { + interface Env { + /** + * Token of the Discord bot. + */ + // biome-ignore lint/style/useNamingConvention: should be SCREAMING_SNAKE_CASE + DISCORD_BOT_TOKEN: string; + + /** + * ID of the Discord guild where the bot is used. + */ + // biome-ignore lint/style/useNamingConvention: + DISCORD_GUILD_ID: string; + + /** + * Email of the Google Cloud service account. + * (`client_email` in the JSON file) + */ + // biome-ignore lint/style/useNamingConvention: + GOOGLE_SERVICE_ACCOUNT_EMAIL: string; + + /** + * Private key of the Google Cloud service account. + * (`private_key` in the JSON file) + */ + // biome-ignore lint/style/useNamingConvention: + GOOGLE_SERVICE_ACCOUNT_KEY: string; + + /** + * API key of the OpenAI API. + */ + // biome-ignore lint/style/useNamingConvention: + OPENAI_API_KEY: string; + + /** + * API key of the Gemini API. + */ + // biome-ignore lint/style/useNamingConvention: + GEMINI_API_KEY: string; + } +} diff --git a/src/ffmpeg.ts b/src/ffmpeg.ts new file mode 100644 index 0000000..96a1a07 --- /dev/null +++ b/src/ffmpeg.ts @@ -0,0 +1,93 @@ +import { dirname, extname, join } from "node:path"; +import { basename } from "node:path"; +import { promisify } from "node:util"; +import { file } from "bun"; +import { parse } from "csv-parse/sync"; +import Ffmpeg from "fluent-ffmpeg"; + +/** + * Extract audio from a video file. + * @param videoFilePath Path to the video file + * @returns Path to the extracted audio file + */ +export const extractAudio = async (videoFilePath: string) => { + const audioFilePath = join( + dirname(videoFilePath), + `${basename(videoFilePath, extname(videoFilePath))}.mp3`, + ); + + return new Promise((resolve, reject) => { + Ffmpeg(videoFilePath) + .noVideo() + .saveToFile(audioFilePath) + .on("end", () => { + resolve(audioFilePath); + }) + .on("error", reject); + }); +}; + +type AudioSegment = { + path: string; + startTime: number; + endTime: number; +}; + +/** + * Split an audio file into multiple files with a maximum size. + * @param sourcePath Path to the audio file + * @param maxFileSize Maximum size of each file + * @returns Array of audio segments, each of which has a path to the audio file, start time, and end time + */ +export const splitAudio = async ( + sourcePath: string, + maxFileSize: number, +): Promise => + promisify(Ffmpeg.ffprobe)(sourcePath).then( + ({ format: { duration, size } }) => { + if (!(duration && size)) { + throw new Error("Failed to get file metadata from ffprobe."); + } + + if (size <= maxFileSize) { + return [ + { + path: sourcePath, + startTime: 0, + endTime: duration, + }, + ]; + } + + const dir = dirname(sourcePath); + const name = basename(sourcePath, extname(sourcePath)); + const audioFilePath = join(dir, `${name}%03d${extname(sourcePath)}`); + const listFilePath = join(dir, `${name}.csv`); + + return new Promise((resolve, reject) => { + Ffmpeg(sourcePath) + .outputOptions([ + "-f segment", + `-segment_time ${Math.floor((duration * maxFileSize) / size)}`, + `-segment_list ${listFilePath}`, + ]) + .saveToFile(audioFilePath) + .on("end", () => { + resolve(listFilePath); + }) + .on("error", reject); + }).then(async (listFilePath) => { + const csv = await file(listFilePath).text(); + return (parse(csv) as string[][]).map((row) => { + if (!(row[0] && row[1] && row[2])) { + throw new Error("Failed to parse CSV file."); + } + return { + path: join(dirname(listFilePath), row[0]), + startTime: Number(row[1]), + endTime: Number(row[2]), + }; + }); + }); + }, + ); diff --git a/src/gdrive.ts b/src/gdrive.ts new file mode 100644 index 0000000..5e87fe0 --- /dev/null +++ b/src/gdrive.ts @@ -0,0 +1,146 @@ +import { createReadStream, createWriteStream } from "node:fs"; +import { basename, extname } from "node:path"; +import { auth, drive_v3 } from "@googleapis/drive"; +import { env } from "bun"; +import mime from "mime"; + +/** + * Google Drive API client with scopes of `drive.readonly` and `drive.file`. + */ +export const driveClient = new drive_v3.Drive({ + auth: new auth.GoogleAuth({ + credentials: { + // biome-ignore lint/style/useNamingConvention: library's naming convention + client_email: env.GOOGLE_SERVICE_ACCOUNT_EMAIL, + // replace \n with actual newlines + // biome-ignore lint/style/useNamingConvention: library's naming convention + private_key: env.GOOGLE_SERVICE_ACCOUNT_KEY.replace(/\\n/g, "\n"), + }, + // ref: https://developers.google.com/identity/protocols/oauth2/scopes#drive + scopes: [ + // required to download files + "https://www.googleapis.com/auth/drive.readonly", + // required to upload files + "https://www.googleapis.com/auth/drive.file", + ], + }), +}); + +/** + * Extract Google Drive file ID from a URL. + * @param url Google Drive URL + * @returns Google Drive file ID + */ +export const extractFileId = (url: string): string | undefined => { + // file ID is the path segment after d (files), e (forms), or folders + // ref: https://github.com/spamscanner/url-regex-safe/blob/6c1e2c3b5557709633a2cc971d599469ea395061/src/index.js#L80 + // ref: https://stackoverflow.com/questions/16840038/easiest-way-to-get-file-id-from-url-on-google-apps-script + const regex = + /^https?:\/\/(?:drive|docs)\.google\.com\/[^\s'"\)]+\/(?:d|e|folders)\/([-\w]{25,})(?:\/[^\s'"\)]*[^\s"\)'.?!])?$/g; + return regex.exec(url)?.[1]; +}; + +/** + * Google Drive file metadata only with required fields. + */ +type FileMetadata = { + [P in K]: NonNullable; +}; + +/** + * Get metadata of a file from Google Drive. + * @param fileId Google Drive file ID + * @param fields selector for the fields to get + * @see https://developers.google.com/drive/api/guides/fields-parameter + * @returns Google Drive file metadata + */ +export const getFileMetadata = async < + F extends string | (keyof drive_v3.Schema$File)[], +>( + fileId: string, + fields?: F, +) => + driveClient.files + .get({ + fileId: fileId, + ...(fields + ? { fields: Array.isArray(fields) ? fields.join(",") : fields } + : undefined), + }) + .then(({ data }) => { + if (Array.isArray(fields) && fields.some((field) => !data[field])) { + throw new Error("Failed to get file metadata."); + } + return data as F extends (keyof drive_v3.Schema$File)[] + ? FileMetadata + : drive_v3.Schema$File; + }); + +/** + * Download a file from Google Drive. + * @param fileId Google Drive file ID + * @param path path to save the file + * @returns path to the downloaded file + */ +export const downloadFile = async (fileId: string, path: string) => + driveClient.files + .get( + { + fileId: fileId, + alt: "media", + }, + { + responseType: "stream", + }, + ) + .then( + ({ data }) => + new Promise((resolve, reject) => { + data + .on("end", () => { + resolve(path); + }) + .on("error", reject) + .pipe(createWriteStream(path)); + }), + ); + +/** + * Upload a file to Google Drive. + * @param path path to the file + * @param fileBasename basename of the file to upload (without extension) + * @param parentFolderId Google Drive folder ID to upload the file to + * @param convertTo Google Docs MIME type to convert the file to + * @returns Google Drive file metadata of the uploaded file + */ +export const uploadFile = async ( + path: string, + fileBasename?: string, + parentFolderId?: string, + convertTo?: `application/vnd.google-apps.${ + | "document" + | "spreadsheet" + | "presentation"}`, +) => + driveClient.files + .create({ + fields: "name,webViewLink", + requestBody: { + // remove extension if converting + name: + (fileBasename ?? basename(path, extname(path))) + + (convertTo ? "" : extname(path)), + ...(parentFolderId ? { parents: [parentFolderId] } : {}), + ...(convertTo ? { mimeType: convertTo } : {}), + }, + media: { + ...(convertTo ? { mimeType: mime.getType(path) ?? "text/plain" } : {}), + body: createReadStream(path), + }, + }) + .then(({ data }) => { + if (!(data.name && data.webViewLink)) { + throw new Error("Failed to upload file."); + } + return data as FileMetadata<"name" | "webViewLink">; + }); diff --git a/src/main.ts b/src/main.ts new file mode 100644 index 0000000..f949e67 --- /dev/null +++ b/src/main.ts @@ -0,0 +1,97 @@ +import { promisify } from "node:util"; +import { env } from "bun"; +import { consola } from "consola"; +import { Client, Events } from "discord.js"; +import Ffmpeg from "fluent-ffmpeg"; +import { geminiClient, openaiClient } from "./ai"; +import { commandsListener, registerCommands } from "./commands"; +import { driveClient } from "./gdrive"; + +consola.start("interview-transcriber is starting..."); + +// check if all required environment variables are set +// need to sync with env.d.ts +const requiredEnvs = [ + "DISCORD_BOT_TOKEN", + "DISCORD_GUILD_ID", + "GOOGLE_SERVICE_ACCOUNT_EMAIL", + "GOOGLE_SERVICE_ACCOUNT_KEY", + "OPENAI_API_KEY", + "GEMINI_API_KEY", +]; +const missingEnv = requiredEnvs.filter((name) => !env[name]); +if (missingEnv.length) { + consola.error( + `Environment variables ${missingEnv.join( + ", ", + )} are not set. Follow the instructions in README.md and set them in .env.`, + ); + process.exit(1); +} + +// test if the client is working with valid credentials to fail fast + +consola.start("Checking ffmpeg installation..."); +await promisify(Ffmpeg.getAvailableFormats)(); +consola.ready("ffmpeg is installed!"); + +consola.start("Initializing OpenAI API client..."); +await openaiClient.models.list(); +consola.ready("OpenAI API client is now ready!"); + +consola.start("Initializing Gemini API client..."); +const result = await geminiClient + .getGenerativeModel({ + model: "gemini-pro", + }) + .generateContent("Ping! Say something to me!"); +consola.info(`Gemini: ${result.response.text()}`); +consola.ready("Gemini API client is now ready!"); + +consola.start("Initializing Google Drive API client..."); +consola.info(`Service account email: ${env.GOOGLE_SERVICE_ACCOUNT_EMAIL}`); +const files = await driveClient.files.list({ + fields: "files(owners)", +}); +// exit if the service account has access to no files +// exclude files owned by the service account itself +// only legacy files have multiple owners, so we do not support them +// ref: https://developers.google.com/drive/api/reference/rest/v3/files#File.FIELDS.owners +if ( + !files.data.files?.filter(({ owners }) => owners?.[0] && !owners?.[0]?.me) + .length +) { + consola.warn( + "No files are shared to the service account in Google Drive. Share some files to the service account.", + ); +} +consola.ready("Google Drive API client is now ready!"); + +consola.start("Starting Discord bot..."); +const discordClient = new Client({ intents: [] }); + +discordClient.once(Events.ClientReady, async (client) => { + consola.ready("Discord bot is now ready!"); + consola.info(`Logged in as ${client.user.tag}.`); + + const application = await client.application.fetch(); + const botSettingsUrl = `https://discord.com/developers/applications/${application.id}/bot`; + if (application.botPublic) { + consola.warn( + `Bot is public (can be added by anyone). Consider making it private from ${botSettingsUrl}.`, + ); + } + if (application.botRequireCodeGrant) { + consola.warn( + `Bot requires OAuth2 code grant. It is unnecessary for this bot. Consider disabling it from ${botSettingsUrl}.`, + ); + } + + await registerCommands(client); + + consola.ready("interview-transcriber is successfully started!"); +}); + +discordClient.on(Events.InteractionCreate, commandsListener); + +discordClient.login(env.DISCORD_BOT_TOKEN); diff --git a/src/transcribe.ts b/src/transcribe.ts new file mode 100644 index 0000000..c889d28 --- /dev/null +++ b/src/transcribe.ts @@ -0,0 +1,161 @@ +import { mkdtemp, rmdir } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { basename, extname, join } from "node:path"; +import { write } from "bun"; +import consola from "consola"; +import uniqueString from "unique-string"; +import { + proofreadTranscription, + transcribeAudioFile, + whisperMaxFileSize, +} from "./ai"; +import { extractAudio, splitAudio } from "./ffmpeg"; +import { downloadFile, getFileMetadata, uploadFile } from "./gdrive"; + +/** + * Supported languages. + */ +export type SupportedLanguages = "en" | "ja"; + +/** + * Transcribe a video file. + * @param videoFileId Google Drive file ID of the video file. + * @param language Language of the video file. + * @param proofreadModel AI model to use for proofreading. + * @returns Google Drive file metadata of the uploaded files (audio, transcription, proofread transcription). + */ +export const transcribe = async ( + videoFileId: string, + language: SupportedLanguages = "en", + proofreadModel: Parameters[2] = "gemini-pro", +) => { + consola.info(`Transcribing ${videoFileId}...`); + const videoFile = await getFileMetadata(videoFileId, [ + "name", + "webViewLink", + "mimeType", + "parents", + ]); + if (!videoFile.mimeType.startsWith("video/")) { + throw new Error("Specified file is not a video."); + } + const videoBasename = basename(videoFile.name, extname(videoFile.name)); + consola.info(`File: ${videoFile.name} (${videoFile.webViewLink})`); + const parentFolderId = videoFile.parents[0]; + + const tempDir = await mkdtemp(join(tmpdir(), "interview-transcriber-")); + + try { + const videoFilePath = await downloadFile( + videoFileId, + // use random string to avoid non-ASCII characters in the file name which causes an error in whisper + join(tempDir, uniqueString() + extname(videoFile.name)), + ); + consola.info(`Downloaded to ${videoFilePath}`); + + const results: ReturnType[] = []; + if (parentFolderId) { + results.push( + getFileMetadata(parentFolderId, ["name", "webViewLink"]).then( + (data) => { + consola.info(`Parent folder: ${data.name} (${data.webViewLink})`); + return data; + }, + ), + ); + } + + const audioFilePath = await extractAudio(videoFilePath); + consola.info(`Extracted audio to ${audioFilePath}`); + results.push( + uploadFile(audioFilePath, videoBasename, parentFolderId).then((data) => { + consola.info(`Uploaded audio to ${data.webViewLink}`); + return data; + }), + ); + + const audioSegments = await splitAudio( + audioFilePath, + whisperMaxFileSize * 0.95, + ); + consola.info( + `Split audio into ${audioSegments.length} files (total ${ + audioSegments.at(-1)?.endTime + } seconds)`, + ); + + const segmenter = new Intl.Segmenter(language); + + const transcriptions = await Promise.all( + audioSegments.map(({ path }) => transcribeAudioFile(path, language)), + ); + const transcribedText = transcriptions.flat().join("\n"); + const transcriptionFilePath = join( + tempDir, + `${basename(videoFilePath, extname(videoFilePath))}_transcription.txt`, + ); + await write(transcriptionFilePath, transcribedText); + consola.info( + `Transcribed audio to ${transcriptionFilePath} (${ + [...segmenter.segment(transcribedText)].length + } characters)`, + ); + results.push( + uploadFile( + transcriptionFilePath, + videoBasename, + parentFolderId, + "application/vnd.google-apps.document", + ).then((data) => { + consola.info(`Uploaded transcription to ${data.webViewLink}`); + return data; + }), + ); + + const proofreadText = await proofreadTranscription( + transcribedText, + language, + proofreadModel, + ); + const proofreadFilePath = join( + tempDir, + `${basename(videoFilePath, extname(videoFilePath))}_proofread.txt`, + ); + await write( + proofreadFilePath, + `model: ${proofreadText.model}\nprompt:\n${proofreadText.prompt}\n\n---\n\n${proofreadText.response}`, + ); + consola.info( + `Proofread transcription to ${proofreadFilePath} (${ + [...segmenter.segment(proofreadText.response)].length + } characters)`, + ); + results.push( + uploadFile( + proofreadFilePath, + videoBasename, + parentFolderId, + "application/vnd.google-apps.document", + ).then((data) => { + consola.info(`Uploaded proofread transcription to ${data.webViewLink}`); + return data; + }), + ); + + const [parentFolder, audioFile, transcriptionFile, proofreadFile] = + await Promise.all(results); + if (!(audioFile && transcriptionFile && proofreadFile)) { + // parentFolder is undefined if the video file is not in a folder + throw new Error("Failed to upload files."); + } + return { + video: videoFile, + parent: parentFolder, + audio: audioFile, + transcription: transcriptionFile, + proofreadTranscription: proofreadFile, + }; + } finally { + await rmdir(tempDir, { recursive: true }); + } +};