Files
KfChatDotNet/KfChatDotNetBot/Services/WhisperTranscription.cs
IfYouComplainImDFEingAgain 3d269716e8 Add Whisper transcription for BossmanJack Discord voice messages (#107)
* Add Whisper transcription for BossmanJack Discord voice messages

Detect Discord voice message attachments (audio with IS_VOICE_MESSAGE flag)
from the monitored user and transcribe them via OpenAI Whisper API before
relaying to chat. Reuses the existing OpenAi.ApiKey setting. Feature is
disabled by default via Whisper.Enabled setting.

* Use separate API key setting for Whisper transcription

* Switch to local Whisper and post-then-edit transcription flow

Voice messages are now relayed immediately with a "transcribing..." placeholder,
then transcribed locally via the whisper CLI and the message is edited to append
the result. Removes OpenAI API dependency in favor of a local whisper binary.

Settings: Whisper.BinaryPath, Whisper.Model, Whisper.Enabled

---------

Co-authored-by: DFE <dfe@dfe.com>
Co-authored-by: barelyprofessional <150058423+barelyprofessional@users.noreply.github.com>
2026-04-17 04:14:58 +02:00

145 lines
5.1 KiB
C#

using System.Diagnostics;
using System.Net;
using KfChatDotNetBot.Settings;
using NLog;
namespace KfChatDotNetBot.Services;
/// <summary>
/// Local OpenAI Whisper integration for voice message transcription.
///
/// Downloads audio from a URL and runs it through a local Whisper binary
/// (openai-whisper CLI) for speech-to-text transcription.
///
/// Requires: pip install openai-whisper (or faster-whisper)
///
/// Configuration:
/// - Whisper.Enabled: Feature toggle (default: false)
/// - Whisper.BinaryPath: Path to the whisper binary (default: whisper)
/// - Whisper.Model: Model size - tiny, base, small, medium, large (default: base)
/// - Proxy: Global proxy setting (optional, used for downloading)
/// </summary>
public static class WhisperTranscription
{
private static readonly Logger Logger = LogManager.GetCurrentClassLogger();
/// <summary>
/// Downloads audio from a URL and transcribes it using local Whisper.
///
/// Flow:
/// 1. Check if feature is enabled
/// 2. Download audio from URL to a temp file
/// 3. Run whisper CLI on the temp file with --output_format txt
/// 4. Read the output text file and return the transcription
/// 5. Clean up temp files
///
/// Returns null on any failure. All errors are logged via NLog.
/// </summary>
public static async Task<string?> TranscribeFromUrlAsync(string url, string fileName, CancellationToken ct = default)
{
var settings = await SettingsProvider.GetMultipleValuesAsync([
BuiltIn.Keys.WhisperEnabled,
BuiltIn.Keys.WhisperBinaryPath,
BuiltIn.Keys.WhisperModel,
BuiltIn.Keys.Proxy
]);
if (!settings[BuiltIn.Keys.WhisperEnabled].ToBoolean())
{
Logger.Debug("Whisper transcription is disabled");
return null;
}
var whisperBinary = settings[BuiltIn.Keys.WhisperBinaryPath].Value ?? "whisper";
var model = settings[BuiltIn.Keys.WhisperModel].Value ?? "base";
var handler = new HttpClientHandler { AutomaticDecompression = DecompressionMethods.All };
if (settings[BuiltIn.Keys.Proxy].Value != null)
{
handler.UseProxy = true;
handler.Proxy = new WebProxy(settings[BuiltIn.Keys.Proxy].Value);
}
var tempDir = Path.Combine(Path.GetTempPath(), $"whisper_{Guid.NewGuid()}");
Directory.CreateDirectory(tempDir);
var ext = Path.GetExtension(fileName);
if (string.IsNullOrEmpty(ext)) ext = ".ogg";
var tempAudioPath = Path.Combine(tempDir, $"voice{ext}");
try
{
// Download the audio file
Logger.Info($"Downloading voice message from {url}");
using var client = new HttpClient(handler);
var audioBytes = await client.GetByteArrayAsync(url, ct);
await File.WriteAllBytesAsync(tempAudioPath, audioBytes, ct);
Logger.Info($"Downloaded {audioBytes.Length} bytes to {tempAudioPath}");
// Run local whisper
var args = $"\"{tempAudioPath}\" --model {model} --output_format txt --output_dir \"{tempDir}\"";
Logger.Info($"Running: {whisperBinary} {args}");
var processInfo = new ProcessStartInfo
{
FileName = whisperBinary,
Arguments = args,
RedirectStandardOutput = true,
RedirectStandardError = true,
UseShellExecute = false,
CreateNoWindow = true
};
using var process = Process.Start(processInfo);
if (process == null)
{
Logger.Error("Failed to start Whisper process");
return null;
}
await process.WaitForExitAsync(ct);
if (process.ExitCode != 0)
{
var stderr = await process.StandardError.ReadToEndAsync(ct);
Logger.Error($"Whisper exited with code {process.ExitCode}: {stderr}");
return null;
}
// Whisper outputs a .txt file with the same base name as the input
var outputPath = Path.Combine(tempDir, "voice.txt");
if (!File.Exists(outputPath))
{
Logger.Error($"Whisper output file not found at {outputPath}");
return null;
}
var transcription = (await File.ReadAllTextAsync(outputPath, ct)).Trim();
if (string.IsNullOrWhiteSpace(transcription))
{
Logger.Warn("Whisper returned empty transcription");
return null;
}
Logger.Info($"Transcription received: {transcription.Length} characters");
return transcription;
}
catch (Exception ex)
{
Logger.Error(ex, "Error during Whisper transcription");
return null;
}
finally
{
try
{
if (Directory.Exists(tempDir)) Directory.Delete(tempDir, true);
}
catch (Exception ex)
{
Logger.Warn(ex, "Failed to cleanup Whisper temp files");
}
}
}
}