Add Whisper transcription for BossmanJack Discord voice messages (#107)

* Add Whisper transcription for BossmanJack Discord voice messages

Detect Discord voice message attachments (audio with IS_VOICE_MESSAGE flag)
from the monitored user and transcribe them via OpenAI Whisper API before
relaying to chat. Reuses the existing OpenAi.ApiKey setting. Feature is
disabled by default via Whisper.Enabled setting.

* Use separate API key setting for Whisper transcription

* Switch to local Whisper and post-then-edit transcription flow

Voice messages are now relayed immediately with a "transcribing..." placeholder,
then transcribed locally via the whisper CLI and the message is edited to append
the result. Removes OpenAI API dependency in favor of a local whisper binary.

Settings: Whisper.BinaryPath, Whisper.Model, Whisper.Enabled

---------

Co-authored-by: DFE <dfe@dfe.com>
Co-authored-by: barelyprofessional <150058423+barelyprofessional@users.noreply.github.com>
This commit is contained in:
IfYouComplainImDFEingAgain
2026-04-16 22:14:58 -04:00
committed by GitHub
parent 1a49fe1976
commit 3d269716e8
3 changed files with 212 additions and 3 deletions

View File

@@ -944,13 +944,72 @@ public class BotServices
}
var result = $"[img]{settings[BuiltIn.Keys.DiscordIcon].Value}[/img] {message.Author.GlobalName ?? message.Author.Username}: {message.Content?.Replace("", ":feels:")}";
var voiceMessages = new List<(string Url, string Filename)>();
foreach (var attachment in message.Attachments ?? [])
{
result += $"[br]Attachment: {attachment.GetProperty("filename").GetString()} {attachment.GetProperty("url").GetString()}";
var filename = attachment.GetProperty("filename").GetString() ?? "unknown";
var url = attachment.GetProperty("url").GetString() ?? "";
// Discord voice messages have content_type audio/ogg and the IS_VOICE_MESSAGE flag (1 << 13)
if (attachment.TryGetProperty("content_type", out var contentTypeProp) &&
contentTypeProp.GetString()?.StartsWith("audio/") == true &&
attachment.TryGetProperty("flags", out var flagsProp) &&
flagsProp.TryGetInt32(out var flags) &&
(flags & (1 << 13)) != 0)
{
result += "[br]🎤 Voice message (transcribing...)";
voiceMessages.Add((url, filename));
}
else
{
result += $"[br]Attachment: {filename} {url}";
}
}
_chatBot.SendChatMessage(result, TemporarilyBypassGambaSeshForDiscord);
var sentMsg = _chatBot.SendChatMessage(result, TemporarilyBypassGambaSeshForDiscord);
UpdateBossmanLastSighting("talking in Discord").Wait(_cancellationToken);
// Transcribe voice messages in the background, then edit the sent message
if (voiceMessages.Count > 0)
{
_ = Task.Run(async () =>
{
try
{
// Wait for the message to be echoed so we have a UUID to edit
if (!await _chatBot.WaitForChatMessageAsync(sentMsg, TimeSpan.FromSeconds(10), _cancellationToken))
{
_logger.Warn("Voice message never got echoed, can't edit with transcription");
return;
}
var edited = result;
foreach (var (url, filename) in voiceMessages)
{
var transcription = await WhisperTranscription.TranscribeFromUrlAsync(url, filename, _cancellationToken);
if (transcription != null)
{
edited = edited.Replace("🎤 Voice message (transcribing...)",
$"🎤 Voice message: [i]{transcription}[/i]");
}
else
{
edited = edited.Replace("🎤 Voice message (transcribing...)",
"🎤 Voice message (transcription unavailable)");
}
}
if (sentMsg.ChatMessageUuid != null)
{
await _chatBot.KfClient.EditMessageAsync(sentMsg.ChatMessageUuid, edited);
}
}
catch (Exception ex)
{
_logger.Error(ex, "Failed to transcribe Discord voice message");
}
}, _cancellationToken);
}
}
private async Task DiscordFlashText(SentMessageTrackerModel msg)

View File

@@ -0,0 +1,144 @@
using System.Diagnostics;
using System.Net;
using KfChatDotNetBot.Settings;
using NLog;
namespace KfChatDotNetBot.Services;
/// <summary>
/// Local OpenAI Whisper integration for voice message transcription.
///
/// Downloads audio from a URL and runs it through a local Whisper binary
/// (openai-whisper CLI) for speech-to-text transcription.
///
/// Requires: pip install openai-whisper (or faster-whisper)
///
/// Configuration:
/// - Whisper.Enabled: Feature toggle (default: false)
/// - Whisper.BinaryPath: Path to the whisper binary (default: whisper)
/// - Whisper.Model: Model size - tiny, base, small, medium, large (default: base)
/// - Proxy: Global proxy setting (optional, used for downloading)
/// </summary>
public static class WhisperTranscription
{
private static readonly Logger Logger = LogManager.GetCurrentClassLogger();
/// <summary>
/// Downloads audio from a URL and transcribes it using local Whisper.
///
/// Flow:
/// 1. Check if feature is enabled
/// 2. Download audio from URL to a temp file
/// 3. Run whisper CLI on the temp file with --output_format txt
/// 4. Read the output text file and return the transcription
/// 5. Clean up temp files
///
/// Returns null on any failure. All errors are logged via NLog.
/// </summary>
public static async Task<string?> TranscribeFromUrlAsync(string url, string fileName, CancellationToken ct = default)
{
var settings = await SettingsProvider.GetMultipleValuesAsync([
BuiltIn.Keys.WhisperEnabled,
BuiltIn.Keys.WhisperBinaryPath,
BuiltIn.Keys.WhisperModel,
BuiltIn.Keys.Proxy
]);
if (!settings[BuiltIn.Keys.WhisperEnabled].ToBoolean())
{
Logger.Debug("Whisper transcription is disabled");
return null;
}
var whisperBinary = settings[BuiltIn.Keys.WhisperBinaryPath].Value ?? "whisper";
var model = settings[BuiltIn.Keys.WhisperModel].Value ?? "base";
var handler = new HttpClientHandler { AutomaticDecompression = DecompressionMethods.All };
if (settings[BuiltIn.Keys.Proxy].Value != null)
{
handler.UseProxy = true;
handler.Proxy = new WebProxy(settings[BuiltIn.Keys.Proxy].Value);
}
var tempDir = Path.Combine(Path.GetTempPath(), $"whisper_{Guid.NewGuid()}");
Directory.CreateDirectory(tempDir);
var ext = Path.GetExtension(fileName);
if (string.IsNullOrEmpty(ext)) ext = ".ogg";
var tempAudioPath = Path.Combine(tempDir, $"voice{ext}");
try
{
// Download the audio file
Logger.Info($"Downloading voice message from {url}");
using var client = new HttpClient(handler);
var audioBytes = await client.GetByteArrayAsync(url, ct);
await File.WriteAllBytesAsync(tempAudioPath, audioBytes, ct);
Logger.Info($"Downloaded {audioBytes.Length} bytes to {tempAudioPath}");
// Run local whisper
var args = $"\"{tempAudioPath}\" --model {model} --output_format txt --output_dir \"{tempDir}\"";
Logger.Info($"Running: {whisperBinary} {args}");
var processInfo = new ProcessStartInfo
{
FileName = whisperBinary,
Arguments = args,
RedirectStandardOutput = true,
RedirectStandardError = true,
UseShellExecute = false,
CreateNoWindow = true
};
using var process = Process.Start(processInfo);
if (process == null)
{
Logger.Error("Failed to start Whisper process");
return null;
}
await process.WaitForExitAsync(ct);
if (process.ExitCode != 0)
{
var stderr = await process.StandardError.ReadToEndAsync(ct);
Logger.Error($"Whisper exited with code {process.ExitCode}: {stderr}");
return null;
}
// Whisper outputs a .txt file with the same base name as the input
var outputPath = Path.Combine(tempDir, "voice.txt");
if (!File.Exists(outputPath))
{
Logger.Error($"Whisper output file not found at {outputPath}");
return null;
}
var transcription = (await File.ReadAllTextAsync(outputPath, ct)).Trim();
if (string.IsNullOrWhiteSpace(transcription))
{
Logger.Warn("Whisper returned empty transcription");
return null;
}
Logger.Info($"Transcription received: {transcription.Length} characters");
return transcription;
}
catch (Exception ex)
{
Logger.Error(ex, "Error during Whisper transcription");
return null;
}
finally
{
try
{
if (Directory.Exists(tempDir)) Directory.Delete(tempDir, true);
}
catch (Exception ex)
{
Logger.Warn(ex, "Failed to cleanup Whisper temp files");
}
}
}
}

View File

@@ -574,6 +574,12 @@ public static class BuiltIn
public static string KasinoKrashEnabled = "Kasino.Krash.Enabled";
[BuiltInSetting("Delay in milliseconds before cleaning up krash", SettingValueType.Text, "10000", WholeNumberRegex)]
public static string KasinoKrashCleanupDelay = "Kasino.Krash.CleanupDelay";
[BuiltInSetting("Whether Whisper transcription is enabled", SettingValueType.Boolean, "false", BooleanRegex)]
public static string WhisperEnabled = "Whisper.Enabled";
[BuiltInSetting("Path of the Whisper binary", SettingValueType.Text, "whisper")]
public static string WhisperBinaryPath = "Whisper.BinaryPath";
[BuiltInSetting("Whisper model name (e.g. tiny, base, small, medium, large)", SettingValueType.Text, "base")]
public static string WhisperModel = "Whisper.Model";
[BuiltInSetting("Whether Winna is enabled", SettingValueType.Boolean, "false", BooleanRegex)]
public static string WinnaEnabled = "Winna.Enabled";
[BuiltInSetting("BossmanJack's Winna username", SettingValueType.Text, "ImBossmanJack")]