diff --git a/KfChatDotNetBot/Services/BotServices.cs b/KfChatDotNetBot/Services/BotServices.cs index f98d495..424c531 100644 --- a/KfChatDotNetBot/Services/BotServices.cs +++ b/KfChatDotNetBot/Services/BotServices.cs @@ -944,13 +944,72 @@ public class BotServices } var result = $"[img]{settings[BuiltIn.Keys.DiscordIcon].Value}[/img] {message.Author.GlobalName ?? message.Author.Username}: {message.Content?.Replace("❤️", ":feels:")}"; + var voiceMessages = new List<(string Url, string Filename)>(); foreach (var attachment in message.Attachments ?? []) { - result += $"[br]Attachment: {attachment.GetProperty("filename").GetString()} {attachment.GetProperty("url").GetString()}"; + var filename = attachment.GetProperty("filename").GetString() ?? "unknown"; + var url = attachment.GetProperty("url").GetString() ?? ""; + + // Discord voice messages have content_type audio/ogg and the IS_VOICE_MESSAGE flag (1 << 13) + if (attachment.TryGetProperty("content_type", out var contentTypeProp) && + contentTypeProp.GetString()?.StartsWith("audio/") == true && + attachment.TryGetProperty("flags", out var flagsProp) && + flagsProp.TryGetInt32(out var flags) && + (flags & (1 << 13)) != 0) + { + result += "[br]🎤 Voice message (transcribing...)"; + voiceMessages.Add((url, filename)); + } + else + { + result += $"[br]Attachment: {filename} {url}"; + } } - - _chatBot.SendChatMessage(result, TemporarilyBypassGambaSeshForDiscord); + + var sentMsg = _chatBot.SendChatMessage(result, TemporarilyBypassGambaSeshForDiscord); UpdateBossmanLastSighting("talking in Discord").Wait(_cancellationToken); + + // Transcribe voice messages in the background, then edit the sent message + if (voiceMessages.Count > 0) + { + _ = Task.Run(async () => + { + try + { + // Wait for the message to be echoed so we have a UUID to edit + if (!await _chatBot.WaitForChatMessageAsync(sentMsg, TimeSpan.FromSeconds(10), _cancellationToken)) + { + _logger.Warn("Voice message never got echoed, can't edit with transcription"); + return; + } + + var edited = result; + foreach (var (url, filename) in voiceMessages) + { + var transcription = await WhisperTranscription.TranscribeFromUrlAsync(url, filename, _cancellationToken); + if (transcription != null) + { + edited = edited.Replace("🎤 Voice message (transcribing...)", + $"🎤 Voice message: [i]{transcription}[/i]"); + } + else + { + edited = edited.Replace("🎤 Voice message (transcribing...)", + "🎤 Voice message (transcription unavailable)"); + } + } + + if (sentMsg.ChatMessageUuid != null) + { + await _chatBot.KfClient.EditMessageAsync(sentMsg.ChatMessageUuid, edited); + } + } + catch (Exception ex) + { + _logger.Error(ex, "Failed to transcribe Discord voice message"); + } + }, _cancellationToken); + } } private async Task DiscordFlashText(SentMessageTrackerModel msg) diff --git a/KfChatDotNetBot/Services/WhisperTranscription.cs b/KfChatDotNetBot/Services/WhisperTranscription.cs new file mode 100644 index 0000000..916e085 --- /dev/null +++ b/KfChatDotNetBot/Services/WhisperTranscription.cs @@ -0,0 +1,144 @@ +using System.Diagnostics; +using System.Net; +using KfChatDotNetBot.Settings; +using NLog; + +namespace KfChatDotNetBot.Services; + +/// +/// Local OpenAI Whisper integration for voice message transcription. +/// +/// Downloads audio from a URL and runs it through a local Whisper binary +/// (openai-whisper CLI) for speech-to-text transcription. +/// +/// Requires: pip install openai-whisper (or faster-whisper) +/// +/// Configuration: +/// - Whisper.Enabled: Feature toggle (default: false) +/// - Whisper.BinaryPath: Path to the whisper binary (default: whisper) +/// - Whisper.Model: Model size - tiny, base, small, medium, large (default: base) +/// - Proxy: Global proxy setting (optional, used for downloading) +/// +public static class WhisperTranscription +{ + private static readonly Logger Logger = LogManager.GetCurrentClassLogger(); + + /// + /// Downloads audio from a URL and transcribes it using local Whisper. + /// + /// Flow: + /// 1. Check if feature is enabled + /// 2. Download audio from URL to a temp file + /// 3. Run whisper CLI on the temp file with --output_format txt + /// 4. Read the output text file and return the transcription + /// 5. Clean up temp files + /// + /// Returns null on any failure. All errors are logged via NLog. + /// + public static async Task TranscribeFromUrlAsync(string url, string fileName, CancellationToken ct = default) + { + var settings = await SettingsProvider.GetMultipleValuesAsync([ + BuiltIn.Keys.WhisperEnabled, + BuiltIn.Keys.WhisperBinaryPath, + BuiltIn.Keys.WhisperModel, + BuiltIn.Keys.Proxy + ]); + + if (!settings[BuiltIn.Keys.WhisperEnabled].ToBoolean()) + { + Logger.Debug("Whisper transcription is disabled"); + return null; + } + + var whisperBinary = settings[BuiltIn.Keys.WhisperBinaryPath].Value ?? "whisper"; + var model = settings[BuiltIn.Keys.WhisperModel].Value ?? "base"; + + var handler = new HttpClientHandler { AutomaticDecompression = DecompressionMethods.All }; + if (settings[BuiltIn.Keys.Proxy].Value != null) + { + handler.UseProxy = true; + handler.Proxy = new WebProxy(settings[BuiltIn.Keys.Proxy].Value); + } + + var tempDir = Path.Combine(Path.GetTempPath(), $"whisper_{Guid.NewGuid()}"); + Directory.CreateDirectory(tempDir); + var ext = Path.GetExtension(fileName); + if (string.IsNullOrEmpty(ext)) ext = ".ogg"; + var tempAudioPath = Path.Combine(tempDir, $"voice{ext}"); + + try + { + // Download the audio file + Logger.Info($"Downloading voice message from {url}"); + using var client = new HttpClient(handler); + var audioBytes = await client.GetByteArrayAsync(url, ct); + await File.WriteAllBytesAsync(tempAudioPath, audioBytes, ct); + Logger.Info($"Downloaded {audioBytes.Length} bytes to {tempAudioPath}"); + + // Run local whisper + var args = $"\"{tempAudioPath}\" --model {model} --output_format txt --output_dir \"{tempDir}\""; + Logger.Info($"Running: {whisperBinary} {args}"); + + var processInfo = new ProcessStartInfo + { + FileName = whisperBinary, + Arguments = args, + RedirectStandardOutput = true, + RedirectStandardError = true, + UseShellExecute = false, + CreateNoWindow = true + }; + + using var process = Process.Start(processInfo); + if (process == null) + { + Logger.Error("Failed to start Whisper process"); + return null; + } + + await process.WaitForExitAsync(ct); + + if (process.ExitCode != 0) + { + var stderr = await process.StandardError.ReadToEndAsync(ct); + Logger.Error($"Whisper exited with code {process.ExitCode}: {stderr}"); + return null; + } + + // Whisper outputs a .txt file with the same base name as the input + var outputPath = Path.Combine(tempDir, "voice.txt"); + if (!File.Exists(outputPath)) + { + Logger.Error($"Whisper output file not found at {outputPath}"); + return null; + } + + var transcription = (await File.ReadAllTextAsync(outputPath, ct)).Trim(); + + if (string.IsNullOrWhiteSpace(transcription)) + { + Logger.Warn("Whisper returned empty transcription"); + return null; + } + + Logger.Info($"Transcription received: {transcription.Length} characters"); + return transcription; + } + catch (Exception ex) + { + Logger.Error(ex, "Error during Whisper transcription"); + return null; + } + finally + { + try + { + if (Directory.Exists(tempDir)) Directory.Delete(tempDir, true); + } + catch (Exception ex) + { + Logger.Warn(ex, "Failed to cleanup Whisper temp files"); + } + } + } +} diff --git a/KfChatDotNetBot/Settings/BuiltIn.cs b/KfChatDotNetBot/Settings/BuiltIn.cs index 024c60c..d31fd27 100644 --- a/KfChatDotNetBot/Settings/BuiltIn.cs +++ b/KfChatDotNetBot/Settings/BuiltIn.cs @@ -574,6 +574,12 @@ public static class BuiltIn public static string KasinoKrashEnabled = "Kasino.Krash.Enabled"; [BuiltInSetting("Delay in milliseconds before cleaning up krash", SettingValueType.Text, "10000", WholeNumberRegex)] public static string KasinoKrashCleanupDelay = "Kasino.Krash.CleanupDelay"; + [BuiltInSetting("Whether Whisper transcription is enabled", SettingValueType.Boolean, "false", BooleanRegex)] + public static string WhisperEnabled = "Whisper.Enabled"; + [BuiltInSetting("Path of the Whisper binary", SettingValueType.Text, "whisper")] + public static string WhisperBinaryPath = "Whisper.BinaryPath"; + [BuiltInSetting("Whisper model name (e.g. tiny, base, small, medium, large)", SettingValueType.Text, "base")] + public static string WhisperModel = "Whisper.Model"; [BuiltInSetting("Whether Winna is enabled", SettingValueType.Boolean, "false", BooleanRegex)] public static string WinnaEnabled = "Winna.Enabled"; [BuiltInSetting("BossmanJack's Winna username", SettingValueType.Text, "ImBossmanJack")]