mirror of
https://github.com/barelyprofessional/KfChatDotNet.git
synced 2026-04-30 03:22:04 -04:00
Add Whisper transcription for BossmanJack Discord voice messages (#107)
* Add Whisper transcription for BossmanJack Discord voice messages Detect Discord voice message attachments (audio with IS_VOICE_MESSAGE flag) from the monitored user and transcribe them via OpenAI Whisper API before relaying to chat. Reuses the existing OpenAi.ApiKey setting. Feature is disabled by default via Whisper.Enabled setting. * Use separate API key setting for Whisper transcription * Switch to local Whisper and post-then-edit transcription flow Voice messages are now relayed immediately with a "transcribing..." placeholder, then transcribed locally via the whisper CLI and the message is edited to append the result. Removes OpenAI API dependency in favor of a local whisper binary. Settings: Whisper.BinaryPath, Whisper.Model, Whisper.Enabled --------- Co-authored-by: DFE <dfe@dfe.com> Co-authored-by: barelyprofessional <150058423+barelyprofessional@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
1a49fe1976
commit
3d269716e8
@@ -944,13 +944,72 @@ public class BotServices
|
||||
}
|
||||
|
||||
var result = $"[img]{settings[BuiltIn.Keys.DiscordIcon].Value}[/img] {message.Author.GlobalName ?? message.Author.Username}: {message.Content?.Replace("❤️", ":feels:")}";
|
||||
var voiceMessages = new List<(string Url, string Filename)>();
|
||||
foreach (var attachment in message.Attachments ?? [])
|
||||
{
|
||||
result += $"[br]Attachment: {attachment.GetProperty("filename").GetString()} {attachment.GetProperty("url").GetString()}";
|
||||
var filename = attachment.GetProperty("filename").GetString() ?? "unknown";
|
||||
var url = attachment.GetProperty("url").GetString() ?? "";
|
||||
|
||||
// Discord voice messages have content_type audio/ogg and the IS_VOICE_MESSAGE flag (1 << 13)
|
||||
if (attachment.TryGetProperty("content_type", out var contentTypeProp) &&
|
||||
contentTypeProp.GetString()?.StartsWith("audio/") == true &&
|
||||
attachment.TryGetProperty("flags", out var flagsProp) &&
|
||||
flagsProp.TryGetInt32(out var flags) &&
|
||||
(flags & (1 << 13)) != 0)
|
||||
{
|
||||
result += "[br]🎤 Voice message (transcribing...)";
|
||||
voiceMessages.Add((url, filename));
|
||||
}
|
||||
else
|
||||
{
|
||||
result += $"[br]Attachment: {filename} {url}";
|
||||
}
|
||||
}
|
||||
|
||||
_chatBot.SendChatMessage(result, TemporarilyBypassGambaSeshForDiscord);
|
||||
|
||||
var sentMsg = _chatBot.SendChatMessage(result, TemporarilyBypassGambaSeshForDiscord);
|
||||
UpdateBossmanLastSighting("talking in Discord").Wait(_cancellationToken);
|
||||
|
||||
// Transcribe voice messages in the background, then edit the sent message
|
||||
if (voiceMessages.Count > 0)
|
||||
{
|
||||
_ = Task.Run(async () =>
|
||||
{
|
||||
try
|
||||
{
|
||||
// Wait for the message to be echoed so we have a UUID to edit
|
||||
if (!await _chatBot.WaitForChatMessageAsync(sentMsg, TimeSpan.FromSeconds(10), _cancellationToken))
|
||||
{
|
||||
_logger.Warn("Voice message never got echoed, can't edit with transcription");
|
||||
return;
|
||||
}
|
||||
|
||||
var edited = result;
|
||||
foreach (var (url, filename) in voiceMessages)
|
||||
{
|
||||
var transcription = await WhisperTranscription.TranscribeFromUrlAsync(url, filename, _cancellationToken);
|
||||
if (transcription != null)
|
||||
{
|
||||
edited = edited.Replace("🎤 Voice message (transcribing...)",
|
||||
$"🎤 Voice message: [i]{transcription}[/i]");
|
||||
}
|
||||
else
|
||||
{
|
||||
edited = edited.Replace("🎤 Voice message (transcribing...)",
|
||||
"🎤 Voice message (transcription unavailable)");
|
||||
}
|
||||
}
|
||||
|
||||
if (sentMsg.ChatMessageUuid != null)
|
||||
{
|
||||
await _chatBot.KfClient.EditMessageAsync(sentMsg.ChatMessageUuid, edited);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.Error(ex, "Failed to transcribe Discord voice message");
|
||||
}
|
||||
}, _cancellationToken);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task DiscordFlashText(SentMessageTrackerModel msg)
|
||||
|
||||
144
KfChatDotNetBot/Services/WhisperTranscription.cs
Normal file
144
KfChatDotNetBot/Services/WhisperTranscription.cs
Normal file
@@ -0,0 +1,144 @@
|
||||
using System.Diagnostics;
|
||||
using System.Net;
|
||||
using KfChatDotNetBot.Settings;
|
||||
using NLog;
|
||||
|
||||
namespace KfChatDotNetBot.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Local OpenAI Whisper integration for voice message transcription.
|
||||
///
|
||||
/// Downloads audio from a URL and runs it through a local Whisper binary
|
||||
/// (openai-whisper CLI) for speech-to-text transcription.
|
||||
///
|
||||
/// Requires: pip install openai-whisper (or faster-whisper)
|
||||
///
|
||||
/// Configuration:
|
||||
/// - Whisper.Enabled: Feature toggle (default: false)
|
||||
/// - Whisper.BinaryPath: Path to the whisper binary (default: whisper)
|
||||
/// - Whisper.Model: Model size - tiny, base, small, medium, large (default: base)
|
||||
/// - Proxy: Global proxy setting (optional, used for downloading)
|
||||
/// </summary>
|
||||
public static class WhisperTranscription
|
||||
{
|
||||
private static readonly Logger Logger = LogManager.GetCurrentClassLogger();
|
||||
|
||||
/// <summary>
|
||||
/// Downloads audio from a URL and transcribes it using local Whisper.
|
||||
///
|
||||
/// Flow:
|
||||
/// 1. Check if feature is enabled
|
||||
/// 2. Download audio from URL to a temp file
|
||||
/// 3. Run whisper CLI on the temp file with --output_format txt
|
||||
/// 4. Read the output text file and return the transcription
|
||||
/// 5. Clean up temp files
|
||||
///
|
||||
/// Returns null on any failure. All errors are logged via NLog.
|
||||
/// </summary>
|
||||
public static async Task<string?> TranscribeFromUrlAsync(string url, string fileName, CancellationToken ct = default)
|
||||
{
|
||||
var settings = await SettingsProvider.GetMultipleValuesAsync([
|
||||
BuiltIn.Keys.WhisperEnabled,
|
||||
BuiltIn.Keys.WhisperBinaryPath,
|
||||
BuiltIn.Keys.WhisperModel,
|
||||
BuiltIn.Keys.Proxy
|
||||
]);
|
||||
|
||||
if (!settings[BuiltIn.Keys.WhisperEnabled].ToBoolean())
|
||||
{
|
||||
Logger.Debug("Whisper transcription is disabled");
|
||||
return null;
|
||||
}
|
||||
|
||||
var whisperBinary = settings[BuiltIn.Keys.WhisperBinaryPath].Value ?? "whisper";
|
||||
var model = settings[BuiltIn.Keys.WhisperModel].Value ?? "base";
|
||||
|
||||
var handler = new HttpClientHandler { AutomaticDecompression = DecompressionMethods.All };
|
||||
if (settings[BuiltIn.Keys.Proxy].Value != null)
|
||||
{
|
||||
handler.UseProxy = true;
|
||||
handler.Proxy = new WebProxy(settings[BuiltIn.Keys.Proxy].Value);
|
||||
}
|
||||
|
||||
var tempDir = Path.Combine(Path.GetTempPath(), $"whisper_{Guid.NewGuid()}");
|
||||
Directory.CreateDirectory(tempDir);
|
||||
var ext = Path.GetExtension(fileName);
|
||||
if (string.IsNullOrEmpty(ext)) ext = ".ogg";
|
||||
var tempAudioPath = Path.Combine(tempDir, $"voice{ext}");
|
||||
|
||||
try
|
||||
{
|
||||
// Download the audio file
|
||||
Logger.Info($"Downloading voice message from {url}");
|
||||
using var client = new HttpClient(handler);
|
||||
var audioBytes = await client.GetByteArrayAsync(url, ct);
|
||||
await File.WriteAllBytesAsync(tempAudioPath, audioBytes, ct);
|
||||
Logger.Info($"Downloaded {audioBytes.Length} bytes to {tempAudioPath}");
|
||||
|
||||
// Run local whisper
|
||||
var args = $"\"{tempAudioPath}\" --model {model} --output_format txt --output_dir \"{tempDir}\"";
|
||||
Logger.Info($"Running: {whisperBinary} {args}");
|
||||
|
||||
var processInfo = new ProcessStartInfo
|
||||
{
|
||||
FileName = whisperBinary,
|
||||
Arguments = args,
|
||||
RedirectStandardOutput = true,
|
||||
RedirectStandardError = true,
|
||||
UseShellExecute = false,
|
||||
CreateNoWindow = true
|
||||
};
|
||||
|
||||
using var process = Process.Start(processInfo);
|
||||
if (process == null)
|
||||
{
|
||||
Logger.Error("Failed to start Whisper process");
|
||||
return null;
|
||||
}
|
||||
|
||||
await process.WaitForExitAsync(ct);
|
||||
|
||||
if (process.ExitCode != 0)
|
||||
{
|
||||
var stderr = await process.StandardError.ReadToEndAsync(ct);
|
||||
Logger.Error($"Whisper exited with code {process.ExitCode}: {stderr}");
|
||||
return null;
|
||||
}
|
||||
|
||||
// Whisper outputs a .txt file with the same base name as the input
|
||||
var outputPath = Path.Combine(tempDir, "voice.txt");
|
||||
if (!File.Exists(outputPath))
|
||||
{
|
||||
Logger.Error($"Whisper output file not found at {outputPath}");
|
||||
return null;
|
||||
}
|
||||
|
||||
var transcription = (await File.ReadAllTextAsync(outputPath, ct)).Trim();
|
||||
|
||||
if (string.IsNullOrWhiteSpace(transcription))
|
||||
{
|
||||
Logger.Warn("Whisper returned empty transcription");
|
||||
return null;
|
||||
}
|
||||
|
||||
Logger.Info($"Transcription received: {transcription.Length} characters");
|
||||
return transcription;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Logger.Error(ex, "Error during Whisper transcription");
|
||||
return null;
|
||||
}
|
||||
finally
|
||||
{
|
||||
try
|
||||
{
|
||||
if (Directory.Exists(tempDir)) Directory.Delete(tempDir, true);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Logger.Warn(ex, "Failed to cleanup Whisper temp files");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -574,6 +574,12 @@ public static class BuiltIn
|
||||
public static string KasinoKrashEnabled = "Kasino.Krash.Enabled";
|
||||
[BuiltInSetting("Delay in milliseconds before cleaning up krash", SettingValueType.Text, "10000", WholeNumberRegex)]
|
||||
public static string KasinoKrashCleanupDelay = "Kasino.Krash.CleanupDelay";
|
||||
[BuiltInSetting("Whether Whisper transcription is enabled", SettingValueType.Boolean, "false", BooleanRegex)]
|
||||
public static string WhisperEnabled = "Whisper.Enabled";
|
||||
[BuiltInSetting("Path of the Whisper binary", SettingValueType.Text, "whisper")]
|
||||
public static string WhisperBinaryPath = "Whisper.BinaryPath";
|
||||
[BuiltInSetting("Whisper model name (e.g. tiny, base, small, medium, large)", SettingValueType.Text, "base")]
|
||||
public static string WhisperModel = "Whisper.Model";
|
||||
[BuiltInSetting("Whether Winna is enabled", SettingValueType.Boolean, "false", BooleanRegex)]
|
||||
public static string WinnaEnabled = "Winna.Enabled";
|
||||
[BuiltInSetting("BossmanJack's Winna username", SettingValueType.Text, "ImBossmanJack")]
|
||||
|
||||
Reference in New Issue
Block a user