Add Whisper transcription for BossmanJack Discord voice messages (#107)

* Add Whisper transcription for BossmanJack Discord voice messages

Detect Discord voice message attachments (audio with IS_VOICE_MESSAGE flag)
from the monitored user and transcribe them via OpenAI Whisper API before
relaying to chat. Reuses the existing OpenAi.ApiKey setting. Feature is
disabled by default via Whisper.Enabled setting.

* Use separate API key setting for Whisper transcription

* Switch to local Whisper and post-then-edit transcription flow

Voice messages are now relayed immediately with a "transcribing..." placeholder,
then transcribed locally via the whisper CLI and the message is edited to append
the result. Removes OpenAI API dependency in favor of a local whisper binary.

Settings: Whisper.BinaryPath, Whisper.Model, Whisper.Enabled

---------

Co-authored-by: DFE <dfe@dfe.com>
Co-authored-by: barelyprofessional <150058423+barelyprofessional@users.noreply.github.com>
This commit is contained in:
IfYouComplainImDFEingAgain
2026-04-16 22:14:58 -04:00
committed by GitHub
parent 1a49fe1976
commit 3d269716e8
3 changed files with 212 additions and 3 deletions

View File

@@ -944,13 +944,72 @@ public class BotServices
}
var result = $"[img]{settings[BuiltIn.Keys.DiscordIcon].Value}[/img] {message.Author.GlobalName ?? message.Author.Username}: {message.Content?.Replace("", ":feels:")}";
var voiceMessages = new List<(string Url, string Filename)>();
foreach (var attachment in message.Attachments ?? [])
{
result += $"[br]Attachment: {attachment.GetProperty("filename").GetString()} {attachment.GetProperty("url").GetString()}";
var filename = attachment.GetProperty("filename").GetString() ?? "unknown";
var url = attachment.GetProperty("url").GetString() ?? "";
// Discord voice messages have content_type audio/ogg and the IS_VOICE_MESSAGE flag (1 << 13)
if (attachment.TryGetProperty("content_type", out var contentTypeProp) &&
contentTypeProp.GetString()?.StartsWith("audio/") == true &&
attachment.TryGetProperty("flags", out var flagsProp) &&
flagsProp.TryGetInt32(out var flags) &&
(flags & (1 << 13)) != 0)
{
result += "[br]🎤 Voice message (transcribing...)";
voiceMessages.Add((url, filename));
}
else
{
result += $"[br]Attachment: {filename} {url}";
}
}
_chatBot.SendChatMessage(result, TemporarilyBypassGambaSeshForDiscord);
var sentMsg = _chatBot.SendChatMessage(result, TemporarilyBypassGambaSeshForDiscord);
UpdateBossmanLastSighting("talking in Discord").Wait(_cancellationToken);
// Transcribe voice messages in the background, then edit the sent message
if (voiceMessages.Count > 0)
{
_ = Task.Run(async () =>
{
try
{
// Wait for the message to be echoed so we have a UUID to edit
if (!await _chatBot.WaitForChatMessageAsync(sentMsg, TimeSpan.FromSeconds(10), _cancellationToken))
{
_logger.Warn("Voice message never got echoed, can't edit with transcription");
return;
}
var edited = result;
foreach (var (url, filename) in voiceMessages)
{
var transcription = await WhisperTranscription.TranscribeFromUrlAsync(url, filename, _cancellationToken);
if (transcription != null)
{
edited = edited.Replace("🎤 Voice message (transcribing...)",
$"🎤 Voice message: [i]{transcription}[/i]");
}
else
{
edited = edited.Replace("🎤 Voice message (transcribing...)",
"🎤 Voice message (transcription unavailable)");
}
}
if (sentMsg.ChatMessageUuid != null)
{
await _chatBot.KfClient.EditMessageAsync(sentMsg.ChatMessageUuid, edited);
}
}
catch (Exception ex)
{
_logger.Error(ex, "Failed to transcribe Discord voice message");
}
}, _cancellationToken);
}
}
private async Task DiscordFlashText(SentMessageTrackerModel msg)