Added Whisper.net x LLamaSharp examples for Speech Detection and Speech Chat

This commit is contained in:
Lyrcaxis 2024-04-07 18:50:22 +03:00
parent be980edcb0
commit 9e513204db
4 changed files with 291 additions and 0 deletions

View File

@ -29,6 +29,8 @@ public class ExampleRunner
{ "Batched Executor: Fork", BatchedExecutorFork.Run },
{ "Batched Executor: Rewind", BatchedExecutorRewind.Run },
{ "Batched Executor: Guidance", BatchedExecutorGuidance.Run },
{ "Whisper.net: Speech Recognition", SpeechTranscription.Run },
{ "Whisper.net: Speech Chat", SpeechChat.Run },
{ "Exit", () => { Environment.Exit(0); return Task.CompletedTask; } }
};

View File

@ -0,0 +1,87 @@
using LLama.Common;
using static LLama.Examples.Examples.SpeechTranscription;
namespace LLama.Examples.Examples
{
public class SpeechChat
{
public static async Task Run()
{
if (ConsoleStyleHelpers.SelectAudioModel() is not string model) { return; }
bool loadFinished = false;
var loading = ConsoleStyleHelpers.LoadPrint("Loading transcription model...", () => loadFinished);
using var audioServer = new AudioServer(model);
loadFinished = true; loading.Wait();
Console.WriteLine("Audio model loaded. Insert path for language model.");
using var llamaServer = new LlamaServer(audioServer);
await ConsoleStyleHelpers.WaitUntilExit();
}
class LlamaServer : IAudioServiceUser, IDisposable
{
bool isModelResponding;
AudioServer audioServer;
LLamaWeights model;
LLamaContext context;
InteractiveExecutor executor;
string fullPrompt = "";
bool canceled;
public LlamaServer(AudioServer server)
{
var parameters = new ModelParams(UserSettings.GetModelPath()) { ContextSize = 1024, Seed = 1337, GpuLayerCount = 99 };
model = LLamaWeights.LoadFromFile(parameters);
context = model.CreateContext(parameters);
executor = new InteractiveExecutor(context);
(audioServer = server).ServiceUsers.Add(this);
}
bool IAudioServiceUser.IsOfInterest(string AudioTranscription) => !isModelResponding || AudioTranscription.Contains("stop", StringComparison.CurrentCultureIgnoreCase);
void IAudioServiceUser.ProcessText(string AudioTranscription)
{
if (isModelResponding && AudioTranscription.Contains("stop", StringComparison.CurrentCultureIgnoreCase)) { canceled = true; }
else if (!isModelResponding) { _ = SendMessage(AudioTranscription); }
}
async Task SendMessage(string newMessage)
{
// While a response is queried, we want to detect short phrases/commands like 'stop',
audioServer.detectionSettings = (1, 1); // ..so we lower the min Speech Detection time.
isModelResponding = true;
AddToPrompt($"\n{newMessage}\n", ConsoleColor.Blue);
await foreach (var token in executor.InferAsync(fullPrompt))
{
AddToPrompt(token, ConsoleColor.Yellow);
if (canceled) { AddToPrompt("[...stopped]", ConsoleColor.Red); break; }
}
audioServer.detectionSettings = (2, 3); // Reset back to default detection settings to avoid false positives.
(isModelResponding, canceled) = (false, false); // Reset the state variables to their default.
}
void AddToPrompt(string msg, ConsoleColor color = ConsoleColor.Yellow)
{
fullPrompt += msg;
Console.ForegroundColor = color;
Console.Write(msg);
Console.ForegroundColor = ConsoleColor.White;
}
void IDisposable.Dispose()
{
model.Dispose();
context.Dispose();
}
}
}
}

View File

@ -0,0 +1,196 @@
using NAudio.Wave;
using Whisper.net;
namespace LLama.Examples.Examples
{
public class SpeechTranscription
{
public static async Task Run()
{
if (ConsoleStyleHelpers.SelectAudioModel() is not string model) { return; }
bool loadFinished = false;
var loading = ConsoleStyleHelpers.LoadPrint("Loading model...", () => loadFinished);
using var audioServer = new AudioServer(model);
audioServer.ServiceUsers.Add(new AudioEchoer());
loadFinished = true; loading.Wait();
await ConsoleStyleHelpers.WaitUntilExit();
}
class AudioEchoer : IAudioServiceUser
{
bool IAudioServiceUser.IsOfInterest(string AudioTranscription)
{
if (AudioTranscription.Contains("Artificial Intelligence", StringComparison.CurrentCultureIgnoreCase)) {
Console.ForegroundColor = ConsoleColor.DarkRed;
Console.WriteLine($"Skipped text because it's not of interest {AudioTranscription}");
Console.ForegroundColor = ConsoleColor.White;
return false;
}
else { return true; }
}
void IAudioServiceUser.ProcessText(string AudioTranscription)
{
Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine(AudioTranscription);
Console.ForegroundColor = ConsoleColor.White;
}
}
public interface IAudioServiceUser
{
bool IsOfInterest(string AudioTranscription);
void ProcessText(string AudioTranscription);
}
public class AudioServer : IDisposable
{
const int clipLength = 250; // ms
const float voiceDetectionThreshold = 0.02f;
readonly string[] knownFalsePositives = ["[BLANK_AUDIO]", "Thank you", "[silence]"];
WaveInEvent waveIn;
WaveFormat waveFormat = new(16000, 16, 1); // 16KHz, 16 bits, Mono Channel
List<byte> recordedBytes = [];
WhisperFactory? whisperFactory;
WhisperProcessor? processor;
string whisperPrompt =
"""
The short audio comes from a non-native-english speaker/user that talks to an LLM in real time.
Transcribe knowing this as a fact, and that multiple phrases or questions might appear together.
If there are pauses, form paragraphs that leaves related parts together, and splits the next in new lines.
""";
// Tracked stats for Speech Recognition, Parsing, and Serving.
int currentBlankClips; // Ideally would work with milliseconds,
int totalNonBlankClips; // ..but for example's sake they work on a
int nonIdleTime; // ..clip-based quant-length (1 = clipLength).
// Default detection settings: A speech of 750ms, followed by pause of 500ms. (2x250ms)
public (int minBlanksPerSeperation, int minNonBlanksForValidMessages) detectionSettings = (2, 3);
public HashSet<IAudioServiceUser> ServiceUsers = [];
public AudioServer(string modelPath)
{
whisperFactory = WhisperFactory.FromPath(modelPath);
var builder = whisperFactory.CreateBuilder().WithThreads(16).WithPrompt(whisperPrompt).WithSingleSegment().WithLanguage("en");
(builder.WithBeamSearchSamplingStrategy() as BeamSearchSamplingStrategyBuilder)!.WithPatience(0.2f).WithBeamSize(5);
processor = builder.Build();
waveIn = new WaveInEvent() { BufferMilliseconds = clipLength, WaveFormat = waveFormat };
waveIn.DataAvailable += WaveIn_DataAvailable;
waveIn.StartRecording();
}
void WaveIn_DataAvailable(object? sender, WaveInEventArgs e)
{
// Cache the recorded bytes
recordedBytes.AddRange(e.Buffer[..e.BytesRecorded]);
if (recordedBytes.Count > 110000000) { recordedBytes.RemoveRange(0, 50000000); }
// Get the max volume contained inside the clip
var maxVolume = 0f; // This byte->sample algorithm is from: https://github.com/naudio/NAudio/blob/master/Docs/RecordingLevelMeter.md#calculating-peak-values
for (int i = 0; i < e.BytesRecorded; i += 2) { maxVolume = Math.Max(maxVolume, Math.Abs((short) ((e.Buffer[i + 1] << 8) | e.Buffer[i + 0]) / 32768f)); }
// Compare the volume with the threshold and act accordingly.
// Once an interesting and 'full' set of clips pops up, serve it.
if (maxVolume > voiceDetectionThreshold) {
currentBlankClips = 0;
totalNonBlankClips++;
nonIdleTime++;
}
else if (++currentBlankClips < detectionSettings.minBlanksPerSeperation) { nonIdleTime++; }
else {
if (totalNonBlankClips > detectionSettings.minNonBlanksForValidMessages) { SendTranscription(); }
else if (totalNonBlankClips > 0) { } // This might be case of a false-positive -- knock, noise, cough, anything.
(currentBlankClips, totalNonBlankClips, nonIdleTime) = (0, 0, 0);
}
async void SendTranscription()
{
var bytesPerClip = waveFormat.BitsPerSample * clipLength * 2;
var capturedClipBytes = recordedBytes.TakeLast(bytesPerClip * (nonIdleTime + 2)).ToArray();
var transcribedText = await ProcessAudio(capturedClipBytes, "Assets\\temp.wav"); // Save to temporary file.
if (knownFalsePositives.Contains(transcribedText)) { return; } // False positive.. yikes!
foreach (var user in ServiceUsers.Where(x => x.IsOfInterest(transcribedText))) { user.ProcessText(transcribedText); }
}
}
/// <summary> Requests a transcription and responds with the text. Whisper.net currently doesn't work well with parallelism. </summary>
async Task<string> ProcessAudio(byte[] bytes, string tempWavFilePath)
{
var wavStream = new MemoryStream();
using (var writer = new WaveFileWriter(tempWavFilePath, waveFormat)) { writer.Write(bytes, 0, bytes.Length); }
using (var fileStream = File.OpenRead(tempWavFilePath)) { await fileStream.CopyToAsync(wavStream); }
wavStream.Seek(0, SeekOrigin.Begin);
return string.Join(' ', await processor!.ProcessAsync(wavStream).Select(result => result.Text).ToListAsync()).Trim();
}
void IDisposable.Dispose()
{
waveIn.Dispose();
processor?.Dispose();
}
}
public static class ConsoleStyleHelpers
{
public static string? SelectAudioModel()
{
var models = Directory.GetFiles("Assets", "*bin");
if (models.Length == 1) { return models[0]; }
else if (models.Length != 0) {
for (int i = 0; i < models.Length; i++) {
Console.ForegroundColor = ConsoleColor.Blue;
Console.Write($"{i}: ");
Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine(models[i]["Assets\\".Length..]);
}
while (true) {
Console.ForegroundColor = ConsoleColor.DarkCyan;
Console.Write($"Please choose a model (1-{models.Length}): ");
if (!int.TryParse(Console.ReadKey().KeyChar.ToString(), out var i) || i > models.Length) { Console.WriteLine(); continue; }
Console.WriteLine();
Console.ForegroundColor = ConsoleColor.White;
return models[i - 1];
}
}
else
{
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine($"Download a non-quantized model and place it in the executing directory:");
Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine($"\t{Environment.CurrentDirectory}\\Assets");
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine("You can find the official ggml models in whisper.cpp's huggingface repository: ");
Console.ForegroundColor = ConsoleColor.Blue;
Console.WriteLine("\thttps://huggingface.co/ggerganov/whisper.cpp/tree/main");
Console.ForegroundColor = ConsoleColor.White;
return null;
}
}
public static async Task LoadPrint(string startText, Func<bool> ShouldContinue)
{
var startTime = DateTime.Now;
Console.Write(startText);
while (!ShouldContinue()) { Console.Write("."); await Task.Delay(100); }
Console.WriteLine($" Completed in {(DateTime.Now - startTime).TotalSeconds:f2}s.");
}
public async static Task WaitUntilExit()
{
Console.ForegroundColor = ConsoleColor.Green;
Console.WriteLine("Voice active. Begin talking to transcribe. Press any key at any time to exit.");
Console.ForegroundColor = ConsoleColor.White;
await Task.Delay(1000);
Console.ReadKey();
}
}
}
}

View File

@ -18,8 +18,14 @@
<PackageReference Include="Microsoft.KernelMemory.Core" Version="0.34.240313.1" />
<PackageReference Include="Microsoft.SemanticKernel" Version="1.6.2" />
<PackageReference Include="Microsoft.SemanticKernel.Plugins.Memory" Version="1.6.2-alpha" />
<PackageReference Include="NAudio" Version="2.2.1" />
<PackageReference Include="Spectre.Console" Version="0.48.0" />
<PackageReference Include="Spectre.Console.ImageSharp" Version="0.48.0" />
<PackageReference Include="Whisper.net" Version="1.5.0" />
<PackageReference Include="Whisper.net.Runtime" Version="1.5.0" />
<PackageReference Include="Whisper.net.Runtime.Clblast" Version="1.5.0" />
<PackageReference Include="Whisper.net.Runtime.CoreML" Version="1.5.0" />
<PackageReference Include="Whisper.net.Runtime.Cublas" Version="1.5.0" />
</ItemGroup>
<ItemGroup>