Added Whisper.net x LLamaSharp examples for Speech Detection and Speech Chat
This commit is contained in:
parent
be980edcb0
commit
9e513204db
|
@ -29,6 +29,8 @@ public class ExampleRunner
|
|||
{ "Batched Executor: Fork", BatchedExecutorFork.Run },
|
||||
{ "Batched Executor: Rewind", BatchedExecutorRewind.Run },
|
||||
{ "Batched Executor: Guidance", BatchedExecutorGuidance.Run },
|
||||
{ "Whisper.net: Speech Recognition", SpeechTranscription.Run },
|
||||
{ "Whisper.net: Speech Chat", SpeechChat.Run },
|
||||
{ "Exit", () => { Environment.Exit(0); return Task.CompletedTask; } }
|
||||
};
|
||||
|
||||
|
|
|
@ -0,0 +1,87 @@
|
|||
using LLama.Common;
|
||||
|
||||
using static LLama.Examples.Examples.SpeechTranscription;
|
||||
|
||||
namespace LLama.Examples.Examples
|
||||
{
|
||||
public class SpeechChat
|
||||
{
|
||||
public static async Task Run()
|
||||
{
|
||||
if (ConsoleStyleHelpers.SelectAudioModel() is not string model) { return; }
|
||||
|
||||
bool loadFinished = false;
|
||||
var loading = ConsoleStyleHelpers.LoadPrint("Loading transcription model...", () => loadFinished);
|
||||
|
||||
using var audioServer = new AudioServer(model);
|
||||
loadFinished = true; loading.Wait();
|
||||
|
||||
Console.WriteLine("Audio model loaded. Insert path for language model.");
|
||||
using var llamaServer = new LlamaServer(audioServer);
|
||||
|
||||
await ConsoleStyleHelpers.WaitUntilExit();
|
||||
}
|
||||
|
||||
|
||||
class LlamaServer : IAudioServiceUser, IDisposable
|
||||
{
|
||||
bool isModelResponding;
|
||||
AudioServer audioServer;
|
||||
|
||||
LLamaWeights model;
|
||||
LLamaContext context;
|
||||
InteractiveExecutor executor;
|
||||
|
||||
string fullPrompt = "";
|
||||
bool canceled;
|
||||
|
||||
public LlamaServer(AudioServer server)
|
||||
{
|
||||
var parameters = new ModelParams(UserSettings.GetModelPath()) { ContextSize = 1024, Seed = 1337, GpuLayerCount = 99 };
|
||||
model = LLamaWeights.LoadFromFile(parameters);
|
||||
context = model.CreateContext(parameters);
|
||||
executor = new InteractiveExecutor(context);
|
||||
(audioServer = server).ServiceUsers.Add(this);
|
||||
}
|
||||
|
||||
bool IAudioServiceUser.IsOfInterest(string AudioTranscription) => !isModelResponding || AudioTranscription.Contains("stop", StringComparison.CurrentCultureIgnoreCase);
|
||||
void IAudioServiceUser.ProcessText(string AudioTranscription)
|
||||
{
|
||||
if (isModelResponding && AudioTranscription.Contains("stop", StringComparison.CurrentCultureIgnoreCase)) { canceled = true; }
|
||||
else if (!isModelResponding) { _ = SendMessage(AudioTranscription); }
|
||||
}
|
||||
|
||||
async Task SendMessage(string newMessage)
|
||||
{
|
||||
// While a response is queried, we want to detect short phrases/commands like 'stop',
|
||||
audioServer.detectionSettings = (1, 1); // ..so we lower the min Speech Detection time.
|
||||
|
||||
isModelResponding = true;
|
||||
AddToPrompt($"\n{newMessage}\n", ConsoleColor.Blue);
|
||||
await foreach (var token in executor.InferAsync(fullPrompt))
|
||||
{
|
||||
AddToPrompt(token, ConsoleColor.Yellow);
|
||||
if (canceled) { AddToPrompt("[...stopped]", ConsoleColor.Red); break; }
|
||||
}
|
||||
audioServer.detectionSettings = (2, 3); // Reset back to default detection settings to avoid false positives.
|
||||
(isModelResponding, canceled) = (false, false); // Reset the state variables to their default.
|
||||
}
|
||||
|
||||
void AddToPrompt(string msg, ConsoleColor color = ConsoleColor.Yellow)
|
||||
{
|
||||
fullPrompt += msg;
|
||||
|
||||
Console.ForegroundColor = color;
|
||||
Console.Write(msg);
|
||||
Console.ForegroundColor = ConsoleColor.White;
|
||||
}
|
||||
|
||||
void IDisposable.Dispose()
|
||||
{
|
||||
model.Dispose();
|
||||
context.Dispose();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,196 @@
|
|||
using NAudio.Wave;
|
||||
|
||||
using Whisper.net;
|
||||
|
||||
namespace LLama.Examples.Examples
|
||||
{
|
||||
public class SpeechTranscription
|
||||
{
|
||||
public static async Task Run()
|
||||
{
|
||||
if (ConsoleStyleHelpers.SelectAudioModel() is not string model) { return; }
|
||||
|
||||
bool loadFinished = false;
|
||||
var loading = ConsoleStyleHelpers.LoadPrint("Loading model...", () => loadFinished);
|
||||
|
||||
using var audioServer = new AudioServer(model);
|
||||
audioServer.ServiceUsers.Add(new AudioEchoer());
|
||||
|
||||
loadFinished = true; loading.Wait();
|
||||
await ConsoleStyleHelpers.WaitUntilExit();
|
||||
}
|
||||
|
||||
class AudioEchoer : IAudioServiceUser
|
||||
{
|
||||
bool IAudioServiceUser.IsOfInterest(string AudioTranscription)
|
||||
{
|
||||
if (AudioTranscription.Contains("Artificial Intelligence", StringComparison.CurrentCultureIgnoreCase)) {
|
||||
Console.ForegroundColor = ConsoleColor.DarkRed;
|
||||
Console.WriteLine($"Skipped text because it's not of interest {AudioTranscription}");
|
||||
Console.ForegroundColor = ConsoleColor.White;
|
||||
return false;
|
||||
}
|
||||
else { return true; }
|
||||
}
|
||||
void IAudioServiceUser.ProcessText(string AudioTranscription)
|
||||
{
|
||||
Console.ForegroundColor = ConsoleColor.Yellow;
|
||||
Console.WriteLine(AudioTranscription);
|
||||
Console.ForegroundColor = ConsoleColor.White;
|
||||
}
|
||||
}
|
||||
|
||||
public interface IAudioServiceUser
|
||||
{
|
||||
bool IsOfInterest(string AudioTranscription);
|
||||
void ProcessText(string AudioTranscription);
|
||||
}
|
||||
|
||||
public class AudioServer : IDisposable
|
||||
{
|
||||
const int clipLength = 250; // ms
|
||||
const float voiceDetectionThreshold = 0.02f;
|
||||
readonly string[] knownFalsePositives = ["[BLANK_AUDIO]", "Thank you", "[silence]"];
|
||||
|
||||
WaveInEvent waveIn;
|
||||
WaveFormat waveFormat = new(16000, 16, 1); // 16KHz, 16 bits, Mono Channel
|
||||
List<byte> recordedBytes = [];
|
||||
|
||||
WhisperFactory? whisperFactory;
|
||||
WhisperProcessor? processor;
|
||||
string whisperPrompt =
|
||||
"""
|
||||
The short audio comes from a non-native-english speaker/user that talks to an LLM in real time.
|
||||
Transcribe knowing this as a fact, and that multiple phrases or questions might appear together.
|
||||
If there are pauses, form paragraphs that leaves related parts together, and splits the next in new lines.
|
||||
""";
|
||||
|
||||
// Tracked stats for Speech Recognition, Parsing, and Serving.
|
||||
int currentBlankClips; // Ideally would work with milliseconds,
|
||||
int totalNonBlankClips; // ..but for example's sake they work on a
|
||||
int nonIdleTime; // ..clip-based quant-length (1 = clipLength).
|
||||
// Default detection settings: A speech of 750ms, followed by pause of 500ms. (2x250ms)
|
||||
public (int minBlanksPerSeperation, int minNonBlanksForValidMessages) detectionSettings = (2, 3);
|
||||
|
||||
public HashSet<IAudioServiceUser> ServiceUsers = [];
|
||||
|
||||
public AudioServer(string modelPath)
|
||||
{
|
||||
whisperFactory = WhisperFactory.FromPath(modelPath);
|
||||
var builder = whisperFactory.CreateBuilder().WithThreads(16).WithPrompt(whisperPrompt).WithSingleSegment().WithLanguage("en");
|
||||
(builder.WithBeamSearchSamplingStrategy() as BeamSearchSamplingStrategyBuilder)!.WithPatience(0.2f).WithBeamSize(5);
|
||||
processor = builder.Build();
|
||||
|
||||
waveIn = new WaveInEvent() { BufferMilliseconds = clipLength, WaveFormat = waveFormat };
|
||||
waveIn.DataAvailable += WaveIn_DataAvailable;
|
||||
waveIn.StartRecording();
|
||||
}
|
||||
|
||||
void WaveIn_DataAvailable(object? sender, WaveInEventArgs e)
|
||||
{
|
||||
// Cache the recorded bytes
|
||||
recordedBytes.AddRange(e.Buffer[..e.BytesRecorded]);
|
||||
if (recordedBytes.Count > 110000000) { recordedBytes.RemoveRange(0, 50000000); }
|
||||
|
||||
// Get the max volume contained inside the clip
|
||||
var maxVolume = 0f; // This byte->sample algorithm is from: https://github.com/naudio/NAudio/blob/master/Docs/RecordingLevelMeter.md#calculating-peak-values
|
||||
for (int i = 0; i < e.BytesRecorded; i += 2) { maxVolume = Math.Max(maxVolume, Math.Abs((short) ((e.Buffer[i + 1] << 8) | e.Buffer[i + 0]) / 32768f)); }
|
||||
|
||||
// Compare the volume with the threshold and act accordingly.
|
||||
// Once an interesting and 'full' set of clips pops up, serve it.
|
||||
if (maxVolume > voiceDetectionThreshold) {
|
||||
currentBlankClips = 0;
|
||||
totalNonBlankClips++;
|
||||
nonIdleTime++;
|
||||
}
|
||||
else if (++currentBlankClips < detectionSettings.minBlanksPerSeperation) { nonIdleTime++; }
|
||||
else {
|
||||
if (totalNonBlankClips > detectionSettings.minNonBlanksForValidMessages) { SendTranscription(); }
|
||||
else if (totalNonBlankClips > 0) { } // This might be case of a false-positive -- knock, noise, cough, anything.
|
||||
(currentBlankClips, totalNonBlankClips, nonIdleTime) = (0, 0, 0);
|
||||
}
|
||||
|
||||
|
||||
async void SendTranscription()
|
||||
{
|
||||
var bytesPerClip = waveFormat.BitsPerSample * clipLength * 2;
|
||||
var capturedClipBytes = recordedBytes.TakeLast(bytesPerClip * (nonIdleTime + 2)).ToArray();
|
||||
var transcribedText = await ProcessAudio(capturedClipBytes, "Assets\\temp.wav"); // Save to temporary file.
|
||||
if (knownFalsePositives.Contains(transcribedText)) { return; } // False positive.. yikes!
|
||||
foreach (var user in ServiceUsers.Where(x => x.IsOfInterest(transcribedText))) { user.ProcessText(transcribedText); }
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary> Requests a transcription and responds with the text. Whisper.net currently doesn't work well with parallelism. </summary>
|
||||
async Task<string> ProcessAudio(byte[] bytes, string tempWavFilePath)
|
||||
{
|
||||
var wavStream = new MemoryStream();
|
||||
using (var writer = new WaveFileWriter(tempWavFilePath, waveFormat)) { writer.Write(bytes, 0, bytes.Length); }
|
||||
using (var fileStream = File.OpenRead(tempWavFilePath)) { await fileStream.CopyToAsync(wavStream); }
|
||||
wavStream.Seek(0, SeekOrigin.Begin);
|
||||
|
||||
return string.Join(' ', await processor!.ProcessAsync(wavStream).Select(result => result.Text).ToListAsync()).Trim();
|
||||
}
|
||||
|
||||
void IDisposable.Dispose()
|
||||
{
|
||||
waveIn.Dispose();
|
||||
processor?.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
public static class ConsoleStyleHelpers
|
||||
{
|
||||
public static string? SelectAudioModel()
|
||||
{
|
||||
var models = Directory.GetFiles("Assets", "*bin");
|
||||
if (models.Length == 1) { return models[0]; }
|
||||
else if (models.Length != 0) {
|
||||
for (int i = 0; i < models.Length; i++) {
|
||||
Console.ForegroundColor = ConsoleColor.Blue;
|
||||
Console.Write($"{i}: ");
|
||||
Console.ForegroundColor = ConsoleColor.Yellow;
|
||||
Console.WriteLine(models[i]["Assets\\".Length..]);
|
||||
}
|
||||
while (true) {
|
||||
Console.ForegroundColor = ConsoleColor.DarkCyan;
|
||||
Console.Write($"Please choose a model (1-{models.Length}): ");
|
||||
if (!int.TryParse(Console.ReadKey().KeyChar.ToString(), out var i) || i > models.Length) { Console.WriteLine(); continue; }
|
||||
Console.WriteLine();
|
||||
Console.ForegroundColor = ConsoleColor.White;
|
||||
return models[i - 1];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.ForegroundColor = ConsoleColor.Red;
|
||||
Console.WriteLine($"Download a non-quantized model and place it in the executing directory:");
|
||||
Console.ForegroundColor = ConsoleColor.Yellow;
|
||||
Console.WriteLine($"\t{Environment.CurrentDirectory}\\Assets");
|
||||
Console.ForegroundColor = ConsoleColor.Red;
|
||||
Console.WriteLine("You can find the official ggml models in whisper.cpp's huggingface repository: ");
|
||||
Console.ForegroundColor = ConsoleColor.Blue;
|
||||
Console.WriteLine("\thttps://huggingface.co/ggerganov/whisper.cpp/tree/main");
|
||||
Console.ForegroundColor = ConsoleColor.White;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
public static async Task LoadPrint(string startText, Func<bool> ShouldContinue)
|
||||
{
|
||||
var startTime = DateTime.Now;
|
||||
Console.Write(startText);
|
||||
while (!ShouldContinue()) { Console.Write("."); await Task.Delay(100); }
|
||||
Console.WriteLine($" Completed in {(DateTime.Now - startTime).TotalSeconds:f2}s.");
|
||||
}
|
||||
|
||||
public async static Task WaitUntilExit()
|
||||
{
|
||||
Console.ForegroundColor = ConsoleColor.Green;
|
||||
Console.WriteLine("Voice active. Begin talking to transcribe. Press any key at any time to exit.");
|
||||
Console.ForegroundColor = ConsoleColor.White;
|
||||
await Task.Delay(1000);
|
||||
Console.ReadKey();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -18,8 +18,14 @@
|
|||
<PackageReference Include="Microsoft.KernelMemory.Core" Version="0.34.240313.1" />
|
||||
<PackageReference Include="Microsoft.SemanticKernel" Version="1.6.2" />
|
||||
<PackageReference Include="Microsoft.SemanticKernel.Plugins.Memory" Version="1.6.2-alpha" />
|
||||
<PackageReference Include="NAudio" Version="2.2.1" />
|
||||
<PackageReference Include="Spectre.Console" Version="0.48.0" />
|
||||
<PackageReference Include="Spectre.Console.ImageSharp" Version="0.48.0" />
|
||||
<PackageReference Include="Whisper.net" Version="1.5.0" />
|
||||
<PackageReference Include="Whisper.net.Runtime" Version="1.5.0" />
|
||||
<PackageReference Include="Whisper.net.Runtime.Clblast" Version="1.5.0" />
|
||||
<PackageReference Include="Whisper.net.Runtime.CoreML" Version="1.5.0" />
|
||||
<PackageReference Include="Whisper.net.Runtime.Cublas" Version="1.5.0" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
|
|
Loading…
Reference in New Issue