Revert "Merge branch 'pr/268' into RuntimeDetection"

This reverts commit 091b8d58b3, reversing
changes made to 9b2ca9cf8e.
This commit is contained in:
SignalRT 2023-11-09 22:13:18 +01:00
parent 200011e186
commit 5fe721bdbe
18 changed files with 224 additions and 216 deletions

View File

@ -30,7 +30,6 @@
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Logging.Console" Version="7.0.0" />
<PackageReference Include="Microsoft.SemanticKernel" Version="1.0.0-beta4" />
<PackageReference Include="Spectre.Console" Version="0.47.0" />
</ItemGroup>
<ItemGroup>

View File

@ -4,7 +4,7 @@ namespace LLama.Examples.NewVersion
{
public class GetEmbeddings
{
public static Task Run()
public static void Run()
{
Console.Write("Please input your model path: ");
var modelPath = Console.ReadLine();
@ -23,7 +23,6 @@ namespace LLama.Examples.NewVersion
Console.WriteLine(string.Join(", ", embedder.GetEmbeddings(text)));
Console.WriteLine();
}
return Task.CompletedTask;
}
}
}

View File

@ -2,7 +2,7 @@
{
public class QuantizeModel
{
public static Task Run()
public static void Run()
{
Console.Write("Please input your original model path: ");
var inputPath = Console.ReadLine();
@ -21,8 +21,6 @@
{
Console.WriteLine("Quantization failed!");
}
return Task.CompletedTask;
}
}
}

View File

@ -1,54 +1,109 @@
using System.Linq.Expressions;
using Spectre.Console;
namespace LLama.Examples.NewVersion
namespace LLama.Examples.NewVersion
{
public class NewVersionTestRunner
{
static Dictionary<string, Func<Task>> Examples = new Dictionary<string, Func<Task>>
{
{"Run a chat session without stripping the role names.", () => ChatSessionWithRoleName.Run()},
{"Run a chat session with the role names stripped.",()=> ChatSessionStripRoleName.Run()},
{"Interactive mode chat by using executor.",()=> InteractiveModeExecute.Run()},
{"Instruct mode chat by using executor.",()=> InstructModeExecute.Run()},
{"Stateless mode chat by using executor.",()=> StatelessModeExecute.Run()},
{"Load and save chat session.",()=> SaveAndLoadSession.Run()},
{"Load and save state of model and executor.",()=> LoadAndSaveState.Run()},
{"Get embeddings from LLama model.",()=> GetEmbeddings.Run()},
{"Quantize the model.",()=> QuantizeModel.Run()},
{"Automatic conversation.",()=> TalkToYourself.Run()},
{"Constrain response to json format using grammar.",()=> GrammarJsonResponse.Run()},
{"Semantic Kernel Prompt.",()=> SemanticKernelPrompt.Run()},
{"Semantic Kernel Chat.",()=> SemanticKernelChat.Run()},
{"Semantic Kernel Memory.",()=> SemanticKernelMemory.Run()},
{"Coding Assistant.",()=> CodingAssistant.Run()},
{"Batch Decoding.",()=> BatchedDecoding.Run()},
{"SK Kernel Memory.",()=> KernelMemory.Run()},
{"Exit", ()=> Task.CompletedTask}
};
public static async Task Run()
{
AnsiConsole.Write(new Rule("LLamaSharp Examples"));
Console.WriteLine("================LLamaSharp Examples (New Version)==================\n");
Console.WriteLine("Please input a number to choose an example to run:");
Console.WriteLine("0: Run a chat session without stripping the role names.");
Console.WriteLine("1: Run a chat session with the role names stripped.");
Console.WriteLine("2: Interactive mode chat by using executor.");
Console.WriteLine("3: Instruct mode chat by using executor.");
Console.WriteLine("4: Stateless mode chat by using executor.");
Console.WriteLine("5: Load and save chat session.");
Console.WriteLine("6: Load and save state of model and executor.");
Console.WriteLine("7: Get embeddings from LLama model.");
Console.WriteLine("8: Quantize the model.");
Console.WriteLine("9: Automatic conversation.");
Console.WriteLine("10: Constrain response to json format using grammar.");
Console.WriteLine("11: Semantic Kernel Prompt.");
Console.WriteLine("12: Semantic Kernel Chat.");
Console.WriteLine("13: Semantic Kernel Memory.");
Console.WriteLine("14: Coding Assistant.");
Console.WriteLine("15: Batch Decoding.");
Console.WriteLine("16: SK Kernel Memory.");
while (true)
{
var choice = AnsiConsole.Prompt(
new SelectionPrompt<string>()
.Title("Please choose[green] an example[/] to run: ")
.AddChoices(Examples.Keys));
Console.Write("\nYour choice: ");
int choice = int.Parse(Console.ReadLine());
if (Examples.TryGetValue(choice, out var example))
if (choice == 0)
{
if (choice == "Exit")
{
break;
}
AnsiConsole.Write(new Rule(choice));
await example();
await ChatSessionWithRoleName.Run();
}
AnsiConsole.Clear();
else if (choice == 1)
{
await ChatSessionStripRoleName.Run();
}
else if (choice == 2)
{
await InteractiveModeExecute.Run();
}
else if (choice == 3)
{
await InstructModeExecute.Run();
}
else if (choice == 4)
{
await StatelessModeExecute.Run();
}
else if (choice == 5)
{
await SaveAndLoadSession.Run();
}
else if (choice == 6)
{
await LoadAndSaveState.Run();
}
else if (choice == 7)
{
GetEmbeddings.Run();
}
else if (choice == 8)
{
QuantizeModel.Run();
}
else if (choice == 9)
{
await TalkToYourself.Run();
}
else if (choice == 10)
{
await GrammarJsonResponse.Run();
}
else if (choice == 11)
{
await SemanticKernelPrompt.Run();
}
else if (choice == 12)
{
await SemanticKernelChat.Run();
}
else if (choice == 13)
{
await SemanticKernelMemory.Run();
}
else if (choice == 14)
{
await CodingAssistant.Run();
}
else if (choice == 15)
{
await BatchedDecoding.Run();
}
else if (choice == 16)
{
await KernelMemory.Run();
}
else
{
Console.WriteLine("Cannot parse your choice. Please select again.");
continue;
}
break;
}
}
}

View File

@ -17,9 +17,9 @@ namespace LLama.Web.Common
public int MaxInstances { get; set; }
/// <summary>
/// Model context size (n_ctx). Null to use value from model.
/// Model context size (n_ctx)
/// </summary>
public uint? ContextSize { get; set; }
public uint ContextSize { get; set; } = 512;
/// <summary>
/// the GPU that is used for scratch and small tensors

View File

@ -8,9 +8,9 @@ namespace LLama.Abstractions;
public interface IContextParams
{
/// <summary>
/// Model context size (n_ctx). Null to use value from model file.
/// Model context size (n_ctx)
/// </summary>
uint? ContextSize { get; set; }
uint ContextSize { get; set; }
/// <summary>
/// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)

View File

@ -43,7 +43,7 @@ namespace LLama.Common
/// <param name="data"></param>
public FixedSizeQueue(int size, IEnumerable<T> data)
{
#if NET6_0_OR_GREATER
#if !NETSTANDARD2_0
// Try to check the size without enumerating the entire IEnumerable. This may not be able to get the count,
// in which case we'll have to check later
if (data.TryGetNonEnumeratedCount(out var dataCount) && dataCount > size)

View File

@ -12,68 +12,105 @@ namespace LLama.Common
public record ModelParams
: ILLamaParams
{
/// <inheritdoc />
public uint? ContextSize { get; set; }
/// <inheritdoc />
/// <summary>
/// Model context size (n_ctx)
/// </summary>
public uint ContextSize { get; set; } = 512;
/// <summary>
/// the GPU that is used for scratch and small tensors
/// </summary>
public int MainGpu { get; set; } = 0;
/// <inheritdoc />
/// <summary>
/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
/// </summary>
public int GpuLayerCount { get; set; } = 20;
/// <inheritdoc />
/// <summary>
/// Seed for the random number generator (seed)
/// </summary>
public uint Seed { get; set; } = 0xFFFFFFFF;
/// <inheritdoc />
/// <summary>
/// Use f16 instead of f32 for memory kv (memory_f16)
/// </summary>
public bool UseFp16Memory { get; set; } = true;
/// <inheritdoc />
/// <summary>
/// Use mmap for faster loads (use_mmap)
/// </summary>
public bool UseMemorymap { get; set; } = true;
/// <inheritdoc />
/// <summary>
/// Use mlock to keep model in memory (use_mlock)
/// </summary>
public bool UseMemoryLock { get; set; }
/// <inheritdoc />
/// <summary>
/// Compute perplexity over the prompt (perplexity)
/// </summary>
public bool Perplexity { get; set; }
/// <inheritdoc />
/// <summary>
/// Model path (model)
/// </summary>
public string ModelPath { get; set; }
/// <inheritdoc />
/// <summary>
/// List of LoRAs to apply
/// </summary>
public AdapterCollection LoraAdapters { get; set; } = new();
/// <inheritdoc />
/// <summary>
/// base model path for the lora adapter (lora_base)
/// </summary>
public string LoraBase { get; set; } = string.Empty;
/// <inheritdoc />
/// <summary>
/// Number of threads (null = autodetect) (n_threads)
/// </summary>
public uint? Threads { get; set; }
/// <inheritdoc />
/// <summary>
/// Number of threads to use for batch processing (null = autodetect) (n_threads)
/// </summary>
public uint? BatchThreads { get; set; }
/// <inheritdoc />
/// <summary>
/// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
/// </summary>
public uint BatchSize { get; set; } = 512;
/// <inheritdoc />
/// <summary>
/// Whether to use embedding mode. (embedding) Note that if this is set to true,
/// The LLamaModel won't produce text response anymore.
/// </summary>
public bool EmbeddingMode { get; set; }
/// <inheritdoc />
/// <summary>
/// how split tensors should be distributed across GPUs.
/// </summary>
/// <remarks>"[ 3, 2 ]" will assign 60% of the data to GPU 0 and 40% to GPU 1.</remarks>
[JsonConverter(typeof(TensorSplitsCollectionConverter))]
public TensorSplitsCollection TensorSplits { get; set; } = new();
/// <inheritdoc />
public float? RopeFrequencyBase { get; set; }
/// <summary>
/// RoPE base frequency
/// </summary>
public float? RopeFrequencyBase { get; set; }
/// <inheritdoc />
public float? RopeFrequencyScale { get; set; }
/// <summary>
/// RoPE frequency scaling factor
/// </summary>
public float? RopeFrequencyScale { get; set; }
/// <inheritdoc />
public bool MulMatQ { get; set; }
/// <summary>
/// Use experimental mul_mat_q kernels
/// </summary>
public bool MulMatQ { get; set; }
/// <inheritdoc />
/// <summary>
/// Load vocab only (no weights)
/// </summary>
public bool VocabOnly { get; set; }
/// <inheritdoc />
/// <summary>
/// The encoding to use to convert text for the model
/// </summary>
[JsonConverter(typeof(EncodingConverter))]
public Encoding Encoding { get; set; } = Encoding.UTF8;

View File

@ -9,8 +9,6 @@ namespace LLama.Extensions
{
return GetValueOrDefaultImpl(dictionary, key, defaultValue);
}
#elif !NET6_0_OR_GREATER && !NETSTANDARD2_1_OR_GREATER
#error Target framework not supported!
#endif
internal static TValue GetValueOrDefaultImpl<TKey, TValue>(IReadOnlyDictionary<TKey, TValue> dictionary, TKey key, TValue defaultValue)

View File

@ -15,8 +15,6 @@ internal static class EncodingExtensions
{
return GetCharCountImpl(encoding, bytes);
}
#elif !NET6_0_OR_GREATER && !NETSTANDARD2_1_OR_GREATER
#error Target framework not supported!
#endif
internal static int GetCharsImpl(Encoding encoding, ReadOnlySpan<byte> bytes, Span<char> output)

View File

@ -21,7 +21,7 @@ namespace LLama.Extensions
public static void ToLlamaContextParams(this IContextParams @params, out LLamaContextParams result)
{
result = NativeApi.llama_context_default_params();
result.n_ctx = @params.ContextSize ?? 0;
result.n_ctx = @params.ContextSize;
result.n_batch = @params.BatchSize;
result.seed = @params.Seed;
result.f16_kv = @params.UseFp16Memory;

View File

@ -10,8 +10,6 @@ namespace LLama.Extensions
{
return TakeLastImpl(source, count);
}
#elif !NET6_0_OR_GREATER && !NETSTANDARD2_1_OR_GREATER
#error Target framework not supported!
#endif
internal static IEnumerable<T> TakeLastImpl<T>(IEnumerable<T> source, int count)

View File

@ -19,7 +19,5 @@ internal static class KeyValuePairExtensions
first = pair.Key;
second = pair.Value;
}
#elif !NET6_0_OR_GREATER && !NETSTANDARD2_1_OR_GREATER
#error Target framework not supported!
#endif
}

View File

@ -5,7 +5,7 @@ namespace LLama.Extensions
{
internal static class ListExtensions
{
#if !NET6_0_OR_GREATER
#if NETSTANDARD2_0
public static void EnsureCapacity<T>(this List<T> list, int capacity)
{
if (list.Capacity < capacity)

View File

@ -47,7 +47,6 @@
</ItemGroup>
<ItemGroup>
<PackageReference Include="ManagedCuda" Version="10.0.0" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="7.0.1" />
</ItemGroup>

View File

@ -22,7 +22,7 @@ namespace LLama.Native
public uint seed;
/// <summary>
/// text context, 0 = from model
/// text context
/// </summary>
public uint n_ctx;

View File

@ -1,13 +1,8 @@
using System;
using System.Buffers;
using System.Reflection;
using System.Runtime.InteropServices;
using System.Text;
using LLama.Exceptions;
using ManagedCuda;
#if NET6_0_OR_GREATER
using System.Runtime.Intrinsics.X86;
#endif
#pragma warning disable IDE1006 // Naming Styles
@ -29,9 +24,8 @@ namespace LLama.Native
{
static NativeApi()
{
#if NET6_0_OR_GREATER
NativeLibrary.SetDllImportResolver(typeof(NativeApi).Assembly, LLamaImportResolver);
#endif
// Try to load a preferred library, based on CPU feature detection
TryLoadLibrary();
try
{
@ -50,120 +44,63 @@ namespace LLama.Native
}
/// <summary>
/// Get the cuda version if possible.
/// Try to load libllama, using CPU feature detection to try and load a more specialised DLL if possible
/// </summary>
/// <returns> -1 for no cuda</returns>
private static int GetCudaVersion()
/// <returns>The library handle to unload later, or IntPtr.Zero if no library was loaded</returns>
private static IntPtr TryLoadLibrary()
{
int deviceCount = CudaContext.GetDeviceCount();
for (int deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++)
{
using (CudaContext ctx = new CudaContext(deviceIndex))
{
var version = ctx.GetAPIVersionOfCurrentContext();
return version.Major;
}
}
return -1;
}
/// <summary>
/// Get the xla flag for native library name.
/// </summary>
/// <returns></returns>
private static string GetAvxFlag()
{
AvxLevel level = AvxLevel.None;
#if NET6_0_OR_GREATER
if (Avx.IsSupported) level = AvxLevel.Avx;
if (Avx2.IsSupported) level = AvxLevel.Avx2;
#if NET8_0_OR_GREATER
if(Avx512F.IsSupported) level = AvxLevel.Avx512;
#endif
return level switch
{
AvxLevel.None => "",
AvxLevel.Avx => "-avx",
AvxLevel.Avx2 => "-avx2",
AvxLevel.Avx512 => "-avx512",
};
#else
return string.Empty;
#endif
}
#if NET6_0_OR_GREATER
private static IntPtr LLamaImportResolver(string name, Assembly assembly, DllImportSearchPath? searchPath)
{
IntPtr handle = IntPtr.Zero;
if(!name.Equals(libraryName))
{
return NativeLibrary.Load(name, assembly, searchPath);
}
string libraryPath = string.Empty;
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
{
var avxFlag = GetAvxFlag();
// check cuda
var cudaVersion = GetCudaVersion();
if(cudaVersion == 11)
{
libraryPath = $"runtimes/win-x64/native/libllama-cuda11{avxFlag}.dll";
}
else if (cudaVersion == 12)
{
libraryPath = $"runtimes/win-x64/native/libllama-cuda12{avxFlag}.dll";
}
else if(cudaVersion == -1) // cpu version
{
libraryPath = $"runtimes/win-x64/native/libllama{avxFlag}.dll";
}
else
{
throw new NotImplementedException($"Cuda version {cudaVersion} has not been supported, please compile dll yourself or open an issue in LLamaSharp.");
}
}
else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
{
var avxFlag = GetAvxFlag();
// check cuda
var cudaVersion = GetCudaVersion();
if (cudaVersion == 11)
{
libraryPath = $"runtimes/linux-x64/native/libllama-cuda11{avxFlag}.so";
}
else if (cudaVersion == 12)
{
libraryPath = $"runtimes/linux-x64/native/libllama-cuda12{avxFlag}.so";
}
else if (cudaVersion == -1) // cpu version
{
libraryPath = $"runtimes/linux-x64/native/libllama{avxFlag}.so";
}
else
{
throw new NotImplementedException($"Cuda version {cudaVersion} has not been supported, please compile dll yourself or open an issue in LLamaSharp.");
}
}
else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
{
if (System.Runtime.Intrinsics.Arm.ArmBase.Arm64.IsSupported)
{
libraryPath = $"runtimes/osx-arm64/native/libllama.dylib";
}
else
{
libraryPath = $"runtimes/osx-x64/native/libllama.dylib";
}
// All of the Windows libraries, in order of preference
return TryLoad("cu12.1.0/libllama.dll")
?? TryLoad("cu11.7.1/libllama.dll")
#if NET8_0_OR_GREATER
?? TryLoad("avx512/libllama.dll", System.Runtime.Intrinsics.X86.Avx512.IsSupported)
#endif
?? TryLoad("avx2/libllama.dll", System.Runtime.Intrinsics.X86.Avx2.IsSupported)
?? TryLoad("avx/libllama.dll", System.Runtime.Intrinsics.X86.Avx.IsSupported)
?? IntPtr.Zero;
}
NativeLibrary.TryLoad(libraryPath, assembly, searchPath, out handle);
return handle;
}
if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
{
// All of the Linux libraries, in order of preference
return TryLoad("cu12.1.0/libllama.so")
?? TryLoad("cu11.7.1/libllama.so")
#if NET8_0_OR_GREATER
?? TryLoad("avx512/libllama.so", System.Runtime.Intrinsics.X86.Avx512.IsSupported)
#endif
?? TryLoad("avx2/libllama.so", System.Runtime.Intrinsics.X86.Avx2.IsSupported)
?? TryLoad("avx/libllama.so", System.Runtime.Intrinsics.X86.Avx.IsSupported)
?? IntPtr.Zero;
}
if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
{
return TryLoad("runtimes/macos-arm64/libllama.dylib", System.Runtime.Intrinsics.Arm.ArmBase.Arm64.IsSupported)
?? TryLoad("runtimes/macos-x86_64/libllama.dylib")
?? IntPtr.Zero;
}
#endif
return IntPtr.Zero;
#if NET6_0_OR_GREATER
// Try to load a DLL from the path if supported. Returns null if nothing is loaded.
static IntPtr? TryLoad(string path, bool supported = true)
{
if (!supported)
return null;
if (NativeLibrary.TryLoad(path, out var handle))
return handle;
return null;
}
#endif
}
private const string libraryName = "libllama";
@ -637,13 +574,5 @@ namespace LLama.Native
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern int llama_set_n_threads(SafeLLamaContextHandle ctx, uint n_threads, uint n_threads_batch);
private enum AvxLevel
{
None = 0,
Avx = 1,
Avx2 = 2,
Avx512 = 3
}
}
}

View File

@ -11,7 +11,7 @@
**The C#/.NET binding of [llama.cpp](https://github.com/ggerganov/llama.cpp). It provides higher-level APIs to inference the LLaMA Models and deploy it on local device with C#/.NET. It works on
both Windows, Linux and MAC without requirment for compiling llama.cpp yourself. Even without GPU or not enough GPU memory, you can still apply LLaMA models well with this repo. 🤗**
both Windows, Linux and MAC without requirment for compiling llama.cpp yourself. Even without GPU or not enought GPU memory, you can still apply LLaMA models well with this repo. 🤗**
**Furthermore, it provides integrations with other projects such as [semantic-kernel](https://github.com/microsoft/semantic-kernel), [kernel-memory](https://github.com/microsoft/kernel-memory) and [BotSharp](https://github.com/SciSharp/BotSharp) to provide higher-level applications.**