Initial changes required for GGUF support

This commit is contained in:
Martin Evans 2023-08-25 15:47:54 +01:00
parent 964f497c50
commit 2056078aef
19 changed files with 76 additions and 2043 deletions

View File

@ -52,6 +52,30 @@
<None Update="Assets\reason-act.txt">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/ggml-metal.metal">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.dll">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.so">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.dll">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.so">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-metal.dylib">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.dylib">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.so">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
</ItemGroup>
</Project>

View File

@ -37,6 +37,30 @@
</ItemGroup>
<ItemGroup>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/ggml-metal.metal">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.dll">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.so">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.dll">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.so">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-metal.dylib">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.dylib">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.so">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="Models\llama-2-7b-chat.ggmlv3.q3_K_S.bin">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>

View File

@ -14,7 +14,6 @@ namespace LLama.Unittest
BatchSize = 17,
ContextSize = 42,
LoraAdapter = "adapter",
GroupedQueryAttention = 7,
Seed = 42,
GpuLayerCount = 111
};
@ -33,7 +32,6 @@ namespace LLama.Unittest
BatchSize = 17,
ContextSize = 42,
LoraAdapter = "adapter",
GroupedQueryAttention = 7,
Seed = 42,
GpuLayerCount = 111
};

View File

@ -88,16 +88,6 @@ namespace LLama.Web.Common
/// </summary>
public float[] TensorSplits { get; set; }
/// <summary>
/// Grouped-Query Attention
/// </summary>
public int GroupedQueryAttention { get; set; } = 1;
/// <summary>
/// RMS Norm Epsilon
/// </summary>
public float RmsNormEpsilon { get; set; } = 5e-6f;
/// <summary>
/// RoPE base frequency
/// </summary>

View File

@ -98,16 +98,6 @@ namespace LLama.Abstractions
/// </summary>
float[]? TensorSplits { get; set; }
/// <summary>
/// Grouped-Query Attention
/// </summary>
int GroupedQueryAttention { get; set; }
/// <summary>
/// RMS Norm Epsilon
/// </summary>
float RmsNormEpsilon { get; set; }
/// <summary>
/// RoPE base frequency
/// </summary>

View File

@ -89,16 +89,6 @@ namespace LLama.Common
/// </summary>
public float[]? TensorSplits { get; set; }
/// <summary>
/// Grouped-Query Attention
/// </summary>
public int GroupedQueryAttention { get; set; } = 1;
/// <summary>
/// RMS Norm Epsilon
/// </summary>
public float RmsNormEpsilon { get; set; } = 5e-6f;
/// <summary>
/// RoPE base frequency
/// </summary>
@ -153,8 +143,6 @@ namespace LLama.Common
/// <param name="batchSize">Batch size for prompt processing (must be >=32 to use BLAS) (n_batch)</param>
/// <param name="convertEosToNewLine">Whether to convert eos to newline during the inference.</param>
/// <param name="embeddingMode">Whether to use embedding mode. (embedding) Note that if this is set to true, The LLamaModel won't produce text response anymore.</param>
/// <param name="groupedQueryAttention">Grouped-Query Attention</param>
/// <param name="rmsNormEpsilon">RMS Norm Epsilon</param>
/// <param name="ropeFrequencyBase">RoPE base frequency.</param>
/// <param name="ropeFrequencyScale">RoPE frequency scaling factor</param>
/// <param name="mulMatQ">Use experimental mul_mat_q kernels</param>
@ -165,7 +153,7 @@ namespace LLama.Common
bool useMemorymap = true, bool useMemoryLock = false, bool perplexity = false,
string loraAdapter = "", string loraBase = "", int threads = -1, int batchSize = 512,
bool convertEosToNewLine = false, bool embeddingMode = false,
int groupedQueryAttention = 1, float rmsNormEpsilon = 5e-6f, float ropeFrequencyBase = 10000.0f, float ropeFrequencyScale = 1f, bool mulMatQ = false,
float ropeFrequencyBase = 10000.0f, float ropeFrequencyScale = 1f, bool mulMatQ = false,
string encoding = "UTF-8")
{
ContextSize = contextSize;
@ -182,8 +170,6 @@ namespace LLama.Common
BatchSize = batchSize;
ConvertEosToNewLine = convertEosToNewLine;
EmbeddingMode = embeddingMode;
GroupedQueryAttention = groupedQueryAttention;
RmsNormEpsilon = rmsNormEpsilon;
RopeFrequencyBase = ropeFrequencyBase;
RopeFrequencyScale = ropeFrequencyScale;
MulMatQ = mulMatQ;

View File

@ -39,8 +39,6 @@ namespace LLama.Extensions
result.logits_all = @params.Perplexity;
result.embedding = @params.EmbeddingMode;
result.low_vram = @params.LowVram;
result.n_gqa = @params.GroupedQueryAttention;
result.rms_norm_eps = @params.RmsNormEpsilon;
result.rope_freq_base = @params.RopeFrequencyBase;
result.rope_freq_scale = @params.RopeFrequencyScale;
result.mul_mat_q = @params.MulMatQ;

View File

@ -31,16 +31,6 @@ namespace LLama.Native
/// </summary>
public int n_batch;
/// <summary>
/// grouped-query attention (TEMP - will be moved to model hparams)
/// </summary>
public int n_gqa;
/// <summary>
/// rms norm epsilon (TEMP - will be moved to model hparams)
/// </summary>
public float rms_norm_eps;
/// <summary>
/// number of layers to store in VRAM
/// </summary>
@ -82,8 +72,8 @@ namespace LLama.Native
/// if true, reduce VRAM usage at the cost of performance
/// </summary>
public bool low_vram
{
get => Convert.ToBoolean(_low_vram);
{
readonly get => Convert.ToBoolean(_low_vram);
set => _low_vram = Convert.ToSByte(value);
}
private sbyte _low_vram;
@ -92,8 +82,8 @@ namespace LLama.Native
/// if true, use experimental mul_mat_q kernels
/// </summary>
public bool mul_mat_q
{
get => Convert.ToBoolean(_mul_mat_q);
{
readonly get => Convert.ToBoolean(_mul_mat_q);
set => _mul_mat_q = Convert.ToSByte(value);
}
private sbyte _mul_mat_q;
@ -102,8 +92,8 @@ namespace LLama.Native
/// use fp16 for KV cache
/// </summary>
public bool f16_kv
{
get => Convert.ToBoolean(_f16_kv);
{
readonly get => Convert.ToBoolean(_f16_kv);
set => _f16_kv = Convert.ToSByte(value);
}
private sbyte _f16_kv;
@ -112,8 +102,8 @@ namespace LLama.Native
/// the llama_eval() call computes all logits, not just the last one
/// </summary>
public bool logits_all
{
get => Convert.ToBoolean(_logits_all);
{
readonly get => Convert.ToBoolean(_logits_all);
set => _logits_all = Convert.ToSByte(value);
}
private sbyte _logits_all;
@ -122,8 +112,8 @@ namespace LLama.Native
/// only load the vocabulary, no weights
/// </summary>
public bool vocab_only
{
get => Convert.ToBoolean(_vocab_only);
{
readonly get => Convert.ToBoolean(_vocab_only);
set => _vocab_only = Convert.ToSByte(value);
}
private sbyte _vocab_only;
@ -132,8 +122,8 @@ namespace LLama.Native
/// use mmap if possible
/// </summary>
public bool use_mmap
{
get => Convert.ToBoolean(_use_mmap);
{
readonly get => Convert.ToBoolean(_use_mmap);
set => _use_mmap = Convert.ToSByte(value);
}
private sbyte _use_mmap;
@ -142,8 +132,8 @@ namespace LLama.Native
/// force system to keep model in RAM
/// </summary>
public bool use_mlock
{
get => Convert.ToBoolean(_use_mlock);
{
readonly get => Convert.ToBoolean(_use_mlock);
set => _use_mlock = Convert.ToSByte(value);
}
private sbyte _use_mlock;
@ -152,8 +142,8 @@ namespace LLama.Native
/// embedding mode only
/// </summary>
public bool embedding
{
get => Convert.ToBoolean(_embedding);
{
readonly get => Convert.ToBoolean(_embedding);
set => _embedding = Convert.ToSByte(value);
}
private sbyte _embedding;

View File

@ -105,5 +105,10 @@
/// </summary>
/// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q6_K = 18,
/// <summary>
/// File type was not specified
/// </summary>
LLAMA_FTYPE_GUESSED = 1024
}
}

View File

@ -377,7 +377,7 @@ namespace LLama.Native
/// <param name="model"></param>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern int llama_n_vocab_from_model(SafeLlamaModelHandle model);
public static extern int llama_model_n_vocab(SafeLlamaModelHandle model);
/// <summary>
/// Get the size of the context window for the model
@ -385,7 +385,7 @@ namespace LLama.Native
/// <param name="model"></param>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern int llama_n_ctx_from_model(SafeLlamaModelHandle model);
public static extern int llama_model_n_ctx(SafeLlamaModelHandle model);
/// <summary>
/// Get the dimension of embedding vectors from this model
@ -393,7 +393,7 @@ namespace LLama.Native
/// <param name="model"></param>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern int llama_n_embd_from_model(SafeLlamaModelHandle model);
public static extern int llama_model_n_embd(SafeLlamaModelHandle model);
/// <summary>
/// Convert a single token into text

View File

@ -28,9 +28,9 @@ namespace LLama.Native
internal SafeLlamaModelHandle(IntPtr handle)
: base(handle)
{
VocabCount = NativeApi.llama_n_vocab_from_model(this);
ContextSize = NativeApi.llama_n_ctx_from_model(this);
EmbeddingSize = NativeApi.llama_n_embd_from_model(this);
VocabCount = NativeApi.llama_model_n_vocab(this);
ContextSize = NativeApi.llama_model_n_ctx(this);
EmbeddingSize = NativeApi.llama_model_n_embd(this);
}
/// <inheritdoc />

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.