Initial changes required for GGUF support
This commit is contained in:
parent
964f497c50
commit
2056078aef
|
@ -52,6 +52,30 @@
|
|||
<None Update="Assets\reason-act.txt">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/ggml-metal.metal">
|
||||
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.dll">
|
||||
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.so">
|
||||
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.dll">
|
||||
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.so">
|
||||
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-metal.dylib">
|
||||
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.dylib">
|
||||
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.so">
|
||||
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
|
||||
</None>
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
|
|
|
@ -37,6 +37,30 @@
|
|||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/ggml-metal.metal">
|
||||
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.dll">
|
||||
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.so">
|
||||
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.dll">
|
||||
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.so">
|
||||
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-metal.dylib">
|
||||
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.dylib">
|
||||
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.so">
|
||||
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="Models\llama-2-7b-chat.ggmlv3.q3_K_S.bin">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</None>
|
||||
|
|
|
@ -14,7 +14,6 @@ namespace LLama.Unittest
|
|||
BatchSize = 17,
|
||||
ContextSize = 42,
|
||||
LoraAdapter = "adapter",
|
||||
GroupedQueryAttention = 7,
|
||||
Seed = 42,
|
||||
GpuLayerCount = 111
|
||||
};
|
||||
|
@ -33,7 +32,6 @@ namespace LLama.Unittest
|
|||
BatchSize = 17,
|
||||
ContextSize = 42,
|
||||
LoraAdapter = "adapter",
|
||||
GroupedQueryAttention = 7,
|
||||
Seed = 42,
|
||||
GpuLayerCount = 111
|
||||
};
|
||||
|
|
|
@ -88,16 +88,6 @@ namespace LLama.Web.Common
|
|||
/// </summary>
|
||||
public float[] TensorSplits { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Grouped-Query Attention
|
||||
/// </summary>
|
||||
public int GroupedQueryAttention { get; set; } = 1;
|
||||
|
||||
/// <summary>
|
||||
/// RMS Norm Epsilon
|
||||
/// </summary>
|
||||
public float RmsNormEpsilon { get; set; } = 5e-6f;
|
||||
|
||||
/// <summary>
|
||||
/// RoPE base frequency
|
||||
/// </summary>
|
||||
|
|
|
@ -98,16 +98,6 @@ namespace LLama.Abstractions
|
|||
/// </summary>
|
||||
float[]? TensorSplits { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Grouped-Query Attention
|
||||
/// </summary>
|
||||
int GroupedQueryAttention { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// RMS Norm Epsilon
|
||||
/// </summary>
|
||||
float RmsNormEpsilon { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// RoPE base frequency
|
||||
/// </summary>
|
||||
|
|
|
@ -89,16 +89,6 @@ namespace LLama.Common
|
|||
/// </summary>
|
||||
public float[]? TensorSplits { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Grouped-Query Attention
|
||||
/// </summary>
|
||||
public int GroupedQueryAttention { get; set; } = 1;
|
||||
|
||||
/// <summary>
|
||||
/// RMS Norm Epsilon
|
||||
/// </summary>
|
||||
public float RmsNormEpsilon { get; set; } = 5e-6f;
|
||||
|
||||
/// <summary>
|
||||
/// RoPE base frequency
|
||||
/// </summary>
|
||||
|
@ -153,8 +143,6 @@ namespace LLama.Common
|
|||
/// <param name="batchSize">Batch size for prompt processing (must be >=32 to use BLAS) (n_batch)</param>
|
||||
/// <param name="convertEosToNewLine">Whether to convert eos to newline during the inference.</param>
|
||||
/// <param name="embeddingMode">Whether to use embedding mode. (embedding) Note that if this is set to true, The LLamaModel won't produce text response anymore.</param>
|
||||
/// <param name="groupedQueryAttention">Grouped-Query Attention</param>
|
||||
/// <param name="rmsNormEpsilon">RMS Norm Epsilon</param>
|
||||
/// <param name="ropeFrequencyBase">RoPE base frequency.</param>
|
||||
/// <param name="ropeFrequencyScale">RoPE frequency scaling factor</param>
|
||||
/// <param name="mulMatQ">Use experimental mul_mat_q kernels</param>
|
||||
|
@ -165,7 +153,7 @@ namespace LLama.Common
|
|||
bool useMemorymap = true, bool useMemoryLock = false, bool perplexity = false,
|
||||
string loraAdapter = "", string loraBase = "", int threads = -1, int batchSize = 512,
|
||||
bool convertEosToNewLine = false, bool embeddingMode = false,
|
||||
int groupedQueryAttention = 1, float rmsNormEpsilon = 5e-6f, float ropeFrequencyBase = 10000.0f, float ropeFrequencyScale = 1f, bool mulMatQ = false,
|
||||
float ropeFrequencyBase = 10000.0f, float ropeFrequencyScale = 1f, bool mulMatQ = false,
|
||||
string encoding = "UTF-8")
|
||||
{
|
||||
ContextSize = contextSize;
|
||||
|
@ -182,8 +170,6 @@ namespace LLama.Common
|
|||
BatchSize = batchSize;
|
||||
ConvertEosToNewLine = convertEosToNewLine;
|
||||
EmbeddingMode = embeddingMode;
|
||||
GroupedQueryAttention = groupedQueryAttention;
|
||||
RmsNormEpsilon = rmsNormEpsilon;
|
||||
RopeFrequencyBase = ropeFrequencyBase;
|
||||
RopeFrequencyScale = ropeFrequencyScale;
|
||||
MulMatQ = mulMatQ;
|
||||
|
|
|
@ -39,8 +39,6 @@ namespace LLama.Extensions
|
|||
result.logits_all = @params.Perplexity;
|
||||
result.embedding = @params.EmbeddingMode;
|
||||
result.low_vram = @params.LowVram;
|
||||
result.n_gqa = @params.GroupedQueryAttention;
|
||||
result.rms_norm_eps = @params.RmsNormEpsilon;
|
||||
result.rope_freq_base = @params.RopeFrequencyBase;
|
||||
result.rope_freq_scale = @params.RopeFrequencyScale;
|
||||
result.mul_mat_q = @params.MulMatQ;
|
||||
|
|
|
@ -31,16 +31,6 @@ namespace LLama.Native
|
|||
/// </summary>
|
||||
public int n_batch;
|
||||
|
||||
/// <summary>
|
||||
/// grouped-query attention (TEMP - will be moved to model hparams)
|
||||
/// </summary>
|
||||
public int n_gqa;
|
||||
|
||||
/// <summary>
|
||||
/// rms norm epsilon (TEMP - will be moved to model hparams)
|
||||
/// </summary>
|
||||
public float rms_norm_eps;
|
||||
|
||||
/// <summary>
|
||||
/// number of layers to store in VRAM
|
||||
/// </summary>
|
||||
|
@ -82,8 +72,8 @@ namespace LLama.Native
|
|||
/// if true, reduce VRAM usage at the cost of performance
|
||||
/// </summary>
|
||||
public bool low_vram
|
||||
{
|
||||
get => Convert.ToBoolean(_low_vram);
|
||||
{
|
||||
readonly get => Convert.ToBoolean(_low_vram);
|
||||
set => _low_vram = Convert.ToSByte(value);
|
||||
}
|
||||
private sbyte _low_vram;
|
||||
|
@ -92,8 +82,8 @@ namespace LLama.Native
|
|||
/// if true, use experimental mul_mat_q kernels
|
||||
/// </summary>
|
||||
public bool mul_mat_q
|
||||
{
|
||||
get => Convert.ToBoolean(_mul_mat_q);
|
||||
{
|
||||
readonly get => Convert.ToBoolean(_mul_mat_q);
|
||||
set => _mul_mat_q = Convert.ToSByte(value);
|
||||
}
|
||||
private sbyte _mul_mat_q;
|
||||
|
@ -102,8 +92,8 @@ namespace LLama.Native
|
|||
/// use fp16 for KV cache
|
||||
/// </summary>
|
||||
public bool f16_kv
|
||||
{
|
||||
get => Convert.ToBoolean(_f16_kv);
|
||||
{
|
||||
readonly get => Convert.ToBoolean(_f16_kv);
|
||||
set => _f16_kv = Convert.ToSByte(value);
|
||||
}
|
||||
private sbyte _f16_kv;
|
||||
|
@ -112,8 +102,8 @@ namespace LLama.Native
|
|||
/// the llama_eval() call computes all logits, not just the last one
|
||||
/// </summary>
|
||||
public bool logits_all
|
||||
{
|
||||
get => Convert.ToBoolean(_logits_all);
|
||||
{
|
||||
readonly get => Convert.ToBoolean(_logits_all);
|
||||
set => _logits_all = Convert.ToSByte(value);
|
||||
}
|
||||
private sbyte _logits_all;
|
||||
|
@ -122,8 +112,8 @@ namespace LLama.Native
|
|||
/// only load the vocabulary, no weights
|
||||
/// </summary>
|
||||
public bool vocab_only
|
||||
{
|
||||
get => Convert.ToBoolean(_vocab_only);
|
||||
{
|
||||
readonly get => Convert.ToBoolean(_vocab_only);
|
||||
set => _vocab_only = Convert.ToSByte(value);
|
||||
}
|
||||
private sbyte _vocab_only;
|
||||
|
@ -132,8 +122,8 @@ namespace LLama.Native
|
|||
/// use mmap if possible
|
||||
/// </summary>
|
||||
public bool use_mmap
|
||||
{
|
||||
get => Convert.ToBoolean(_use_mmap);
|
||||
{
|
||||
readonly get => Convert.ToBoolean(_use_mmap);
|
||||
set => _use_mmap = Convert.ToSByte(value);
|
||||
}
|
||||
private sbyte _use_mmap;
|
||||
|
@ -142,8 +132,8 @@ namespace LLama.Native
|
|||
/// force system to keep model in RAM
|
||||
/// </summary>
|
||||
public bool use_mlock
|
||||
{
|
||||
get => Convert.ToBoolean(_use_mlock);
|
||||
{
|
||||
readonly get => Convert.ToBoolean(_use_mlock);
|
||||
set => _use_mlock = Convert.ToSByte(value);
|
||||
}
|
||||
private sbyte _use_mlock;
|
||||
|
@ -152,8 +142,8 @@ namespace LLama.Native
|
|||
/// embedding mode only
|
||||
/// </summary>
|
||||
public bool embedding
|
||||
{
|
||||
get => Convert.ToBoolean(_embedding);
|
||||
{
|
||||
readonly get => Convert.ToBoolean(_embedding);
|
||||
set => _embedding = Convert.ToSByte(value);
|
||||
}
|
||||
private sbyte _embedding;
|
||||
|
|
|
@ -105,5 +105,10 @@
|
|||
/// </summary>
|
||||
/// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks>
|
||||
LLAMA_FTYPE_MOSTLY_Q6_K = 18,
|
||||
|
||||
/// <summary>
|
||||
/// File type was not specified
|
||||
/// </summary>
|
||||
LLAMA_FTYPE_GUESSED = 1024
|
||||
}
|
||||
}
|
||||
|
|
|
@ -377,7 +377,7 @@ namespace LLama.Native
|
|||
/// <param name="model"></param>
|
||||
/// <returns></returns>
|
||||
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
|
||||
public static extern int llama_n_vocab_from_model(SafeLlamaModelHandle model);
|
||||
public static extern int llama_model_n_vocab(SafeLlamaModelHandle model);
|
||||
|
||||
/// <summary>
|
||||
/// Get the size of the context window for the model
|
||||
|
@ -385,7 +385,7 @@ namespace LLama.Native
|
|||
/// <param name="model"></param>
|
||||
/// <returns></returns>
|
||||
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
|
||||
public static extern int llama_n_ctx_from_model(SafeLlamaModelHandle model);
|
||||
public static extern int llama_model_n_ctx(SafeLlamaModelHandle model);
|
||||
|
||||
/// <summary>
|
||||
/// Get the dimension of embedding vectors from this model
|
||||
|
@ -393,7 +393,7 @@ namespace LLama.Native
|
|||
/// <param name="model"></param>
|
||||
/// <returns></returns>
|
||||
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
|
||||
public static extern int llama_n_embd_from_model(SafeLlamaModelHandle model);
|
||||
public static extern int llama_model_n_embd(SafeLlamaModelHandle model);
|
||||
|
||||
/// <summary>
|
||||
/// Convert a single token into text
|
||||
|
|
|
@ -28,9 +28,9 @@ namespace LLama.Native
|
|||
internal SafeLlamaModelHandle(IntPtr handle)
|
||||
: base(handle)
|
||||
{
|
||||
VocabCount = NativeApi.llama_n_vocab_from_model(this);
|
||||
ContextSize = NativeApi.llama_n_ctx_from_model(this);
|
||||
EmbeddingSize = NativeApi.llama_n_embd_from_model(this);
|
||||
VocabCount = NativeApi.llama_model_n_vocab(this);
|
||||
ContextSize = NativeApi.llama_model_n_ctx(this);
|
||||
EmbeddingSize = NativeApi.llama_model_n_embd(this);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
|
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue