Merge pull request #96 from martindevans/minor_quantizer_improvements
Minor quantizer improvements
This commit is contained in:
commit
841cf88e3b
|
@ -1,8 +1,6 @@
|
|||
using LLama.Native;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
|
||||
namespace LLama
|
||||
{
|
||||
|
@ -36,8 +34,7 @@ namespace LLama
|
|||
quantizeParams.nthread = nthread;
|
||||
quantizeParams.allow_requantize = allowRequantize;
|
||||
quantizeParams.quantize_output_tensor = quantizeOutputTensor;
|
||||
LLamaModelQuantizeParams* p = &quantizeParams;
|
||||
return NativeApi.llama_model_quantize(srcFileName, dstFilename, p) == 0;
|
||||
return NativeApi.llama_model_quantize(srcFileName, dstFilename, &quantizeParams) == 0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -57,42 +54,71 @@ namespace LLama
|
|||
return Quantize(srcFileName, dstFilename, StringToFtype(ftype), nthread, allowRequantize, quantizeOutputTensor);
|
||||
}
|
||||
|
||||
private static bool ValidateFtype(string ftype)
|
||||
{
|
||||
return new string[] { "q4_0", "q4_1", "q5_0", "q5_1", "q8_0" }.Contains(ftype);
|
||||
}
|
||||
|
||||
private static bool ValidateFtype(LLamaFtype ftype)
|
||||
{
|
||||
return ftype is LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0 or LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1
|
||||
or LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0 or LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1 or LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0;
|
||||
}
|
||||
// Validation copies from here:
|
||||
// https://github.com/ggerganov/llama.cpp/blob/e59fcb2bc129881f4a269fee748fb38bce0a64de/llama.cpp#L2960
|
||||
|
||||
private static string FtypeToString(LLamaFtype ftype)
|
||||
{
|
||||
return ftype switch
|
||||
switch (ftype)
|
||||
{
|
||||
LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0 => "q4_0",
|
||||
LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1 => "q4_1",
|
||||
LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0 => "q5_0",
|
||||
LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1 => "q5_1",
|
||||
LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0 => "q8_0",
|
||||
_ => throw new ArgumentException($"The type {Enum.GetName(typeof(LLamaFtype), ftype)} is not a valid type " +
|
||||
$"to perform quantization.")
|
||||
};
|
||||
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0:
|
||||
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1:
|
||||
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0:
|
||||
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1:
|
||||
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0:
|
||||
case LLamaFtype.LLAMA_FTYPE_MOSTLY_F16:
|
||||
case LLamaFtype.LLAMA_FTYPE_ALL_F32:
|
||||
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q2_K:
|
||||
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
||||
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
||||
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_L:
|
||||
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
||||
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_K_M:
|
||||
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
||||
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_K_M:
|
||||
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q6_K:
|
||||
return true;
|
||||
|
||||
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parse a string into a LLamaFtype. This is a "relaxed" parsing, which allows any string which is contained within
|
||||
/// the enum name to be used.
|
||||
///
|
||||
/// For example "Q5_K_M" will convert to "LLAMA_FTYPE_MOSTLY_Q5_K_M"
|
||||
/// </summary>
|
||||
/// <param name="str"></param>
|
||||
/// <returns></returns>
|
||||
/// <exception cref="ArgumentException"></exception>
|
||||
private static LLamaFtype StringToFtype(string str)
|
||||
{
|
||||
return str switch
|
||||
// Find all variants which contain the input string
|
||||
var matches = new List<LLamaFtype>();
|
||||
foreach (LLamaFtype ftype in Enum.GetValues(typeof(LLamaFtype)))
|
||||
{
|
||||
"q4_0" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0,
|
||||
"q4_1" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1,
|
||||
"q5_0" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0,
|
||||
"q5_1" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1,
|
||||
"q8_0" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0,
|
||||
_ => throw new ArgumentException($"Invalid ftype {str} to quantize.")
|
||||
};
|
||||
var name = Enum.GetName(typeof(LLamaFtype), ftype);
|
||||
|
||||
// Note: this is using "IndexOf" instead of "Contains" to be compatible with netstandard2.0
|
||||
#pragma warning disable CA2249
|
||||
if (name != null && name.IndexOf(str, StringComparison.OrdinalIgnoreCase) >= 0)
|
||||
matches.Add(ftype);
|
||||
#pragma warning restore CA2249
|
||||
}
|
||||
|
||||
// If there was just one match, success!
|
||||
if (matches.Count == 1)
|
||||
return matches[0];
|
||||
|
||||
// If none matched throw a generic error
|
||||
if (matches.Count == 0)
|
||||
throw new ArgumentException($"Unknown ftype \"{str}\" for quantization.");
|
||||
|
||||
// There were several matches, throw an error asking the user to be more specific
|
||||
throw new ArgumentException($"\"{str}\" matches multiple potential ftypes: {string.Join(",", matches)}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,29 +1,109 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Text;
|
||||
|
||||
namespace LLama.Native
|
||||
namespace LLama.Native
|
||||
{
|
||||
/// <summary>
|
||||
/// Supported model file types
|
||||
/// </summary>
|
||||
public enum LLamaFtype
|
||||
{
|
||||
/// <summary>
|
||||
/// All f32
|
||||
/// </summary>
|
||||
/// <remarks>Benchmark@7B: 26GB</remarks>
|
||||
LLAMA_FTYPE_ALL_F32 = 0,
|
||||
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
|
||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
||||
|
||||
/// <summary>
|
||||
/// Mostly f16
|
||||
/// </summary>
|
||||
/// <remarks>Benchmark@7B: 13GB</remarks>
|
||||
LLAMA_FTYPE_MOSTLY_F16 = 1,
|
||||
|
||||
/// <summary>
|
||||
/// Mostly 8 bit
|
||||
/// </summary>
|
||||
/// <remarks>Benchmark@7B: 6.7GB, +0.0004ppl</remarks>
|
||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7,
|
||||
|
||||
/// <summary>
|
||||
/// Mostly 4 bit
|
||||
/// </summary>
|
||||
/// <remarks>Benchmark@7B: 3.50GB, +0.2499 ppl</remarks>
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0 = 2,
|
||||
|
||||
/// <summary>
|
||||
/// Mostly 4 bit
|
||||
/// </summary>
|
||||
/// <remarks>Benchmark@7B: 3.90GB, +0.1846 ppl</remarks>
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1 = 3,
|
||||
|
||||
/// <summary>
|
||||
/// Mostly 4 bit, tok_embeddings.weight and output.weight are f16
|
||||
/// </summary>
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,
|
||||
|
||||
/// <summary>
|
||||
/// Mostly 5 bit
|
||||
/// </summary>
|
||||
/// <remarks>Benchmark@7B: 4.30GB @ 7B tokens, +0.0796 ppl</remarks>
|
||||
LLAMA_FTYPE_MOSTLY_Q5_0 = 8,
|
||||
|
||||
/// <summary>
|
||||
/// Mostly 5 bit
|
||||
/// </summary>
|
||||
/// <remarks>Benchmark@7B: 4.70GB, +0.0415 ppl</remarks>
|
||||
LLAMA_FTYPE_MOSTLY_Q5_1 = 9,
|
||||
|
||||
/// <summary>
|
||||
/// K-Quant 2 bit
|
||||
/// </summary>
|
||||
/// <remarks>Benchmark@7B: 2.67GB @ 7N parameters, +0.8698 ppl</remarks>
|
||||
LLAMA_FTYPE_MOSTLY_Q2_K = 10,
|
||||
|
||||
/// <summary>
|
||||
/// K-Quant 3 bit (Small)
|
||||
/// </summary>
|
||||
/// <remarks>Benchmark@7B: 2.75GB, +0.5505 ppl</remarks>
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,
|
||||
|
||||
/// <summary>
|
||||
/// K-Quant 3 bit (Medium)
|
||||
/// </summary>
|
||||
/// <remarks>Benchmark@7B: 3.06GB, +0.2437 ppl</remarks>
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,
|
||||
|
||||
/// <summary>
|
||||
/// K-Quant 3 bit (Large)
|
||||
/// </summary>
|
||||
/// <remarks>Benchmark@7B: 3.35GB, +0.1803 ppl</remarks>
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,
|
||||
|
||||
/// <summary>
|
||||
/// K-Quant 4 bit (Small)
|
||||
/// </summary>
|
||||
/// <remarks>Benchmark@7B: 3.56GB, +0.1149 ppl</remarks>
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,
|
||||
|
||||
/// <summary>
|
||||
/// K-Quant 4 bit (Medium)
|
||||
/// </summary>
|
||||
/// <remarks>Benchmark@7B: 3.80GB, +0.0535 ppl</remarks>
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,
|
||||
|
||||
/// <summary>
|
||||
/// K-Quant 5 bit (Small)
|
||||
/// </summary>
|
||||
/// <remarks>Benchmark@7B: 4.33GB, +0.0353 ppl</remarks>
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,
|
||||
|
||||
/// <summary>
|
||||
/// K-Quant 5 bit (Medium)
|
||||
/// </summary>
|
||||
/// <remarks>Benchmark@7B: 4.45GB, +0.0142 ppl</remarks>
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,
|
||||
|
||||
/// <summary>
|
||||
/// K-Quant 6 bit
|
||||
/// </summary>
|
||||
/// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks>
|
||||
LLAMA_FTYPE_MOSTLY_Q6_K = 18,
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,25 +1,28 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Text;
|
||||
using System.Runtime.InteropServices;
|
||||
|
||||
namespace LLama.Native
|
||||
{
|
||||
/// <summary>
|
||||
/// Quantizer parameters used in the native API
|
||||
/// </summary>
|
||||
public struct LLamaModelQuantizeParams
|
||||
{
|
||||
/// <summary>
|
||||
/// number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
/// </summary>
|
||||
public int nthread;
|
||||
|
||||
/// <summary>
|
||||
/// quantize to this llama_ftype
|
||||
/// </summary>
|
||||
public LLamaFtype ftype;
|
||||
|
||||
/// <summary>
|
||||
/// allow quantizing non-f32/f16 tensors
|
||||
/// </summary>
|
||||
[MarshalAs(UnmanagedType.I1)]
|
||||
public bool allow_requantize;
|
||||
|
||||
/// <summary>
|
||||
/// quantize output.weight
|
||||
/// </summary>
|
||||
|
|
Loading…
Reference in New Issue