2023-05-11 17:38:28 +08:00
|
|
|
|
using LLama.Native;
|
|
|
|
|
using System;
|
|
|
|
|
using System.Collections.Generic;
|
|
|
|
|
using System.Linq;
|
|
|
|
|
using System.Text;
|
|
|
|
|
|
|
|
|
|
namespace LLama
|
|
|
|
|
{
|
2023-06-20 02:38:57 +08:00
|
|
|
|
/// <summary>
|
|
|
|
|
/// The quantizer to quantize the model.
|
|
|
|
|
/// </summary>
|
|
|
|
|
public static class LLamaQuantizer
|
2023-05-11 17:38:28 +08:00
|
|
|
|
{
|
2023-05-13 15:02:19 +08:00
|
|
|
|
/// <summary>
|
|
|
|
|
/// Quantize the model.
|
|
|
|
|
/// </summary>
|
|
|
|
|
/// <param name="srcFileName">The model file to be quantized.</param>
|
|
|
|
|
/// <param name="dstFilename">The path to save the quantized model.</param>
|
|
|
|
|
/// <param name="ftype">The type of quantization.</param>
|
|
|
|
|
/// <param name="nthread">Thread to be used during the quantization. By default it's the physical core number.</param>
|
|
|
|
|
/// <returns>Whether the quantization is successful.</returns>
|
|
|
|
|
/// <exception cref="ArgumentException"></exception>
|
2023-06-20 23:32:58 +08:00
|
|
|
|
public static unsafe bool Quantize(string srcFileName, string dstFilename, LLamaFtype ftype, int nthread = -1, bool allowRequantize = true,
|
|
|
|
|
bool quantizeOutputTensor = false)
|
2023-05-11 17:38:28 +08:00
|
|
|
|
{
|
|
|
|
|
if (!ValidateFtype(ftype))
|
|
|
|
|
{
|
|
|
|
|
throw new ArgumentException($"The type {Enum.GetName(typeof(LLamaFtype), ftype)} is not a valid type " +
|
|
|
|
|
$"to perform quantization.");
|
|
|
|
|
}
|
2023-06-20 23:32:58 +08:00
|
|
|
|
|
|
|
|
|
var quantizeParams = NativeApi.llama_model_quantize_default_params();
|
|
|
|
|
quantizeParams.ftype = ftype;
|
|
|
|
|
quantizeParams.nthread = nthread;
|
|
|
|
|
quantizeParams.allow_requantize = allowRequantize;
|
|
|
|
|
quantizeParams.quantize_output_tensor = quantizeOutputTensor;
|
|
|
|
|
LLamaModelQuantizeParams* p = &quantizeParams;
|
|
|
|
|
return NativeApi.llama_model_quantize(srcFileName, dstFilename, p) == 0;
|
2023-05-13 15:02:19 +08:00
|
|
|
|
}
|
2023-05-11 17:38:28 +08:00
|
|
|
|
|
2023-05-13 15:02:19 +08:00
|
|
|
|
/// <summary>
|
|
|
|
|
/// Quantize the model.
|
|
|
|
|
/// </summary>
|
|
|
|
|
/// <param name="srcFileName">The model file to be quantized.</param>
|
|
|
|
|
/// <param name="dstFilename">The path to save the quantized model.</param>
|
|
|
|
|
/// <param name="ftype">The type of quantization.</param>
|
|
|
|
|
/// <param name="nthread">Thread to be used during the quantization. By default it's the physical core number.</param>
|
|
|
|
|
/// <returns>Whether the quantization is successful.</returns>
|
|
|
|
|
/// <exception cref="ArgumentException"></exception>
|
2023-06-20 23:32:58 +08:00
|
|
|
|
public static bool Quantize(string srcFileName, string dstFilename, string ftype, int nthread = -1, bool allowRequantize = true,
|
|
|
|
|
bool quantizeOutputTensor = false)
|
2023-05-13 15:02:19 +08:00
|
|
|
|
{
|
2023-06-20 23:32:58 +08:00
|
|
|
|
return Quantize(srcFileName, dstFilename, StringToFtype(ftype), nthread, allowRequantize, quantizeOutputTensor);
|
2023-05-11 17:38:28 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static bool ValidateFtype(string ftype)
|
|
|
|
|
{
|
2023-05-21 20:36:49 +08:00
|
|
|
|
return new string[] { "q4_0", "q4_1", "q5_0", "q5_1", "q8_0" }.Contains(ftype);
|
2023-05-11 17:38:28 +08:00
|
|
|
|
}
|
|
|
|
|
|
2023-05-13 15:02:19 +08:00
|
|
|
|
private static bool ValidateFtype(LLamaFtype ftype)
|
|
|
|
|
{
|
2023-05-21 20:36:49 +08:00
|
|
|
|
return ftype is LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0 or LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1
|
2023-05-13 15:02:19 +08:00
|
|
|
|
or LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0 or LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1 or LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0;
|
|
|
|
|
}
|
|
|
|
|
|
2023-05-11 17:38:28 +08:00
|
|
|
|
private static string FtypeToString(LLamaFtype ftype)
|
|
|
|
|
{
|
|
|
|
|
return ftype switch
|
|
|
|
|
{
|
|
|
|
|
LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0 => "q4_0",
|
|
|
|
|
LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1 => "q4_1",
|
|
|
|
|
LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0 => "q5_0",
|
|
|
|
|
LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1 => "q5_1",
|
|
|
|
|
LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0 => "q8_0",
|
|
|
|
|
_ => throw new ArgumentException($"The type {Enum.GetName(typeof(LLamaFtype), ftype)} is not a valid type " +
|
|
|
|
|
$"to perform quantization.")
|
|
|
|
|
};
|
|
|
|
|
}
|
2023-05-13 15:02:19 +08:00
|
|
|
|
|
|
|
|
|
private static LLamaFtype StringToFtype(string str)
|
|
|
|
|
{
|
|
|
|
|
return str switch
|
|
|
|
|
{
|
|
|
|
|
"q4_0" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0,
|
|
|
|
|
"q4_1" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1,
|
|
|
|
|
"q5_0" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0,
|
|
|
|
|
"q5_1" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1,
|
|
|
|
|
"q8_0" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0,
|
2023-06-11 09:13:30 +08:00
|
|
|
|
_ => throw new ArgumentException($"Invalid ftype {str} to quantize.")
|
2023-05-13 15:02:19 +08:00
|
|
|
|
};
|
|
|
|
|
}
|
2023-05-11 17:38:28 +08:00
|
|
|
|
}
|
|
|
|
|
}
|