LLamaSharp/LLama/LLamaQuantizer.cs

154 lines
6.7 KiB
C#
Raw Normal View History

2023-05-11 17:38:28 +08:00
using LLama.Native;
using System;
using System.Collections.Generic;
namespace LLama
{
2023-06-20 02:38:57 +08:00
/// <summary>
/// The quantizer to quantize the model.
/// </summary>
public static class LLamaQuantizer
2023-05-11 17:38:28 +08:00
{
/// <summary>
/// Quantize the model.
/// </summary>
/// <param name="srcFileName">The model file to be quantized.</param>
/// <param name="dstFilename">The path to save the quantized model.</param>
/// <param name="ftype">The type of quantization.</param>
/// <param name="nthread">Thread to be used during the quantization. By default it's the physical core number.</param>
2023-07-20 23:07:53 +08:00
/// <param name="allowRequantize"></param>
/// <param name="quantizeOutputTensor"></param>
/// <returns>Whether the quantization is successful.</returns>
/// <exception cref="ArgumentException"></exception>
April 2024 Binary Update (#662) * Updated binaries, using [this build](https://github.com/SciSharp/LLamaSharp/actions/runs/8654672719/job/23733195669) for llama.cpp commit `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`. - Added all new functions. - Moved some functions (e.g. `SafeLlamaModelHandle` specific functions) into `SafeLlamaModelHandle.cs` - Exposed tokens on `SafeLlamaModelHandle` and `LLamaWeights` through a `Tokens` property. As new special tokens are added in the future they can be added here. - Changed all token properties to return nullable tokens, to handle some models not having some tokens. - Fixed `DefaultSamplingPipeline` to handle no newline token in some models. * Moved native methods to more specific locations. - Context specific things have been moved into `SafeLLamaContextHandle.cs` and made private - they're exposed through C# properties and methods already. - Checking that GPU layer count is zero if GPU offload is not supported. - Moved methods for creating default structs (`llama_model_quantize_default_params` and `llama_context_default_params`) into relevant structs. * Removed exception if `GpuLayerCount > 0` when GPU is not supported. * - Added low level wrapper methods for new per-sequence state load/save in `SafeLLamaContextHandle` - Added high level wrapper methods (save/load with `State` object or memory mapped file) in `LLamaContext` - Moved native methods for per-sequence state load/save into `SafeLLamaContextHandle` * Added update and defrag methods for KV cache in `SafeLLamaContextHandle` * Updated submodule to `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7` * Passing the sequence ID when saving a single sequence state
2024-04-17 06:19:47 +08:00
public static bool Quantize(
string srcFileName, string dstFilename, LLamaFtype ftype, int nthread = -1, bool allowRequantize = true, bool quantizeOutputTensor = false)
2023-05-11 17:38:28 +08:00
{
if (!ValidateFtype(ftype))
{
throw new ArgumentException($"The type {Enum.GetName(typeof(LLamaFtype), ftype)} is not a valid type " +
$"to perform quantization.");
}
2023-06-20 23:32:58 +08:00
April 2024 Binary Update (#662) * Updated binaries, using [this build](https://github.com/SciSharp/LLamaSharp/actions/runs/8654672719/job/23733195669) for llama.cpp commit `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`. - Added all new functions. - Moved some functions (e.g. `SafeLlamaModelHandle` specific functions) into `SafeLlamaModelHandle.cs` - Exposed tokens on `SafeLlamaModelHandle` and `LLamaWeights` through a `Tokens` property. As new special tokens are added in the future they can be added here. - Changed all token properties to return nullable tokens, to handle some models not having some tokens. - Fixed `DefaultSamplingPipeline` to handle no newline token in some models. * Moved native methods to more specific locations. - Context specific things have been moved into `SafeLLamaContextHandle.cs` and made private - they're exposed through C# properties and methods already. - Checking that GPU layer count is zero if GPU offload is not supported. - Moved methods for creating default structs (`llama_model_quantize_default_params` and `llama_context_default_params`) into relevant structs. * Removed exception if `GpuLayerCount > 0` when GPU is not supported. * - Added low level wrapper methods for new per-sequence state load/save in `SafeLLamaContextHandle` - Added high level wrapper methods (save/load with `State` object or memory mapped file) in `LLamaContext` - Moved native methods for per-sequence state load/save into `SafeLLamaContextHandle` * Added update and defrag methods for KV cache in `SafeLLamaContextHandle` * Updated submodule to `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7` * Passing the sequence ID when saving a single sequence state
2024-04-17 06:19:47 +08:00
var quantizeParams = LLamaModelQuantizeParams.Default();
2023-06-20 23:32:58 +08:00
quantizeParams.ftype = ftype;
quantizeParams.nthread = nthread;
quantizeParams.allow_requantize = allowRequantize;
quantizeParams.quantize_output_tensor = quantizeOutputTensor;
April 2024 Binary Update (#662) * Updated binaries, using [this build](https://github.com/SciSharp/LLamaSharp/actions/runs/8654672719/job/23733195669) for llama.cpp commit `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`. - Added all new functions. - Moved some functions (e.g. `SafeLlamaModelHandle` specific functions) into `SafeLlamaModelHandle.cs` - Exposed tokens on `SafeLlamaModelHandle` and `LLamaWeights` through a `Tokens` property. As new special tokens are added in the future they can be added here. - Changed all token properties to return nullable tokens, to handle some models not having some tokens. - Fixed `DefaultSamplingPipeline` to handle no newline token in some models. * Moved native methods to more specific locations. - Context specific things have been moved into `SafeLLamaContextHandle.cs` and made private - they're exposed through C# properties and methods already. - Checking that GPU layer count is zero if GPU offload is not supported. - Moved methods for creating default structs (`llama_model_quantize_default_params` and `llama_context_default_params`) into relevant structs. * Removed exception if `GpuLayerCount > 0` when GPU is not supported. * - Added low level wrapper methods for new per-sequence state load/save in `SafeLLamaContextHandle` - Added high level wrapper methods (save/load with `State` object or memory mapped file) in `LLamaContext` - Moved native methods for per-sequence state load/save into `SafeLLamaContextHandle` * Added update and defrag methods for KV cache in `SafeLLamaContextHandle` * Updated submodule to `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7` * Passing the sequence ID when saving a single sequence state
2024-04-17 06:19:47 +08:00
// todo: fill in other quantize params fields.
// This method could probably do with a redesign - passing in a config object (maybe directly
// expose `LLamaModelQuantizeParams`) instead of an ever growing list of method parameters!
return NativeApi.llama_model_quantize(srcFileName, dstFilename, ref quantizeParams) == 0;
}
2023-05-11 17:38:28 +08:00
/// <summary>
/// Quantize the model.
/// </summary>
/// <param name="srcFileName">The model file to be quantized.</param>
/// <param name="dstFilename">The path to save the quantized model.</param>
/// <param name="ftype">The type of quantization.</param>
/// <param name="nthread">Thread to be used during the quantization. By default it's the physical core number.</param>
2023-07-20 23:07:53 +08:00
/// <param name="allowRequantize"></param>
/// <param name="quantizeOutputTensor"></param>
/// <returns>Whether the quantization is successful.</returns>
/// <exception cref="ArgumentException"></exception>
2023-06-20 23:32:58 +08:00
public static bool Quantize(string srcFileName, string dstFilename, string ftype, int nthread = -1, bool allowRequantize = true,
2023-07-20 23:07:53 +08:00
bool quantizeOutputTensor = false)
{
2023-06-20 23:32:58 +08:00
return Quantize(srcFileName, dstFilename, StringToFtype(ftype), nthread, allowRequantize, quantizeOutputTensor);
2023-05-11 17:38:28 +08:00
}
private static bool ValidateFtype(LLamaFtype ftype)
{
// Validation copies from here:
April 2024 Binary Update (#662) * Updated binaries, using [this build](https://github.com/SciSharp/LLamaSharp/actions/runs/8654672719/job/23733195669) for llama.cpp commit `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`. - Added all new functions. - Moved some functions (e.g. `SafeLlamaModelHandle` specific functions) into `SafeLlamaModelHandle.cs` - Exposed tokens on `SafeLlamaModelHandle` and `LLamaWeights` through a `Tokens` property. As new special tokens are added in the future they can be added here. - Changed all token properties to return nullable tokens, to handle some models not having some tokens. - Fixed `DefaultSamplingPipeline` to handle no newline token in some models. * Moved native methods to more specific locations. - Context specific things have been moved into `SafeLLamaContextHandle.cs` and made private - they're exposed through C# properties and methods already. - Checking that GPU layer count is zero if GPU offload is not supported. - Moved methods for creating default structs (`llama_model_quantize_default_params` and `llama_context_default_params`) into relevant structs. * Removed exception if `GpuLayerCount > 0` when GPU is not supported. * - Added low level wrapper methods for new per-sequence state load/save in `SafeLLamaContextHandle` - Added high level wrapper methods (save/load with `State` object or memory mapped file) in `LLamaContext` - Moved native methods for per-sequence state load/save into `SafeLLamaContextHandle` * Added update and defrag methods for KV cache in `SafeLLamaContextHandle` * Updated submodule to `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7` * Passing the sequence ID when saving a single sequence state
2024-04-17 06:19:47 +08:00
// https://github.com/ggerganov/llama.cpp/blob/f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7/llama.cpp#L13450
switch (ftype)
2023-05-11 17:38:28 +08:00
{
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0:
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1:
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0:
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1:
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0:
case LLamaFtype.LLAMA_FTYPE_MOSTLY_F16:
case LLamaFtype.LLAMA_FTYPE_ALL_F32:
2024-02-07 02:06:10 +08:00
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q2_K_S:
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q2_K:
2024-02-07 02:06:10 +08:00
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_K_XS:
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_S:
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_M:
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_L:
2024-02-07 02:06:10 +08:00
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_K_S:
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_K_M:
2024-02-07 02:06:10 +08:00
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_K_S:
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_K_M:
2024-02-07 02:06:10 +08:00
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q6_K:
2024-02-07 02:06:10 +08:00
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_XXS:
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_XS:
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_S:
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_M:
2024-02-07 02:06:10 +08:00
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_XXS:
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ1_S:
April 2024 Binary Update (#662) * Updated binaries, using [this build](https://github.com/SciSharp/LLamaSharp/actions/runs/8654672719/job/23733195669) for llama.cpp commit `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`. - Added all new functions. - Moved some functions (e.g. `SafeLlamaModelHandle` specific functions) into `SafeLlamaModelHandle.cs` - Exposed tokens on `SafeLlamaModelHandle` and `LLamaWeights` through a `Tokens` property. As new special tokens are added in the future they can be added here. - Changed all token properties to return nullable tokens, to handle some models not having some tokens. - Fixed `DefaultSamplingPipeline` to handle no newline token in some models. * Moved native methods to more specific locations. - Context specific things have been moved into `SafeLLamaContextHandle.cs` and made private - they're exposed through C# properties and methods already. - Checking that GPU layer count is zero if GPU offload is not supported. - Moved methods for creating default structs (`llama_model_quantize_default_params` and `llama_context_default_params`) into relevant structs. * Removed exception if `GpuLayerCount > 0` when GPU is not supported. * - Added low level wrapper methods for new per-sequence state load/save in `SafeLLamaContextHandle` - Added high level wrapper methods (save/load with `State` object or memory mapped file) in `LLamaContext` - Moved native methods for per-sequence state load/save into `SafeLLamaContextHandle` * Added update and defrag methods for KV cache in `SafeLLamaContextHandle` * Updated submodule to `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7` * Passing the sequence ID when saving a single sequence state
2024-04-17 06:19:47 +08:00
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ1_M:
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ4_NL:
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ4_XS:
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_S:
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_M:
return true;
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
case LLamaFtype.LLAMA_FTYPE_GUESSED:
default:
return false;
}
2023-05-11 17:38:28 +08:00
}
/// <summary>
/// Parse a string into a LLamaFtype. This is a "relaxed" parsing, which allows any string which is contained within
/// the enum name to be used.
///
/// For example "Q5_K_M" will convert to "LLAMA_FTYPE_MOSTLY_Q5_K_M"
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
/// <exception cref="ArgumentException"></exception>
private static LLamaFtype StringToFtype(string str)
{
// Find all variants which contain the input string
var matches = new List<LLamaFtype>();
foreach (LLamaFtype ftype in Enum.GetValues(typeof(LLamaFtype)))
{
var name = Enum.GetName(typeof(LLamaFtype), ftype);
// Note: this is using "IndexOf" instead of "Contains" to be compatible with netstandard2.0
#pragma warning disable CA2249
if (name != null && name.IndexOf(str, StringComparison.OrdinalIgnoreCase) >= 0)
matches.Add(ftype);
#pragma warning restore CA2249
}
// If there was just one match, success!
if (matches.Count == 1)
return matches[0];
// If none matched throw a generic error
if (matches.Count == 0)
throw new ArgumentException($"Unknown ftype \"{str}\" for quantization.");
// There were several matches, throw an error asking the user to be more specific
throw new ArgumentException($"\"{str}\" matches multiple potential ftypes: {string.Join(",", matches)}");
}
2023-05-11 17:38:28 +08:00
}
}