LLamaSharp/LLama/LLamaQuantizer.cs

using LLama.Native;
using System;
using System.Collections.Generic;

namespace LLama
{
    /// <summary>
    /// The quantizer to quantize the model.
    /// </summary>
    public static class LLamaQuantizer
    {
        /// <summary>
        /// Quantize the model.
        /// </summary>
        /// <param name="srcFileName">The model file to be quantized.</param>
        /// <param name="dstFilename">The path to save the quantized model.</param>
        /// <param name="ftype">The type of quantization.</param>
        /// <param name="nthread">Thread to be used during the quantization. By default it's the physical core number.</param>
        /// <param name="allowRequantize"></param>
        /// <param name="quantizeOutputTensor"></param>
        /// <returns>Whether the quantization is successful.</returns>
        /// <exception cref="ArgumentException"></exception>
        public static bool Quantize(
            string srcFileName, string dstFilename, LLamaFtype ftype, int nthread = -1, bool allowRequantize = true, bool quantizeOutputTensor = false)
        {
            if (!ValidateFtype(ftype))
            {
                throw new ArgumentException($"The type {Enum.GetName(typeof(LLamaFtype), ftype)} is not a valid type " +
                    $"to perform quantization.");
            }

            var quantizeParams = LLamaModelQuantizeParams.Default();
            quantizeParams.ftype = ftype;
            quantizeParams.nthread = nthread;
            quantizeParams.allow_requantize = allowRequantize;
            quantizeParams.quantize_output_tensor = quantizeOutputTensor;

            // todo: fill in other quantize params fields.
            // This method could probably do with a redesign - passing in a config object (maybe directly
            // expose `LLamaModelQuantizeParams`) instead of an ever growing list of method parameters!

            return NativeApi.llama_model_quantize(srcFileName, dstFilename, ref quantizeParams) == 0;
        }

        /// <summary>
        /// Quantize the model.
        /// </summary>
        /// <param name="srcFileName">The model file to be quantized.</param>
        /// <param name="dstFilename">The path to save the quantized model.</param>
        /// <param name="ftype">The type of quantization.</param>
        /// <param name="nthread">Thread to be used during the quantization. By default it's the physical core number.</param>
        /// <param name="allowRequantize"></param>
        /// <param name="quantizeOutputTensor"></param>
        /// <returns>Whether the quantization is successful.</returns>
        /// <exception cref="ArgumentException"></exception>
        public static bool Quantize(string srcFileName, string dstFilename, string ftype, int nthread = -1, bool allowRequantize = true,
                                    bool quantizeOutputTensor = false)
        {
            return Quantize(srcFileName, dstFilename, StringToFtype(ftype), nthread, allowRequantize, quantizeOutputTensor);
        }

        private static bool ValidateFtype(LLamaFtype ftype)
        {
            // Validation copies from here:
            // https://github.com/ggerganov/llama.cpp/blob/f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7/llama.cpp#L13450

            switch (ftype)
            {
                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0:
                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1:
                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0:
                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1:
                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0:
                case LLamaFtype.LLAMA_FTYPE_MOSTLY_F16:
                case LLamaFtype.LLAMA_FTYPE_ALL_F32:

                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q2_K_S:
                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q2_K:

                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_K_XS:
                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_S:
                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_M:
                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_L:

                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_K_S:
                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_K_M:

                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_K_S:
                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_K_M:

                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q6_K:

                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_XXS:
                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_XS:
                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_S:
                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_M:

                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_XXS:

                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ1_S:
                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ1_M:

                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ4_NL:
                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ4_XS:

                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_S:
                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_M:
                    return true;

                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
                case LLamaFtype.LLAMA_FTYPE_GUESSED:
                default:
                    return false;
            }
        }

        /// <summary>
        /// Parse a string into a LLamaFtype. This is a "relaxed" parsing, which allows any string which is contained within
        /// the enum name to be used.
        ///
        /// For example "Q5_K_M" will convert to "LLAMA_FTYPE_MOSTLY_Q5_K_M"
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        /// <exception cref="ArgumentException"></exception>
        private static LLamaFtype StringToFtype(string str)
        {
            // Find all variants which contain the input string
            var matches = new List<LLamaFtype>();
            foreach (LLamaFtype ftype in Enum.GetValues(typeof(LLamaFtype)))
            {
                var name = Enum.GetName(typeof(LLamaFtype), ftype);
                
                // Note: this is using "IndexOf" instead of "Contains" to be compatible with netstandard2.0
#pragma warning disable CA2249
                if (name != null && name.IndexOf(str, StringComparison.OrdinalIgnoreCase) >= 0)
                    matches.Add(ftype);
#pragma warning restore CA2249
            }

            // If there was just one match, success!
            if (matches.Count == 1)
                return matches[0];

            // If none matched throw a generic error
            if (matches.Count == 0)
                throw new ArgumentException($"Unknown ftype \"{str}\" for quantization.");

            // There were several matches, throw an error asking the user to be more specific
            throw new ArgumentException($"\"{str}\" matches multiple potential ftypes: {string.Join(",", matches)}");
        }
    }
}
-												feat: run quantization in csharp.

											
										
										
											2023-05-11 17:38:28 +08:00
+								using LLama.Native;
 								using System;
 								using System.Collections.Generic;
 								namespace LLama
 								{
-												docs: publiash documentation 0.4.

											
										
										
											2023-06-20 02:38:57 +08:00
+								    /// <summary>
 								    /// The quantizer to quantize the model.
 								    /// </summary>
 								    public static class LLamaQuantizer
-												feat: run quantization in csharp.

											
										
										
											2023-05-11 17:38:28 +08:00
+								    {
-												refactor: use official api of quantization instead.

											
										
										
											2023-05-13 15:02:19 +08:00
+								        /// <summary>
 								        /// Quantize the model.
 								        /// </summary>
 								        /// <param name="srcFileName">The model file to be quantized.</param>
 								        /// <param name="dstFilename">The path to save the quantized model.</param>
 								        /// <param name="ftype">The type of quantization.</param>
 								        /// <param name="nthread">Thread to be used during the quantization. By default it's the physical core number.</param>
-												Various minor XML docs fixes

											
										
										
											2023-07-20 23:07:53 +08:00
+								        /// <param name="allowRequantize"></param>
 								        /// <param name="quantizeOutputTensor"></param>
-												refactor: use official api of quantization instead.

											
										
										
											2023-05-13 15:02:19 +08:00
+								        /// <returns>Whether the quantization is successful.</returns>
 								        /// <exception cref="ArgumentException"></exception>
-												April 2024 Binary Update (#662)

* Updated binaries, using [this build](https://github.com/SciSharp/LLamaSharp/actions/runs/8654672719/job/23733195669) for llama.cpp commit `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`.

 - Added all new functions.
 - Moved some functions (e.g. `SafeLlamaModelHandle` specific functions) into `SafeLlamaModelHandle.cs`
 - Exposed tokens on `SafeLlamaModelHandle` and `LLamaWeights` through a `Tokens` property. As new special tokens are added in the future they can be added here.
 - Changed all token properties to return nullable tokens, to handle some models not having some tokens.
 - Fixed `DefaultSamplingPipeline` to handle no newline token in some models.

* Moved native methods to more specific locations.

 - Context specific things have been moved into `SafeLLamaContextHandle.cs` and made private - they're exposed through C# properties and methods already.
 - Checking that GPU layer count is zero if GPU offload is not supported.
 - Moved methods for creating default structs (`llama_model_quantize_default_params` and `llama_context_default_params`) into relevant structs.

* Removed exception if `GpuLayerCount > 0` when GPU is not supported.

* - Added low level wrapper methods for new per-sequence state load/save in `SafeLLamaContextHandle`
 - Added high level wrapper methods (save/load with `State` object or memory mapped file) in `LLamaContext`
 - Moved native methods for per-sequence state load/save into `SafeLLamaContextHandle`

* Added update and defrag methods for KV cache in `SafeLLamaContextHandle`

* Updated submodule to `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`

* Passing the sequence ID when saving a single sequence state
											
										
										
											2024-04-17 06:19:47 +08:00
+								        public static bool Quantize(
 								            string srcFileName, string dstFilename, LLamaFtype ftype, int nthread = -1, bool allowRequantize = true, bool quantizeOutputTensor = false)
-												feat: run quantization in csharp.

											
										
										
											2023-05-11 17:38:28 +08:00
+								        {
 								            if (!ValidateFtype(ftype))
 								            {
 								                throw new ArgumentException($"The type {Enum.GetName(typeof(LLamaFtype), ftype)} is not a valid type " +
 								                    $"to perform quantization.");
 								            }
-												feat: update quantize native params.

											
										
										
											2023-06-20 23:32:58 +08:00
-												April 2024 Binary Update (#662)

* Updated binaries, using [this build](https://github.com/SciSharp/LLamaSharp/actions/runs/8654672719/job/23733195669) for llama.cpp commit `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`.

 - Added all new functions.
 - Moved some functions (e.g. `SafeLlamaModelHandle` specific functions) into `SafeLlamaModelHandle.cs`
 - Exposed tokens on `SafeLlamaModelHandle` and `LLamaWeights` through a `Tokens` property. As new special tokens are added in the future they can be added here.
 - Changed all token properties to return nullable tokens, to handle some models not having some tokens.
 - Fixed `DefaultSamplingPipeline` to handle no newline token in some models.

* Moved native methods to more specific locations.

 - Context specific things have been moved into `SafeLLamaContextHandle.cs` and made private - they're exposed through C# properties and methods already.
 - Checking that GPU layer count is zero if GPU offload is not supported.
 - Moved methods for creating default structs (`llama_model_quantize_default_params` and `llama_context_default_params`) into relevant structs.

* Removed exception if `GpuLayerCount > 0` when GPU is not supported.

* - Added low level wrapper methods for new per-sequence state load/save in `SafeLLamaContextHandle`
 - Added high level wrapper methods (save/load with `State` object or memory mapped file) in `LLamaContext`
 - Moved native methods for per-sequence state load/save into `SafeLLamaContextHandle`

* Added update and defrag methods for KV cache in `SafeLLamaContextHandle`

* Updated submodule to `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`

* Passing the sequence ID when saving a single sequence state
											
										
										
											2024-04-17 06:19:47 +08:00
+								            var quantizeParams = LLamaModelQuantizeParams.Default();
-												feat: update quantize native params.

											
										
										
											2023-06-20 23:32:58 +08:00
+								            quantizeParams.ftype = ftype;
 								            quantizeParams.nthread = nthread;
 								            quantizeParams.allow_requantize = allowRequantize;
 								            quantizeParams.quantize_output_tensor = quantizeOutputTensor;
-												April 2024 Binary Update (#662)

* Updated binaries, using [this build](https://github.com/SciSharp/LLamaSharp/actions/runs/8654672719/job/23733195669) for llama.cpp commit `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`.

 - Added all new functions.
 - Moved some functions (e.g. `SafeLlamaModelHandle` specific functions) into `SafeLlamaModelHandle.cs`
 - Exposed tokens on `SafeLlamaModelHandle` and `LLamaWeights` through a `Tokens` property. As new special tokens are added in the future they can be added here.
 - Changed all token properties to return nullable tokens, to handle some models not having some tokens.
 - Fixed `DefaultSamplingPipeline` to handle no newline token in some models.

* Moved native methods to more specific locations.

 - Context specific things have been moved into `SafeLLamaContextHandle.cs` and made private - they're exposed through C# properties and methods already.
 - Checking that GPU layer count is zero if GPU offload is not supported.
 - Moved methods for creating default structs (`llama_model_quantize_default_params` and `llama_context_default_params`) into relevant structs.

* Removed exception if `GpuLayerCount > 0` when GPU is not supported.

* - Added low level wrapper methods for new per-sequence state load/save in `SafeLLamaContextHandle`
 - Added high level wrapper methods (save/load with `State` object or memory mapped file) in `LLamaContext`
 - Moved native methods for per-sequence state load/save into `SafeLLamaContextHandle`

* Added update and defrag methods for KV cache in `SafeLLamaContextHandle`

* Updated submodule to `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`

* Passing the sequence ID when saving a single sequence state
											
										
										
											2024-04-17 06:19:47 +08:00
-												Modified `llama_model_quantize` to accept argument by `ref` instead of pointer.

											
										
										
											2024-04-26 08:35:13 +08:00
+								            // todo: fill in other quantize params fields.
 								            // This method could probably do with a redesign - passing in a config object (maybe directly
 								            // expose `LLamaModelQuantizeParams`) instead of an ever growing list of method parameters!
 								            return NativeApi.llama_model_quantize(srcFileName, dstFilename, ref quantizeParams) == 0;
-												refactor: use official api of quantization instead.

											
										
										
											2023-05-13 15:02:19 +08:00
+								        }
-												feat: run quantization in csharp.

											
										
										
											2023-05-11 17:38:28 +08:00
-												refactor: use official api of quantization instead.

											
										
										
											2023-05-13 15:02:19 +08:00
+								        /// <summary>
 								        /// Quantize the model.
 								        /// </summary>
 								        /// <param name="srcFileName">The model file to be quantized.</param>
 								        /// <param name="dstFilename">The path to save the quantized model.</param>
 								        /// <param name="ftype">The type of quantization.</param>
 								        /// <param name="nthread">Thread to be used during the quantization. By default it's the physical core number.</param>
-												Various minor XML docs fixes

											
										
										
											2023-07-20 23:07:53 +08:00
+								        /// <param name="allowRequantize"></param>
 								        /// <param name="quantizeOutputTensor"></param>
-												refactor: use official api of quantization instead.

											
										
										
											2023-05-13 15:02:19 +08:00
+								        /// <returns>Whether the quantization is successful.</returns>
 								        /// <exception cref="ArgumentException"></exception>
-												feat: update quantize native params.

											
										
										
											2023-06-20 23:32:58 +08:00
+								        public static bool Quantize(string srcFileName, string dstFilename, string ftype, int nthread = -1, bool allowRequantize = true,
-												Various minor XML docs fixes

											
										
										
											2023-07-20 23:07:53 +08:00
+								                                    bool quantizeOutputTensor = false)
-												refactor: use official api of quantization instead.

											
										
										
											2023-05-13 15:02:19 +08:00
+								        {
-												feat: update quantize native params.

											
										
										
											2023-06-20 23:32:58 +08:00
+								            return Quantize(srcFileName, dstFilename, StringToFtype(ftype), nthread, allowRequantize, quantizeOutputTensor);
-												feat: run quantization in csharp.

											
										
										
											2023-05-11 17:38:28 +08:00
+								        }
-												refactor: use official api of quantization instead.

											
										
										
											2023-05-13 15:02:19 +08:00
+								        private static bool ValidateFtype(LLamaFtype ftype)
 								        {
-												- Expanded range of supported types in quantizer to match llama.cpp
 - Rewritten `LLamaFtype` parsing to support any substring which uniquely matches a single enum variant

											
										
										
											2023-08-10 23:58:00 +08:00
+								            // Validation copies from here:
-												April 2024 Binary Update (#662)

* Updated binaries, using [this build](https://github.com/SciSharp/LLamaSharp/actions/runs/8654672719/job/23733195669) for llama.cpp commit `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`.

 - Added all new functions.
 - Moved some functions (e.g. `SafeLlamaModelHandle` specific functions) into `SafeLlamaModelHandle.cs`
 - Exposed tokens on `SafeLlamaModelHandle` and `LLamaWeights` through a `Tokens` property. As new special tokens are added in the future they can be added here.
 - Changed all token properties to return nullable tokens, to handle some models not having some tokens.
 - Fixed `DefaultSamplingPipeline` to handle no newline token in some models.

* Moved native methods to more specific locations.

 - Context specific things have been moved into `SafeLLamaContextHandle.cs` and made private - they're exposed through C# properties and methods already.
 - Checking that GPU layer count is zero if GPU offload is not supported.
 - Moved methods for creating default structs (`llama_model_quantize_default_params` and `llama_context_default_params`) into relevant structs.

* Removed exception if `GpuLayerCount > 0` when GPU is not supported.

* - Added low level wrapper methods for new per-sequence state load/save in `SafeLLamaContextHandle`
 - Added high level wrapper methods (save/load with `State` object or memory mapped file) in `LLamaContext`
 - Moved native methods for per-sequence state load/save into `SafeLLamaContextHandle`

* Added update and defrag methods for KV cache in `SafeLLamaContextHandle`

* Updated submodule to `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`

* Passing the sequence ID when saving a single sequence state
											
										
										
											2024-04-17 06:19:47 +08:00
+								            // https://github.com/ggerganov/llama.cpp/blob/f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7/llama.cpp#L13450
-												refactor: use official api of quantization instead.

											
										
										
											2023-05-13 15:02:19 +08:00
-												- Expanded range of supported types in quantizer to match llama.cpp
 - Rewritten `LLamaFtype` parsing to support any substring which uniquely matches a single enum variant

											
										
										
											2023-08-10 23:58:00 +08:00
+								            switch (ftype)
-												feat: run quantization in csharp.

											
										
										
											2023-05-11 17:38:28 +08:00
+								            {
-												- Expanded range of supported types in quantizer to match llama.cpp
 - Rewritten `LLamaFtype` parsing to support any substring which uniquely matches a single enum variant

											
										
										
											2023-08-10 23:58:00 +08:00
+								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0:
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1:
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0:
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1:
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0:
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_F16:
 								                case LLamaFtype.LLAMA_FTYPE_ALL_F32:
-												Added new file types to quantisation

											
										
										
											2024-02-07 02:06:10 +08:00
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q2_K_S:
-												- Expanded range of supported types in quantizer to match llama.cpp
 - Rewritten `LLamaFtype` parsing to support any substring which uniquely matches a single enum variant

											
										
										
											2023-08-10 23:58:00 +08:00
+								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q2_K:
-												Added new file types to quantisation

											
										
										
											2024-02-07 02:06:10 +08:00
-												March Binary Update (#565)

* Updated binaries to llama.cpp `3ab8b3a92ede46df88bc5a2dfca3777de4a2b2b6` (build run: https://github.com/SciSharp/LLamaSharp/actions/runs/8118890586)

* Added abort callback

* Added properties to get/set thread count on `LLamaContext`

* Fixed LLamaLogLevel numbering
											
										
										
											2024-03-06 23:19:42 +08:00
+								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_K_XS:
-												- Expanded range of supported types in quantizer to match llama.cpp
 - Rewritten `LLamaFtype` parsing to support any substring which uniquely matches a single enum variant

											
										
										
											2023-08-10 23:58:00 +08:00
+								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_S:
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_M:
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_L:
-												Added new file types to quantisation

											
										
										
											2024-02-07 02:06:10 +08:00
-												- Expanded range of supported types in quantizer to match llama.cpp
 - Rewritten `LLamaFtype` parsing to support any substring which uniquely matches a single enum variant

											
										
										
											2023-08-10 23:58:00 +08:00
+								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_K_S:
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_K_M:
-												Added new file types to quantisation

											
										
										
											2024-02-07 02:06:10 +08:00
-												- Expanded range of supported types in quantizer to match llama.cpp
 - Rewritten `LLamaFtype` parsing to support any substring which uniquely matches a single enum variant

											
										
										
											2023-08-10 23:58:00 +08:00
+								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_K_S:
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_K_M:
-												Added new file types to quantisation

											
										
										
											2024-02-07 02:06:10 +08:00
-												- Expanded range of supported types in quantizer to match llama.cpp
 - Rewritten `LLamaFtype` parsing to support any substring which uniquely matches a single enum variant

											
										
										
											2023-08-10 23:58:00 +08:00
+								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q6_K:
-												Added new file types to quantisation

											
										
										
											2024-02-07 02:06:10 +08:00
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_XXS:
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_XS:
-												March Binary Update (#565)

* Updated binaries to llama.cpp `3ab8b3a92ede46df88bc5a2dfca3777de4a2b2b6` (build run: https://github.com/SciSharp/LLamaSharp/actions/runs/8118890586)

* Added abort callback

* Added properties to get/set thread count on `LLamaContext`

* Fixed LLamaLogLevel numbering
											
										
										
											2024-03-06 23:19:42 +08:00
+								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_S:
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_M:
-												Added new file types to quantisation

											
										
										
											2024-02-07 02:06:10 +08:00
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_XXS:
-												March Binary Update (#565)

* Updated binaries to llama.cpp `3ab8b3a92ede46df88bc5a2dfca3777de4a2b2b6` (build run: https://github.com/SciSharp/LLamaSharp/actions/runs/8118890586)

* Added abort callback

* Added properties to get/set thread count on `LLamaContext`

* Fixed LLamaLogLevel numbering
											
										
										
											2024-03-06 23:19:42 +08:00
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ1_S:
-												April 2024 Binary Update (#662)

* Updated binaries, using [this build](https://github.com/SciSharp/LLamaSharp/actions/runs/8654672719/job/23733195669) for llama.cpp commit `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`.

 - Added all new functions.
 - Moved some functions (e.g. `SafeLlamaModelHandle` specific functions) into `SafeLlamaModelHandle.cs`
 - Exposed tokens on `SafeLlamaModelHandle` and `LLamaWeights` through a `Tokens` property. As new special tokens are added in the future they can be added here.
 - Changed all token properties to return nullable tokens, to handle some models not having some tokens.
 - Fixed `DefaultSamplingPipeline` to handle no newline token in some models.

* Moved native methods to more specific locations.

 - Context specific things have been moved into `SafeLLamaContextHandle.cs` and made private - they're exposed through C# properties and methods already.
 - Checking that GPU layer count is zero if GPU offload is not supported.
 - Moved methods for creating default structs (`llama_model_quantize_default_params` and `llama_context_default_params`) into relevant structs.

* Removed exception if `GpuLayerCount > 0` when GPU is not supported.

* - Added low level wrapper methods for new per-sequence state load/save in `SafeLLamaContextHandle`
 - Added high level wrapper methods (save/load with `State` object or memory mapped file) in `LLamaContext`
 - Moved native methods for per-sequence state load/save into `SafeLLamaContextHandle`

* Added update and defrag methods for KV cache in `SafeLLamaContextHandle`

* Updated submodule to `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`

* Passing the sequence ID when saving a single sequence state
											
										
										
											2024-04-17 06:19:47 +08:00
+								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ1_M:
-												March Binary Update (#565)

* Updated binaries to llama.cpp `3ab8b3a92ede46df88bc5a2dfca3777de4a2b2b6` (build run: https://github.com/SciSharp/LLamaSharp/actions/runs/8118890586)

* Added abort callback

* Added properties to get/set thread count on `LLamaContext`

* Fixed LLamaLogLevel numbering
											
										
										
											2024-03-06 23:19:42 +08:00
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ4_NL:
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ4_XS:
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_S:
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_M:
-												- Expanded range of supported types in quantizer to match llama.cpp
 - Rewritten `LLamaFtype` parsing to support any substring which uniquely matches a single enum variant

											
										
										
											2023-08-10 23:58:00 +08:00
+								                    return true;
 								                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
-												Assorted cleanup leftover after the huge change in the last PR (comments, syntax style, etc)

											
										
										
											2023-10-19 07:26:30 +08:00
+								                case LLamaFtype.LLAMA_FTYPE_GUESSED:
-												- Expanded range of supported types in quantizer to match llama.cpp
 - Rewritten `LLamaFtype` parsing to support any substring which uniquely matches a single enum variant

											
										
										
											2023-08-10 23:58:00 +08:00
+								                default:
 								                    return false;
 								            }
-												feat: run quantization in csharp.

											
										
										
											2023-05-11 17:38:28 +08:00
+								        }
-												refactor: use official api of quantization instead.

											
										
										
											2023-05-13 15:02:19 +08:00
-												- Expanded range of supported types in quantizer to match llama.cpp
 - Rewritten `LLamaFtype` parsing to support any substring which uniquely matches a single enum variant

											
										
										
											2023-08-10 23:58:00 +08:00
+								        /// <summary>
 								        /// Parse a string into a LLamaFtype. This is a "relaxed" parsing, which allows any string which is contained within
 								        /// the enum name to be used.
 								        ///
 								        /// For example "Q5_K_M" will convert to "LLAMA_FTYPE_MOSTLY_Q5_K_M"
 								        /// </summary>
 								        /// <param name="str"></param>
 								        /// <returns></returns>
 								        /// <exception cref="ArgumentException"></exception>
-												refactor: use official api of quantization instead.

											
										
										
											2023-05-13 15:02:19 +08:00
+								        private static LLamaFtype StringToFtype(string str)
 								        {
-												- Expanded range of supported types in quantizer to match llama.cpp
 - Rewritten `LLamaFtype` parsing to support any substring which uniquely matches a single enum variant

											
										
										
											2023-08-10 23:58:00 +08:00
+								            // Find all variants which contain the input string
 								            var matches = new List<LLamaFtype>();
 								            foreach (LLamaFtype ftype in Enum.GetValues(typeof(LLamaFtype)))
-												refactor: use official api of quantization instead.

											
										
										
											2023-05-13 15:02:19 +08:00
+								            {
-												- Expanded range of supported types in quantizer to match llama.cpp
 - Rewritten `LLamaFtype` parsing to support any substring which uniquely matches a single enum variant

											
										
										
											2023-08-10 23:58:00 +08:00
+								                var name = Enum.GetName(typeof(LLamaFtype), ftype);
 								                // Note: this is using "IndexOf" instead of "Contains" to be compatible with netstandard2.0
 								#pragma warning disable CA2249
 								                if (name != null && name.IndexOf(str, StringComparison.OrdinalIgnoreCase) >= 0)
 								                    matches.Add(ftype);
 								#pragma warning restore CA2249
 								            }
 								            // If there was just one match, success!
 								            if (matches.Count == 1)
 								                return matches[0];
 								            // If none matched throw a generic error
 								            if (matches.Count == 0)
 								                throw new ArgumentException($"Unknown ftype \"{str}\" for quantization.");
 								            // There were several matches, throw an error asking the user to be more specific
 								            throw new ArgumentException($"\"{str}\" matches multiple potential ftypes: {string.Join(",", matches)}");
-												refactor: use official api of quantization instead.

											
										
										
											2023-05-13 15:02:19 +08:00
+								        }
-												feat: run quantization in csharp.

											
										
										
											2023-05-11 17:38:28 +08:00
+								    }
 								}