Merge pull request #263 from martindevans/context_autodetect

Context Size Autodetect
2023-11-08 14:58:02 +00:00 · 2023-11-08 14:58:02 +00:00 · 74a8ca5f83
parent d993bebe18 db1bc741b0
commit 74a8ca5f83
5 changed files with 38 additions and 75 deletions
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@ -17,9 +17,9 @@ namespace LLama.Web.Common
        public int MaxInstances { get; set; }

        /// <summary>
-        /// Model context size (n_ctx)
+        /// Model context size (n_ctx). Null to use value from model.
        /// </summary>
-        public uint ContextSize { get; set; } = 512;
+        public uint? ContextSize { get; set; }

        /// <summary>
        /// the GPU that is used for scratch and small tensors
--- a/LLama/Abstractions/IContextParams.cs
+++ b/LLama/Abstractions/IContextParams.cs
@ -8,9 +8,9 @@ namespace LLama.Abstractions;
 public interface IContextParams
 {
    /// <summary>
-    /// Model context size (n_ctx)
+    /// Model context size (n_ctx). Null to use value from model file.
    /// </summary>
-    uint ContextSize { get; set; }
+    uint? ContextSize { get; set; }

    /// <summary>
    /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@ -12,105 +12,68 @@ namespace LLama.Common
    public record ModelParams
        : ILLamaParams
    {
-        /// <summary>
-        /// Model context size (n_ctx)
-        /// </summary>
-        public uint ContextSize { get; set; } = 512;
-        /// <summary>
-        /// the GPU that is used for scratch and small tensors
-        /// </summary>
+        /// <inheritdoc />
+        public uint? ContextSize { get; set; }
+
+        /// <inheritdoc />
        public int MainGpu { get; set; } = 0;

-        /// <summary>
-        /// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
-        /// </summary>
+        /// <inheritdoc />
        public int GpuLayerCount { get; set; } = 20;
-        /// <summary>
-        /// Seed for the random number generator (seed)
-        /// </summary>
+
+        /// <inheritdoc />
        public uint Seed { get; set; } = 0xFFFFFFFF;
-        /// <summary>
-        /// Use f16 instead of f32 for memory kv (memory_f16)
-        /// </summary>
+
+        /// <inheritdoc />
        public bool UseFp16Memory { get; set; } = true;
-        /// <summary>
-        /// Use mmap for faster loads (use_mmap)
-        /// </summary>
+
+        /// <inheritdoc />
        public bool UseMemorymap { get; set; } = true;
-        /// <summary>
-        /// Use mlock to keep model in memory (use_mlock)
-        /// </summary>
+
+        /// <inheritdoc />
        public bool UseMemoryLock { get; set; }
-        /// <summary>
-        /// Compute perplexity over the prompt (perplexity)
-        /// </summary>
+
+        /// <inheritdoc />
        public bool Perplexity { get; set; }
-        /// <summary>
-        /// Model path (model)
-        /// </summary>
+
+        /// <inheritdoc />
        public string ModelPath { get; set; }

-        /// <summary>
-        /// List of LoRAs to apply
-        /// </summary>
+        /// <inheritdoc />
        public AdapterCollection LoraAdapters { get; set; } = new();

-        /// <summary>
-        /// base model path for the lora adapter (lora_base)
-        /// </summary>
+        /// <inheritdoc />
        public string LoraBase { get; set; } = string.Empty;

-        /// <summary>
-        /// Number of threads (null = autodetect) (n_threads)
-        /// </summary>
+        /// <inheritdoc />
        public uint? Threads { get; set; }

-        /// <summary>
-        /// Number of threads to use for batch processing (null = autodetect) (n_threads)
-        /// </summary>
+        /// <inheritdoc />
        public uint? BatchThreads { get; set; }

-        /// <summary>
-        /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
-        /// </summary>
+        /// <inheritdoc />
        public uint BatchSize { get; set; } = 512;

-        /// <summary>
-        /// Whether to use embedding mode. (embedding) Note that if this is set to true, 
-        /// The LLamaModel won't produce text response anymore.
-        /// </summary>
+        /// <inheritdoc />
        public bool EmbeddingMode { get; set; }

-        /// <summary>
-        /// how split tensors should be distributed across GPUs.
-        /// </summary>
-        /// <remarks>"[ 3, 2 ]" will assign 60% of the data to GPU 0 and 40% to GPU 1.</remarks>
+        /// <inheritdoc />
        [JsonConverter(typeof(TensorSplitsCollectionConverter))]
        public TensorSplitsCollection TensorSplits { get; set; } = new();

-		/// <summary>
-		/// RoPE base frequency
-		/// </summary>
+        /// <inheritdoc />
        public float? RopeFrequencyBase { get; set; }

-		/// <summary>
-		/// RoPE frequency scaling factor
-		/// </summary>
+        /// <inheritdoc />
        public float? RopeFrequencyScale { get; set; }

-		/// <summary>
-		/// Use experimental mul_mat_q kernels
-		/// </summary>
+        /// <inheritdoc />
        public bool MulMatQ { get; set; }

-        /// <summary>
-        /// Load vocab only (no weights)
-        /// </summary>
+        /// <inheritdoc />
        public bool VocabOnly { get; set; }

-        /// <summary>
-        /// The encoding to use to convert text for the model
-        /// </summary>
+        /// <inheritdoc />
        [JsonConverter(typeof(EncodingConverter))]
        public Encoding Encoding { get; set; } = Encoding.UTF8;

--- a/LLama/Extensions/IContextParamsExtensions.cs
+++ b/LLama/Extensions/IContextParamsExtensions.cs
@ -21,7 +21,7 @@ namespace LLama.Extensions
        public static void ToLlamaContextParams(this IContextParams @params, out LLamaContextParams result)
        {
            result = NativeApi.llama_context_default_params();
-            result.n_ctx = @params.ContextSize;
+            result.n_ctx = @params.ContextSize ?? 0;
            result.n_batch = @params.BatchSize;
            result.seed = @params.Seed;
            result.f16_kv = @params.UseFp16Memory;
--- a/LLama/Native/LLamaContextParams.cs
+++ b/LLama/Native/LLamaContextParams.cs
@ -22,7 +22,7 @@ namespace LLama.Native
        public uint seed;

        /// <summary>
-        /// text context
+        /// text context, 0 = from model
        /// </summary>
        public uint n_ctx;