Unable to load the model onto multiple GPUs (#617)

2024-03-20 07:03:12 +00:00 · 2024-03-20 07:03:12 +00:00 · 9e4109f774
parent 6216197196
commit 9e4109f774
4 changed files with 46 additions and 4 deletions
--- a/LLama.KernelMemory/BuilderExtensions.cs
+++ b/LLama.KernelMemory/BuilderExtensions.cs
@ -8,6 +8,7 @@ using LLama;
 using LLama.Common;
 using Microsoft.KernelMemory.AI;
 using Microsoft.SemanticKernel.AI.Embeddings;
+using LLama.Native;

 namespace LLamaSharp.KernelMemory
 {
@ -81,7 +82,9 @@ namespace LLamaSharp.KernelMemory
                ContextSize = config?.ContextSize ?? 2048,
                Seed = config?.Seed ?? 0,
                GpuLayerCount = config?.GpuLayerCount ?? 20,
-                EmbeddingMode = true
+                EmbeddingMode = true,
+                MainGpu = config?.MainGpu ?? 0,
+                SplitMode = config?.SplitMode ?? GPUSplitMode.None
            };
            var weights = LLamaWeights.LoadFromFile(parameters);
            var context = weights.CreateContext(parameters);
--- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
+++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@ -27,7 +27,12 @@ namespace LLamaSharp.KernelMemory
        public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config)
        {
            this._config = config;
-            var @params = new ModelParams(_config.ModelPath) { EmbeddingMode = true };
+            var @params = new ModelParams(_config.ModelPath)
+            {
+                EmbeddingMode = true,
+                MainGpu = _config.MainGpu,
+                SplitMode = _config.SplitMode
+            };
            _weights = LLamaWeights.LoadFromFile(@params);
            _embedder = new LLamaEmbedder(_weights, @params);
            _ownsWeights = true;
@ -42,7 +47,12 @@ namespace LLamaSharp.KernelMemory
        public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config, LLamaWeights weights)
        {
            this._config = config;
-            var @params = new ModelParams(_config.ModelPath) { EmbeddingMode = true };
+            var @params = new ModelParams(_config.ModelPath)
+            {
+                EmbeddingMode = true,
+                MainGpu = _config.MainGpu,
+                SplitMode = _config.SplitMode
+            };
            _weights = weights;
            _embedder = new LLamaEmbedder(_weights, @params);
            _ownsEmbedder = true;
--- a/LLama.KernelMemory/LlamaSharpConfig.cs
+++ b/LLama.KernelMemory/LlamaSharpConfig.cs
@ -1,4 +1,5 @@
 using LLama.Common;
+using LLama.Native;
 using System;
 using System.Collections.Generic;
 using System.Linq;
@ -41,6 +42,31 @@ namespace LLamaSharp.KernelMemory
        /// </summary>
        public int? GpuLayerCount { get; set; }

+        /// <summary>
+        /// main_gpu interpretation depends on split_mode:
+        /// <list type="bullet">
+        ///     <item>
+        ///         <term>None</term>
+        ///         <description>The GPU that is used for the entire mode.</description>
+        ///     </item>
+        ///     <item>
+        ///         <term>Row</term>
+        ///         <description>The GPU that is used for small tensors and intermediate results.</description>
+        ///     </item>
+        ///     <item>
+        ///         <term>Layer</term>
+        ///         <description>Ignored.</description>
+        ///     </item>
+        /// </list>
+        /// </summary>
+        /// <value></value>
+        public int MainGpu { get; set; } = 0;
+
+        /// <summary>
+        /// How to split the model across multiple GPUs
+        /// </summary>
+        /// <value></value>
+        public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;

        /// <summary>
        /// Set the default inference parameters.
--- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs
+++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@ -1,6 +1,7 @@
 using LLama;
 using LLama.Abstractions;
 using LLama.Common;
+using LLama.Native;
 using Microsoft.KernelMemory.AI;
 using System;
 using System.Collections.Generic;
@ -34,7 +35,9 @@ namespace LLamaSharp.KernelMemory
            {
                ContextSize = config?.ContextSize ?? 2048,
                Seed = config?.Seed ?? 0,
-                GpuLayerCount = config?.GpuLayerCount ?? 20
+                GpuLayerCount = config?.GpuLayerCount ?? 20,
+                MainGpu = config?.MainGpu ?? 0,
+                SplitMode = config?.SplitMode ?? GPUSplitMode.None
            };
            _weights = LLamaWeights.LoadFromFile(parameters);
            _context = _weights.CreateContext(parameters);