ci: add benchmark test. (#720)

* ci: add benchmark test.
2024-05-08 23:39:49 +08:00 · 2024-05-08 23:39:49 +08:00 · 6f9097f25b
parent 05100184f4
commit 6f9097f25b
14 changed files with 5696 additions and 2 deletions
--- a/.github/_typos.toml
+++ b/.github/_typos.toml
@ -10,5 +10,7 @@ extend-exclude = [
    "_typos.toml",
    "docs/xmldocs/",
    "LLama.Web/wwwroot/",
-    "LLama/runtimes/deps/"
+    "LLama/runtimes/deps/",
+    "LLama.Benchmark/Assets/",
+    "LLama.Examples/Assets/"
 ]
--- a/.github/download_models.py
+++ b/.github/download_models.py
@ -0,0 +1,20 @@
+from huggingface_hub import hf_hub_download
+import argparse
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model-list', type=str, required=True)
+    parser.add_argument('--model-dir', type=str, required=True)
+    parser.add_argument('--endpoint', type=str, default='https://huggingface.co')
+    args = parser.parse_args()
+    
+    with open(args.model_list, 'r') as f:
+        repo_id, filename = f.readline().split(',')
+    
+    hf_hub_download(
+        repo_id=repo_id, 
+        filename=filename, 
+        local_dir=args.model_dir, 
+        local_dir_use_symlinks=False, 
+        endpoint=args.endpoint
+    )
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -0,0 +1,74 @@
+name: Benchmark Test
+on:
+  push:
+    branches: [master]
+  pull_request:
+    branches: [master]
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-benchmark
+  cancel-in-progress: true
+
+jobs:
+  linux-benchmark-cuda:
+    if: contains(github.event.pull_request.labels.*.name, 'benchmark')
+    runs-on: [self-hosted, linux, gpu]
+
+    strategy:
+      fail-fast: false
+      matrix:
+        build: [cuda11]
+        include:
+          - build: cuda11
+            image: nvidia/cuda:11.7.1-devel-ubuntu22.04
+            modeldir: /llamasharp_ci/models_benchmark
+          # - build: cuda12
+          #   image: nvidia/cuda:12.1.1-runtime-ubuntu22.04
+
+    container:
+      image: ${{ matrix.image }}
+      env:
+        BENCHMARK_MODEL_DIR: ${{ matrix.modeldir }}
+      ports:
+        - 80
+      volumes:
+        - /llamasharp_ci:/llamasharp_ci
+      options: --gpus=all --ipc=host --runtime=nvidia
+
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Install libraries
+      run: |
+        apt update
+        apt install -y curl libicu-dev
+        apt-get install wget
+        wget https://packages.microsoft.com/config/ubuntu/22.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb
+        dpkg -i packages-microsoft-prod.deb
+        rm packages-microsoft-prod.deb
+        apt-get update  && apt-get install -y dotnet-sdk-8.0
+
+    - name: Prepare models
+      run: | 
+        apt-get update
+        apt-get install -y python3.10 python3-pip
+        python3 --version
+        pip install huggingface_hub
+        python3 .github/download_models.py --model-dir ${{ matrix.modeldir }} --model-list LLama.Benchmark/Assets/models.txt --endpoint https://hf-mirror.com
+
+    - name: Clear package cache
+      run: dotnet clean LLamaSharp.sln && dotnet nuget locals all --clear
+    - name: Restore packages
+      run: dotnet restore LLamaSharp.sln
+    - name: Build
+      run: | 
+        dotnet clean
+        dotnet build LLama/LLamaSharp.csproj -c Release --no-restore
+        dotnet build LLama.Benchmark/LLama.Benchmark.csproj -c Release --no-restore
+    - name: Run benchmark test
+      run: dotnet run --project LLama.Benchmark/LLama.Benchmark.csproj -c Release --anyCategories LLama
+    - name: Upload artifacts
+      if: always()
+      uses: actions/upload-artifact@v3
+      with:
+        name: Benchmark_Results
+        path: BenchmarkDotNet.Artifacts/results/*
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@ -1,4 +1,4 @@
-name: CI
+name: Unit Test
 on:
  push:
    branches: [master]
--- a/.gitignore
+++ b/.gitignore
@ -346,3 +346,5 @@ site/
 /LLama.Unittest/Models/*.bin
 /LLama.Unittest/Models/*.gguf

+/LLama.Benchmark/Models/*.bin
+/LLama.Benchmark/Models/*.gguf
--- a/LLama.Benchmark/Assets/TextCompletionPrompts.txt
+++ b/LLama.Benchmark/Assets/TextCompletionPrompts.txt
--- a/LLama.Benchmark/Assets/extreme-ironing-taxi-610x427.jpg
+++ b/LLama.Benchmark/Assets/extreme-ironing-taxi-610x427.jpg
--- a/LLama.Benchmark/Assets/models.txt
+++ b/LLama.Benchmark/Assets/models.txt
@ -0,0 +1 @@
+TheBloke/Llama-2-7b-Chat-GGUF,llama-2-7b-chat.Q3_K_S.gguf
--- a/LLama.Benchmark/Common.cs
+++ b/LLama.Benchmark/Common.cs
@ -0,0 +1,10 @@
+
+namespace LLama.Benchmark
+{
+    public enum ExecutorType
+    {
+        Interactive,
+        Instruct,
+        Stateless
+    }
+}
--- a/LLama.Benchmark/Constants.cs
+++ b/LLama.Benchmark/Constants.cs
@ -0,0 +1,23 @@
+
+namespace LLama.Benchmark
+{
+    internal static class Constants
+    {
+        public static string ModelDir
+        {
+            get
+            {
+                return Environment.GetEnvironmentVariable("BENCHMARK_MODEL_DIR") ?? "";
+            }
+        }
+
+        public static string Generative7BModelPath =>  Path.Combine(ModelDir, "llama-2-7b-chat.Q3_K_S.gguf");
+        public static string EmbeddingModelPath => Path.Combine(ModelDir, "all-MiniLM-L12-v2.Q8_0.gguf");
+
+        public static string LLavaModelPath => Path.Combine("llava-v1.6-mistral-7b.Q3_K_XS.gguf");
+        public static string LLavaMmpPath => Path.Combine("mmproj-model-f16.gguf");
+        public static string LLavaImage => "Assets/extreme-ironing-taxi-610x427.jpg";
+
+        public static string TextCompletionPromptsFilePath => "Assets/TextCompletionPrompts.txt";
+    }
+}
--- a/LLama.Benchmark/LLama.Benchmark.csproj
+++ b/LLama.Benchmark/LLama.Benchmark.csproj
@ -0,0 +1,30 @@
+<Project Sdk="Microsoft.NET.Sdk">
+  <Import Project="..\LLama\LLamaSharp.Runtime.targets" />
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>net8.0</TargetFramework>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+    <Configuration>Release</Configuration>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="BenchmarkDotNet" Version="0.13.12" />
+    <PackageReference Include="BenchmarkDotNet.Diagnostics.Windows" Version="0.13.12" />
+  </ItemGroup>
+
+    <ItemGroup>
+      <ProjectReference Include="..\LLama\LLamaSharp.csproj" />
+    </ItemGroup>
+
+    <ItemGroup>
+        <None Update="Assets\TextCompletionPrompts.txt">
+            <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        </None>
+        <None Update="Models\extreme-ironing-taxi-610x427.jpg">
+            <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        </None>
+    </ItemGroup>
+
+</Project>
--- a/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
+++ b/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
@ -0,0 +1,126 @@
+#pragma warning disable CS8618
+
+using System.Text;
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Engines;
+using BenchmarkDotNet.Jobs;
+using LLama.Abstractions;
+using LLama.Common;
+
+namespace LLama.Benchmark.LLamaExecutorBenchmark
+{
+#if WINDOWS
+    [BenchmarkDotNet.Diagnostics.Windows.Configs.NativeMemoryProfiler]
+#endif
+    [BenchmarkCategory("Executor", "LLama")]
+    [SimpleJob(RunStrategy.Monitoring, runtimeMoniker: RuntimeMoniker.Net80)]
+    [MemoryDiagnoser]
+    [MinIterationCount(1)]
+    [MaxIterationCount(16)]
+    [RPlotExporter]
+    public class PrefillBenchmark
+    {
+        /// <summary>
+        /// (prompt length, context length)
+        /// </summary>
+        public IEnumerable<(int, uint)> PromptAndContextLengths => new (int, uint)[] 
+        {
+            (512, 2048),
+            (2024, 2048)
+        };
+
+        /// <summary>
+        /// (model path, gpu layer count)
+        /// </summary>
+        public IEnumerable<(string, int)> ModelAndGpuLayerCounts => new (string, int)[]
+        // TODO: specify the native library to load here to test cpu case better.
+        {
+            (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 0),
+            (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 10),
+            (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 20)
+        };
+
+        public IEnumerable<ExecutorType> ExecutorTypes => new ExecutorType[]
+        {
+            ExecutorType.Interactive,
+            ExecutorType.Stateless
+        };
+
+        [ParamsSource(nameof(PromptAndContextLengths))]
+        public (int, uint) PromptAndContextLength { get; set; }
+
+        [ParamsSource(nameof(ModelAndGpuLayerCounts))]
+        public (string, int) ModelAndGpuLayerCount { get; set; }
+
+        [ParamsSource(nameof(ExecutorTypes))]
+        public ExecutorType ExecutorType { get; set; }
+
+        /// <summary>
+        /// Params used to create a model.
+        /// </summary>
+        public ModelParams ModelParams { get; set; }
+
+        /// <summary>
+        /// Params used in inference.
+        /// </summary>
+        public InferenceParams InferenceParams { get; set; }
+
+        /// <summary>
+        /// Prompt used to run text generation.
+        /// </summary>
+        public string Prompt { get; set; }
+
+        public ILLamaExecutor Executor { get; set; }
+
+        private void InitializeParamsAndModel()
+        {
+            ModelParams = new ModelParams(ModelAndGpuLayerCount.Item1)
+            {
+                ContextSize = PromptAndContextLength.Item2,
+                GpuLayerCount = ModelAndGpuLayerCount.Item2
+            };
+            Prompt = File.ReadAllText(Constants.TextCompletionPromptsFilePath).Substring(0, PromptAndContextLength.Item1);
+            InferenceParams = new InferenceParams()
+            {
+                Temperature = 0.6f,
+                MaxTokens = 1 // Only prefill, no generation here.
+            };
+
+            LLamaWeights weights = LLamaWeights.LoadFromFile(ModelParams);
+            LLamaContext context = weights.CreateContext(ModelParams);
+            Executor = ExecutorType switch
+            {
+                ExecutorType.Interactive => new InteractiveExecutor(context),
+                ExecutorType.Instruct => new InstructExecutor(context),
+                ExecutorType.Stateless => new StatelessExecutor(weights, ModelParams),
+                _ => throw new NotSupportedException()
+            };
+        }
+
+        [GlobalSetup(Targets = [nameof(Basic)])]
+        public void GlobalSetup()
+        {
+            InitializeParamsAndModel();
+        }
+
+        [IterationCleanup(Targets = [nameof(Basic)])]
+        public void GlobalCleanup()
+        {
+            if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
+            {
+                Executor.Context.NativeHandle.KvCacheClear();
+            }
+        }
+
+        [Benchmark]
+        public async Task<string> Basic()
+        {
+            StringBuilder sb = new();
+            await foreach(var text in Executor.InferAsync(Prompt, InferenceParams))
+            {
+                sb.Append(text);
+            }
+            return sb.ToString();
+        }
+    }
+}
--- a/LLama.Benchmark/Program.cs
+++ b/LLama.Benchmark/Program.cs
@ -0,0 +1,13 @@
+using BenchmarkDotNet.Running;
+
+namespace LLama.Benchmark
+{
+    public class Program
+    {
+        public static void Main(string[] args)
+        {
+            var summary = BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args);
+            Console.WriteLine(summary);
+        }
+    }
+}
--- a/LLamaSharp.sln
+++ b/LLamaSharp.sln
@ -17,6 +17,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LLamaSharp.SemanticKernel",
 EndProject
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LLamaSharp.KernelMemory", "LLama.KernelMemory\LLamaSharp.KernelMemory.csproj", "{E5589AE7-B86F-4343-A1CC-8E5D34596E52}"
 EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LLama.Benchmark", "LLama.Benchmark\LLama.Benchmark.csproj", "{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@ -111,6 +113,18 @@ Global
 		{E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|Any CPU.Build.0 = Release|Any CPU
 		{E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|x64.ActiveCfg = Release|Any CPU
 		{E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|x64.Build.0 = Release|Any CPU
+		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|x64.Build.0 = Debug|Any CPU
+		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|Any CPU.ActiveCfg = Debug|Any CPU
+		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|Any CPU.Build.0 = Debug|Any CPU
+		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|x64.ActiveCfg = Debug|Any CPU
+		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|x64.Build.0 = Debug|Any CPU
+		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|Any CPU.Build.0 = Release|Any CPU
+		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|x64.ActiveCfg = Release|Any CPU
+		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|x64.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
				`@ -0,0 +1 @@`
				`TheBloke/Llama-2-7b-Chat-GGUF,llama-2-7b-chat.Q3_K_S.gguf`