ci: add benchmark test. (#720)

* ci: add benchmark test.
This commit is contained in:
Rinne 2024-05-08 23:39:49 +08:00 committed by GitHub
parent 05100184f4
commit 6f9097f25b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 5696 additions and 2 deletions

4
.github/_typos.toml vendored
View File

@ -10,5 +10,7 @@ extend-exclude = [
"_typos.toml",
"docs/xmldocs/",
"LLama.Web/wwwroot/",
"LLama/runtimes/deps/"
"LLama/runtimes/deps/",
"LLama.Benchmark/Assets/",
"LLama.Examples/Assets/"
]

20
.github/download_models.py vendored Normal file
View File

@ -0,0 +1,20 @@
from huggingface_hub import hf_hub_download
import argparse
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--model-list', type=str, required=True)
parser.add_argument('--model-dir', type=str, required=True)
parser.add_argument('--endpoint', type=str, default='https://huggingface.co')
args = parser.parse_args()
with open(args.model_list, 'r') as f:
repo_id, filename = f.readline().split(',')
hf_hub_download(
repo_id=repo_id,
filename=filename,
local_dir=args.model_dir,
local_dir_use_symlinks=False,
endpoint=args.endpoint
)

74
.github/workflows/benchmark.yml vendored Normal file
View File

@ -0,0 +1,74 @@
name: Benchmark Test
on:
push:
branches: [master]
pull_request:
branches: [master]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-benchmark
cancel-in-progress: true
jobs:
linux-benchmark-cuda:
if: contains(github.event.pull_request.labels.*.name, 'benchmark')
runs-on: [self-hosted, linux, gpu]
strategy:
fail-fast: false
matrix:
build: [cuda11]
include:
- build: cuda11
image: nvidia/cuda:11.7.1-devel-ubuntu22.04
modeldir: /llamasharp_ci/models_benchmark
# - build: cuda12
# image: nvidia/cuda:12.1.1-runtime-ubuntu22.04
container:
image: ${{ matrix.image }}
env:
BENCHMARK_MODEL_DIR: ${{ matrix.modeldir }}
ports:
- 80
volumes:
- /llamasharp_ci:/llamasharp_ci
options: --gpus=all --ipc=host --runtime=nvidia
steps:
- uses: actions/checkout@v4
- name: Install libraries
run: |
apt update
apt install -y curl libicu-dev
apt-get install wget
wget https://packages.microsoft.com/config/ubuntu/22.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb
dpkg -i packages-microsoft-prod.deb
rm packages-microsoft-prod.deb
apt-get update && apt-get install -y dotnet-sdk-8.0
- name: Prepare models
run: |
apt-get update
apt-get install -y python3.10 python3-pip
python3 --version
pip install huggingface_hub
python3 .github/download_models.py --model-dir ${{ matrix.modeldir }} --model-list LLama.Benchmark/Assets/models.txt --endpoint https://hf-mirror.com
- name: Clear package cache
run: dotnet clean LLamaSharp.sln && dotnet nuget locals all --clear
- name: Restore packages
run: dotnet restore LLamaSharp.sln
- name: Build
run: |
dotnet clean
dotnet build LLama/LLamaSharp.csproj -c Release --no-restore
dotnet build LLama.Benchmark/LLama.Benchmark.csproj -c Release --no-restore
- name: Run benchmark test
run: dotnet run --project LLama.Benchmark/LLama.Benchmark.csproj -c Release --anyCategories LLama
- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v3
with:
name: Benchmark_Results
path: BenchmarkDotNet.Artifacts/results/*

View File

@ -1,4 +1,4 @@
name: CI
name: Unit Test
on:
push:
branches: [master]

2
.gitignore vendored
View File

@ -346,3 +346,5 @@ site/
/LLama.Unittest/Models/*.bin
/LLama.Unittest/Models/*.gguf
/LLama.Benchmark/Models/*.bin
/LLama.Benchmark/Models/*.gguf

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 90 KiB

View File

@ -0,0 +1 @@
TheBloke/Llama-2-7b-Chat-GGUF,llama-2-7b-chat.Q3_K_S.gguf

10
LLama.Benchmark/Common.cs Normal file
View File

@ -0,0 +1,10 @@
namespace LLama.Benchmark
{
public enum ExecutorType
{
Interactive,
Instruct,
Stateless
}
}

View File

@ -0,0 +1,23 @@
namespace LLama.Benchmark
{
internal static class Constants
{
public static string ModelDir
{
get
{
return Environment.GetEnvironmentVariable("BENCHMARK_MODEL_DIR") ?? "";
}
}
public static string Generative7BModelPath => Path.Combine(ModelDir, "llama-2-7b-chat.Q3_K_S.gguf");
public static string EmbeddingModelPath => Path.Combine(ModelDir, "all-MiniLM-L12-v2.Q8_0.gguf");
public static string LLavaModelPath => Path.Combine("llava-v1.6-mistral-7b.Q3_K_XS.gguf");
public static string LLavaMmpPath => Path.Combine("mmproj-model-f16.gguf");
public static string LLavaImage => "Assets/extreme-ironing-taxi-610x427.jpg";
public static string TextCompletionPromptsFilePath => "Assets/TextCompletionPrompts.txt";
}
}

View File

@ -0,0 +1,30 @@
<Project Sdk="Microsoft.NET.Sdk">
<Import Project="..\LLama\LLamaSharp.Runtime.targets" />
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<Configuration>Release</Configuration>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="BenchmarkDotNet" Version="0.13.12" />
<PackageReference Include="BenchmarkDotNet.Diagnostics.Windows" Version="0.13.12" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\LLama\LLamaSharp.csproj" />
</ItemGroup>
<ItemGroup>
<None Update="Assets\TextCompletionPrompts.txt">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="Models\extreme-ironing-taxi-610x427.jpg">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
</ItemGroup>
</Project>

View File

@ -0,0 +1,126 @@
#pragma warning disable CS8618
using System.Text;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Engines;
using BenchmarkDotNet.Jobs;
using LLama.Abstractions;
using LLama.Common;
namespace LLama.Benchmark.LLamaExecutorBenchmark
{
#if WINDOWS
[BenchmarkDotNet.Diagnostics.Windows.Configs.NativeMemoryProfiler]
#endif
[BenchmarkCategory("Executor", "LLama")]
[SimpleJob(RunStrategy.Monitoring, runtimeMoniker: RuntimeMoniker.Net80)]
[MemoryDiagnoser]
[MinIterationCount(1)]
[MaxIterationCount(16)]
[RPlotExporter]
public class PrefillBenchmark
{
/// <summary>
/// (prompt length, context length)
/// </summary>
public IEnumerable<(int, uint)> PromptAndContextLengths => new (int, uint)[]
{
(512, 2048),
(2024, 2048)
};
/// <summary>
/// (model path, gpu layer count)
/// </summary>
public IEnumerable<(string, int)> ModelAndGpuLayerCounts => new (string, int)[]
// TODO: specify the native library to load here to test cpu case better.
{
(Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 0),
(Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 10),
(Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 20)
};
public IEnumerable<ExecutorType> ExecutorTypes => new ExecutorType[]
{
ExecutorType.Interactive,
ExecutorType.Stateless
};
[ParamsSource(nameof(PromptAndContextLengths))]
public (int, uint) PromptAndContextLength { get; set; }
[ParamsSource(nameof(ModelAndGpuLayerCounts))]
public (string, int) ModelAndGpuLayerCount { get; set; }
[ParamsSource(nameof(ExecutorTypes))]
public ExecutorType ExecutorType { get; set; }
/// <summary>
/// Params used to create a model.
/// </summary>
public ModelParams ModelParams { get; set; }
/// <summary>
/// Params used in inference.
/// </summary>
public InferenceParams InferenceParams { get; set; }
/// <summary>
/// Prompt used to run text generation.
/// </summary>
public string Prompt { get; set; }
public ILLamaExecutor Executor { get; set; }
private void InitializeParamsAndModel()
{
ModelParams = new ModelParams(ModelAndGpuLayerCount.Item1)
{
ContextSize = PromptAndContextLength.Item2,
GpuLayerCount = ModelAndGpuLayerCount.Item2
};
Prompt = File.ReadAllText(Constants.TextCompletionPromptsFilePath).Substring(0, PromptAndContextLength.Item1);
InferenceParams = new InferenceParams()
{
Temperature = 0.6f,
MaxTokens = 1 // Only prefill, no generation here.
};
LLamaWeights weights = LLamaWeights.LoadFromFile(ModelParams);
LLamaContext context = weights.CreateContext(ModelParams);
Executor = ExecutorType switch
{
ExecutorType.Interactive => new InteractiveExecutor(context),
ExecutorType.Instruct => new InstructExecutor(context),
ExecutorType.Stateless => new StatelessExecutor(weights, ModelParams),
_ => throw new NotSupportedException()
};
}
[GlobalSetup(Targets = [nameof(Basic)])]
public void GlobalSetup()
{
InitializeParamsAndModel();
}
[IterationCleanup(Targets = [nameof(Basic)])]
public void GlobalCleanup()
{
if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
{
Executor.Context.NativeHandle.KvCacheClear();
}
}
[Benchmark]
public async Task<string> Basic()
{
StringBuilder sb = new();
await foreach(var text in Executor.InferAsync(Prompt, InferenceParams))
{
sb.Append(text);
}
return sb.ToString();
}
}
}

View File

@ -0,0 +1,13 @@
using BenchmarkDotNet.Running;
namespace LLama.Benchmark
{
public class Program
{
public static void Main(string[] args)
{
var summary = BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args);
Console.WriteLine(summary);
}
}
}

View File

@ -17,6 +17,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LLamaSharp.SemanticKernel",
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LLamaSharp.KernelMemory", "LLama.KernelMemory\LLamaSharp.KernelMemory.csproj", "{E5589AE7-B86F-4343-A1CC-8E5D34596E52}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LLama.Benchmark", "LLama.Benchmark\LLama.Benchmark.csproj", "{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@ -111,6 +113,18 @@ Global
{E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|Any CPU.Build.0 = Release|Any CPU
{E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|x64.ActiveCfg = Release|Any CPU
{E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|x64.Build.0 = Release|Any CPU
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|Any CPU.Build.0 = Debug|Any CPU
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|x64.ActiveCfg = Debug|Any CPU
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|x64.Build.0 = Debug|Any CPU
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|Any CPU.ActiveCfg = Debug|Any CPU
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|Any CPU.Build.0 = Debug|Any CPU
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|x64.ActiveCfg = Debug|Any CPU
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|x64.Build.0 = Debug|Any CPU
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|Any CPU.ActiveCfg = Release|Any CPU
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|Any CPU.Build.0 = Release|Any CPU
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|x64.ActiveCfg = Release|Any CPU
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|x64.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE