parent
05100184f4
commit
6f9097f25b
|
@ -10,5 +10,7 @@ extend-exclude = [
|
|||
"_typos.toml",
|
||||
"docs/xmldocs/",
|
||||
"LLama.Web/wwwroot/",
|
||||
"LLama/runtimes/deps/"
|
||||
"LLama/runtimes/deps/",
|
||||
"LLama.Benchmark/Assets/",
|
||||
"LLama.Examples/Assets/"
|
||||
]
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
from huggingface_hub import hf_hub_download
|
||||
import argparse
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model-list', type=str, required=True)
|
||||
parser.add_argument('--model-dir', type=str, required=True)
|
||||
parser.add_argument('--endpoint', type=str, default='https://huggingface.co')
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.model_list, 'r') as f:
|
||||
repo_id, filename = f.readline().split(',')
|
||||
|
||||
hf_hub_download(
|
||||
repo_id=repo_id,
|
||||
filename=filename,
|
||||
local_dir=args.model_dir,
|
||||
local_dir_use_symlinks=False,
|
||||
endpoint=args.endpoint
|
||||
)
|
|
@ -0,0 +1,74 @@
|
|||
name: Benchmark Test
|
||||
on:
|
||||
push:
|
||||
branches: [master]
|
||||
pull_request:
|
||||
branches: [master]
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-benchmark
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
linux-benchmark-cuda:
|
||||
if: contains(github.event.pull_request.labels.*.name, 'benchmark')
|
||||
runs-on: [self-hosted, linux, gpu]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build: [cuda11]
|
||||
include:
|
||||
- build: cuda11
|
||||
image: nvidia/cuda:11.7.1-devel-ubuntu22.04
|
||||
modeldir: /llamasharp_ci/models_benchmark
|
||||
# - build: cuda12
|
||||
# image: nvidia/cuda:12.1.1-runtime-ubuntu22.04
|
||||
|
||||
container:
|
||||
image: ${{ matrix.image }}
|
||||
env:
|
||||
BENCHMARK_MODEL_DIR: ${{ matrix.modeldir }}
|
||||
ports:
|
||||
- 80
|
||||
volumes:
|
||||
- /llamasharp_ci:/llamasharp_ci
|
||||
options: --gpus=all --ipc=host --runtime=nvidia
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install libraries
|
||||
run: |
|
||||
apt update
|
||||
apt install -y curl libicu-dev
|
||||
apt-get install wget
|
||||
wget https://packages.microsoft.com/config/ubuntu/22.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb
|
||||
dpkg -i packages-microsoft-prod.deb
|
||||
rm packages-microsoft-prod.deb
|
||||
apt-get update && apt-get install -y dotnet-sdk-8.0
|
||||
|
||||
- name: Prepare models
|
||||
run: |
|
||||
apt-get update
|
||||
apt-get install -y python3.10 python3-pip
|
||||
python3 --version
|
||||
pip install huggingface_hub
|
||||
python3 .github/download_models.py --model-dir ${{ matrix.modeldir }} --model-list LLama.Benchmark/Assets/models.txt --endpoint https://hf-mirror.com
|
||||
|
||||
- name: Clear package cache
|
||||
run: dotnet clean LLamaSharp.sln && dotnet nuget locals all --clear
|
||||
- name: Restore packages
|
||||
run: dotnet restore LLamaSharp.sln
|
||||
- name: Build
|
||||
run: |
|
||||
dotnet clean
|
||||
dotnet build LLama/LLamaSharp.csproj -c Release --no-restore
|
||||
dotnet build LLama.Benchmark/LLama.Benchmark.csproj -c Release --no-restore
|
||||
- name: Run benchmark test
|
||||
run: dotnet run --project LLama.Benchmark/LLama.Benchmark.csproj -c Release --anyCategories LLama
|
||||
- name: Upload artifacts
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: Benchmark_Results
|
||||
path: BenchmarkDotNet.Artifacts/results/*
|
|
@ -1,4 +1,4 @@
|
|||
name: CI
|
||||
name: Unit Test
|
||||
on:
|
||||
push:
|
||||
branches: [master]
|
||||
|
|
|
@ -346,3 +346,5 @@ site/
|
|||
/LLama.Unittest/Models/*.bin
|
||||
/LLama.Unittest/Models/*.gguf
|
||||
|
||||
/LLama.Benchmark/Models/*.bin
|
||||
/LLama.Benchmark/Models/*.gguf
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
After Width: | Height: | Size: 90 KiB |
|
@ -0,0 +1 @@
|
|||
TheBloke/Llama-2-7b-Chat-GGUF,llama-2-7b-chat.Q3_K_S.gguf
|
|
@ -0,0 +1,10 @@
|
|||
|
||||
namespace LLama.Benchmark
|
||||
{
|
||||
public enum ExecutorType
|
||||
{
|
||||
Interactive,
|
||||
Instruct,
|
||||
Stateless
|
||||
}
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
|
||||
namespace LLama.Benchmark
|
||||
{
|
||||
internal static class Constants
|
||||
{
|
||||
public static string ModelDir
|
||||
{
|
||||
get
|
||||
{
|
||||
return Environment.GetEnvironmentVariable("BENCHMARK_MODEL_DIR") ?? "";
|
||||
}
|
||||
}
|
||||
|
||||
public static string Generative7BModelPath => Path.Combine(ModelDir, "llama-2-7b-chat.Q3_K_S.gguf");
|
||||
public static string EmbeddingModelPath => Path.Combine(ModelDir, "all-MiniLM-L12-v2.Q8_0.gguf");
|
||||
|
||||
public static string LLavaModelPath => Path.Combine("llava-v1.6-mistral-7b.Q3_K_XS.gguf");
|
||||
public static string LLavaMmpPath => Path.Combine("mmproj-model-f16.gguf");
|
||||
public static string LLavaImage => "Assets/extreme-ironing-taxi-610x427.jpg";
|
||||
|
||||
public static string TextCompletionPromptsFilePath => "Assets/TextCompletionPrompts.txt";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<Import Project="..\LLama\LLamaSharp.Runtime.targets" />
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<Configuration>Release</Configuration>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="BenchmarkDotNet" Version="0.13.12" />
|
||||
<PackageReference Include="BenchmarkDotNet.Diagnostics.Windows" Version="0.13.12" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\LLama\LLamaSharp.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<None Update="Assets\TextCompletionPrompts.txt">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="Models\extreme-ironing-taxi-610x427.jpg">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</None>
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
|
@ -0,0 +1,126 @@
|
|||
#pragma warning disable CS8618
|
||||
|
||||
using System.Text;
|
||||
using BenchmarkDotNet.Attributes;
|
||||
using BenchmarkDotNet.Engines;
|
||||
using BenchmarkDotNet.Jobs;
|
||||
using LLama.Abstractions;
|
||||
using LLama.Common;
|
||||
|
||||
namespace LLama.Benchmark.LLamaExecutorBenchmark
|
||||
{
|
||||
#if WINDOWS
|
||||
[BenchmarkDotNet.Diagnostics.Windows.Configs.NativeMemoryProfiler]
|
||||
#endif
|
||||
[BenchmarkCategory("Executor", "LLama")]
|
||||
[SimpleJob(RunStrategy.Monitoring, runtimeMoniker: RuntimeMoniker.Net80)]
|
||||
[MemoryDiagnoser]
|
||||
[MinIterationCount(1)]
|
||||
[MaxIterationCount(16)]
|
||||
[RPlotExporter]
|
||||
public class PrefillBenchmark
|
||||
{
|
||||
/// <summary>
|
||||
/// (prompt length, context length)
|
||||
/// </summary>
|
||||
public IEnumerable<(int, uint)> PromptAndContextLengths => new (int, uint)[]
|
||||
{
|
||||
(512, 2048),
|
||||
(2024, 2048)
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// (model path, gpu layer count)
|
||||
/// </summary>
|
||||
public IEnumerable<(string, int)> ModelAndGpuLayerCounts => new (string, int)[]
|
||||
// TODO: specify the native library to load here to test cpu case better.
|
||||
{
|
||||
(Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 0),
|
||||
(Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 10),
|
||||
(Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 20)
|
||||
};
|
||||
|
||||
public IEnumerable<ExecutorType> ExecutorTypes => new ExecutorType[]
|
||||
{
|
||||
ExecutorType.Interactive,
|
||||
ExecutorType.Stateless
|
||||
};
|
||||
|
||||
[ParamsSource(nameof(PromptAndContextLengths))]
|
||||
public (int, uint) PromptAndContextLength { get; set; }
|
||||
|
||||
[ParamsSource(nameof(ModelAndGpuLayerCounts))]
|
||||
public (string, int) ModelAndGpuLayerCount { get; set; }
|
||||
|
||||
[ParamsSource(nameof(ExecutorTypes))]
|
||||
public ExecutorType ExecutorType { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Params used to create a model.
|
||||
/// </summary>
|
||||
public ModelParams ModelParams { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Params used in inference.
|
||||
/// </summary>
|
||||
public InferenceParams InferenceParams { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Prompt used to run text generation.
|
||||
/// </summary>
|
||||
public string Prompt { get; set; }
|
||||
|
||||
public ILLamaExecutor Executor { get; set; }
|
||||
|
||||
private void InitializeParamsAndModel()
|
||||
{
|
||||
ModelParams = new ModelParams(ModelAndGpuLayerCount.Item1)
|
||||
{
|
||||
ContextSize = PromptAndContextLength.Item2,
|
||||
GpuLayerCount = ModelAndGpuLayerCount.Item2
|
||||
};
|
||||
Prompt = File.ReadAllText(Constants.TextCompletionPromptsFilePath).Substring(0, PromptAndContextLength.Item1);
|
||||
InferenceParams = new InferenceParams()
|
||||
{
|
||||
Temperature = 0.6f,
|
||||
MaxTokens = 1 // Only prefill, no generation here.
|
||||
};
|
||||
|
||||
LLamaWeights weights = LLamaWeights.LoadFromFile(ModelParams);
|
||||
LLamaContext context = weights.CreateContext(ModelParams);
|
||||
Executor = ExecutorType switch
|
||||
{
|
||||
ExecutorType.Interactive => new InteractiveExecutor(context),
|
||||
ExecutorType.Instruct => new InstructExecutor(context),
|
||||
ExecutorType.Stateless => new StatelessExecutor(weights, ModelParams),
|
||||
_ => throw new NotSupportedException()
|
||||
};
|
||||
}
|
||||
|
||||
[GlobalSetup(Targets = [nameof(Basic)])]
|
||||
public void GlobalSetup()
|
||||
{
|
||||
InitializeParamsAndModel();
|
||||
}
|
||||
|
||||
[IterationCleanup(Targets = [nameof(Basic)])]
|
||||
public void GlobalCleanup()
|
||||
{
|
||||
if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
|
||||
{
|
||||
Executor.Context.NativeHandle.KvCacheClear();
|
||||
}
|
||||
}
|
||||
|
||||
[Benchmark]
|
||||
public async Task<string> Basic()
|
||||
{
|
||||
StringBuilder sb = new();
|
||||
await foreach(var text in Executor.InferAsync(Prompt, InferenceParams))
|
||||
{
|
||||
sb.Append(text);
|
||||
}
|
||||
return sb.ToString();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
using BenchmarkDotNet.Running;
|
||||
|
||||
namespace LLama.Benchmark
|
||||
{
|
||||
public class Program
|
||||
{
|
||||
public static void Main(string[] args)
|
||||
{
|
||||
var summary = BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args);
|
||||
Console.WriteLine(summary);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -17,6 +17,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LLamaSharp.SemanticKernel",
|
|||
EndProject
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LLamaSharp.KernelMemory", "LLama.KernelMemory\LLamaSharp.KernelMemory.csproj", "{E5589AE7-B86F-4343-A1CC-8E5D34596E52}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LLama.Benchmark", "LLama.Benchmark\LLama.Benchmark.csproj", "{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Any CPU = Debug|Any CPU
|
||||
|
@ -111,6 +113,18 @@ Global
|
|||
{E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|x64.ActiveCfg = Release|Any CPU
|
||||
{E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|x64.Build.0 = Release|Any CPU
|
||||
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|x64.Build.0 = Debug|Any CPU
|
||||
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|Any CPU.Build.0 = Debug|Any CPU
|
||||
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|x64.ActiveCfg = Debug|Any CPU
|
||||
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|x64.Build.0 = Debug|Any CPU
|
||||
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|x64.ActiveCfg = Release|Any CPU
|
||||
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|x64.Build.0 = Release|Any CPU
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
|
Loading…
Reference in New Issue