parent
05100184f4
commit
6f9097f25b
|
@ -10,5 +10,7 @@ extend-exclude = [
|
||||||
"_typos.toml",
|
"_typos.toml",
|
||||||
"docs/xmldocs/",
|
"docs/xmldocs/",
|
||||||
"LLama.Web/wwwroot/",
|
"LLama.Web/wwwroot/",
|
||||||
"LLama/runtimes/deps/"
|
"LLama/runtimes/deps/",
|
||||||
|
"LLama.Benchmark/Assets/",
|
||||||
|
"LLama.Examples/Assets/"
|
||||||
]
|
]
|
||||||
|
|
|
@ -0,0 +1,20 @@
|
||||||
|
from huggingface_hub import hf_hub_download
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--model-list', type=str, required=True)
|
||||||
|
parser.add_argument('--model-dir', type=str, required=True)
|
||||||
|
parser.add_argument('--endpoint', type=str, default='https://huggingface.co')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
with open(args.model_list, 'r') as f:
|
||||||
|
repo_id, filename = f.readline().split(',')
|
||||||
|
|
||||||
|
hf_hub_download(
|
||||||
|
repo_id=repo_id,
|
||||||
|
filename=filename,
|
||||||
|
local_dir=args.model_dir,
|
||||||
|
local_dir_use_symlinks=False,
|
||||||
|
endpoint=args.endpoint
|
||||||
|
)
|
|
@ -0,0 +1,74 @@
|
||||||
|
name: Benchmark Test
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [master]
|
||||||
|
pull_request:
|
||||||
|
branches: [master]
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref }}-benchmark
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
linux-benchmark-cuda:
|
||||||
|
if: contains(github.event.pull_request.labels.*.name, 'benchmark')
|
||||||
|
runs-on: [self-hosted, linux, gpu]
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
build: [cuda11]
|
||||||
|
include:
|
||||||
|
- build: cuda11
|
||||||
|
image: nvidia/cuda:11.7.1-devel-ubuntu22.04
|
||||||
|
modeldir: /llamasharp_ci/models_benchmark
|
||||||
|
# - build: cuda12
|
||||||
|
# image: nvidia/cuda:12.1.1-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
container:
|
||||||
|
image: ${{ matrix.image }}
|
||||||
|
env:
|
||||||
|
BENCHMARK_MODEL_DIR: ${{ matrix.modeldir }}
|
||||||
|
ports:
|
||||||
|
- 80
|
||||||
|
volumes:
|
||||||
|
- /llamasharp_ci:/llamasharp_ci
|
||||||
|
options: --gpus=all --ipc=host --runtime=nvidia
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Install libraries
|
||||||
|
run: |
|
||||||
|
apt update
|
||||||
|
apt install -y curl libicu-dev
|
||||||
|
apt-get install wget
|
||||||
|
wget https://packages.microsoft.com/config/ubuntu/22.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb
|
||||||
|
dpkg -i packages-microsoft-prod.deb
|
||||||
|
rm packages-microsoft-prod.deb
|
||||||
|
apt-get update && apt-get install -y dotnet-sdk-8.0
|
||||||
|
|
||||||
|
- name: Prepare models
|
||||||
|
run: |
|
||||||
|
apt-get update
|
||||||
|
apt-get install -y python3.10 python3-pip
|
||||||
|
python3 --version
|
||||||
|
pip install huggingface_hub
|
||||||
|
python3 .github/download_models.py --model-dir ${{ matrix.modeldir }} --model-list LLama.Benchmark/Assets/models.txt --endpoint https://hf-mirror.com
|
||||||
|
|
||||||
|
- name: Clear package cache
|
||||||
|
run: dotnet clean LLamaSharp.sln && dotnet nuget locals all --clear
|
||||||
|
- name: Restore packages
|
||||||
|
run: dotnet restore LLamaSharp.sln
|
||||||
|
- name: Build
|
||||||
|
run: |
|
||||||
|
dotnet clean
|
||||||
|
dotnet build LLama/LLamaSharp.csproj -c Release --no-restore
|
||||||
|
dotnet build LLama.Benchmark/LLama.Benchmark.csproj -c Release --no-restore
|
||||||
|
- name: Run benchmark test
|
||||||
|
run: dotnet run --project LLama.Benchmark/LLama.Benchmark.csproj -c Release --anyCategories LLama
|
||||||
|
- name: Upload artifacts
|
||||||
|
if: always()
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: Benchmark_Results
|
||||||
|
path: BenchmarkDotNet.Artifacts/results/*
|
|
@ -1,4 +1,4 @@
|
||||||
name: CI
|
name: Unit Test
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches: [master]
|
branches: [master]
|
||||||
|
|
|
@ -346,3 +346,5 @@ site/
|
||||||
/LLama.Unittest/Models/*.bin
|
/LLama.Unittest/Models/*.bin
|
||||||
/LLama.Unittest/Models/*.gguf
|
/LLama.Unittest/Models/*.gguf
|
||||||
|
|
||||||
|
/LLama.Benchmark/Models/*.bin
|
||||||
|
/LLama.Benchmark/Models/*.gguf
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
After Width: | Height: | Size: 90 KiB |
|
@ -0,0 +1 @@
|
||||||
|
TheBloke/Llama-2-7b-Chat-GGUF,llama-2-7b-chat.Q3_K_S.gguf
|
|
@ -0,0 +1,10 @@
|
||||||
|
|
||||||
|
namespace LLama.Benchmark
|
||||||
|
{
|
||||||
|
public enum ExecutorType
|
||||||
|
{
|
||||||
|
Interactive,
|
||||||
|
Instruct,
|
||||||
|
Stateless
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,23 @@
|
||||||
|
|
||||||
|
namespace LLama.Benchmark
|
||||||
|
{
|
||||||
|
internal static class Constants
|
||||||
|
{
|
||||||
|
public static string ModelDir
|
||||||
|
{
|
||||||
|
get
|
||||||
|
{
|
||||||
|
return Environment.GetEnvironmentVariable("BENCHMARK_MODEL_DIR") ?? "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static string Generative7BModelPath => Path.Combine(ModelDir, "llama-2-7b-chat.Q3_K_S.gguf");
|
||||||
|
public static string EmbeddingModelPath => Path.Combine(ModelDir, "all-MiniLM-L12-v2.Q8_0.gguf");
|
||||||
|
|
||||||
|
public static string LLavaModelPath => Path.Combine("llava-v1.6-mistral-7b.Q3_K_XS.gguf");
|
||||||
|
public static string LLavaMmpPath => Path.Combine("mmproj-model-f16.gguf");
|
||||||
|
public static string LLavaImage => "Assets/extreme-ironing-taxi-610x427.jpg";
|
||||||
|
|
||||||
|
public static string TextCompletionPromptsFilePath => "Assets/TextCompletionPrompts.txt";
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
<Import Project="..\LLama\LLamaSharp.Runtime.targets" />
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<OutputType>Exe</OutputType>
|
||||||
|
<TargetFramework>net8.0</TargetFramework>
|
||||||
|
<ImplicitUsings>enable</ImplicitUsings>
|
||||||
|
<Nullable>enable</Nullable>
|
||||||
|
<Configuration>Release</Configuration>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<PackageReference Include="BenchmarkDotNet" Version="0.13.12" />
|
||||||
|
<PackageReference Include="BenchmarkDotNet.Diagnostics.Windows" Version="0.13.12" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<ProjectReference Include="..\LLama\LLamaSharp.csproj" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<None Update="Assets\TextCompletionPrompts.txt">
|
||||||
|
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||||
|
</None>
|
||||||
|
<None Update="Models\extreme-ironing-taxi-610x427.jpg">
|
||||||
|
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||||
|
</None>
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
</Project>
|
|
@ -0,0 +1,126 @@
|
||||||
|
#pragma warning disable CS8618
|
||||||
|
|
||||||
|
using System.Text;
|
||||||
|
using BenchmarkDotNet.Attributes;
|
||||||
|
using BenchmarkDotNet.Engines;
|
||||||
|
using BenchmarkDotNet.Jobs;
|
||||||
|
using LLama.Abstractions;
|
||||||
|
using LLama.Common;
|
||||||
|
|
||||||
|
namespace LLama.Benchmark.LLamaExecutorBenchmark
|
||||||
|
{
|
||||||
|
#if WINDOWS
|
||||||
|
[BenchmarkDotNet.Diagnostics.Windows.Configs.NativeMemoryProfiler]
|
||||||
|
#endif
|
||||||
|
[BenchmarkCategory("Executor", "LLama")]
|
||||||
|
[SimpleJob(RunStrategy.Monitoring, runtimeMoniker: RuntimeMoniker.Net80)]
|
||||||
|
[MemoryDiagnoser]
|
||||||
|
[MinIterationCount(1)]
|
||||||
|
[MaxIterationCount(16)]
|
||||||
|
[RPlotExporter]
|
||||||
|
public class PrefillBenchmark
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// (prompt length, context length)
|
||||||
|
/// </summary>
|
||||||
|
public IEnumerable<(int, uint)> PromptAndContextLengths => new (int, uint)[]
|
||||||
|
{
|
||||||
|
(512, 2048),
|
||||||
|
(2024, 2048)
|
||||||
|
};
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// (model path, gpu layer count)
|
||||||
|
/// </summary>
|
||||||
|
public IEnumerable<(string, int)> ModelAndGpuLayerCounts => new (string, int)[]
|
||||||
|
// TODO: specify the native library to load here to test cpu case better.
|
||||||
|
{
|
||||||
|
(Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 0),
|
||||||
|
(Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 10),
|
||||||
|
(Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 20)
|
||||||
|
};
|
||||||
|
|
||||||
|
public IEnumerable<ExecutorType> ExecutorTypes => new ExecutorType[]
|
||||||
|
{
|
||||||
|
ExecutorType.Interactive,
|
||||||
|
ExecutorType.Stateless
|
||||||
|
};
|
||||||
|
|
||||||
|
[ParamsSource(nameof(PromptAndContextLengths))]
|
||||||
|
public (int, uint) PromptAndContextLength { get; set; }
|
||||||
|
|
||||||
|
[ParamsSource(nameof(ModelAndGpuLayerCounts))]
|
||||||
|
public (string, int) ModelAndGpuLayerCount { get; set; }
|
||||||
|
|
||||||
|
[ParamsSource(nameof(ExecutorTypes))]
|
||||||
|
public ExecutorType ExecutorType { get; set; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Params used to create a model.
|
||||||
|
/// </summary>
|
||||||
|
public ModelParams ModelParams { get; set; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Params used in inference.
|
||||||
|
/// </summary>
|
||||||
|
public InferenceParams InferenceParams { get; set; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Prompt used to run text generation.
|
||||||
|
/// </summary>
|
||||||
|
public string Prompt { get; set; }
|
||||||
|
|
||||||
|
public ILLamaExecutor Executor { get; set; }
|
||||||
|
|
||||||
|
private void InitializeParamsAndModel()
|
||||||
|
{
|
||||||
|
ModelParams = new ModelParams(ModelAndGpuLayerCount.Item1)
|
||||||
|
{
|
||||||
|
ContextSize = PromptAndContextLength.Item2,
|
||||||
|
GpuLayerCount = ModelAndGpuLayerCount.Item2
|
||||||
|
};
|
||||||
|
Prompt = File.ReadAllText(Constants.TextCompletionPromptsFilePath).Substring(0, PromptAndContextLength.Item1);
|
||||||
|
InferenceParams = new InferenceParams()
|
||||||
|
{
|
||||||
|
Temperature = 0.6f,
|
||||||
|
MaxTokens = 1 // Only prefill, no generation here.
|
||||||
|
};
|
||||||
|
|
||||||
|
LLamaWeights weights = LLamaWeights.LoadFromFile(ModelParams);
|
||||||
|
LLamaContext context = weights.CreateContext(ModelParams);
|
||||||
|
Executor = ExecutorType switch
|
||||||
|
{
|
||||||
|
ExecutorType.Interactive => new InteractiveExecutor(context),
|
||||||
|
ExecutorType.Instruct => new InstructExecutor(context),
|
||||||
|
ExecutorType.Stateless => new StatelessExecutor(weights, ModelParams),
|
||||||
|
_ => throw new NotSupportedException()
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
[GlobalSetup(Targets = [nameof(Basic)])]
|
||||||
|
public void GlobalSetup()
|
||||||
|
{
|
||||||
|
InitializeParamsAndModel();
|
||||||
|
}
|
||||||
|
|
||||||
|
[IterationCleanup(Targets = [nameof(Basic)])]
|
||||||
|
public void GlobalCleanup()
|
||||||
|
{
|
||||||
|
if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
|
||||||
|
{
|
||||||
|
Executor.Context.NativeHandle.KvCacheClear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[Benchmark]
|
||||||
|
public async Task<string> Basic()
|
||||||
|
{
|
||||||
|
StringBuilder sb = new();
|
||||||
|
await foreach(var text in Executor.InferAsync(Prompt, InferenceParams))
|
||||||
|
{
|
||||||
|
sb.Append(text);
|
||||||
|
}
|
||||||
|
return sb.ToString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,13 @@
|
||||||
|
using BenchmarkDotNet.Running;
|
||||||
|
|
||||||
|
namespace LLama.Benchmark
|
||||||
|
{
|
||||||
|
public class Program
|
||||||
|
{
|
||||||
|
public static void Main(string[] args)
|
||||||
|
{
|
||||||
|
var summary = BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args);
|
||||||
|
Console.WriteLine(summary);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,6 +17,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LLamaSharp.SemanticKernel",
|
||||||
EndProject
|
EndProject
|
||||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LLamaSharp.KernelMemory", "LLama.KernelMemory\LLamaSharp.KernelMemory.csproj", "{E5589AE7-B86F-4343-A1CC-8E5D34596E52}"
|
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LLamaSharp.KernelMemory", "LLama.KernelMemory\LLamaSharp.KernelMemory.csproj", "{E5589AE7-B86F-4343-A1CC-8E5D34596E52}"
|
||||||
EndProject
|
EndProject
|
||||||
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LLama.Benchmark", "LLama.Benchmark\LLama.Benchmark.csproj", "{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}"
|
||||||
|
EndProject
|
||||||
Global
|
Global
|
||||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
Debug|Any CPU = Debug|Any CPU
|
Debug|Any CPU = Debug|Any CPU
|
||||||
|
@ -111,6 +113,18 @@ Global
|
||||||
{E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|Any CPU.Build.0 = Release|Any CPU
|
{E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
{E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|x64.ActiveCfg = Release|Any CPU
|
{E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|x64.ActiveCfg = Release|Any CPU
|
||||||
{E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|x64.Build.0 = Release|Any CPU
|
{E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|x64.Build.0 = Release|Any CPU
|
||||||
|
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||||
|
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||||
|
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||||
|
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|x64.Build.0 = Debug|Any CPU
|
||||||
|
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|Any CPU.ActiveCfg = Debug|Any CPU
|
||||||
|
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|Any CPU.Build.0 = Debug|Any CPU
|
||||||
|
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|x64.ActiveCfg = Debug|Any CPU
|
||||||
|
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|x64.Build.0 = Debug|Any CPU
|
||||||
|
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
|
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
|
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|x64.ActiveCfg = Release|Any CPU
|
||||||
|
{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|x64.Build.0 = Release|Any CPU
|
||||||
EndGlobalSection
|
EndGlobalSection
|
||||||
GlobalSection(SolutionProperties) = preSolution
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
HideSolutionNode = FALSE
|
HideSolutionNode = FALSE
|
||||||
|
|
Loading…
Reference in New Issue