Compare commits

..

1 Commits

Author SHA1 Message Date
dependabot[bot] 96bf214427
build(deps): bump Microsoft.SemanticKernel.Abstractions
Bumps [Microsoft.SemanticKernel.Abstractions](https://github.com/microsoft/semantic-kernel) from 1.6.2 to 1.6.3.
- [Release notes](https://github.com/microsoft/semantic-kernel/releases)
- [Commits](https://github.com/microsoft/semantic-kernel/compare/dotnet-1.6.2...dotnet-1.6.3)

---
updated-dependencies:
- dependency-name: Microsoft.SemanticKernel.Abstractions
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-04-01 06:26:17 +00:00
189 changed files with 2617 additions and 10421 deletions

View File

@ -1,86 +0,0 @@
[*]
charset = utf-8
end_of_line = lf
trim_trailing_whitespace = false
insert_final_newline = false
indent_style = space
indent_size = 4
# Microsoft .NET properties
csharp_new_line_before_members_in_object_initializers = false
csharp_preferred_modifier_order = public, private, protected, internal, file, new, static, abstract, virtual, sealed, readonly, override, extern, unsafe, volatile, async, required:suggestion
csharp_style_prefer_utf8_string_literals = true:suggestion
csharp_style_var_elsewhere = true:suggestion
csharp_style_var_for_built_in_types = true:suggestion
csharp_style_var_when_type_is_apparent = true:suggestion
dotnet_naming_rule.enum_member_rule.import_to_resharper = True
dotnet_naming_rule.enum_member_rule.resharper_description = Enum members
dotnet_naming_rule.enum_member_rule.resharper_guid = 8b8504e3-f0be-4c14-9103-c732f2bddc15
dotnet_naming_rule.enum_member_rule.resharper_style = AA_BB, AaBb
dotnet_naming_rule.enum_member_rule.severity = warning
dotnet_naming_rule.enum_member_rule.style = all_upper_style
dotnet_naming_rule.enum_member_rule.symbols = enum_member_symbols
dotnet_naming_rule.unity_serialized_field_rule.import_to_resharper = True
dotnet_naming_rule.unity_serialized_field_rule.resharper_description = Unity serialized field
dotnet_naming_rule.unity_serialized_field_rule.resharper_guid = 5f0fdb63-c892-4d2c-9324-15c80b22a7ef
dotnet_naming_rule.unity_serialized_field_rule.severity = warning
dotnet_naming_rule.unity_serialized_field_rule.style = lower_camel_case_style
dotnet_naming_rule.unity_serialized_field_rule.symbols = unity_serialized_field_symbols
dotnet_naming_style.all_upper_style.capitalization = all_upper
dotnet_naming_style.all_upper_style.word_separator = _
dotnet_naming_style.lower_camel_case_style.capitalization = camel_case
dotnet_naming_symbols.enum_member_symbols.applicable_accessibilities = *
dotnet_naming_symbols.enum_member_symbols.applicable_kinds =
dotnet_naming_symbols.enum_member_symbols.resharper_applicable_kinds = enum_member
dotnet_naming_symbols.enum_member_symbols.resharper_required_modifiers = any
dotnet_naming_symbols.unity_serialized_field_symbols.applicable_accessibilities = *
dotnet_naming_symbols.unity_serialized_field_symbols.applicable_kinds =
dotnet_naming_symbols.unity_serialized_field_symbols.resharper_applicable_kinds = unity_serialised_field
dotnet_naming_symbols.unity_serialized_field_symbols.resharper_required_modifiers = instance
dotnet_style_parentheses_in_arithmetic_binary_operators = never_if_unnecessary:none
dotnet_style_parentheses_in_other_binary_operators = always_for_clarity:none
dotnet_style_parentheses_in_relational_binary_operators = never_if_unnecessary:none
dotnet_style_predefined_type_for_locals_parameters_members = true:suggestion
dotnet_style_predefined_type_for_member_access = true:suggestion
dotnet_style_qualification_for_event = false:suggestion
dotnet_style_qualification_for_field = false:suggestion
dotnet_style_qualification_for_method = false:suggestion
dotnet_style_qualification_for_property = false:suggestion
dotnet_style_require_accessibility_modifiers = for_non_interface_members:suggestion
# ReSharper properties
resharper_autodetect_indent_settings = true
resharper_formatter_off_tag = @formatter:off
resharper_formatter_on_tag = @formatter:on
resharper_formatter_tags_enabled = true
resharper_use_indent_from_vs = false
# ReSharper inspection severities
resharper_arrange_redundant_parentheses_highlighting = hint
resharper_arrange_this_qualifier_highlighting = hint
resharper_arrange_type_member_modifiers_highlighting = hint
resharper_arrange_type_modifiers_highlighting = hint
resharper_built_in_type_reference_style_for_member_access_highlighting = hint
resharper_built_in_type_reference_style_highlighting = hint
resharper_razor_assembly_not_resolved_highlighting = warning
resharper_redundant_base_qualifier_highlighting = warning
resharper_suggest_var_or_type_built_in_types_highlighting = hint
resharper_suggest_var_or_type_elsewhere_highlighting = hint
resharper_suggest_var_or_type_simple_types_highlighting = hint
resharper_web_config_module_not_resolved_highlighting = warning
resharper_web_config_type_not_resolved_highlighting = warning
resharper_web_config_wrong_module_highlighting = warning
[{*.har,*.jsb2,*.jsb3,*.json,*.jsonc,*.postman_collection,*.postman_collection.json,*.postman_environment,*.postman_environment.json,.babelrc,.eslintrc,.prettierrc,.stylelintrc,bowerrc,jest.config}]
indent_style = space
indent_size = 2
[*.map]
indent_style = space
indent_size = 2
[*.{appxmanifest,asax,ascx,aspx,axaml,build,c,c++,c++m,cc,ccm,cginc,compute,cp,cpp,cppm,cs,cshtml,cu,cuh,cxx,cxxm,dtd,fs,fsi,fsscript,fsx,fx,fxh,h,hh,hlsl,hlsli,hlslinc,hpp,hxx,inc,inl,ino,ipp,ixx,master,ml,mli,mpp,mq4,mq5,mqh,mxx,nuspec,paml,razor,resw,resx,shader,skin,tpp,usf,ush,uxml,vb,xaml,xamlx,xoml,xsd}]
indent_style = space
indent_size = 4
tab_width = 4

View File

@ -1,12 +0,0 @@
name: Blank Issue
description: Submit any other kind of issue.
labels: [Blank Issue]
body:
- type: textarea
id: description
attributes:
label: Description
description: Please describe the issue here.
placeholder: Description
validations:
required: false

View File

@ -1,52 +0,0 @@
name: BUG Report
description: Report a BUG of LLamaSharp.
title: "[BUG]: "
labels: [bug-report]
body:
- type: markdown
attributes:
value: |
To help us fix your problem more quickly, please check the following steps at first.
- [ ] I have read the related documents.
- [ ] I have searched the keywords in the issues.
- type: textarea
id: background
attributes:
label: Description
description: Please share a clear description of the problem.
placeholder: Description
validations:
required: true
- type: textarea
id: repro-steps
attributes:
label: Reproduction Steps
description: |
Please describe how to reproduce the problem here. A minimal example code is the best.
placeholder: Reproduction Steps
validations:
required: true
- type: textarea
id: configuration
attributes:
label: Environment & Configuration
description: |
Please provide the information of your environment and configuration.
placeholder: Environment & Configuration
value: |
- Operating system:
- .NET runtime version:
- LLamaSharp version:
- CUDA version (if you are using cuda backend):
- CPU & GPU device:
validations:
required: true
- type: textarea
id: known-workarounds
attributes:
label: Known Workarounds
description: |
Please provide a description of the known workarounds, if any.
placeholder: Known Workarounds
validations:
required: false

View File

@ -1,35 +0,0 @@
name: Feature Request
description: Request/Propose a new feature in LLamaSharp.
title: "[Feature]: "
labels: [feature-request]
body:
- type: markdown
attributes:
value: |
Feature proposal/request is always welcomed!
- type: textarea
id: background
attributes:
label: Background & Description
description: Please describe the purpose and value of the new feature here.
placeholder: Background & Description
validations:
required: true
- type: textarea
id: api-proposal
attributes:
label: API & Usage
description: |
Please tell us the new APIs related to the feature, if any. Please describe when and how it is used.
placeholder: API & Usage
validations:
required: false
- type: textarea
id: implementation
attributes:
label: How to implement
description: |
Please describe how you think the feature should be implemented. It's okay leave it blank.
placeholder: How to implement
validations:
required: false

16
.github/_typos.toml vendored
View File

@ -1,16 +0,0 @@
# Typos configuration file
#
# Info: https://github.com/marketplace/actions/typos-action
# Install: brew install typos-cli
# Install: conda install typos
# Run: typos -c .github/_typos.toml
[files]
extend-exclude = [
"_typos.toml",
"docs/xmldocs/",
"LLama.Web/wwwroot/",
"LLama/runtimes/deps/",
"LLama.Benchmark/Assets/",
"LLama.Examples/Assets/"
]

View File

@ -1,20 +0,0 @@
from huggingface_hub import hf_hub_download
import argparse
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--model-list', type=str, required=True)
parser.add_argument('--model-dir', type=str, required=True)
parser.add_argument('--endpoint', type=str, default='https://huggingface.co')
args = parser.parse_args()
with open(args.model_list, 'r') as f:
repo_id, filename = f.readline().split(',')
hf_hub_download(
repo_id=repo_id,
filename=filename,
local_dir=args.model_dir,
local_dir_use_symlinks=False,
endpoint=args.endpoint
)

View File

@ -1,117 +0,0 @@
name: Benchmark Test
on:
push:
branches: [master]
pull_request:
branches: [master]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-benchmark
cancel-in-progress: true
jobs:
linux-benchmark-cuda:
if: contains(github.event.pull_request.labels.*.name, 'benchmark')
runs-on: [self-hosted, linux, gpu]
strategy:
fail-fast: false
matrix:
build: [cuda11, cuda12]
include:
- build: cuda11
image: nvidia/cuda:11.7.1-devel-ubuntu22.04
modeldir: /llamasharp_ci/models_benchmark
- build: cuda12
image: nvidia/cuda:12.1.1-devel-ubuntu22.04
modeldir: /llamasharp_ci/models_benchmark
container:
image: ${{ matrix.image }}
env:
BENCHMARK_MODEL_DIR: ${{ matrix.modeldir }}
ports:
- 80
volumes:
- /llamasharp_ci:/llamasharp_ci
options: --gpus=all --ipc=host --runtime=nvidia
steps:
- uses: actions/checkout@v4
- name: Install libraries
run: |
apt update
apt install -y curl libicu-dev
apt-get install wget
wget https://packages.microsoft.com/config/ubuntu/22.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb
dpkg -i packages-microsoft-prod.deb
rm packages-microsoft-prod.deb
apt-get update && apt-get install -y dotnet-sdk-8.0
- name: Prepare models
run: |
apt-get update
apt-get install -y python3.10 python3-pip
python3 --version
pip install huggingface_hub
python3 .github/download_models.py --model-dir ${{ matrix.modeldir }} --model-list LLama.Benchmark/Assets/models.txt --endpoint https://hf-mirror.com
- name: Clear package cache
run: dotnet clean LLamaSharp.sln && dotnet nuget locals all --clear
- name: Restore packages
run: dotnet restore LLamaSharp.sln
- name: Build
run: |
dotnet clean
dotnet build LLama/LLamaSharp.csproj -c Release --no-restore
dotnet build LLama.Benchmark/LLama.Benchmark.csproj -c Release --no-restore
- name: Run benchmark test
run: dotnet run --project LLama.Benchmark/LLama.Benchmark.csproj -c Release --anyCategories LLama
- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v3
with:
name: Benchmark_Results
path: BenchmarkDotNet.Artifacts/results/*
windows-benchmark-cuda:
if: contains(github.event.pull_request.labels.*.name, 'benchmark')
runs-on: [self-hosted, windows, gpu]
strategy:
fail-fast: false
matrix:
build: [cuda11]
include:
- build: cuda11
modeldir: F:\Models\LLamaSharpBenchmark
env:
AGENT_TOOLSDIRECTORY: D:\Libs\github\runner-cache
BENCHMARK_MODEL_DIR: ${{ matrix.modeldir }}
steps:
- name: Settings
run: |
set http_proxy=127.0.0.1:7891
set https_proxy=127.0.0.1:7891
- uses: actions/checkout@v4
- name: Clear package cache
run: dotnet clean LLamaSharp.sln && dotnet nuget locals all --clear
- name: Restore packages
run: dotnet restore LLamaSharp.sln
- name: Build
run: |
dotnet clean
dotnet build LLama/LLamaSharp.csproj -c Release --no-restore
dotnet build LLama.Benchmark/LLama.Benchmark.csproj -c Release --no-restore
- name: Run benchmark test
run: dotnet run --project LLama.Benchmark/LLama.Benchmark.csproj -c Release --anyCategories LLama
- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v3
with:
name: Benchmark_Results
path: BenchmarkDotNet.Artifacts/results/*

View File

@ -1,26 +0,0 @@
name: .NET code format check
on:
# Currently we don't trigger this workflow.
# It's only used to show how the format check should be used
# and may be enabled in the future.
push:
branches: [ "PLACEHOLDER" ]
pull_request:
branches: [ "PLACEHOLDER" ]
jobs:
dotnet-format:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Setup .NET
uses: actions/setup-dotnet@v3
with:
dotnet-version: 8.0.x
- name: Restore dependencies
run: dotnet restore
- name: Format
run: dotnet format --verify-no-changes --verbosity diagnostic

View File

@ -48,12 +48,12 @@ jobs:
cd build
cmake .. ${{ env.COMMON_DEFINE }} ${{ matrix.defines }}
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
- uses: actions/upload-artifact@v4
- uses: actions/upload-artifact@v3
with:
path: ./build/libllama.so
name: llama-bin-linux-${{ matrix.build }}-x64.so
- name: Upload Llava
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: ./build/examples/llava/libllava_shared.so
name: llava-bin-linux-${{ matrix.build }}-x64.so
@ -89,13 +89,13 @@ jobs:
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Upload artifacts
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: .\build\bin\Release\llama.dll
name: llama-bin-win-${{ matrix.build }}-x64.dll
- name: Upload Llava
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: .\build\bin\Release\llava_shared.dll
name: llava-bin-win-${{ matrix.build }}-x64.dll
@ -121,7 +121,6 @@ jobs:
uses: actions/checkout@v4
with:
repository: ggerganov/llama.cpp
ref: '${{ github.event.inputs.llama_cpp_commit }}'
- name: Download dependencies - Linux
if: ${{ matrix.os == 'ubuntu-22.04' }}
run: |
@ -170,7 +169,7 @@ jobs:
ls -R
- name: Upload artifacts (Windows)
if: ${{ matrix.os == 'windows-latest' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: |
.\build\bin\Release\llama.dll
@ -178,14 +177,14 @@ jobs:
name: llama-bin-win-clblast-x64.dll
- name: Upload llava artifacts (Windows)
if: ${{ matrix.os == 'windows-latest' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: |
.\build\bin\Release\llava_shared.dll
name: llava-bin-win-clblast-x64.dll
- name: Upload artifacts (linux)
if: ${{ matrix.os == 'ubuntu-22.04' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: |
./build/libllama.so
@ -193,7 +192,7 @@ jobs:
name: llama-bin-linux-clblast-x64.so
- name: Upload llava artifacts (linux)
if: ${{ matrix.os == 'ubuntu-22.04' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: |
./build/examples/llava/libllava_shared.so
@ -244,25 +243,25 @@ jobs:
- name: Upload artifacts (Windows)
if: ${{ matrix.os == 'windows-latest' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: .\build\bin\Release\llama.dll
name: llama-bin-win-cublas-cu${{ matrix.cuda }}-x64.dll
- name: Upload llava artifacts (Windows)
if: ${{ matrix.os == 'windows-latest' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: .\build\bin\Release\llava_shared.dll
name: llava-bin-win-cublas-cu${{ matrix.cuda }}-x64.dll
- name: Upload artifacts (Linux)
if: ${{ matrix.os == 'ubuntu-20.04' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: ./build/libllama.so
name: llama-bin-linux-cublas-cu${{ matrix.cuda }}-x64.so
- name: Upload llava artifacts (Linux)
if: ${{ matrix.os == 'ubuntu-20.04' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: ./build/examples/llava/libllava_shared.so
name: llava-bin-linux-cublas-cu${{ matrix.cuda }}-x64.so
@ -275,7 +274,7 @@ jobs:
matrix:
include:
- build: 'arm64'
defines: '-DCMAKE_OSX_ARCHITECTURES=arm64 -DLLAMA_METAL_EMBED_LIBRARY=ON'
defines: '-DCMAKE_OSX_ARCHITECTURES=arm64'
- build: 'x64'
defines: '-DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=OFF -DLLAMA_AVX=ON -DLLAMA_AVX2=ON'
runs-on: macos-latest
@ -297,18 +296,18 @@ jobs:
cmake .. ${{ env.COMMON_DEFINE }} ${{ matrix.defines }}
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Upload artifacts
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: ./build/libllama.dylib
name: llama-bin-osx-${{ matrix.build }}.dylib
- name: Upload Llava
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: ./build/examples/llava/libllava_shared.dylib
name: llava-bin-osx-${{ matrix.build }}.dylib
- name: Upload Metal
if: ${{ matrix.build != 'x64' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: ./build/bin/ggml-metal.metal
name: ggml-metal.metal
@ -371,7 +370,7 @@ jobs:
cp artifacts/llava-bin-win-cublas-cu11.7.1-x64.dll/llava_shared.dll deps/cu11.7.1/llava_shared.dll
cp artifacts/llama-bin-linux-cublas-cu11.7.1-x64.so/libllama.so deps/cu11.7.1/libllama.so
cp artifacts/llava-bin-linux-cublas-cu11.7.1-x64.so/libllava_shared.so deps/cu11.7.1/libllava_shared.so
cp artifacts/llava-bin-linux-cublas-cu11.7.1-x64.so/libllava_shared.so deps/cu11.7.1/libllama_shared.so
cp artifacts/llama-bin-win-cublas-cu12.1.0-x64.dll/llama.dll deps/cu12.1.0/llama.dll
cp artifacts/llava-bin-win-cublas-cu12.1.0-x64.dll/llava_shared.dll deps/cu12.1.0/llava_shared.dll
@ -380,20 +379,19 @@ jobs:
cp artifacts/llava-bin-linux-cublas-cu12.1.0-x64.so/libllava_shared.so deps/cu12.1.0/libllava_shared.so
cp artifacts/llama-bin-win-clblast-x64.dll/{llama,clblast}.dll deps/clblast/
cp artifacts/llava-bin-win-clblast-x64.dll/llava_shared.dll deps/clblast/llava_shared.dll
cp artifacts/llama-bin-linux-clblast-x64.so/libllama.so deps/clblast/
cp artifacts/llava-bin-linux-clblast-x64.so/libllava_shared.so deps/clblast/libllava_shared.so
- name: Upload artifacts
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: deps/
name: deps
- name: Remove Artifacts
uses: geekyeggo/delete-artifact@v5
uses: geekyeggo/delete-artifact@v2
with:
name: |
llama-*

View File

@ -1,4 +1,4 @@
name: Unit Test
name: CI
on:
push:
branches: [master]
@ -13,6 +13,7 @@ jobs:
name: Test
runs-on: ${{ matrix.os }}
strategy:
max-parallel: 2
fail-fast: false
matrix:
build: [linux-release, windows-release, osx-release]
@ -20,9 +21,9 @@ jobs:
- build: linux-release
os: ubuntu-latest
config: release
- build: osx-release
os: macos-14 # https://github.blog/changelog/2024-01-30-github-actions-introducing-the-new-m1-macos-runner-available-to-open-source/
config: release
- build: osx-release
os: macos-latest
config: release
- build: windows-release
os: windows-2019
config: release
@ -30,7 +31,8 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-dotnet@v4
with:
dotnet-version: |
dotnet-version: |
7.0.x
8.0.x
- name: Cache Packages
uses: actions/cache@v4
@ -45,7 +47,7 @@ jobs:
- name: Build
run: dotnet build LLamaSharp.sln -c ${{ matrix.config }} --no-restore
- name: Test
run: dotnet test LLamaSharp.sln -c ${{ matrix.config }} -l "console;verbosity=detailed" --diag:logs/log.txt --filter Category!=NoCI
run: dotnet test LLamaSharp.sln -c ${{ matrix.config }} -l "console;verbosity=detailed" --diag:logs/log.txt
- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v3

View File

@ -1,31 +0,0 @@
# Check pull requests for typos.
#
# Configuration: .github/_typos.toml
#
# Info: https://github.com/marketplace/actions/typos-action
# Local install: brew install typos-cli
# Local install: conda install typos
# Local run: typos -c .github/_typos.toml
name: Spell Check
on:
push:
branches: [ "master" ]
pull_request:
branches: [ "master" ]
jobs:
run:
name: Spell check
runs-on: ubuntu-latest
steps:
- name: Check out code
uses: actions/checkout@v3
- name: Use custom config file
uses: crate-ci/typos@master
with:
config: .github/_typos.toml
write_changes: false
quiet: true

2
.gitignore vendored
View File

@ -346,5 +346,3 @@ site/
/LLama.Unittest/Models/*.bin
/LLama.Unittest/Models/*.gguf
/LLama.Benchmark/Models/*.bin
/LLama.Benchmark/Models/*.gguf

File diff suppressed because it is too large Load Diff

Binary file not shown.

Before

Width:  |  Height:  |  Size: 90 KiB

View File

@ -1 +0,0 @@
TheBloke/Llama-2-7b-Chat-GGUF,llama-2-7b-chat.Q3_K_S.gguf

View File

@ -1,10 +0,0 @@
namespace LLama.Benchmark
{
public enum ExecutorType
{
Interactive,
Instruct,
Stateless
}
}

View File

@ -1,23 +0,0 @@
namespace LLama.Benchmark
{
internal static class Constants
{
public static string ModelDir
{
get
{
return Environment.GetEnvironmentVariable("BENCHMARK_MODEL_DIR") ?? "";
}
}
public static string Generative7BModelPath => Path.Combine(ModelDir, "llama-2-7b-chat.Q3_K_S.gguf");
public static string EmbeddingModelPath => Path.Combine(ModelDir, "all-MiniLM-L12-v2.Q8_0.gguf");
public static string LLavaModelPath => Path.Combine("llava-v1.6-mistral-7b.Q3_K_XS.gguf");
public static string LLavaMmpPath => Path.Combine("mmproj-model-f16.gguf");
public static string LLavaImage => "Assets/extreme-ironing-taxi-610x427.jpg";
public static string TextCompletionPromptsFilePath => "Assets/TextCompletionPrompts.txt";
}
}

View File

@ -1,30 +0,0 @@
<Project Sdk="Microsoft.NET.Sdk">
<Import Project="..\LLama\LLamaSharp.Runtime.targets" />
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<Configuration>Release</Configuration>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="BenchmarkDotNet" Version="0.13.12" />
<PackageReference Include="BenchmarkDotNet.Diagnostics.Windows" Version="0.13.12" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\LLama\LLamaSharp.csproj" />
</ItemGroup>
<ItemGroup>
<None Update="Assets\TextCompletionPrompts.txt">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="Models\extreme-ironing-taxi-610x427.jpg">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
</ItemGroup>
</Project>

View File

@ -1,138 +0,0 @@
#pragma warning disable CS8618
using System.Text;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Engines;
using BenchmarkDotNet.Jobs;
using LLama.Abstractions;
using LLama.Common;
using LLama.Native;
namespace LLama.Benchmark.LLamaExecutorBenchmark
{
#if WINDOWS
[BenchmarkDotNet.Diagnostics.Windows.Configs.NativeMemoryProfiler]
#endif
[BenchmarkCategory("Executor", "LLama")]
[SimpleJob(RunStrategy.Monitoring, runtimeMoniker: RuntimeMoniker.Net80)]
[MemoryDiagnoser]
[MinIterationCount(1)]
[MaxIterationCount(16)]
[RPlotExporter]
public class PrefillBenchmark
{
/// <summary>
/// (prompt length, context length)
/// </summary>
public IEnumerable<(int, uint)> PromptAndContextLengths => new (int, uint)[]
{
(512, 2048),
(2024, 2048)
};
/// <summary>
/// (model path, gpu layer count)
/// </summary>
public IEnumerable<(string, int)> ModelAndGpuLayerCounts => new (string, int)[]
// TODO: specify the native library to load here to test cpu case better.
{
(Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 0),
(Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 10),
(Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 20)
};
public IEnumerable<ExecutorType> ExecutorTypes => new ExecutorType[]
{
ExecutorType.Interactive,
ExecutorType.Stateless
};
[ParamsSource(nameof(PromptAndContextLengths))]
public (int, uint) PromptAndContextLength { get; set; }
[ParamsSource(nameof(ModelAndGpuLayerCounts))]
public (string, int) ModelAndGpuLayerCount { get; set; }
[ParamsSource(nameof(ExecutorTypes))]
public ExecutorType ExecutorType { get; set; }
/// <summary>
/// Params used to create a model.
/// </summary>
public ModelParams ModelParams { get; set; }
/// <summary>
/// Params used in inference.
/// </summary>
public InferenceParams InferenceParams { get; set; }
/// <summary>
/// Prompt used to run text generation.
/// </summary>
public string Prompt { get; set; }
public ILLamaExecutor Executor { get; set; }
private void InitializeParamsAndModel()
{
ModelParams = new ModelParams(ModelAndGpuLayerCount.Item1)
{
ContextSize = PromptAndContextLength.Item2,
GpuLayerCount = ModelAndGpuLayerCount.Item2
};
Prompt = File.ReadAllText(Constants.TextCompletionPromptsFilePath).Substring(0, PromptAndContextLength.Item1);
InferenceParams = new InferenceParams()
{
Temperature = 0.6f,
MaxTokens = 1 // Only prefill, no generation here.
};
LLamaWeights weights = LLamaWeights.LoadFromFile(ModelParams);
LLamaContext context = weights.CreateContext(ModelParams);
Executor = ExecutorType switch
{
ExecutorType.Interactive => new InteractiveExecutor(context),
ExecutorType.Instruct => new InstructExecutor(context),
ExecutorType.Stateless => new StatelessExecutor(weights, ModelParams),
_ => throw new NotSupportedException()
};
}
[GlobalSetup(Targets = [nameof(Basic)])]
public void GlobalSetup()
{
var showLLamaCppLogs = true;
NativeLibraryConfig
.Instance
.WithLogCallback((level, message) =>
{
if (showLLamaCppLogs)
Console.WriteLine($"[llama {level}]: {message.TrimEnd('\n')}");
}).WithCuda().SkipCheck().WithAutoFallback(false);
// Calling this method forces loading to occur now.
NativeApi.llama_empty_call();
InitializeParamsAndModel();
}
[IterationCleanup(Targets = [nameof(Basic)])]
public void GlobalCleanup()
{
if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
{
Executor.Context.NativeHandle.KvCacheClear();
}
}
[Benchmark]
public async Task<string> Basic()
{
StringBuilder sb = new();
await foreach(var text in Executor.InferAsync(Prompt, InferenceParams))
{
sb.Append(text);
}
return sb.ToString();
}
}
}

View File

@ -1,13 +0,0 @@
using BenchmarkDotNet.Running;
namespace LLama.Benchmark
{
public class Program
{
public static void Main(string[] args)
{
var summary = BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args);
Console.WriteLine(summary);
}
}
}

View File

@ -5,7 +5,6 @@ public class ExampleRunner
{
private static readonly Dictionary<string, Func<Task>> Examples = new()
{
{ "Chat Session: LLama3", LLama3ChatSession.Run },
{ "Chat Session: History", ChatSessionWithHistory.Run },
{ "Chat Session: Role names", ChatSessionWithRoleName.Run },
{ "Chat Session: Role names stripped", ChatSessionStripRoleName.Run },
@ -27,11 +26,9 @@ public class ExampleRunner
{ "Semantic Kernel: Prompt", SemanticKernelPrompt.Run },
{ "Semantic Kernel: Chat", SemanticKernelChat.Run },
{ "Semantic Kernel: Store", SemanticKernelMemory.Run },
{ "Batched Executor: Save/Load", BatchedExecutorSaveAndLoad.Run },
{ "Batched Executor: Fork", BatchedExecutorFork.Run },
{ "Batched Executor: Rewind", BatchedExecutorRewind.Run },
{ "Batched Executor: Guidance", BatchedExecutorGuidance.Run },
{ "Speech Chat: Integration with Whisper.net", SpeechChat.Run },
{ "Exit", () => { Environment.Exit(0); return Task.CompletedTask; } }
};

View File

@ -19,7 +19,7 @@ public class BatchedExecutorFork
string modelPath = UserSettings.GetModelPath();
var parameters = new ModelParams(modelPath);
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
using var model = LLamaWeights.LoadFromFile(parameters);
var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that");
@ -32,7 +32,7 @@ public class BatchedExecutorFork
// Evaluate the initial prompt to create one conversation
using var start = executor.Create();
start.Prompt(executor.Context.Tokenize(prompt));
start.Prompt(prompt);
await executor.Infer();
// Create the root node of the tree

View File

@ -19,7 +19,7 @@ public class BatchedExecutorGuidance
string modelPath = UserSettings.GetModelPath();
var parameters = new ModelParams(modelPath);
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
using var model = LLamaWeights.LoadFromFile(parameters);
var positivePrompt = AnsiConsole.Ask("Positive Prompt (or ENTER for default):", "My favourite colour is").Trim();
var negativePrompt = AnsiConsole.Ask("Negative Prompt (or ENTER for default):", "I hate the colour red. My favourite colour is").Trim();
@ -34,9 +34,9 @@ public class BatchedExecutorGuidance
// Load the two prompts into two conversations
using var guided = executor.Create();
guided.Prompt(executor.Context.Tokenize(positivePrompt));
guided.Prompt(positivePrompt);
using var guidance = executor.Create();
guidance.Prompt(executor.Context.Tokenize(negativePrompt));
guidance.Prompt(negativePrompt);
// Run inference to evaluate prompts
await AnsiConsole
@ -79,7 +79,7 @@ public class BatchedExecutorGuidance
guidance.Prompt(g);
// Early exit if we reach the natural end of the guided sentence
if (g == model.Tokens.EOS)
if (g == model.EndOfSentenceToken)
break;
// Update progress bar

View File

@ -20,7 +20,7 @@ public class BatchedExecutorRewind
string modelPath = UserSettings.GetModelPath();
var parameters = new ModelParams(modelPath);
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
using var model = LLamaWeights.LoadFromFile(parameters);
var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that");
@ -33,7 +33,7 @@ public class BatchedExecutorRewind
// Evaluate the initial prompt to create one conversation
using var conversation = executor.Create();
conversation.Prompt(executor.Context.Tokenize(prompt));
conversation.Prompt(prompt);
// Create the start node wrapping the conversation
var node = new Node(executor.Context);

View File

@ -1,108 +0,0 @@
using LLama.Batched;
using LLama.Common;
using LLama.Native;
using LLama.Sampling;
using Spectre.Console;
namespace LLama.Examples.Examples;
/// <summary>
/// This demonstrates generating multiple replies to the same prompt, with a shared cache
/// </summary>
public class BatchedExecutorSaveAndLoad
{
private const int n_len = 18;
public static async Task Run()
{
string modelPath = UserSettings.GetModelPath();
var parameters = new ModelParams(modelPath);
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that");
// Create an executor that can evaluate a batch of conversations together
using var executor = new BatchedExecutor(model, parameters);
// Print some info
var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
Console.WriteLine($"Created executor with model: {name}");
// Create a conversation
var conversation = executor.Create();
conversation.Prompt(executor.Context.Tokenize(prompt));
// Run inference loop
var decoder = new StreamingTokenDecoder(executor.Context);
var sampler = new DefaultSamplingPipeline();
var lastToken = await GenerateTokens(executor, conversation, sampler, decoder, n_len);
// Can't save a conversation while RequiresInference is true
if (conversation.RequiresInference)
await executor.Infer();
// Save this conversation to a file and dispose it
conversation.Save("demo_conversation.state");
conversation.Dispose();
AnsiConsole.WriteLine($"Saved state: {new FileInfo("demo_conversation.state").Length} bytes");
// Now create a new conversation by loading that state
conversation = executor.Load("demo_conversation.state");
AnsiConsole.WriteLine("Loaded state");
// Prompt it again with the last token, so we can continue generating
conversation.Rewind(1);
conversation.Prompt(lastToken);
// Continue generating text
lastToken = await GenerateTokens(executor, conversation, sampler, decoder, n_len);
// Can't save a conversation while RequiresInference is true
if (conversation.RequiresInference)
await executor.Infer();
// Save the conversation again, this time into system memory
using (var state = conversation.Save())
{
conversation.Dispose();
AnsiConsole.WriteLine($"Saved state to memory: {state.Size} bytes");
// Now create a new conversation by loading that state
conversation = executor.Load("demo_conversation.state");
AnsiConsole.WriteLine("Loaded state");
}
// Prompt it again with the last token, so we can continue generating
conversation.Rewind(1);
conversation.Prompt(lastToken);
// Continue generating text
await GenerateTokens(executor, conversation, sampler, decoder, n_len);
// Display final output
AnsiConsole.MarkupLine($"[red]{prompt}{decoder.Read()}[/]");
}
private static async Task<LLamaToken> GenerateTokens(BatchedExecutor executor, Conversation conversation, ISamplingPipeline sampler, StreamingTokenDecoder decoder, int count = 15)
{
var token = (LLamaToken)0;
for (var i = 0; i < count; i++)
{
// Run inference
await executor.Infer();
// Use sampling pipeline to pick a token
token = sampler.Sample(executor.Context.NativeHandle, conversation.Sample(), ReadOnlySpan<LLamaToken>.Empty);
// Add it to the decoder, so it can be converted into text later
decoder.Add(token);
// Prompt the conversation with the token
conversation.Prompt(token);
}
return token;
}
}

View File

@ -27,11 +27,12 @@ public class ChatChineseGB2312
var parameters = new ModelParams(modelPath)
{
ContextSize = 1024,
Seed = 1337,
GpuLayerCount = 5,
Encoding = Encoding.UTF8
};
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
using var model = LLamaWeights.LoadFromFile(parameters);
using var context = model.CreateContext(parameters);
var executor = new InteractiveExecutor(context);

View File

@ -12,14 +12,15 @@ public class ChatSessionStripRoleName
var parameters = new ModelParams(modelPath)
{
ContextSize = 1024,
Seed = 1337,
GpuLayerCount = 5
};
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
using var model = LLamaWeights.LoadFromFile(parameters);
using var context = model.CreateContext(parameters);
var executor = new InteractiveExecutor(context);
var chatHistoryJson = await File.ReadAllTextAsync("Assets/chat-with-bob.json");
var chatHistoryJson = File.ReadAllText("Assets/chat-with-bob.json");
ChatHistory chatHistory = ChatHistory.FromJson(chatHistoryJson) ?? new ChatHistory();
ChatSession session = new(executor, chatHistory);

View File

@ -10,10 +10,11 @@ public class ChatSessionWithHistory
var parameters = new ModelParams(modelPath)
{
ContextSize = 1024,
Seed = 1337,
GpuLayerCount = 5
};
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
using var model = LLamaWeights.LoadFromFile(parameters);
using var context = model.CreateContext(parameters);
var executor = new InteractiveExecutor(context);

View File

@ -10,14 +10,15 @@ public class ChatSessionWithRestart
var parameters = new ModelParams(modelPath)
{
ContextSize = 1024,
Seed = 1337,
GpuLayerCount = 5
};
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
using var model = LLamaWeights.LoadFromFile(parameters);
using var context = model.CreateContext(parameters);
var executor = new InteractiveExecutor(context);
var chatHistoryJson = await File.ReadAllTextAsync("Assets/chat-with-bob.json");
var chatHistoryJson = File.ReadAllText("Assets/chat-with-bob.json");
ChatHistory chatHistory = ChatHistory.FromJson(chatHistoryJson) ?? new ChatHistory();
ChatSession prototypeSession =
await ChatSession.InitializeSessionFromHistoryAsync(executor, chatHistory);

View File

@ -10,14 +10,15 @@ public class ChatSessionWithRoleName
var parameters = new ModelParams(modelPath)
{
ContextSize = 1024,
Seed = 1337,
GpuLayerCount = 5
};
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
using var model = LLamaWeights.LoadFromFile(parameters);
using var context = model.CreateContext(parameters);
var executor = new InteractiveExecutor(context);
var chatHistoryJson = await File.ReadAllTextAsync("Assets/chat-with-bob.json");
var chatHistoryJson = File.ReadAllText("Assets/chat-with-bob.json");
ChatHistory chatHistory = ChatHistory.FromJson(chatHistoryJson) ?? new ChatHistory();
ChatSession session = new(executor, chatHistory);

View File

@ -29,7 +29,7 @@
{
ContextSize = 4096
};
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
using var model = LLamaWeights.LoadFromFile(parameters);
using var context = model.CreateContext(parameters);
var executor = new InstructExecutor(context, InstructionPrefix, InstructionSuffix, null);

View File

@ -9,7 +9,7 @@ namespace LLama.Examples.Examples
string modelPath = UserSettings.GetModelPath();
Console.ForegroundColor = ConsoleColor.DarkGray;
var @params = new ModelParams(modelPath) { Embeddings = true };
var @params = new ModelParams(modelPath) { EmbeddingMode = true };
using var weights = LLamaWeights.LoadFromFile(@params);
var embedder = new LLamaEmbedder(weights, @params);

View File

@ -9,15 +9,16 @@ namespace LLama.Examples.Examples
{
string modelPath = UserSettings.GetModelPath();
var gbnf = (await File.ReadAllTextAsync("Assets/json.gbnf")).Trim();
var gbnf = File.ReadAllText("Assets/json.gbnf").Trim();
var grammar = Grammar.Parse(gbnf, "root");
var parameters = new ModelParams(modelPath)
{
ContextSize = 1024,
Seed = 1337,
GpuLayerCount = 5
};
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
using var model = LLamaWeights.LoadFromFile(parameters);
var ex = new StatelessExecutor(model, parameters);
Console.ForegroundColor = ConsoleColor.Yellow;

View File

@ -9,14 +9,15 @@ namespace LLama.Examples.Examples
{
string modelPath = UserSettings.GetModelPath();
var prompt = (await File.ReadAllTextAsync("Assets/dan.txt")).Trim();
var prompt = File.ReadAllText("Assets/dan.txt").Trim();
var parameters = new ModelParams(modelPath)
{
ContextSize = 1024,
Seed = 1337,
GpuLayerCount = 5
};
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
using var model = LLamaWeights.LoadFromFile(parameters);
using var context = model.CreateContext(parameters);
var executor = new InstructExecutor(context);

View File

@ -13,10 +13,11 @@ namespace LLama.Examples.Examples
var parameters = new ModelParams(modelPath)
{
ContextSize = 1024,
Seed = 1337,
GpuLayerCount = 5
};
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
using var model = LLamaWeights.LoadFromFile(parameters);
using var context = model.CreateContext(parameters);
var ex = new InteractiveExecutor(context);

View File

@ -1,126 +0,0 @@
using LLama.Abstractions;
using LLama.Common;
namespace LLama.Examples.Examples;
// When using chatsession, it's a common case that you want to strip the role names
// rather than display them. This example shows how to use transforms to strip them.
public class LLama3ChatSession
{
public static async Task Run()
{
string modelPath = UserSettings.GetModelPath();
var parameters = new ModelParams(modelPath)
{
Seed = 1337,
GpuLayerCount = 10
};
using var model = LLamaWeights.LoadFromFile(parameters);
using var context = model.CreateContext(parameters);
var executor = new InteractiveExecutor(context);
var chatHistoryJson = File.ReadAllText("Assets/chat-with-bob.json");
ChatHistory chatHistory = ChatHistory.FromJson(chatHistoryJson) ?? new ChatHistory();
ChatSession session = new(executor, chatHistory);
session.WithHistoryTransform(new LLama3HistoryTransform());
session.WithOutputTransform(new LLamaTransforms.KeywordTextOutputStreamTransform(
new string[] { "User:", "Assistant:", "<22>" },
redundancyLength: 5));
InferenceParams inferenceParams = new InferenceParams()
{
Temperature = 0.6f,
AntiPrompts = new List<string> { "User:" }
};
Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine("The chat session has started.");
// show the prompt
Console.ForegroundColor = ConsoleColor.Green;
string userInput = Console.ReadLine() ?? "";
while (userInput != "exit")
{
await foreach (
var text
in session.ChatAsync(
new ChatHistory.Message(AuthorRole.User, userInput),
inferenceParams))
{
Console.ForegroundColor = ConsoleColor.White;
Console.Write(text);
}
Console.WriteLine();
Console.ForegroundColor = ConsoleColor.Green;
userInput = Console.ReadLine() ?? "";
Console.ForegroundColor = ConsoleColor.White;
}
}
class LLama3HistoryTransform : IHistoryTransform
{
/// <summary>
/// Convert a ChatHistory instance to plain text.
/// </summary>
/// <param name="history">The ChatHistory instance</param>
/// <returns></returns>
public string HistoryToText(ChatHistory history)
{
string res = Bos;
foreach (var message in history.Messages)
{
res += EncodeMessage(message);
}
res += EncodeHeader(new ChatHistory.Message(AuthorRole.Assistant, ""));
return res;
}
private string EncodeHeader(ChatHistory.Message message)
{
string res = StartHeaderId;
res += message.AuthorRole.ToString();
res += EndHeaderId;
res += "\n\n";
return res;
}
private string EncodeMessage(ChatHistory.Message message)
{
string res = EncodeHeader(message);
res += message.Content;
res += EndofTurn;
return res;
}
/// <summary>
/// Converts plain text to a ChatHistory instance.
/// </summary>
/// <param name="role">The role for the author.</param>
/// <param name="text">The chat history as plain text.</param>
/// <returns>The updated history.</returns>
public ChatHistory TextToHistory(AuthorRole role, string text)
{
return new ChatHistory(new ChatHistory.Message[] { new ChatHistory.Message(role, text) });
}
/// <summary>
/// Copy the transform.
/// </summary>
/// <returns></returns>
public IHistoryTransform Clone()
{
return new LLama3HistoryTransform();
}
private const string StartHeaderId = "<|start_header_id|>";
private const string EndHeaderId = "<|end_header_id|>";
private const string Bos = "<|begin_of_text|>";
private const string Eos = "<|end_of_text|>";
private const string EndofTurn = "<|eot_id|>";
}
}

View File

@ -1,7 +1,7 @@
using System.Text.RegularExpressions;
using LLama.Batched;
using LLama.Common;
using Spectre.Console;
using LLama.Native;
namespace LLama.Examples.Examples
{
@ -18,15 +18,18 @@ namespace LLama.Examples.Examples
var prompt = $"{{{modelImage}}}\nUSER:\nProvide a full description of the image.\nASSISTANT:\n";
var parameters = new ModelParams(modelPath);
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
var parameters = new ModelParams(modelPath)
{
ContextSize = 4096,
Seed = 1337,
};
using var model = LLamaWeights.LoadFromFile(parameters);
using var context = model.CreateContext(parameters);
// Llava Init
using var clipModel = await LLavaWeights.LoadFromFileAsync(multiModalProj);
using var clipModel = LLavaWeights.LoadFromFile(multiModalProj);
var ex = new InteractiveExecutor(context, clipModel);
var ex = new InteractiveExecutor(context, clipModel );
Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine("The executor has been enabled. In this example, the prompt is printed, the maximum tokens is set to {0} and the context size is {1}.", maxTokens, parameters.ContextSize );
@ -42,16 +45,16 @@ namespace LLama.Examples.Examples
var imageMatches = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
var imageCount = imageMatches.Count();
var hasImages = imageCount > 0;
byte[][] imageBytes = null;
if (hasImages)
{
var imagePathsWithCurlyBraces = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
var imagePaths = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Groups[1].Value).ToList();
var imagePaths = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Groups[1].Value);
List<byte[]> imageBytes;
try
{
imageBytes = imagePaths.Select(File.ReadAllBytes).ToList();
imageBytes = imagePaths.Select(File.ReadAllBytes).ToArray();
}
catch (IOException exception)
{
@ -64,17 +67,15 @@ namespace LLama.Examples.Examples
break;
}
// Each prompt with images we clear cache
// When the prompt contains images we clear KV_CACHE to restart conversation
// See:
// https://github.com/ggerganov/llama.cpp/discussions/3620
ex.Context.NativeHandle.KvCacheRemove( LLamaSeqId.Zero, -1, -1 );
int index = 0;
foreach (var path in imagePathsWithCurlyBraces)
{
// First image replace to tag <image, the rest of the images delete the tag
prompt = prompt.Replace(path, index++ == 0 ? "<image>" : "");
if (index++ == 0)
prompt = prompt.Replace(path, "<image>");
else
prompt = prompt.Replace(path, "");
}
@ -95,12 +96,9 @@ namespace LLama.Examples.Examples
Console.WriteLine();
// Initialize Images in executor
// Initilize Images in executor
//
foreach (var image in imagePaths)
{
ex.Images.Add(await File.ReadAllBytesAsync(image));
}
ex.ImagePaths = imagePaths.ToList();
}
Console.ForegroundColor = Color.White;
@ -115,7 +113,7 @@ namespace LLama.Examples.Examples
// let the user finish with exit
//
if (prompt != null && prompt.Equals("/exit", StringComparison.OrdinalIgnoreCase))
if (prompt.Equals("/exit", StringComparison.OrdinalIgnoreCase))
break;
}

View File

@ -12,10 +12,11 @@ namespace LLama.Examples.Examples
var parameters = new ModelParams(modelPath)
{
ContextSize = 1024,
Seed = 1337,
GpuLayerCount = 5
};
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
using var model = LLamaWeights.LoadFromFile(parameters);
using var context = model.CreateContext(parameters);
var ex = new InteractiveExecutor(context);

View File

@ -13,10 +13,11 @@ namespace LLama.Examples.Examples
var parameters = new ModelParams(modelPath)
{
ContextSize = 1024,
Seed = 1337,
GpuLayerCount = 5
};
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
using var model = LLamaWeights.LoadFromFile(parameters);
using var context = model.CreateContext(parameters);
var ex = new InteractiveExecutor(context);

View File

@ -16,7 +16,7 @@ namespace LLama.Examples.Examples
// Load weights into memory
var parameters = new ModelParams(modelPath);
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
using var model = LLamaWeights.LoadFromFile(parameters);
var ex = new StatelessExecutor(model, parameters);
var chatGPT = new LLamaSharpChatCompletion(ex);

View File

@ -20,10 +20,10 @@ namespace LLama.Examples.Examples
var parameters = new ModelParams(modelPath)
{
Seed = seed,
Embeddings = true
EmbeddingMode = true
};
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
using var model = LLamaWeights.LoadFromFile(parameters);
var embedding = new LLamaEmbedder(model, parameters);
Console.WriteLine("====================================================");

View File

@ -1,9 +1,9 @@
using LLama.Common;
using LLamaSharp.SemanticKernel.ChatCompletion;
using Microsoft.SemanticKernel;
using LLamaSharp.SemanticKernel.TextCompletion;
using Microsoft.SemanticKernel.TextGeneration;
using Microsoft.Extensions.DependencyInjection;
using LLamaSharp.SemanticKernel;
namespace LLama.Examples.Examples
{
@ -19,7 +19,7 @@ namespace LLama.Examples.Examples
// Load weights into memory
var parameters = new ModelParams(modelPath);
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
using var model = LLamaWeights.LoadFromFile(parameters);
var ex = new StatelessExecutor(model, parameters);
var builder = Kernel.CreateBuilder();
@ -31,7 +31,7 @@ namespace LLama.Examples.Examples
One line TLDR with the fewest words.";
LLamaSharpPromptExecutionSettings settings = new() { MaxTokens = 100 };
ChatRequestSettings settings = new() { MaxTokens = 100 };
var summarize = kernel.CreateFunctionFromPrompt(prompt, settings);
string text1 = @"

View File

@ -1,253 +0,0 @@
using LLama.Common;
using NAudio.Wave;
using Whisper.net;
namespace LLama.Examples.Examples
{
public class SpeechChat
{
public static async Task Run()
{
ConsoleStyleHelpers.WriteLine(
"""
This example demonstrates the basics of audio transcriptions, speech recognition, and speech commands,
as well as how to recognize a user's voice in real time and then get a response from LLM.
It uses whisper.net and models could be found in: https://huggingface.co/ggerganov/whisper.cpp/tree/main.
To use it, you need a working microphone and enough RAM to host both audio + language models.
Once you've selected the models, just speak to your microphone and watch the LLM continue your text.
While it's going, you can say something like 'Okay, stop', or 'Stop now', to interrupt the LLM's inference.
NOTE: You may need to poke around with the voice detection threshold, based on your mic's sensitivity.
-----------------------------------------------------------------------------------------------------------
""", ConsoleColor.Yellow);
if (ConsoleStyleHelpers.SelectAudioModel() is not string model) { return; }
bool loadFinished = false;
var loading = ConsoleStyleHelpers.LoadPrint("Loading transcription model...", () => loadFinished);
using var speechRecognitionServer = new SpeechRecognitionServer(model);
loadFinished = true; loading.Wait();
Console.WriteLine("Audio model loaded. Insert path for language model.");
using var _ = new LlamaSession_SpeechListener(speechRecognitionServer);
await ConsoleStyleHelpers.WaitUntilExit();
}
class LlamaSession_SpeechListener : ISpeechListener, IDisposable
{
bool isModelResponding;
SpeechRecognitionServer audioServer;
LLamaWeights model;
LLamaContext context;
InteractiveExecutor executor;
string fullPrompt = "";
bool canceled;
public LlamaSession_SpeechListener(SpeechRecognitionServer server)
{
var parameters = new ModelParams(UserSettings.GetModelPath()) { Seed = 1337, GpuLayerCount = 99 };
model = LLamaWeights.LoadFromFile(parameters);
context = model.CreateContext(parameters);
executor = new InteractiveExecutor(context);
(audioServer = server).ServiceUsers.Add(this);
}
// Whisper is struggling with single words and very short phrases without context, so it's actually better to say something like "Ok, Stop!" to have it work better.
bool ISpeechListener.IsInterested(string audioTranscription) => !isModelResponding || audioTranscription.Contains("stop", StringComparison.CurrentCultureIgnoreCase);
void ISpeechListener.HandleSpeech(string audioTranscription)
{
if (isModelResponding && audioTranscription.Contains("stop", StringComparison.CurrentCultureIgnoreCase)) { canceled = true; }
else if (!isModelResponding) { _ = SendMessage(audioTranscription); }
}
async Task SendMessage(string newMessage)
{
// While a response is queried, we want to detect short phrases/commands like 'stop',
audioServer.detectionSettings = (1, 2); // ..so we lower the min Speech Detection time.
isModelResponding = true;
AddToPrompt($"\n{newMessage}\n", ConsoleColor.Blue);
await foreach (var token in executor.InferAsync(fullPrompt))
{
AddToPrompt(token, ConsoleColor.Yellow);
if (canceled) { AddToPrompt("[...stopped]", ConsoleColor.Red); break; }
}
audioServer.detectionSettings = (2, 3); // Reset back to default detection settings to avoid false positives.
(isModelResponding, canceled) = (false, false); // Reset the state variables to their default.
}
void AddToPrompt(string msg, ConsoleColor color = ConsoleColor.Yellow)
{
fullPrompt += msg;
ConsoleStyleHelpers.Write(msg, color);
}
void IDisposable.Dispose()
{
model.Dispose();
context.Dispose();
}
}
public interface ISpeechListener
{
bool IsInterested(string audioTranscription);
void HandleSpeech(string audioTranscription);
}
public class SpeechRecognitionServer : IDisposable
{
const int clipLength = 250; // ms
const float voiceDetectionThreshold = 0.01f; // Adjust as needed
readonly string[] knownFalsePositives = ["[BLANK_AUDIO]", "Thank you", "[silence]"];
WaveInEvent waveIn;
WaveFormat waveFormat = new(16000, 16, 1); // 16KHz, 16 bits, Mono Channel
List<byte> recordedBytes = [];
WhisperFactory? whisperFactory;
WhisperProcessor? processor;
string whisperPrompt =
"""
The short audio comes from a user that is speaking to an AI Language Model in real time.
Pay extra attentions for commands like 'ok stop' or just 'stop'.
In case of inaudible sentences that might be, assume they're saying 'stop'.
""".Trim();
// Tracked stats for Speech Recognition, Parsing, and Serving.
int currentBlankClips; // Ideally would work with milliseconds,
int totalNonBlankClips; // ..but for example's sake they work on a
int nonIdleTime; // ..clip-based quant-length (1 = clipLength).
// Default detection settings: A speech of 750ms, followed by pause of 500ms. (2x250ms)
public (int minBlanksPerSeparation, int minNonBlanksForValidMessages) detectionSettings = (2, 3);
public HashSet<ISpeechListener> ServiceUsers = [];
public SpeechRecognitionServer(string modelPath)
{
// Adjust the path based on your GPU's type. On your build you ideally want just the correct runtime build for your project, but here we're having all references, so it's getting confused.
var libPath = @$"{Environment.GetFolderPath(Environment.SpecialFolder.UserProfile)}\.nuget\packages\whisper.net.runtime.cublas\1.5.0\build\win-x64\whisper.dll"; // Defaulting to cuBlas.
if (!File.Exists(libPath)) { ConsoleStyleHelpers.WriteLine($"Could not find dll file at {libPath}.\nWhisper will load with the default runtime (possibly CPU).\nIf you own a non-Nvidia GPU, you need to adjust the library path based on your GPU's type.", ConsoleColor.Red); libPath = null; }
whisperFactory = WhisperFactory.FromPath(modelPath, libraryPath: libPath);
var builder = whisperFactory.CreateBuilder().WithThreads(16).WithPrompt(whisperPrompt).WithSingleSegment().WithLanguage("en");
(builder.WithBeamSearchSamplingStrategy() as BeamSearchSamplingStrategyBuilder)!.WithPatience(0.2f).WithBeamSize(5);
processor = builder.Build();
waveIn = new WaveInEvent() { BufferMilliseconds = clipLength, WaveFormat = waveFormat };
waveIn.DataAvailable += OnAudioDataAvailable;
waveIn.StartRecording();
}
void OnAudioDataAvailable(object? sender, WaveInEventArgs e)
{
// Cache the recorded bytes
recordedBytes.AddRange(e.Buffer[..e.BytesRecorded]);
if (recordedBytes.Count > 110000000) { recordedBytes.RemoveRange(0, 50000000); }
// Get the max volume contained inside the clip. Since the clip is recorded as bytes, we need to translate them to samples before getting their volume.
var maxVolume = 0f; // This byte->sample algorithm is from: https://github.com/naudio/NAudio/blob/master/Docs/RecordingLevelMeter.md#calculating-peak-values
for (int i = 0; i < e.BytesRecorded; i += 2) { maxVolume = Math.Max(maxVolume, Math.Abs((short) ((e.Buffer[i + 1] << 8) | e.Buffer[i + 0]) / 32768f)); }
// Compare the volume with the threshold and act accordingly. Once an interesting and 'full' set of clips pops up, serve it.
if (maxVolume >= voiceDetectionThreshold) { currentBlankClips = 0; totalNonBlankClips++; nonIdleTime++; }
else if (++currentBlankClips < detectionSettings.minBlanksPerSeparation) { nonIdleTime++; }
else
{
if (totalNonBlankClips >= detectionSettings.minNonBlanksForValidMessages) { SendTranscription(); }
else if (totalNonBlankClips > 0) { } // This might be case of a false-positive -- knock, noise, cough, anything.
(currentBlankClips, totalNonBlankClips, nonIdleTime) = (0, 0, 0);
}
async void SendTranscription()
{
var bytesPerClip = waveFormat.BitsPerSample * clipLength * 2;
var capturedClipBytes = recordedBytes.TakeLast(bytesPerClip * (nonIdleTime + 2)).ToArray();
var transcribedText = await ProcessAudio(capturedClipBytes, "Assets\\temp.wav"); // Save to temporary file.
if (knownFalsePositives.Contains(transcribedText)) { return; } // False positive.. yikes!
foreach (var user in ServiceUsers.Where(x => x.IsInterested(transcribedText))) { user.HandleSpeech(transcribedText); }
}
}
/// <summary> Requests a transcription and responds with the text. </summary>
async Task<string> ProcessAudio(byte[] bytes, string tempWavFilePath)
{
await using var wavStream = new MemoryStream();
using (var writer = new WaveFileWriter(tempWavFilePath, waveFormat)) { writer.Write(bytes, 0, bytes.Length); }
using (var fileStream = File.OpenRead(tempWavFilePath)) { await fileStream.CopyToAsync(wavStream); }
wavStream.Seek(0, SeekOrigin.Begin);
Console.Beep();
return string.Join(' ', await processor!.ProcessAsync(wavStream).Select(x => x.Text).ToListAsync()).Trim();
}
void IDisposable.Dispose()
{
waveIn.Dispose();
processor?.Dispose();
}
}
public static class ConsoleStyleHelpers
{
public static string? SelectAudioModel()
{
var models = Directory.GetFiles("Assets", "*bin");
if (models.Length == 1) { return models[0]; }
else if (models.Length != 0)
{
WriteLine("Available Models:", ConsoleColor.Green);
for (int i = 0; i < models.Length; i++)
{
Write($"{i + 1}. ", ConsoleColor.Blue);
WriteLine(models[i]["Assets\\".Length..], ConsoleColor.Yellow);
}
while (true)
{
Write($"Please choose a model (1-{models.Length}): ", ConsoleColor.DarkCyan);
if (!int.TryParse(Console.ReadKey().KeyChar.ToString(), out var i) || i > models.Length || i <= 0) { Console.WriteLine(); continue; }
Console.WriteLine();
return models[i - 1];
}
}
else
{
WriteLine($"Download a non-quantized model and place it in the executing directory:", ConsoleColor.Red);
WriteLine($"\t{Environment.CurrentDirectory}\\Assets", ConsoleColor.Yellow);
WriteLine("You can find the official ggml models in whisper.cpp's huggingface repository: ", ConsoleColor.Red);
WriteLine("\thttps://huggingface.co/ggerganov/whisper.cpp/tree/main", ConsoleColor.Blue);
return null;
}
}
public static async Task LoadPrint(string initialText, Func<bool> ShouldContinue)
{
var startTime = DateTime.Now;
Console.WriteLine(initialText);
while (!ShouldContinue()) { Console.Write("."); await Task.Delay(100); }
Console.WriteLine($" Completed in {(DateTime.Now - startTime).TotalSeconds:f2}s.");
}
public async static Task WaitUntilExit()
{
WriteLine("Voice active. Begin talking to transcribe. Press any key at any time to exit.", ConsoleColor.Green);
await Task.Delay(1000);
Console.ReadKey();
}
public static void Write(string text, ConsoleColor consoleColor) => ColorAction(consoleColor, () => Console.Write(text));
public static void WriteLine(string text, ConsoleColor consoleColor) => ColorAction(consoleColor, () => Console.WriteLine(text));
public static void ColorAction(ConsoleColor consoleColor, Action action)
{
Console.ForegroundColor = consoleColor;
action?.Invoke();
Console.ForegroundColor = ConsoleColor.White;
}
}
}
}

View File

@ -11,10 +11,11 @@ namespace LLama.Examples.Examples
var parameters = new ModelParams(modelPath)
{
ContextSize = 1024,
Seed = 1337,
GpuLayerCount = 5
};
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
using var model = LLamaWeights.LoadFromFile(parameters);
var ex = new StatelessExecutor(model, parameters);
Console.ForegroundColor = ConsoleColor.Yellow;

View File

@ -12,7 +12,7 @@ namespace LLama.Examples.Examples
// Load weights into memory
var @params = new ModelParams(modelPath);
using var weights = await LLamaWeights.LoadFromFileAsync(@params);
using var weights = LLamaWeights.LoadFromFile(@params);
// Create 2 contexts sharing the same weights
using var aliceCtx = weights.CreateContext(@params);
@ -21,7 +21,7 @@ namespace LLama.Examples.Examples
var bob = new InteractiveExecutor(bobCtx);
// Initial alice prompt
var alicePrompt = "Transcript of a dialog, where the Alice interacts with a person named Bob. Alice is friendly, kind, honest and good at writing.\nAlice: Hello";
var alicePrompt = "Transcript of a dialog, where the Alice interacts a person named Bob. Alice is friendly, kind, honest and good at writing.\nAlice: Hello";
var aliceResponse = await Prompt(alice, ConsoleColor.Green, alicePrompt, false, false);
// Initial bob prompt

View File

@ -18,14 +18,8 @@
<PackageReference Include="Microsoft.KernelMemory.Core" Version="0.34.240313.1" />
<PackageReference Include="Microsoft.SemanticKernel" Version="1.6.2" />
<PackageReference Include="Microsoft.SemanticKernel.Plugins.Memory" Version="1.6.2-alpha" />
<PackageReference Include="NAudio" Version="2.2.1" />
<PackageReference Include="Spectre.Console" Version="0.48.0" />
<PackageReference Include="Spectre.Console.ImageSharp" Version="0.48.0" />
<PackageReference Include="Whisper.net" Version="1.5.0" />
<PackageReference Include="Whisper.net.Runtime" Version="1.5.0" />
<PackageReference Include="Whisper.net.Runtime.Clblast" Version="1.5.0" />
<PackageReference Include="Whisper.net.Runtime.CoreML" Version="1.5.0" />
<PackageReference Include="Whisper.net.Runtime.Cublas" Version="1.5.0" />
</ItemGroup>
<ItemGroup>

View File

@ -16,20 +16,11 @@ AnsiConsole.MarkupLineInterpolated(
""");
// Configure native library to use. This must be done before any other llama.cpp methods are called!
// Configure native library to use
NativeLibraryConfig
.Instance
.WithCuda();
// Configure logging. Change this to `true` to see log messages from llama.cpp
var showLLamaCppLogs = false;
NativeLibraryConfig
.Instance
.WithLogCallback((level, message) =>
{
if (showLLamaCppLogs)
Console.WriteLine($"[llama {level}]: {message.TrimEnd('\n')}");
});
.WithCuda()
.WithLogs(LLamaLogLevel.Info);
// Calling this method forces loading to occur now.
NativeApi.llama_empty_call();

View File

@ -84,7 +84,7 @@ namespace LLamaSharp.KernelMemory
ContextSize = config?.ContextSize ?? 2048,
Seed = config?.Seed ?? 0,
GpuLayerCount = config?.GpuLayerCount ?? 20,
Embeddings = true,
EmbeddingMode = true,
MainGpu = config?.MainGpu ?? 0,
SplitMode = config?.SplitMode ?? GPUSplitMode.None,
};

View File

@ -4,7 +4,7 @@
<TargetFrameworks>net6.0;net7.0;net8.0</TargetFrameworks>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<Version>0.11.2</Version>
<Version>0.11.0</Version>
<Authors>Xbotter</Authors>
<Company>SciSharp STACK</Company>
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
@ -17,7 +17,7 @@
The integration of LLamaSharp and Microsoft kernel-memory. It could make it easy to support document search for LLamaSharp model inference.
</Description>
<PackageReleaseNotes>
v0.11.2 followed the updating of LLamaSharp.
v0.11.0 updated the kernel-memory package and Fixed System.ArgumentException: EmbeddingMode must be true.
</PackageReleaseNotes>
<PackageLicenseExpression>MIT</PackageLicenseExpression>
<PackageOutputPath>packages</PackageOutputPath>

View File

@ -29,7 +29,7 @@ namespace LLamaSharp.KernelMemory
this._config = config;
var @params = new ModelParams(_config.ModelPath)
{
Embeddings = true,
EmbeddingMode = true,
MainGpu = _config.MainGpu,
SplitMode = _config.SplitMode
};
@ -49,7 +49,7 @@ namespace LLamaSharp.KernelMemory
this._config = config;
var @params = new ModelParams(_config.ModelPath)
{
Embeddings = true,
EmbeddingMode = true,
MainGpu = _config.MainGpu,
SplitMode = _config.SplitMode
};
@ -104,6 +104,6 @@ namespace LLamaSharp.KernelMemory
}
/// <inheritdoc/>
public int CountTokens(string text) => _embedder.Context.Tokenize(text, special: true).Length;
public int CountTokens(string text) => _embedder.Context.Tokenize(text).Length;
}
}

View File

@ -1,7 +1,13 @@
using LLama;
using LLama.Abstractions;
using LLama.Common;
using LLama.Native;
using Microsoft.KernelMemory.AI;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace LLamaSharp.KernelMemory
{
@ -105,6 +111,6 @@ namespace LLamaSharp.KernelMemory
}
/// <inheritdoc/>
public int CountTokens(string text) => _context.Tokenize(text, special: true).Length;
public int CountTokens(string text) => _context.Tokenize(text).Length;
}
}

View File

@ -4,7 +4,6 @@ using System.Text.Json.Serialization;
namespace LLamaSharp.SemanticKernel.ChatCompletion;
[Obsolete("Use LLamaSharpPromptExecutionSettings instead")]
public class ChatRequestSettings : PromptExecutionSettings
{
/// <summary>

View File

@ -8,7 +8,6 @@ namespace LLamaSharp.SemanticKernel.ChatCompletion;
/// <summary>
/// JSON converter for <see cref="OpenAIRequestSettings"/>
/// </summary>
[Obsolete("Use LLamaSharpPromptExecutionSettingsConverter instead")]
public class ChatRequestSettingsConverter : JsonConverter<ChatRequestSettings>
{
/// <inheritdoc/>

View File

@ -7,7 +7,6 @@ using System;
using System.IO;
using System.Runtime.CompilerServices;
using System.Text;
using static LLama.InteractiveExecutor;
using static LLama.LLamaTransforms;
namespace LLamaSharp.SemanticKernel.ChatCompletion;
@ -18,18 +17,17 @@ namespace LLamaSharp.SemanticKernel.ChatCompletion;
public sealed class LLamaSharpChatCompletion : IChatCompletionService
{
private readonly ILLamaExecutor _model;
private LLamaSharpPromptExecutionSettings defaultRequestSettings;
private ChatRequestSettings defaultRequestSettings;
private readonly IHistoryTransform historyTransform;
private readonly ITextStreamTransform outputTransform;
private readonly Dictionary<string, object?> _attributes = new();
private readonly bool _isStatefulExecutor;
public IReadOnlyDictionary<string, object?> Attributes => this._attributes;
static LLamaSharpPromptExecutionSettings GetDefaultSettings()
static ChatRequestSettings GetDefaultSettings()
{
return new LLamaSharpPromptExecutionSettings
return new ChatRequestSettings
{
MaxTokens = 256,
Temperature = 0,
@ -39,12 +37,11 @@ public sealed class LLamaSharpChatCompletion : IChatCompletionService
}
public LLamaSharpChatCompletion(ILLamaExecutor model,
LLamaSharpPromptExecutionSettings? defaultRequestSettings = default,
ChatRequestSettings? defaultRequestSettings = default,
IHistoryTransform? historyTransform = null,
ITextStreamTransform? outputTransform = null)
{
this._model = model;
this._isStatefulExecutor = this._model is StatefulExecutorBase;
this.defaultRequestSettings = defaultRequestSettings ?? GetDefaultSettings();
this.historyTransform = historyTransform ?? new HistoryTransform();
this.outputTransform = outputTransform ?? new KeywordTextOutputStreamTransform(new[] { $"{LLama.Common.AuthorRole.User}:",
@ -68,10 +65,10 @@ public sealed class LLamaSharpChatCompletion : IChatCompletionService
public async Task<IReadOnlyList<ChatMessageContent>> GetChatMessageContentsAsync(ChatHistory chatHistory, PromptExecutionSettings? executionSettings = null, Kernel? kernel = null, CancellationToken cancellationToken = default)
{
var settings = executionSettings != null
? LLamaSharpPromptExecutionSettings.FromRequestSettings(executionSettings)
? ChatRequestSettings.FromRequestSettings(executionSettings)
: defaultRequestSettings;
var prompt = historyTransform.HistoryToText(chatHistory.ToLLamaSharpChatHistory());
string prompt = this._getFormattedPrompt(chatHistory);
var result = _model.InferAsync(prompt, settings.ToLLamaSharpInferenceParams(), cancellationToken);
var output = outputTransform.TransformAsync(result);
@ -89,10 +86,10 @@ public sealed class LLamaSharpChatCompletion : IChatCompletionService
public async IAsyncEnumerable<StreamingChatMessageContent> GetStreamingChatMessageContentsAsync(ChatHistory chatHistory, PromptExecutionSettings? executionSettings = null, Kernel? kernel = null, [EnumeratorCancellation] CancellationToken cancellationToken = default)
{
var settings = executionSettings != null
? LLamaSharpPromptExecutionSettings.FromRequestSettings(executionSettings)
? ChatRequestSettings.FromRequestSettings(executionSettings)
: defaultRequestSettings;
var prompt = historyTransform.HistoryToText(chatHistory.ToLLamaSharpChatHistory());
string prompt = this._getFormattedPrompt(chatHistory);
var result = _model.InferAsync(prompt, settings.ToLLamaSharpInferenceParams(), cancellationToken);
var output = outputTransform.TransformAsync(result);
@ -102,33 +99,4 @@ public sealed class LLamaSharpChatCompletion : IChatCompletionService
yield return new StreamingChatMessageContent(AuthorRole.Assistant, token);
}
}
/// <summary>
/// Return either the entire formatted chatHistory or just the most recent message based on
/// whether the model extends StatefulExecutorBase or not.
/// </summary>
/// <param name="chatHistory"></param>
/// <returns>The formatted prompt</returns>
private string _getFormattedPrompt(ChatHistory chatHistory){
string prompt;
if (this._isStatefulExecutor){
InteractiveExecutorState state = (InteractiveExecutorState)((StatefulExecutorBase)this._model).GetStateData();
if (state.IsPromptRun)
{
prompt = historyTransform.HistoryToText(chatHistory.ToLLamaSharpChatHistory());
}
else
{
ChatHistory temp_history = new();
temp_history.AddUserMessage(chatHistory.Last().Content);
prompt = historyTransform.HistoryToText(temp_history.ToLLamaSharpChatHistory());
}
}
else
{
prompt = historyTransform.HistoryToText(chatHistory.ToLLamaSharpChatHistory());
}
return prompt;
}
}

View File

@ -1,4 +1,5 @@
using Microsoft.SemanticKernel.ChatCompletion;
using LLamaSharp.SemanticKernel.ChatCompletion;
using Microsoft.SemanticKernel.ChatCompletion;
namespace LLamaSharp.SemanticKernel;
public static class ExtensionMethods
@ -22,11 +23,11 @@ public static class ExtensionMethods
}
/// <summary>
/// Convert LLamaSharpPromptExecutionSettings to LLamaSharp InferenceParams
/// Convert ChatRequestSettings to LLamaSharp InferenceParams
/// </summary>
/// <param name="requestSettings"></param>
/// <returns></returns>
internal static global::LLama.Common.InferenceParams ToLLamaSharpInferenceParams(this LLamaSharpPromptExecutionSettings requestSettings)
internal static global::LLama.Common.InferenceParams ToLLamaSharpInferenceParams(this ChatRequestSettings requestSettings)
{
if (requestSettings is null)
{

View File

@ -10,7 +10,7 @@
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<Version>0.11.2</Version>
<Version>0.11.0</Version>
<Authors>Tim Miller, Xbotter</Authors>
<Company>SciSharp STACK</Company>
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
@ -23,7 +23,7 @@
The integration of LLamaSharp and Microsoft semantic-kernel.
</Description>
<PackageReleaseNotes>
v0.11.2 followed the updating of LLamaSharp.
v0.11.0 updates the semantic-kernel package.
</PackageReleaseNotes>
<PackageLicenseExpression>MIT</PackageLicenseExpression>
<PackageOutputPath>packages</PackageOutputPath>
@ -34,7 +34,7 @@
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.SemanticKernel.Abstractions" Version="1.6.2" />
<PackageReference Include="Microsoft.SemanticKernel.Abstractions" Version="1.6.3" />
</ItemGroup>
<ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.0'">

View File

@ -1,131 +0,0 @@

/* Unmerged change from project 'LLamaSharp.SemanticKernel (netstandard2.0)'
Before:
using Microsoft.SemanticKernel;
After:
using LLamaSharp;
using LLamaSharp.SemanticKernel;
using LLamaSharp.SemanticKernel;
using LLamaSharp.SemanticKernel.ChatCompletion;
using Microsoft.SemanticKernel;
*/
using LLamaSharp.SemanticKernel.ChatCompletion;
using Microsoft.SemanticKernel;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace LLamaSharp.SemanticKernel;
public class LLamaSharpPromptExecutionSettings : PromptExecutionSettings
{
/// <summary>
/// Temperature controls the randomness of the completion.
/// The higher the temperature, the more random the completion.
/// </summary>
[JsonPropertyName("temperature")]
public double Temperature { get; set; } = 0;
/// <summary>
/// TopP controls the diversity of the completion.
/// The higher the TopP, the more diverse the completion.
/// </summary>
[JsonPropertyName("top_p")]
public double TopP { get; set; } = 0;
/// <summary>
/// Number between -2.0 and 2.0. Positive values penalize new tokens
/// based on whether they appear in the text so far, increasing the
/// model's likelihood to talk about new topics.
/// </summary>
[JsonPropertyName("presence_penalty")]
public double PresencePenalty { get; set; } = 0;
/// <summary>
/// Number between -2.0 and 2.0. Positive values penalize new tokens
/// based on their existing frequency in the text so far, decreasing
/// the model's likelihood to repeat the same line verbatim.
/// </summary>
[JsonPropertyName("frequency_penalty")]
public double FrequencyPenalty { get; set; } = 0;
/// <summary>
/// Sequences where the completion will stop generating further tokens.
/// </summary>
[JsonPropertyName("stop_sequences")]
public IList<string> StopSequences { get; set; } = Array.Empty<string>();
/// <summary>
/// How many completions to generate for each prompt. Default is 1.
/// Note: Because this parameter generates many completions, it can quickly consume your token quota.
/// Use carefully and ensure that you have reasonable settings for max_tokens and stop.
/// </summary>
[JsonPropertyName("results_per_prompt")]
public int ResultsPerPrompt { get; set; } = 1;
/// <summary>
/// The maximum number of tokens to generate in the completion.
/// </summary>
[JsonPropertyName("max_tokens")]
public int? MaxTokens { get; set; }
/// <summary>
/// Modify the likelihood of specified tokens appearing in the completion.
/// </summary>
[JsonPropertyName("token_selection_biases")]
public IDictionary<int, int> TokenSelectionBiases { get; set; } = new Dictionary<int, int>();
/// <summary>
/// Indicates the format of the response which can be used downstream to post-process the messages. Handlebars: handlebars_object. JSON: json_object, etc.
/// </summary>
[JsonPropertyName("response_format")]
public string ResponseFormat { get; set; } = string.Empty;
/// <summary>
/// Create a new settings object with the values from another settings object.
/// </summary>
/// <param name="requestSettings">Template configuration</param>
/// <param name="defaultMaxTokens">Default max tokens</param>
/// <returns>An instance of OpenAIRequestSettings</returns>
public static LLamaSharpPromptExecutionSettings FromRequestSettings(PromptExecutionSettings? requestSettings, int? defaultMaxTokens = null)
{
if (requestSettings is null)
{
return new LLamaSharpPromptExecutionSettings()
{
MaxTokens = defaultMaxTokens
};
}
if (requestSettings is LLamaSharpPromptExecutionSettings requestSettingsChatRequestSettings)
{
return requestSettingsChatRequestSettings;
}
var json = JsonSerializer.Serialize(requestSettings);
var chatRequestSettings = JsonSerializer.Deserialize<LLamaSharpPromptExecutionSettings>(json, s_options);
if (chatRequestSettings is not null)
{
return chatRequestSettings;
}
throw new ArgumentException($"Invalid request settings, cannot convert to {nameof(LLamaSharpPromptExecutionSettings)}", nameof(requestSettings));
}
private static readonly JsonSerializerOptions s_options = CreateOptions();
private static JsonSerializerOptions CreateOptions()
{
JsonSerializerOptions options = new()
{
WriteIndented = true,
MaxDepth = 20,
AllowTrailingCommas = true,
PropertyNameCaseInsensitive = true,
ReadCommentHandling = JsonCommentHandling.Skip,
Converters = { new LLamaSharpPromptExecutionSettingsConverter() }
};
return options;
}
}

View File

@ -1,104 +0,0 @@
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace LLamaSharp.SemanticKernel;
/// <summary>
/// JSON converter for <see cref="OpenAIRequestSettings"/>
/// </summary>
public class LLamaSharpPromptExecutionSettingsConverter : JsonConverter<LLamaSharpPromptExecutionSettings>
{
/// <inheritdoc/>
public override LLamaSharpPromptExecutionSettings? Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var requestSettings = new LLamaSharpPromptExecutionSettings();
while (reader.Read() && reader.TokenType != JsonTokenType.EndObject)
{
if (reader.TokenType == JsonTokenType.PropertyName)
{
string? propertyName = reader.GetString();
if (propertyName is not null)
{
// normalise property name to uppercase
propertyName = propertyName.ToUpperInvariant();
}
reader.Read();
switch (propertyName)
{
case "MODELID":
case "MODEL_ID":
requestSettings.ModelId = reader.GetString();
break;
case "TEMPERATURE":
requestSettings.Temperature = reader.GetDouble();
break;
case "TOPP":
case "TOP_P":
requestSettings.TopP = reader.GetDouble();
break;
case "FREQUENCYPENALTY":
case "FREQUENCY_PENALTY":
requestSettings.FrequencyPenalty = reader.GetDouble();
break;
case "PRESENCEPENALTY":
case "PRESENCE_PENALTY":
requestSettings.PresencePenalty = reader.GetDouble();
break;
case "MAXTOKENS":
case "MAX_TOKENS":
requestSettings.MaxTokens = reader.GetInt32();
break;
case "STOPSEQUENCES":
case "STOP_SEQUENCES":
requestSettings.StopSequences = JsonSerializer.Deserialize<IList<string>>(ref reader, options) ?? Array.Empty<string>();
break;
case "RESULTSPERPROMPT":
case "RESULTS_PER_PROMPT":
requestSettings.ResultsPerPrompt = reader.GetInt32();
break;
case "TOKENSELECTIONBIASES":
case "TOKEN_SELECTION_BIASES":
requestSettings.TokenSelectionBiases = JsonSerializer.Deserialize<IDictionary<int, int>>(ref reader, options) ?? new Dictionary<int, int>();
break;
default:
reader.Skip();
break;
}
}
}
return requestSettings;
}
/// <inheritdoc/>
public override void Write(Utf8JsonWriter writer, LLamaSharpPromptExecutionSettings value, JsonSerializerOptions options)
{
writer.WriteStartObject();
writer.WriteNumber("temperature", value.Temperature);
writer.WriteNumber("top_p", value.TopP);
writer.WriteNumber("frequency_penalty", value.FrequencyPenalty);
writer.WriteNumber("presence_penalty", value.PresencePenalty);
if (value.MaxTokens is null)
{
writer.WriteNull("max_tokens");
}
else
{
writer.WriteNumber("max_tokens", (decimal)value.MaxTokens);
}
writer.WritePropertyName("stop_sequences");
JsonSerializer.Serialize(writer, value.StopSequences, options);
writer.WriteNumber("results_per_prompt", value.ResultsPerPrompt);
writer.WritePropertyName("token_selection_biases");
JsonSerializer.Serialize(writer, value.TokenSelectionBiases, options);
writer.WriteEndObject();
}
}

View File

@ -1,4 +1,5 @@
using LLama.Abstractions;
using LLamaSharp.SemanticKernel.ChatCompletion;
using Microsoft.SemanticKernel;
using Microsoft.SemanticKernel.Services;
using Microsoft.SemanticKernel.TextGeneration;
@ -23,7 +24,7 @@ public sealed class LLamaSharpTextCompletion : ITextGenerationService
/// <inheritdoc/>
public async Task<IReadOnlyList<TextContent>> GetTextContentsAsync(string prompt, PromptExecutionSettings? executionSettings = null, Kernel? kernel = null, CancellationToken cancellationToken = default)
{
var settings = LLamaSharpPromptExecutionSettings.FromRequestSettings(executionSettings);
var settings = ChatRequestSettings.FromRequestSettings(executionSettings);
var result = executor.InferAsync(prompt, settings?.ToLLamaSharpInferenceParams(), cancellationToken);
var sb = new StringBuilder();
await foreach (var token in result)
@ -36,7 +37,7 @@ public sealed class LLamaSharpTextCompletion : ITextGenerationService
/// <inheritdoc/>
public async IAsyncEnumerable<StreamingTextContent> GetStreamingTextContentsAsync(string prompt, PromptExecutionSettings? executionSettings = null, Kernel? kernel = null, [EnumeratorCancellation] CancellationToken cancellationToken = default)
{
var settings = LLamaSharpPromptExecutionSettings.FromRequestSettings(executionSettings);
var settings = ChatRequestSettings.FromRequestSettings(executionSettings);
var result = executor.InferAsync(prompt, settings?.ToLLamaSharpInferenceParams(), cancellationToken);
await foreach (var token in result)
{

View File

@ -15,10 +15,9 @@ namespace LLama.Unittest
public BasicTest(ITestOutputHelper testOutputHelper)
{
_testOutputHelper = testOutputHelper;
_params = new ModelParams(Constants.GenerativeModelPath)
_params = new ModelParams(Constants.ModelPath)
{
ContextSize = 2048,
GpuLayerCount = Constants.CIGpuLayerCount
ContextSize = 2048
};
_model = LLamaWeights.LoadFromFile(_params);
}

View File

@ -15,10 +15,9 @@ public sealed class BeamTests
public BeamTests(ITestOutputHelper testOutputHelper)
{
_testOutputHelper = testOutputHelper;
_params = new ModelParams(Constants.GenerativeModelPath)
_params = new ModelParams(Constants.ModelPath)
{
ContextSize = 2048,
GpuLayerCount = Constants.CIGpuLayerCount,
ContextSize = 2048
};
_model = LLamaWeights.LoadFromFile(_params);
}
@ -28,7 +27,7 @@ public sealed class BeamTests
_model.Dispose();
}
[Fact]
[Fact(Skip = "Very very slow in CI")]
public void BasicBeam()
{
const int num_beams = 2;
@ -37,15 +36,15 @@ public sealed class BeamTests
var context = _model.CreateContext(_params);
var initial_tokens = context.Tokenize(prompt);
var batch = new LLamaBatch();
batch.AddRange(initial_tokens, 0, LLamaSeqId.Zero, true);
context.Decode(batch);
var result = new StringBuilder();
var initial_tokens = context.Tokenize(prompt);
result.Append(prompt);
//context.Eval(initial_tokens.AsSpan(), 0);
throw new NotImplementedException("Replace Eval");
var decoder = new StreamingTokenDecoder(context);
NativeApi.llama_beam_search(context.NativeHandle, (data, state) =>
{
// Show the current state of every beam.
for (var i = 0; i < state.Beams.Length; i++)
{
ref var view = ref state.Beams[i];
@ -57,17 +56,20 @@ public sealed class BeamTests
_testOutputHelper.WriteLine($"B{i} ({view.CumulativeProbability}) => '{tokens}'");
}
// Once all beams agree on some tokens read them and append them to the output decoder
if (state.CommonPrefixLength > 0)
{
var view = state.Beams[0];
var decoder = new StreamingTokenDecoder(context);
decoder.AddRange(view.Tokens.Slice(0, (int)state.CommonPrefixLength));
var tokens = decoder.Read();
result.Append(tokens);
}
}, IntPtr.Zero, num_beams, initial_tokens.Length, n_predict, Math.Max(1, Environment.ProcessorCount / 2));
_testOutputHelper.WriteLine($"Final: {prompt}{decoder.Read()}");
_testOutputHelper.WriteLine($"Final: {result}");
}
}

View File

@ -1,34 +1,10 @@
using System.Runtime.InteropServices;
namespace LLama.Unittest
namespace LLama.Unittest
{
internal static class Constants
{
public static readonly string GenerativeModelPath = "Models/llama-2-7b-chat.Q3_K_S.gguf";
public static readonly string EmbeddingModelPath = "Models/all-MiniLM-L12-v2.Q8_0.gguf";
public static readonly string LLavaModelPath = "Models/llava-v1.6-mistral-7b.Q3_K_XS.gguf";
public static readonly string LLavaMmpPath = "Models/mmproj-model-f16.gguf";
public static readonly string LLavaImage = "Models/extreme-ironing-taxi-610x427.jpg";
/// <summary>
/// Calculate GpuLayer Count to use in UnitTest
/// </summary>
/// <returns> Defaults to 20 in all the cases, except MacOS/OSX release (to disable METAL on github CI)</returns>
public static int CIGpuLayerCount
{
get
{
if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
{
#if DEBUG
return 20;
#else
return 0;
#endif
}
else return 20;
}
}
public static string ModelPath = "Models/llama-2-7b-chat.Q3_K_S.gguf";
public static string LLavaModelPath = "Models/llava-v1.6-mistral-7b.Q3_K_XS.gguf";
public static string LLavaMmpPath = "Models/mmproj-model-f16.gguf";
public static string LLavaImage = "Models/extreme-ironing-taxi-610x427.jpg";
}
}

View File

@ -12,11 +12,10 @@ namespace LLama.Unittest
public GrammarTest()
{
_params = new ModelParams(Constants.GenerativeModelPath)
_params = new ModelParams(Constants.ModelPath)
{
ContextSize = 2048,
Seed = 92,
GpuLayerCount = Constants.CIGpuLayerCount,
};
_model = LLamaWeights.LoadFromFile(_params);
}

View File

@ -31,9 +31,6 @@
<DownloadFile SourceUrl="https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q3_K_S.gguf" DestinationFolder="Models" DestinationFileName="llama-2-7b-chat.Q3_K_S.gguf" SkipUnchangedFiles="true"></DownloadFile>
<DownloadFile SourceUrl="https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf/resolve/main/llava-v1.6-mistral-7b.Q3_K_XS.gguf" DestinationFolder="Models" DestinationFileName="llava-v1.6-mistral-7b.Q3_K_XS.gguf" SkipUnchangedFiles="true"></DownloadFile>
<DownloadFile SourceUrl="https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf/resolve/main/mmproj-model-f16.gguf" DestinationFolder="Models" DestinationFileName="mmproj-model-f16.gguf" SkipUnchangedFiles="true"></DownloadFile>
<DownloadFile SourceUrl="https://huggingface.co/leliuga/all-MiniLM-L12-v2-GGUF/resolve/main/all-MiniLM-L12-v2.Q8_0.gguf" DestinationFolder="Models" DestinationFileName="all-MiniLM-L12-v2.Q8_0.gguf" SkipUnchangedFiles="true"></DownloadFile>
</Target>
<ItemGroup>
@ -46,9 +43,6 @@
</ItemGroup>
<ItemGroup>
<None Update="Models\all-MiniLM-L12-v2.Q8_0.gguf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="Models\llama-2-7b-chat.Q3_K_S.gguf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>

View File

@ -11,10 +11,9 @@ namespace LLama.Unittest
public LLamaContextTests()
{
var @params = new ModelParams(Constants.GenerativeModelPath)
var @params = new ModelParams(Constants.ModelPath)
{
ContextSize = 768,
GpuLayerCount = Constants.CIGpuLayerCount,
};
_weights = LLamaWeights.LoadFromFile(@params);
_context = _weights.CreateContext(@params);

View File

@ -1,15 +1,30 @@
using LLama.Common;
using LLama.Common;
using Xunit.Abstractions;
namespace LLama.Unittest;
public sealed class LLamaEmbedderTests
: IDisposable
{
private readonly ITestOutputHelper _testOutputHelper;
private readonly LLamaEmbedder _embedder;
public LLamaEmbedderTests(ITestOutputHelper testOutputHelper)
{
_testOutputHelper = testOutputHelper;
var @params = new ModelParams(Constants.ModelPath)
{
ContextSize = 4096,
Threads = 5,
EmbeddingMode = true,
};
using var weights = LLamaWeights.LoadFromFile(@params);
_embedder = new(weights, @params);
}
public void Dispose()
{
_embedder.Dispose();
}
private static float Dot(float[] a, float[] b)
@ -18,26 +33,13 @@ public sealed class LLamaEmbedderTests
return a.Zip(b, (x, y) => x * y).Sum();
}
private async Task CompareEmbeddings(string modelPath)
[Fact]
public async Task EmbedCompare()
{
var @params = new ModelParams(modelPath)
{
ContextSize = 8,
Threads = 4,
Embeddings = true,
GpuLayerCount = Constants.CIGpuLayerCount,
};
using var weights = LLamaWeights.LoadFromFile(@params);
using var embedder = new LLamaEmbedder(weights, @params);
var cat = await embedder.GetEmbeddings("The cat is cute");
Assert.DoesNotContain(float.NaN, cat);
var kitten = await embedder.GetEmbeddings("The kitten is kawaii");
Assert.DoesNotContain(float.NaN, kitten);
var spoon = await embedder.GetEmbeddings("The spoon is not real");
Assert.DoesNotContain(float.NaN, spoon);
var cat = await _embedder.GetEmbeddings("The cat is cute");
var kitten = await _embedder.GetEmbeddings("The kitten is kawaii");
var spoon = await _embedder.GetEmbeddings("The spoon is not real");
_testOutputHelper.WriteLine($"Cat = [{string.Join(",", cat.AsMemory().Slice(0, 7).ToArray())}...]");
_testOutputHelper.WriteLine($"Kitten = [{string.Join(",", kitten.AsMemory().Slice(0, 7).ToArray())}...]");
@ -45,23 +47,6 @@ public sealed class LLamaEmbedderTests
var close = 1 - Dot(cat, kitten);
var far = 1 - Dot(cat, spoon);
_testOutputHelper.WriteLine("");
_testOutputHelper.WriteLine($"Cat.Kitten (Close): {close:F4}");
_testOutputHelper.WriteLine($"Cat.Spoon (Far): {far:F4}");
Assert.True(close < far);
}
[Fact]
public async Task EmbedCompareEmbeddingModel()
{
await CompareEmbeddings(Constants.EmbeddingModelPath);
}
[Fact]
public async Task EmbedCompareGenerateModel()
{
await CompareEmbeddings(Constants.GenerativeModelPath);
}
}

View File

@ -14,11 +14,10 @@ namespace LLama.Unittest
public LLavaWeightTests()
{
var @params = new ModelParams(Constants.GenerativeModelPath)
var @params = new ModelParams(Constants.ModelPath)
{
// Llava models requires big context
ContextSize = 4096,
GpuLayerCount = Constants.CIGpuLayerCount,
ContextSize = 4096
};
_llamaWeights = LLamaWeights.LoadFromFile(@params);
_lLavaWeights = LLavaWeights.LoadFromFile(Constants.LLavaMmpPath);
@ -33,7 +32,7 @@ namespace LLama.Unittest
_lLavaWeights.Dispose();
}
[Fact,Trait("Category", "NoCI")]
[Fact(Skip = "Very very slow in CI")]
public void EmbedImageAsFileName()
{
int n_past = 0;
@ -41,7 +40,7 @@ namespace LLama.Unittest
Assert.True( _lLavaWeights.EvalImageEmbed( _context, emb, ref n_past ) );
}
[Fact,Trait("Category", "NoCI")]
[Fact(Skip = "Very very slow in CI")]
public void EmbedImageAsBinary()
{
int n_past = 0;

View File

@ -7,10 +7,9 @@ public class MemoryDisposalTests
[Fact]
public void ModelDisposal()
{
var @params = new ModelParams(Constants.GenerativeModelPath)
var @params = new ModelParams(Constants.ModelPath)
{
ContextSize = 2048,
GpuLayerCount = 0,
ContextSize = 2048
};
var model = LLamaWeights.LoadFromFile(@params);
@ -22,10 +21,9 @@ public class MemoryDisposalTests
[Fact]
public void ContextDisposal()
{
var @params = new ModelParams(Constants.GenerativeModelPath)
var @params = new ModelParams(Constants.ModelPath)
{
ContextSize = 2048,
GpuLayerCount = Constants.CIGpuLayerCount,
ContextSize = 2048
};
var model = LLamaWeights.LoadFromFile(@params);

View File

@ -1,5 +1,4 @@
using LLamaSharp.SemanticKernel;
using LLamaSharp.SemanticKernel.ChatCompletion;
using LLamaSharp.SemanticKernel.ChatCompletion;
using System.Text.Json;
namespace LLama.Unittest.SemanticKernel
@ -11,11 +10,11 @@ namespace LLama.Unittest.SemanticKernel
{
// Arrange
var options = new JsonSerializerOptions();
options.Converters.Add(new LLamaSharpPromptExecutionSettingsConverter());
options.Converters.Add(new ChatRequestSettingsConverter());
var json = "{}";
// Act
var requestSettings = JsonSerializer.Deserialize<LLamaSharpPromptExecutionSettings>(json, options);
var requestSettings = JsonSerializer.Deserialize<ChatRequestSettings>(json, options);
// Assert
Assert.NotNull(requestSettings);
@ -37,7 +36,7 @@ namespace LLama.Unittest.SemanticKernel
// Arrange
var options = new JsonSerializerOptions();
options.AllowTrailingCommas = true;
options.Converters.Add(new LLamaSharpPromptExecutionSettingsConverter());
options.Converters.Add(new ChatRequestSettingsConverter());
var json = @"{
""frequency_penalty"": 0.5,
""max_tokens"": 250,
@ -50,7 +49,7 @@ namespace LLama.Unittest.SemanticKernel
}";
// Act
var requestSettings = JsonSerializer.Deserialize<LLamaSharpPromptExecutionSettings>(json, options);
var requestSettings = JsonSerializer.Deserialize<ChatRequestSettings>(json, options);
// Assert
Assert.NotNull(requestSettings);
@ -74,7 +73,7 @@ namespace LLama.Unittest.SemanticKernel
// Arrange
var options = new JsonSerializerOptions();
options.AllowTrailingCommas = true;
options.Converters.Add(new LLamaSharpPromptExecutionSettingsConverter());
options.Converters.Add(new ChatRequestSettingsConverter());
var json = @"{
""FrequencyPenalty"": 0.5,
""MaxTokens"": 250,
@ -87,7 +86,7 @@ namespace LLama.Unittest.SemanticKernel
}";
// Act
var requestSettings = JsonSerializer.Deserialize<LLamaSharpPromptExecutionSettings>(json, options);
var requestSettings = JsonSerializer.Deserialize<ChatRequestSettings>(json, options);
// Assert
Assert.NotNull(requestSettings);

View File

@ -1,4 +1,4 @@
using LLamaSharp.SemanticKernel;
using LLamaSharp.SemanticKernel.ChatCompletion;
using Microsoft.SemanticKernel;
namespace LLama.Unittest.SemanticKernel
@ -10,7 +10,7 @@ namespace LLama.Unittest.SemanticKernel
{
// Arrange
// Act
var requestSettings = LLamaSharpPromptExecutionSettings.FromRequestSettings(null, null);
var requestSettings = ChatRequestSettings.FromRequestSettings(null, null);
// Assert
Assert.NotNull(requestSettings);
@ -31,7 +31,7 @@ namespace LLama.Unittest.SemanticKernel
{
// Arrange
// Act
var requestSettings = LLamaSharpPromptExecutionSettings.FromRequestSettings(null, 200);
var requestSettings = ChatRequestSettings.FromRequestSettings(null, 200);
// Assert
Assert.NotNull(requestSettings);
@ -51,7 +51,7 @@ namespace LLama.Unittest.SemanticKernel
public void ChatRequestSettings_FromExistingRequestSettings()
{
// Arrange
var originalRequestSettings = new LLamaSharpPromptExecutionSettings()
var originalRequestSettings = new ChatRequestSettings()
{
FrequencyPenalty = 0.5,
MaxTokens = 100,
@ -64,7 +64,7 @@ namespace LLama.Unittest.SemanticKernel
};
// Act
var requestSettings = LLamaSharpPromptExecutionSettings.FromRequestSettings(originalRequestSettings);
var requestSettings = ChatRequestSettings.FromRequestSettings(originalRequestSettings);
// Assert
Assert.NotNull(requestSettings);
@ -81,7 +81,7 @@ namespace LLama.Unittest.SemanticKernel
};
// Act
var requestSettings = LLamaSharpPromptExecutionSettings.FromRequestSettings(originalRequestSettings);
var requestSettings = ChatRequestSettings.FromRequestSettings(originalRequestSettings);
// Assert
Assert.NotNull(requestSettings);
@ -109,7 +109,7 @@ namespace LLama.Unittest.SemanticKernel
};
// Act
var requestSettings = LLamaSharpPromptExecutionSettings.FromRequestSettings(originalRequestSettings);
var requestSettings = ChatRequestSettings.FromRequestSettings(originalRequestSettings);
// Assert
Assert.NotNull(requestSettings);
@ -148,7 +148,7 @@ namespace LLama.Unittest.SemanticKernel
};
// Act
var requestSettings = LLamaSharpPromptExecutionSettings.FromRequestSettings(originalRequestSettings);
var requestSettings = ChatRequestSettings.FromRequestSettings(originalRequestSettings);
// Assert
Assert.NotNull(requestSettings);

View File

@ -37,7 +37,7 @@ namespace LLamaSharp.SemanticKernel.Tests
public void ToLLamaSharpInferenceParams_StateUnderTest_ExpectedBehavior()
{
// Arrange
var requestSettings = new LLamaSharpPromptExecutionSettings();
var requestSettings = new ChatRequestSettings();
// Act
var result = ExtensionMethods.ToLLamaSharpInferenceParams(

View File

@ -15,12 +15,11 @@ namespace LLama.Unittest
public StatelessExecutorTest(ITestOutputHelper testOutputHelper)
{
_testOutputHelper = testOutputHelper;
_params = new ModelParams(Constants.GenerativeModelPath)
_params = new ModelParams(Constants.ModelPath)
{
ContextSize = 60,
Seed = 1754,
BatchSize = 2,
GpuLayerCount = Constants.CIGpuLayerCount,
};
_weights = LLamaWeights.LoadFromFile(_params);
}

View File

@ -14,7 +14,7 @@ public class StreamingTextDecoderTests
public StreamingTextDecoderTests(ITestOutputHelper testOutputHelper)
{
_testOutputHelper = testOutputHelper;
_params = new ModelParams(Constants.GenerativeModelPath);
_params = new ModelParams(Constants.ModelPath);
_model = LLamaWeights.LoadFromFile(_params);
}

View File

@ -1,252 +0,0 @@
using System.Text;
using LLama.Common;
using LLama.Native;
namespace LLama.Unittest;
public sealed class TemplateTests
: IDisposable
{
private readonly LLamaWeights _model;
public TemplateTests()
{
var @params = new ModelParams(Constants.GenerativeModelPath)
{
ContextSize = 1,
GpuLayerCount = Constants.CIGpuLayerCount
};
_model = LLamaWeights.LoadFromFile(@params);
}
public void Dispose()
{
_model.Dispose();
}
[Fact]
public void BasicTemplate()
{
var templater = new LLamaTemplate(_model);
Assert.Equal(0, templater.Count);
templater.Add("assistant", "hello");
Assert.Equal(1, templater.Count);
templater.Add("user", "world");
Assert.Equal(2, templater.Count);
templater.Add("assistant", "111");
Assert.Equal(3, templater.Count);
templater.Add("user", "aaa");
Assert.Equal(4, templater.Count);
templater.Add("assistant", "222");
Assert.Equal(5, templater.Count);
templater.Add("user", "bbb");
Assert.Equal(6, templater.Count);
templater.Add("assistant", "333");
Assert.Equal(7, templater.Count);
templater.Add("user", "ccc");
Assert.Equal(8, templater.Count);
// Call once with empty array to discover length
var length = templater.Apply(Array.Empty<byte>());
var dest = new byte[length];
Assert.Equal(8, templater.Count);
// Call again to get contents
length = templater.Apply(dest);
Assert.Equal(8, templater.Count);
var templateResult = Encoding.UTF8.GetString(dest.AsSpan(0, length));
const string expected = "<|im_start|>assistant\nhello<|im_end|>\n" +
"<|im_start|>user\nworld<|im_end|>\n" +
"<|im_start|>assistant\n" +
"111<|im_end|>" +
"\n<|im_start|>user\n" +
"aaa<|im_end|>\n" +
"<|im_start|>assistant\n" +
"222<|im_end|>\n" +
"<|im_start|>user\n" +
"bbb<|im_end|>\n" +
"<|im_start|>assistant\n" +
"333<|im_end|>\n" +
"<|im_start|>user\n" +
"ccc<|im_end|>\n";
Assert.Equal(expected, templateResult);
}
[Fact]
public void CustomTemplate()
{
var templater = new LLamaTemplate("gemma");
Assert.Equal(0, templater.Count);
templater.Add("assistant", "hello");
Assert.Equal(1, templater.Count);
templater.Add("user", "world");
Assert.Equal(2, templater.Count);
templater.Add("assistant", "111");
Assert.Equal(3, templater.Count);
templater.Add("user", "aaa");
Assert.Equal(4, templater.Count);
// Call once with empty array to discover length
var length = templater.Apply(Array.Empty<byte>());
var dest = new byte[length];
Assert.Equal(4, templater.Count);
// Call again to get contents
length = templater.Apply(dest);
Assert.Equal(4, templater.Count);
var templateResult = Encoding.UTF8.GetString(dest.AsSpan(0, length));
const string expected = "<start_of_turn>model\n" +
"hello<end_of_turn>\n" +
"<start_of_turn>user\n" +
"world<end_of_turn>\n" +
"<start_of_turn>model\n" +
"111<end_of_turn>\n" +
"<start_of_turn>user\n" +
"aaa<end_of_turn>\n";
Assert.Equal(expected, templateResult);
}
[Fact]
public void BasicTemplateWithAddAssistant()
{
var templater = new LLamaTemplate(_model)
{
AddAssistant = true,
};
Assert.Equal(0, templater.Count);
templater.Add("assistant", "hello");
Assert.Equal(1, templater.Count);
templater.Add("user", "world");
Assert.Equal(2, templater.Count);
templater.Add("assistant", "111");
Assert.Equal(3, templater.Count);
templater.Add("user", "aaa");
Assert.Equal(4, templater.Count);
templater.Add("assistant", "222");
Assert.Equal(5, templater.Count);
templater.Add("user", "bbb");
Assert.Equal(6, templater.Count);
templater.Add("assistant", "333");
Assert.Equal(7, templater.Count);
templater.Add("user", "ccc");
Assert.Equal(8, templater.Count);
// Call once with empty array to discover length
var length = templater.Apply(Array.Empty<byte>());
var dest = new byte[length];
Assert.Equal(8, templater.Count);
// Call again to get contents
length = templater.Apply(dest);
Assert.Equal(8, templater.Count);
var templateResult = Encoding.UTF8.GetString(dest.AsSpan(0, length));
const string expected = "<|im_start|>assistant\nhello<|im_end|>\n" +
"<|im_start|>user\nworld<|im_end|>\n" +
"<|im_start|>assistant\n" +
"111<|im_end|>" +
"\n<|im_start|>user\n" +
"aaa<|im_end|>\n" +
"<|im_start|>assistant\n" +
"222<|im_end|>\n" +
"<|im_start|>user\n" +
"bbb<|im_end|>\n" +
"<|im_start|>assistant\n" +
"333<|im_end|>\n" +
"<|im_start|>user\n" +
"ccc<|im_end|>\n" +
"<|im_start|>assistant\n";
Assert.Equal(expected, templateResult);
}
[Fact]
public void GetOutOfRangeThrows()
{
var templater = new LLamaTemplate(_model);
Assert.Throws<ArgumentOutOfRangeException>(() => templater[0]);
templater.Add("assistant", "1");
templater.Add("user", "2");
Assert.Throws<ArgumentOutOfRangeException>(() => templater[-1]);
Assert.Throws<ArgumentOutOfRangeException>(() => templater[2]);
}
[Fact]
public void RemoveMid()
{
var templater = new LLamaTemplate(_model);
templater.Add("assistant", "1");
templater.Add("user", "2");
templater.Add("assistant", "3");
templater.Add("user", "4a");
templater.Add("user", "4b");
templater.Add("assistant", "5");
Assert.Equal("user", templater[3].Role);
Assert.Equal("4a", templater[3].Content);
Assert.Equal("assistant", templater[5].Role);
Assert.Equal("5", templater[5].Content);
Assert.Equal(6, templater.Count);
templater.RemoveAt(3);
Assert.Equal(5, templater.Count);
Assert.Equal("user", templater[3].Role);
Assert.Equal("4b", templater[3].Content);
Assert.Equal("assistant", templater[4].Role);
Assert.Equal("5", templater[4].Content);
}
[Fact]
public void RemoveLast()
{
var templater = new LLamaTemplate(_model);
templater.Add("assistant", "1");
templater.Add("user", "2");
templater.Add("assistant", "3");
templater.Add("user", "4a");
templater.Add("user", "4b");
templater.Add("assistant", "5");
Assert.Equal(6, templater.Count);
templater.RemoveAt(5);
Assert.Equal(5, templater.Count);
Assert.Equal("user", templater[4].Role);
Assert.Equal("4b", templater[4].Content);
}
[Fact]
public void RemoveOutOfRange()
{
var templater = new LLamaTemplate(_model);
Assert.Throws<ArgumentOutOfRangeException>(() => templater.RemoveAt(0));
templater.Add("assistant", "1");
templater.Add("user", "2");
Assert.Throws<ArgumentOutOfRangeException>(() => templater.RemoveAt(-1));
Assert.Throws<ArgumentOutOfRangeException>(() => templater.RemoveAt(2));
}
}

View File

@ -12,10 +12,9 @@ public sealed class TokenTests
public TokenTests()
{
_params = new ModelParams(Constants.GenerativeModelPath)
_params = new ModelParams(Constants.ModelPath)
{
ContextSize = 2048,
GpuLayerCount = Constants.CIGpuLayerCount,
ContextSize = 2048
};
_model = LLamaWeights.LoadFromFile(_params);
}

View File

@ -1,7 +1,7 @@
namespace LLama.Web.Async
{
/// <summary>
/// Create an Async locking using statement
/// Create an Async locking using statment
/// </summary>
public sealed class AsyncLock
{

View File

@ -29,13 +29,9 @@ namespace LLama.Web.Common
/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;
public uint SeqMax { get; }
/// <inheritdoc />
public uint Seed { get; set; } = 1686349486;
public bool Embeddings { get; }
/// <inheritdoc />
public bool UseMemorymap { get; set; } = true;
@ -61,7 +57,7 @@ namespace LLama.Web.Common
public uint BatchSize { get; set; } = 512;
/// <inheritdoc />
public uint UBatchSize { get; set; } = 512;
public bool EmbeddingMode { get; set; } = false;
/// <inheritdoc />
public TensorSplitsCollection TensorSplits { get; set; } = new();
@ -112,6 +108,6 @@ namespace LLama.Web.Common
public float DefragThreshold { get; set; }
/// <inheritdoc />
public LLamaPoolingType PoolingType { get; set; }
public bool DoPooling { get; set; }
}
}

View File

@ -34,14 +34,14 @@ namespace LLama.Web
private static List<string> CombineCSV(List<string> list, string csv)
{
var results = list is null || list.Count == 0
? CommaSeparatedToList(csv)
: CommaSeparatedToList(csv).Concat(list);
? CommaSeperatedToList(csv)
: CommaSeperatedToList(csv).Concat(list);
return results
.Distinct()
.ToList();
}
private static List<string> CommaSeparatedToList(string value)
private static List<string> CommaSeperatedToList(string value)
{
if (string.IsNullOrEmpty(value))
return new List<string>();

View File

@ -30,7 +30,7 @@ namespace LLama.Web.Hubs
{
_logger.Log(LogLevel.Information, "[OnDisconnectedAsync], Id: {0}", Context.ConnectionId);
// Remove connections session on disconnect
// Remove connections session on dissconnect
await _modelSessionService.CloseAsync(Context.ConnectionId);
await base.OnDisconnectedAsync(exception);
}

View File

@ -1,8 +1,8 @@
## LLama.Web - Basic ASP.NET Core examples of LLamaSharp in action
LLama.Web has no heavy dependencies and no extra frameworks over bootstrap and jquery to keep the examples clean and easy to copy over to your own project
LLama.Web has no heavy dependencies and no extra frameworks ove bootstrap and jquery to keep the examples clean and easy to copy over to your own project
## Websockets
Using signalr websockets simplifies the streaming of responses and model per connection management
Using signalr websockets simplifys the streaming of responses and model per connection management
@ -23,7 +23,7 @@ Example:
{
"Name": "Alpaca",
"Path": "D:\\Repositories\\AI\\Prompts\\alpaca.txt",
"Prompt": "Alternatively to can set a prompt text directly and omit the Path"
"Prompt": "Alternativly to can set a prompt text directly and omit the Path"
"AntiPrompt": [
"User:"
],

View File

@ -8,7 +8,7 @@ namespace LLama.Web.Services
{
/// <summary>
/// Service for handling Models,Weights & Contexts
/// Sercive for handling Models,Weights & Contexts
/// </summary>
public class ModelService : IModelService
{

View File

@ -9,7 +9,7 @@
<ItemGroup>
<PackageReference Include="Microsoft.VisualStudio.Validation" Version="17.8.8" />
<PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="8.0.3" />
<PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="8.0.2" />
<PackageReference Include="Swashbuckle.AspNetCore" Version="6.5.0" />
</ItemGroup>

View File

@ -14,29 +14,20 @@ public interface IContextParams
uint? ContextSize { get; }
/// <summary>
/// maximum batch size that can be submitted at once (must be >=32 to use BLAS) (n_batch)
/// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
/// </summary>
uint BatchSize { get; }
/// <summary>
/// Physical batch size
/// </summary>
uint UBatchSize { get; }
/// <summary>
/// max number of sequences (i.e. distinct states for recurrent models)
/// </summary>
uint SeqMax { get; }
/// <summary>
/// Seed for the random number generator (seed)
/// </summary>
uint Seed { get; }
/// <summary>
/// If true, extract embeddings (together with logits).
/// Whether to use embedding mode. (embedding) Note that if this is set to true,
/// The LLamaModel won't produce text response anymore.
/// </summary>
bool Embeddings { get; }
bool EmbeddingMode { get; }
/// <summary>
/// RoPE base frequency (null to fetch from the model)
@ -114,7 +105,7 @@ public interface IContextParams
float DefragThreshold { get; }
/// <summary>
/// How to pool (sum) embedding results by sequence id (ignored if no pooling layer)
/// Whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
/// </summary>
LLamaPoolingType PoolingType { get; }
bool DoPooling { get; }
}

View File

@ -6,7 +6,7 @@ using LLama.Sampling;
namespace LLama.Abstractions
{
/// <summary>
/// The parameters used for inference.
/// The paramters used for inference.
/// </summary>
public interface IInferenceParams
{

View File

@ -20,15 +20,16 @@ namespace LLama.Abstractions
/// </summary>
public bool IsMultiModal { get; }
/// <summary>
/// Multi-Modal Projections / Clip Model weights
/// Muti-Modal Projections / Clip Model weights
/// </summary>
public LLavaWeights? ClipModel { get; }
public LLavaWeights? ClipModel { get; }
/// <summary>
/// List of images: List of images in byte array format.
/// List of images: Image filename and path (jpeg images).
/// </summary>
public List<byte[]> Images { get; }
public List<string> ImagePaths { get; set; }
/// <summary>
/// Asynchronously infers a response from the model.
/// </summary>

View File

@ -232,7 +232,7 @@ namespace LLama.Abstractions
public sealed record MetadataOverride
{
/// <summary>
/// Get the key being overridden by this override
/// Get the key being overriden by this override
/// </summary>
public string Key { get; }

View File

@ -55,6 +55,23 @@ public sealed class BatchedExecutor
Epoch = 1;
}
/// <summary>
/// Start a new <see cref="Conversation"/> with the given prompt
/// </summary>
/// <param name="prompt"></param>
/// <returns></returns>
[Obsolete("Use BatchedExecutor.Create instead")]
public Conversation Prompt(string prompt)
{
if (IsDisposed)
throw new ObjectDisposedException(nameof(BatchedExecutor));
var conversation = Create();
conversation.Prompt(prompt);
return conversation;
}
/// <summary>
/// Start a new <see cref="Conversation"/>
/// </summary>
@ -67,39 +84,6 @@ public sealed class BatchedExecutor
return new Conversation(this, GetNextSequenceId());
}
/// <summary>
/// Load a conversation that was previously saved to a file. Once loaded the conversation will
/// need to be prompted.
/// </summary>
/// <param name="filepath"></param>
/// <returns></returns>
/// <exception cref="ObjectDisposedException"></exception>
public Conversation Load(string filepath)
{
if (IsDisposed)
throw new ObjectDisposedException(nameof(BatchedExecutor));
var conversation = Create();
conversation.Load(filepath);
return conversation;
}
/// <summary>
/// Load a conversation that was previously saved into memory. Once loaded the conversation will need to be prompted.
/// </summary>
/// <param name="state"></param>
/// <returns></returns>
/// <exception cref="ObjectDisposedException"></exception>
public Conversation Load(Conversation.State state)
{
if (IsDisposed)
throw new ObjectDisposedException(nameof(BatchedExecutor));
var conversation = Create();
conversation.Load(state);
return conversation;
}
/// <summary>
/// Run inference for all conversations in the batch which have pending tokens.
///

View File

@ -2,7 +2,6 @@
using System.Buffers;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Text.Json;
using LLama.Native;
namespace LLama.Batched;
@ -15,7 +14,7 @@ public sealed class Conversation
{
private ulong _requiredEpoch;
private LLamaPos _end;
private int _batchSampleIndex;
private int _batchIndex;
private bool _disposed;
private bool _forked;
@ -108,7 +107,7 @@ public sealed class Conversation
// logits, so sampling one conversation may mess up the fork! Setting the "forked" flag on both sequences ensures
// they both copy the logits before the next sampling run, to fix this issue.
_requiredEpoch = _requiredEpoch,
_batchSampleIndex = _batchSampleIndex,
_batchIndex = _batchIndex,
_forked = true,
_end = _end,
@ -141,7 +140,7 @@ public sealed class Conversation
if (_requiredEpoch > Executor.Epoch)
throw new CannotSampleRequiresInferenceException();
var span = Executor.Context.NativeHandle.GetLogitsIth(_batchSampleIndex);
var span = Executor.Context.NativeHandle.GetLogitsIth(_batchIndex);
// If necessary copy the span, to protect it from modification. This is only done when
// this conversation has been forked in this epoch.
@ -166,12 +165,11 @@ public sealed class Conversation
/// </summary>
/// <param name="input"></param>
/// <returns></returns>
[Obsolete("Tokenize the text and pass the tokens instead")]
public void Prompt(string input, bool addBos, bool special)
public void Prompt(string input)
{
AssertCanBePrompted();
Prompt(Executor.Context.Tokenize(input, addBos, special));
Prompt(Executor.Context.Tokenize(input));
}
/// <summary>
@ -222,7 +220,7 @@ public sealed class Conversation
// Add the prompt to the batch
for (var i = 0; i < tokens.Length; i++)
_batchSampleIndex = Executor.Batch.Add(tokens[i], _end++, ConversationId, i == tokens.Length - 1);
_batchIndex = Executor.Batch.Add(tokens[i], _end++, ConversationId, i == tokens.Length - 1);
// Mark this conversation as needing inference/sampling
_requiredEpoch = Executor.Epoch + 1;
@ -352,168 +350,4 @@ public sealed class Conversation
/// <returns>The new end token position</returns>
public delegate LLamaPos ModifyKvCache(LLamaPos end, KvAccessor kv);
#endregion
#region save/load
private void AssertCanLoad()
{
AssertNotDisposed();
if (_end.Value > 0)
throw new InvalidOperationException("Cannot load into a non-empty conversation");
}
private void AssertCanSave()
{
AssertNotDisposed();
if (RequiresInference)
throw new CannotSaveWhileRequiresInferenceException();
}
/// <summary>
/// Save the complete state of this conversation to a file. if the file already exists it will be overwritten.
/// </summary>
/// <param name="filepath"></param>
/// <exception cref="CannotSaveWhileRequiresInferenceException"></exception>
public void Save(string filepath)
{
AssertCanSave();
// Prepare extra state to put into file header
var state = GetState();
var bytes = JsonSerializer.SerializeToUtf8Bytes(state);
// Save extra state along with the KV cache
Executor.Context.SaveState(filepath, ConversationId, bytes);
}
/// <summary>
/// Save the complete state of this conversation in system memory.
/// </summary>
/// <returns></returns>
public State Save()
{
AssertCanSave();
return new PrivateState(
Executor.Context.GetState(ConversationId),
GetState()
);
}
/// <summary>
/// Load state from a file
/// This should only ever be called by the BatchedExecutor, on a newly created conversation object!
/// </summary>
/// <param name="filepath"></param>
/// <exception cref="InvalidOperationException"></exception>
internal void Load(string filepath)
{
AssertCanLoad();
// Load the state from file into the KV cache
Executor.Context.LoadState(filepath, ConversationId, out var header);
// deserialize the extra state in the file header
var state = JsonSerializer.Deserialize<SerializableConversationState>(header);
if (state == null)
{
Dispose();
throw new InvalidOperationException("Failed to deserialize - deserialized header state was null");
}
Load(state);
}
/// <summary>
/// Load state from a previously saved state.
/// This should only ever be called by the BatchedExecutor, on a newly created conversation object!
/// </summary>
/// <param name="state"></param>
internal void Load(State state)
{
AssertCanLoad();
// There is only one class that extends State and it is PrivateState, so this cast is safe.
var priv = (PrivateState)state;
// Load the state from file into the KV cache
Executor.Context.LoadState(priv.SequenceState, ConversationId);
Load(priv.ConversationState);
}
private void Load(SerializableConversationState state)
{
if (state.Version != 1)
throw new InvalidOperationException("Failed to deserialize - mismatched version number");
// Load extra conversation state
_end = state.TokenCount;
}
private SerializableConversationState GetState()
{
return new SerializableConversationState(
Version: 1,
TokenCount: TokenCount
);
}
private record SerializableConversationState(int Version, int TokenCount);
private sealed class PrivateState
: State
{
public readonly LLamaContext.SequenceState SequenceState;
public readonly SerializableConversationState ConversationState;
public override ulong Size => SequenceState.Size;
public PrivateState(LLamaContext.SequenceState sequenceState, SerializableConversationState conversationState)
{
SequenceState = sequenceState;
ConversationState = conversationState;
}
/// <inheritdoc />
public override void Dispose()
{
if (IsDisposed)
throw new ObjectDisposedException(nameof(State));
IsDisposed = true;
SequenceState.Dispose();
}
}
/// <summary>
/// In memory saved state of a <see cref="Conversation"/>
/// </summary>
public abstract class State
: IDisposable
{
/// <summary>
/// Indicates if this state has been disposed
/// </summary>
public bool IsDisposed { get; protected set; }
/// <summary>
/// Get the size in bytes of this state object
/// </summary>
public abstract ulong Size { get; }
/// <inheritdoc />
public abstract void Dispose();
/// <summary>
/// Internal constructor prevent anyone outside of LLamaSharp extending this class
/// </summary>
internal State()
{
}
}
#endregion
}

View File

@ -56,6 +56,18 @@ public class CannotSampleRequiresPromptException
}
}
/// <summary>
/// This exception is thrown when <see cref="Conversation.Fork"/> is called when <see cref="Conversation.RequiresInference"/> = true
/// </summary>
public class CannotForkWhileRequiresInferenceException
: ExperimentalBatchedExecutorException
{
internal CannotForkWhileRequiresInferenceException()
: base("Cannot `Fork()` a conversation while RequiresInference is true")
{
}
}
/// <summary>
/// This exception is thrown when <see cref="Conversation.Modify"/> is called when <see cref="Conversation.RequiresInference"/> = true
/// </summary>
@ -66,18 +78,4 @@ public class CannotModifyWhileRequiresInferenceException
: base("Cannot `Modify()` a conversation while RequiresInference is true")
{
}
}
/// <summary>
/// This exception is thrown when "Save()" is called on a <see cref="Conversation"/> which has
/// already been prompted and before "Infer()" has been called.
/// <see cref="BatchedExecutor"/>.
/// </summary>
public class CannotSaveWhileRequiresInferenceException
: ExperimentalBatchedExecutorException
{
internal CannotSaveWhileRequiresInferenceException()
: base("Must call `Infer()` before saving this Conversation")
{
}
}

View File

@ -1,117 +0,0 @@
using System;
using System.Buffers.Binary;
using System.IO;
using System.IO.MemoryMappedFiles;
using LLama.Native;
namespace LLama.Batched;
internal static class LLamaContextExtensions
{
private const uint FileHeaderMagic = 3430400180;
/// <summary>
/// Save the state of a particular sequence to specified path. Also save some extra data which will be returned when loading.
/// Data saved with this method <b>must</b> be saved with <see cref="LoadState(LLamaContext, string, LLamaSeqId, out byte[])"/>
/// </summary>
/// <param name="context"></param>
/// <param name="filename"></param>
/// <param name="sequence"></param>
/// <param name="header"></param>
internal static void SaveState(this LLamaContext context, string filename, LLamaSeqId sequence, ReadOnlySpan<byte> header)
{
// Delete that file before overwriting it
if (File.Exists(filename))
File.Delete(filename);
// Estimate size of state to write to disk, this is always equal to or greater than the actual size
var estimatedStateSize = checked((long)context.NativeHandle.GetStateSize(sequence));
// Space for "extra" byte plus a 8 byte header
var prefixSize = header.Length + 8;
// Add enough space for the "extra" data and a 6 byte header
var totalFileSize = prefixSize + estimatedStateSize;
// Map the file and write the bytes directly to it.
long writtenBytes = 0;
using (var file = MemoryMappedFile.CreateFromFile(filename, FileMode.Create, null, totalFileSize))
{
using (var view = file.CreateViewAccessor(0, totalFileSize))
{
unsafe
{
byte* ptr = null;
view.SafeMemoryMappedViewHandle.AcquirePointer(ref ptr);
try
{
// Write prefix data
BinaryPrimitives.WriteUInt32BigEndian(new Span<byte>(ptr + writtenBytes, 4), FileHeaderMagic);
writtenBytes += 4;
BinaryPrimitives.WriteUInt32BigEndian(new Span<byte>(ptr + writtenBytes, 4), (uint)header.Length);
writtenBytes += 4;
header.CopyTo(new Span<byte>(ptr + writtenBytes, header.Length));
writtenBytes += header.Length;
// Write state data
writtenBytes += (long)context.NativeHandle.GetState(ptr + writtenBytes, (ulong)estimatedStateSize, sequence);
}
finally
{
view.SafeMemoryMappedViewHandle.ReleasePointer();
}
}
}
}
// Truncate the file to the actual size of data that was written
using (var fileStream = new FileStream(filename, FileMode.Open))
fileStream.SetLength(writtenBytes);
}
/// <summary>
/// Load the state from the specified path into a particular sequence. Also reading header data. Must only be used with
/// data previously saved with <see cref="SaveState(LLamaContext, string, LLamaSeqId, ReadOnlySpan{byte})"/>
/// </summary>
/// <param name="context"></param>
/// <param name="filename"></param>
/// <param name="sequence"></param>
/// <param name="header"></param>
/// <exception cref="InvalidOperationException"></exception>
internal static void LoadState(this LLamaContext context, string filename, LLamaSeqId sequence, out byte[] header)
{
// Map state file into memory and pass that pointer directly to `llama_set_state_data` to load from
using (var file = MemoryMappedFile.CreateFromFile(filename, FileMode.Open, null))
using (var view = file.CreateViewAccessor())
{
unsafe
{
byte* ptr = null;
view.SafeMemoryMappedViewHandle.AcquirePointer(ref ptr);
try
{
var readBytes = 0;
// Read header
var magic = BinaryPrimitives.ReadUInt32BigEndian(new ReadOnlySpan<byte>(ptr + readBytes, 4));
readBytes += 4;
if (magic != FileHeaderMagic)
throw new InvalidOperationException("Invalid file header");
var headerLength = checked((int)BinaryPrimitives.ReadUInt32BigEndian(new ReadOnlySpan<byte>(ptr + readBytes, 4)));
readBytes += 4;
header = new byte[headerLength];
new Span<byte>(ptr + readBytes, headerLength).CopyTo(header);
readBytes += headerLength;
context.NativeHandle.SetState(ptr + readBytes, sequence);
}
finally
{
view.SafeMemoryMappedViewHandle.ReleasePointer();
}
}
}
}
}

View File

@ -74,21 +74,15 @@ public class ChatSession
/// </summary>
/// <param name="executor">The executor for this session</param>
/// <param name="history">History for this session</param>
/// <param name="transform">History Transform for this session</param>
/// <returns>A new chat session.</returns>
/// <returns></returns>
public static async Task<ChatSession> InitializeSessionFromHistoryAsync(
ILLamaExecutor executor, ChatHistory history, IHistoryTransform? transform = null)
ILLamaExecutor executor, ChatHistory history)
{
if (executor is not StatefulExecutorBase statefulExecutor)
{
throw new ArgumentException("Executor must have a StatefulExecutorBase", nameof(executor));
}
var session = new ChatSession(executor, history);
if (transform != null)
{
session = session.WithHistoryTransform(transform);
}
await statefulExecutor.PrefillPromptAsync(session.HistoryTransform.HistoryToText(history));
return session;
}
@ -551,7 +545,7 @@ public class ChatSession
InferenceParams? inferenceParams = null,
[EnumeratorCancellation] CancellationToken cancellationToken = default)
{
// Make sure the last message is an assistant message (response from the LLM).
// Make sure the last message is an assistant message (reponse from the LLM).
ChatHistory.Message? lastAssistantMessage = History.Messages.LastOrDefault();
if (lastAssistantMessage is null
@ -786,4 +780,4 @@ public record SessionState
outputTransform,
historyTransform);
}
}
}

View File

@ -7,7 +7,7 @@ using LLama.Sampling;
namespace LLama.Common
{
/// <summary>
/// The parameters used for inference.
/// The paramters used for inference.
/// </summary>
public record InferenceParams
: IInferenceParams

View File

@ -24,9 +24,6 @@ namespace LLama.Common
/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;
/// <inheritdoc />
public uint SeqMax { get; set; } = 1;
/// <inheritdoc />
public uint Seed { get; set; } = 0xFFFFFFFF;
@ -55,10 +52,7 @@ namespace LLama.Common
public uint BatchSize { get; set; } = 512;
/// <inheritdoc />
public uint UBatchSize { get; set; } = 512;
/// <inheritdoc />
public bool Embeddings { get; set; }
public bool EmbeddingMode { get; set; }
/// <inheritdoc />
public TensorSplitsCollection TensorSplits { get; set; } = new();
@ -103,7 +97,7 @@ namespace LLama.Common
public float DefragThreshold { get; set; }
/// <inheritdoc />
public LLamaPoolingType PoolingType { get; set; } = LLamaPoolingType.Unspecified;
public bool DoPooling { get; set; }
/// <inheritdoc />
public bool VocabOnly { get; set; }

View File

@ -6,7 +6,7 @@ using LLama.Native;
namespace LLama.Extensions
{
/// <summary>
/// Extension methods to the IContextParams interface
/// Extention methods to the IContextParams interface
/// </summary>
public static class IContextParamsExtensions
{
@ -20,14 +20,11 @@ namespace LLama.Extensions
/// <exception cref="ArgumentException"></exception>
public static void ToLlamaContextParams(this IContextParams @params, out LLamaContextParams result)
{
result = LLamaContextParams.Default();
result = NativeApi.llama_context_default_params();
result.n_ctx = @params.ContextSize ?? 0;
result.n_batch = @params.BatchSize;
result.n_ubatch = @params.UBatchSize;
result.n_seq_max = @params.SeqMax;
result.seed = @params.Seed;
result.embeddings = @params.Embeddings;
result.embedding = @params.EmbeddingMode;
result.rope_freq_base = @params.RopeFrequencyBase ?? 0;
result.rope_freq_scale = @params.RopeFrequencyScale ?? 0;
@ -44,13 +41,10 @@ namespace LLama.Extensions
result.cb_eval = IntPtr.Zero;
result.cb_eval_user_data = IntPtr.Zero;
result.abort_callback = IntPtr.Zero;
result.abort_callback_user_data = IntPtr.Zero;
result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16;
result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16;
result.offload_kqv = !@params.NoKqvOffload;
result.llama_pooling_type = @params.PoolingType;
result.do_pooling = @params.DoPooling;
result.n_threads = Threads(@params.Threads);
result.n_threads_batch = Threads(@params.BatchThreads);

View File

@ -7,7 +7,7 @@ using LLama.Native;
namespace LLama.Extensions;
/// <summary>
/// Extension methods to the IModelParams interface
/// Extention methods to the IModelParams interface
/// </summary>
public static class IModelParamsExtensions
{
@ -28,8 +28,7 @@ public static class IModelParamsExtensions
var disposer = new GroupDisposable();
result = LLamaModelParams.Default();
result = NativeApi.llama_model_default_params();
result.main_gpu = @params.MainGpu;
result.split_mode = @params.SplitMode;
result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount;

View File

@ -8,5 +8,3 @@ using System.Diagnostics.CodeAnalysis;
[assembly: SuppressMessage("Interoperability", "CA1401:P/Invokes should not be visible", Justification = "LLamaSharp intentionally exports the native llama.cpp API")]
[assembly: SuppressMessage("Style", "IDE0070:Use 'System.HashCode'", Justification = "Not compatible with netstandard2.0")]
[assembly: SuppressMessage("Interoperability", "SYSLIB1054:Use 'LibraryImportAttribute' instead of 'DllImportAttribute' to generate P/Invoke marshalling code at compile time", Justification = "Not compatible with netstandard2.0")]

View File

@ -1,4 +1,4 @@
using LLama.Exceptions;
using LLama.Exceptions;
using LLama.Native;
using System;
using System.Collections.Generic;
@ -152,7 +152,6 @@ namespace LLama
return decoder.Read();
}
#region state load/save
/// <summary>
/// Save the state to specified path.
/// </summary>
@ -164,7 +163,7 @@ namespace LLama
File.Delete(filename);
// Estimate size of state to write to disk, this is always equal to or greater than the actual size
var estimatedStateSize = checked((long)NativeHandle.GetStateSize());
var estimatedStateSize = (long)NativeApi.llama_get_state_size(NativeHandle);
// Map the file and write the bytes directly to it. This saves copying the bytes into a C# array
long writtenBytes;
@ -175,53 +174,8 @@ namespace LLama
{
byte* ptr = null;
view.SafeMemoryMappedViewHandle.AcquirePointer(ref ptr);
try
{
writtenBytes = (long)NativeHandle.GetState(ptr, (ulong)estimatedStateSize);
}
finally
{
view.SafeMemoryMappedViewHandle.ReleasePointer();
}
}
}
// Truncate the file to the actual size of data that was written
using (var fileStream = new FileStream(filename, FileMode.Open))
fileStream.SetLength(writtenBytes);
}
/// <summary>
/// Save the state of a particular sequence to specified path.
/// </summary>
/// <param name="filename"></param>
/// <param name="sequence"></param>
public void SaveState(string filename, LLamaSeqId sequence)
{
// Delete that file before overwriting it
if (File.Exists(filename))
File.Delete(filename);
// Estimate size of state to write to disk, this is always equal to or greater than the actual size
var estimatedStateSize = checked((long)NativeHandle.GetStateSize(sequence));
// Map the file and write the bytes directly to it. This saves copying the bytes into a C# array
long writtenBytes;
using (var file = MemoryMappedFile.CreateFromFile(filename, FileMode.Create, null, estimatedStateSize))
using (var view = file.CreateViewAccessor(0, estimatedStateSize))
{
unsafe
{
byte* ptr = null;
view.SafeMemoryMappedViewHandle.AcquirePointer(ref ptr);
try
{
writtenBytes = (long)NativeHandle.GetState(ptr, (ulong)estimatedStateSize, sequence);
}
finally
{
view.SafeMemoryMappedViewHandle.ReleasePointer();
}
writtenBytes = (long)NativeApi.llama_copy_state_data(NativeHandle, ptr);
view.SafeMemoryMappedViewHandle.ReleasePointer();
}
}
@ -233,7 +187,7 @@ namespace LLama
/// <summary>
/// Get the state data as an opaque handle, which can be loaded later using <see cref="LoadState(State)"/>
/// </summary>
/// <remarks>Use <see cref="SaveState(string)"/> if you intend to save this state to disk.</remarks>
/// <remarks>Use <see cref="SaveState"/> if you intend to save this state to disk.</remarks>
/// <returns></returns>
public State GetState()
{
@ -244,11 +198,7 @@ namespace LLama
try
{
// Copy the state data into memory, discover the actual size required
ulong actualSize;
unsafe
{
actualSize = NativeHandle.GetState((byte*)memory, stateSize);
}
var actualSize = NativeHandle.GetState(memory, stateSize);
// Shrink to size
memory = Marshal.ReAllocHGlobal(memory, (nint)actualSize);
@ -268,48 +218,11 @@ namespace LLama
}
}
/// <summary>
/// Get the state data as an opaque handle, which can be loaded later using <see cref="LoadState(State)"/>
/// </summary>
/// <remarks>Use <see cref="SaveState(string, LLamaSeqId)"/> if you intend to save this state to disk.</remarks>
/// <returns></returns>
public SequenceState GetState(LLamaSeqId sequence)
{
var stateSize = NativeHandle.GetStateSize(sequence);
// Allocate a chunk of memory large enough to hold the entire state
var memory = Marshal.AllocHGlobal((nint)stateSize);
try
{
// Copy the state data into memory, discover the actual size required
ulong actualSize;
unsafe
{
actualSize = NativeHandle.GetState((byte*)memory, stateSize, sequence);
}
// Shrink to size
memory = Marshal.ReAllocHGlobal(memory, (nint)actualSize);
// Wrap memory in a "state"
var state = new SequenceState(memory, actualSize);
// Set memory to zero, to prevent it being freed in finally block
memory = IntPtr.Zero;
return state;
}
finally
{
if (memory != IntPtr.Zero)
Marshal.FreeHGlobal(memory);
}
}
/// <summary>
/// Load the state from specified path.
/// </summary>
/// <param name="filename"></param>
/// <exception cref="RuntimeError"></exception>
public void LoadState(string filename)
{
// Map state file into memory and pass that pointer directly to `llama_set_state_data` to load from
@ -320,41 +233,8 @@ namespace LLama
{
byte* ptr = null;
view.SafeMemoryMappedViewHandle.AcquirePointer(ref ptr);
try
{
NativeHandle.SetState(ptr);
}
finally
{
view.SafeMemoryMappedViewHandle.ReleasePointer();
}
}
}
}
/// <summary>
/// Load the state from specified path into a particular sequence
/// </summary>
/// <param name="filename"></param>
/// <param name="sequence"></param>
public void LoadState(string filename, LLamaSeqId sequence)
{
// Map state file into memory and pass that pointer directly to `llama_set_state_data` to load from
using (var file = MemoryMappedFile.CreateFromFile(filename, FileMode.Open, null))
using (var view = file.CreateViewAccessor())
{
unsafe
{
byte* ptr = null;
view.SafeMemoryMappedViewHandle.AcquirePointer(ref ptr);
try
{
NativeHandle.SetState(ptr, sequence);
}
finally
{
view.SafeMemoryMappedViewHandle.ReleasePointer();
}
NativeApi.llama_set_state_data(NativeHandle, ptr);
view.SafeMemoryMappedViewHandle.ReleasePointer();
}
}
}
@ -368,25 +248,10 @@ namespace LLama
{
unsafe
{
NativeHandle.SetState((byte*)state.DangerousGetHandle());
NativeHandle.SetState((byte*)state.DangerousGetHandle().ToPointer());
}
}
/// <summary>
/// Load the state from memory into a particular sequence
/// </summary>
/// <param name="state"></param>
/// <param name="sequence"></param>
/// <exception cref="RuntimeError"></exception>
public void LoadState(SequenceState state, LLamaSeqId sequence)
{
unsafe
{
NativeHandle.SetState((byte*)state.DangerousGetHandle(), sequence);
}
}
#endregion
/// <summary>
/// Sample a single token from this context, using the given sampling pipeline
/// </summary>
@ -492,8 +357,8 @@ namespace LLama
}
// Save the newline logit value
var nl_token = NativeHandle.ModelHandle.Tokens.Newline;
var nl_logit = logits[(int?)nl_token ?? 0];
var nl_token = NativeApi.llama_token_nl(NativeHandle.ModelHandle);
var nl_logit = logits[(int)nl_token];
// Convert logits into token candidates
var candidates_p = LLamaTokenDataArray.Create(logits);
@ -506,7 +371,7 @@ namespace LLama
candidates_p.RepetitionPenalty(NativeHandle, last_n_array, repeatPenalty, alphaFrequency, alphaPresence);
// Restore newline token logit value if necessary
if (!penalizeNL && nl_token.HasValue)
if (!penalizeNL)
{
var candidatesSpan = candidates_p.data.Span;
for (var i = 0; i < candidates_p.data.Length; i++)
@ -521,17 +386,6 @@ namespace LLama
return candidates_p;
}
/// <summary>
/// Gets whether or not the Bos token should be added.
/// From common.cpp https://github.com/ggerganov/llama.cpp/blob/60325fa56f61c228464c9f065db3aa6a61f2156e/common/common.cpp#L2417
/// </summary>
/// <returns></returns>
public bool ShouldAddBosToken()
{
var addBos = NativeApi.llama_add_bos_token(NativeHandle.ModelHandle);
return addBos != -1 ? Convert.ToBoolean(addBos) : NativeHandle.LLamaVocabType == LLamaVocabType.SentencePiece;
}
#region eval overloads
/// <summary>
/// </summary>
@ -563,16 +417,12 @@ namespace LLama
}
/// <summary>
/// The state of this context, which can be reloaded later
/// The state of this model, which can be reloaded later
/// </summary>
public class State
: SafeLLamaHandleBase
{
private readonly ulong _size;
/// <summary>
/// Get the size in bytes of this state object
/// </summary>
public ulong Size => _size;
private ulong _size;
internal State(IntPtr memory, ulong size)
: base(memory, true)
@ -591,7 +441,6 @@ namespace LLama
/// Convert this state to a byte array
/// </summary>
/// <returns></returns>
[Obsolete("It is not generally safe to convert a state into a byte array - it will fail if the state is very large")]
public byte[] ToByteArray()
{
var bytes = new byte[_size];
@ -604,7 +453,6 @@ namespace LLama
/// </summary>
/// <param name="bytes"></param>
/// <returns></returns>
[Obsolete("It is not generally safe to convert a state into a byte array - it will fail if the state is very large")]
public static State FromByteArray(byte[] bytes)
{
var memory = Marshal.AllocHGlobal(bytes.Length);
@ -612,49 +460,5 @@ namespace LLama
return new State(memory, (ulong)bytes.Length);
}
}
/// <summary>
/// The state of a single sequence, which can be reloaded later
/// </summary>
public class SequenceState
: SafeLLamaHandleBase
{
private readonly ulong _size;
/// <summary>
/// Get the size in bytes of this state object
/// </summary>
public ulong Size => _size;
internal SequenceState(IntPtr memory, ulong size)
: base(memory, true)
{
_size = size;
}
/// <inheritdoc />
protected override bool ReleaseHandle()
{
Marshal.FreeHGlobal(handle);
return true;
}
/// <summary>
/// Copy bytes to a destination pointer.
/// </summary>
/// <param name="dst">Destination to write to</param>
/// <param name="length">Length of the destination buffer</param>
/// <param name="offset">Offset from start of src to start copying from</param>
/// <returns>Number of bytes written to destination</returns>
public unsafe ulong CopyTo(byte* dst, ulong length, ulong offset = 0)
{
var copy = Math.Min(length, _size - offset);
var src = (byte*)DangerousGetHandle();
src += offset;
Buffer.MemoryCopy(src, dst, length, copy);
return copy;
}
}
}
}

View File

@ -32,7 +32,7 @@ namespace LLama
/// <param name="logger"></param>
public LLamaEmbedder(LLamaWeights weights, IContextParams @params, ILogger? logger = null)
{
if (!@params.Embeddings)
if (!@params.EmbeddingMode)
throw new ArgumentException("EmbeddingMode must be true", nameof(@params));
Context = weights.CreateContext(@params, logger);
@ -75,7 +75,7 @@ namespace LLama
n_eval = batchSize;
batch.Clear();
batch.AddRange(tokens.AsSpan(i, n_eval), n_past, LLamaSeqId.Zero, true);
batch.AddRange(tokens.AsSpan(i, n_eval), n_past, LLamaSeqId.Zero, false);
n_past += n_eval;
var returnCode = await Context.DecodeAsync(batch, cancellationToken);
@ -97,18 +97,10 @@ namespace LLama
private float[] GetEmbeddingsArray()
{
unsafe
{
var embeddings = NativeApi.llama_get_embeddings(Context.NativeHandle);
if (embeddings == null)
embeddings = NativeApi.llama_get_embeddings_seq(Context.NativeHandle, LLamaSeqId.Zero);
if (embeddings == null)
return Array.Empty<float>();
return new Span<float>(embeddings, Context.EmbeddingSize).ToArray();
}
var embeddings = NativeApi.llama_get_embeddings(Context.NativeHandle);
if (embeddings == null)
return Array.Empty<float>();
return embeddings.ToArray();
}
private static void Normalize(Span<float> embeddings)
@ -119,10 +111,6 @@ namespace LLama
lengthSqr += value * value;
var length = (float)Math.Sqrt(lengthSqr);
// Do not divide by length if it is zero
if (length <= float.Epsilon)
return;
// Normalize
for (var i = 0; i < embeddings.Length; i++)
embeddings[i] /= length;

View File

@ -1,4 +1,4 @@
using LLama.Abstractions;
using LLama.Abstractions;
using LLama.Common;
using LLama.Exceptions;
using LLama.Native;
@ -76,11 +76,11 @@ namespace LLama
}
/// <inheritdoc />
public LLavaWeights? ClipModel { get; }
public LLavaWeights? ClipModel { get; }
/// <inheritdoc />
public List<byte[]> Images { get; }
public List<string> ImagePaths { get; set; }
/// <summary>
/// Current "mu" value for mirostat sampling
/// </summary>
@ -95,7 +95,7 @@ namespace LLama
/// <param name="logger"></param>
protected StatefulExecutorBase(LLamaContext context, ILogger? logger = null)
{
Images = new List<byte[]>();
ImagePaths = new List<string>();
_logger = logger;
Context = context;
_pastTokensCount = 0;
@ -105,12 +105,6 @@ namespace LLama
_decoder = new StreamingTokenDecoder(context);
}
/// <summary>
///
/// </summary>
/// <param name="context"></param>
/// <param name="lLavaWeights"></param>
/// <param name="logger"></param>
public StatefulExecutorBase(LLamaContext context, LLavaWeights lLavaWeights, ILogger? logger = null) :
this( context, logger )
{
@ -135,7 +129,7 @@ namespace LLama
{
_logger?.LogInformation($"[LLamaExecutor] Attempting to load saved session from {filename}");
var session_tokens = new LLamaToken[Context.ContextSize];
if (!NativeApi.llama_state_load_file(Context.NativeHandle, _pathSession, session_tokens, (ulong)Context.ContextSize, out var n_token_count_out))
if (!NativeApi.llama_load_session_file(Context.NativeHandle, _pathSession, session_tokens, (ulong)Context.ContextSize, out var n_token_count_out))
{
_logger?.LogError($"[LLamaExecutor] Failed to load session file {filename}");
throw new RuntimeError($"Failed to load session file {_pathSession}");
@ -183,7 +177,7 @@ namespace LLama
public void SaveSessionFile(string filename)
{
var session_token_array = _session_tokens.ToArray();
NativeApi.llama_state_save_file(Context.NativeHandle, filename, session_token_array, (ulong)session_token_array.Length);
NativeApi.llama_save_session_file(Context.NativeHandle, filename, session_token_array, (ulong)session_token_array.Length);
}
/// <summary>
@ -195,14 +189,13 @@ namespace LLama
// if we run out of context:
// - take the tokensToKeep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - tokensToKeep) tokens and recompute the logits in batches
var n_left = _pastTokensCount - tokensToKeep;
var n_discard = n_left / 2;
int n_left = _pastTokensCount - tokensToKeep;
NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, (LLamaSeqId)0, tokensToKeep, tokensToKeep + n_discard);
NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, (LLamaSeqId)0, tokensToKeep + n_discard, _pastTokensCount, -n_discard);
_pastTokensCount = Math.Max(1, tokensToKeep);
// insert n_left/2 tokens at the start of embed from last_n_tokens
_embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip((int)Context.ContextSize - n_left / 2 - _embeds.Count));
_pastTokensCount -= n_discard;
// stop saving session if we run out of context
_pathSession = string.Empty;
}
@ -210,7 +203,7 @@ namespace LLama
/// <summary>
/// Try to reuse the matching prefix from the session file.
/// </summary>
protected virtual void TryReuseMatchingPrefix()
protected virtual void TryReuseMathingPrefix()
{
if (_n_session_consumed < _session_tokens.Count)
{

View File

@ -1,4 +1,4 @@
using LLama.Abstractions;
using LLama.Abstractions;
using LLama.Common;
using LLama.Native;
using System;
@ -38,8 +38,8 @@ namespace LLama
ILogger? logger = null)
: base(context, logger)
{
_inp_pfx = Context.Tokenize(instructionPrefix, true, true);
_inp_sfx = Context.Tokenize(instructionSuffix, false, true);
_inp_pfx = Context.Tokenize(instructionPrefix, true);
_inp_sfx = Context.Tokenize(instructionSuffix, false);
_instructionPrefix = instructionPrefix;
}
@ -124,7 +124,7 @@ namespace LLama
if (_is_prompt_run)
{
// When running the first input (prompt) in inteactive mode, we should specially process it.
_embed_inps = Context.Tokenize(text, true, true).ToList();
_embed_inps = Context.Tokenize(text, true).ToList();
}
else
{
@ -135,7 +135,7 @@ namespace LLama
_consumedTokensCount = _embed_inps.Count;
_embed_inps.AddRange(_inp_pfx);
var line_inp = Context.Tokenize(text, false, true);
var line_inp = Context.Tokenize(text, false);
_embed_inps.AddRange(line_inp);
_embed_inps.AddRange(_inp_sfx);
@ -163,7 +163,7 @@ namespace LLama
}
}
if (_embeds.Count > 0 && _embeds.Last() == Context.NativeHandle.ModelHandle.Tokens.EOS)
if (_embeds.Count > 0 && _embeds.Last() == NativeApi.llama_token_eos(Context.NativeHandle.ModelHandle))
{
args.WaitForInput = true;
}
@ -186,13 +186,10 @@ namespace LLama
_is_prompt_run = false;
if (_pastTokensCount + _embeds.Count > Context.ContextSize)
{
// Ported from https://github.com/ggerganov/llama.cpp/blob/60325fa56f61c228464c9f065db3aa6a61f2156e/examples/main/main.cpp#L334
// Instruct always uses input token size.
var tokensToKeep = _embed_inps.Count;
HandleRunOutOfContext(tokensToKeep);
HandleRunOutOfContext(inferenceParams.TokensKeep);
}
TryReuseMatchingPrefix();
TryReuseMathingPrefix();
var (result, _) = Context.NativeHandle.Decode(_embeds, LLamaSeqId.Zero, batch, ref _pastTokensCount);
if (result != DecodeResult.Ok)
@ -262,7 +259,7 @@ namespace LLama
return Task.CompletedTask;
}
/// <summary>
/// The descriptor of the state of the instruct executor.
/// The desciptor of the state of the instruct executor.
/// </summary>
public class InstructExecutorState : ExecutorBaseState
{

Some files were not shown because too many files have changed in this diff Show More