build(deps): bump Microsoft.SemanticKernel.Abstractions

Bumps [Microsoft.SemanticKernel.Abstractions](https://github.com/microsoft/semantic-kernel) from 1.6.2 to 1.6.3. - [Release notes](https://github.com/microsoft/semantic-kernel/releases) - [Commits](https://github.com/microsoft/semantic-kernel/compare/dotnet-1.6.2...dotnet-1.6.3) --- updated-dependencies: - dependency-name: Microsoft.SemanticKernel.Abstractions dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>
2024-04-01 06:26:17 +00:00
189 changed files with 2617 additions and 10421 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -1,86 +0,0 @@
-
-[*]
-charset = utf-8
-end_of_line = lf
-trim_trailing_whitespace = false
-insert_final_newline = false
-indent_style = space
-indent_size = 4
-
-# Microsoft .NET properties
-csharp_new_line_before_members_in_object_initializers = false
-csharp_preferred_modifier_order = public, private, protected, internal, file, new, static, abstract, virtual, sealed, readonly, override, extern, unsafe, volatile, async, required:suggestion
-csharp_style_prefer_utf8_string_literals = true:suggestion
-csharp_style_var_elsewhere = true:suggestion
-csharp_style_var_for_built_in_types = true:suggestion
-csharp_style_var_when_type_is_apparent = true:suggestion
-dotnet_naming_rule.enum_member_rule.import_to_resharper = True
-dotnet_naming_rule.enum_member_rule.resharper_description = Enum members
-dotnet_naming_rule.enum_member_rule.resharper_guid = 8b8504e3-f0be-4c14-9103-c732f2bddc15
-dotnet_naming_rule.enum_member_rule.resharper_style = AA_BB, AaBb
-dotnet_naming_rule.enum_member_rule.severity = warning
-dotnet_naming_rule.enum_member_rule.style = all_upper_style
-dotnet_naming_rule.enum_member_rule.symbols = enum_member_symbols
-dotnet_naming_rule.unity_serialized_field_rule.import_to_resharper = True
-dotnet_naming_rule.unity_serialized_field_rule.resharper_description = Unity serialized field
-dotnet_naming_rule.unity_serialized_field_rule.resharper_guid = 5f0fdb63-c892-4d2c-9324-15c80b22a7ef
-dotnet_naming_rule.unity_serialized_field_rule.severity = warning
-dotnet_naming_rule.unity_serialized_field_rule.style = lower_camel_case_style
-dotnet_naming_rule.unity_serialized_field_rule.symbols = unity_serialized_field_symbols
-dotnet_naming_style.all_upper_style.capitalization = all_upper
-dotnet_naming_style.all_upper_style.word_separator = _
-dotnet_naming_style.lower_camel_case_style.capitalization = camel_case
-dotnet_naming_symbols.enum_member_symbols.applicable_accessibilities = *
-dotnet_naming_symbols.enum_member_symbols.applicable_kinds = 
-dotnet_naming_symbols.enum_member_symbols.resharper_applicable_kinds = enum_member
-dotnet_naming_symbols.enum_member_symbols.resharper_required_modifiers = any
-dotnet_naming_symbols.unity_serialized_field_symbols.applicable_accessibilities = *
-dotnet_naming_symbols.unity_serialized_field_symbols.applicable_kinds = 
-dotnet_naming_symbols.unity_serialized_field_symbols.resharper_applicable_kinds = unity_serialised_field
-dotnet_naming_symbols.unity_serialized_field_symbols.resharper_required_modifiers = instance
-dotnet_style_parentheses_in_arithmetic_binary_operators = never_if_unnecessary:none
-dotnet_style_parentheses_in_other_binary_operators = always_for_clarity:none
-dotnet_style_parentheses_in_relational_binary_operators = never_if_unnecessary:none
-dotnet_style_predefined_type_for_locals_parameters_members = true:suggestion
-dotnet_style_predefined_type_for_member_access = true:suggestion
-dotnet_style_qualification_for_event = false:suggestion
-dotnet_style_qualification_for_field = false:suggestion
-dotnet_style_qualification_for_method = false:suggestion
-dotnet_style_qualification_for_property = false:suggestion
-dotnet_style_require_accessibility_modifiers = for_non_interface_members:suggestion
-
-# ReSharper properties
-resharper_autodetect_indent_settings = true
-resharper_formatter_off_tag = @formatter:off
-resharper_formatter_on_tag = @formatter:on
-resharper_formatter_tags_enabled = true
-resharper_use_indent_from_vs = false
-
-# ReSharper inspection severities
-resharper_arrange_redundant_parentheses_highlighting = hint
-resharper_arrange_this_qualifier_highlighting = hint
-resharper_arrange_type_member_modifiers_highlighting = hint
-resharper_arrange_type_modifiers_highlighting = hint
-resharper_built_in_type_reference_style_for_member_access_highlighting = hint
-resharper_built_in_type_reference_style_highlighting = hint
-resharper_razor_assembly_not_resolved_highlighting = warning
-resharper_redundant_base_qualifier_highlighting = warning
-resharper_suggest_var_or_type_built_in_types_highlighting = hint
-resharper_suggest_var_or_type_elsewhere_highlighting = hint
-resharper_suggest_var_or_type_simple_types_highlighting = hint
-resharper_web_config_module_not_resolved_highlighting = warning
-resharper_web_config_type_not_resolved_highlighting = warning
-resharper_web_config_wrong_module_highlighting = warning
-
-[{*.har,*.jsb2,*.jsb3,*.json,*.jsonc,*.postman_collection,*.postman_collection.json,*.postman_environment,*.postman_environment.json,.babelrc,.eslintrc,.prettierrc,.stylelintrc,bowerrc,jest.config}]
-indent_style = space
-indent_size = 2
-
-[*.map]
-indent_style = space
-indent_size = 2
-
-[*.{appxmanifest,asax,ascx,aspx,axaml,build,c,c++,c++m,cc,ccm,cginc,compute,cp,cpp,cppm,cs,cshtml,cu,cuh,cxx,cxxm,dtd,fs,fsi,fsscript,fsx,fx,fxh,h,hh,hlsl,hlsli,hlslinc,hpp,hxx,inc,inl,ino,ipp,ixx,master,ml,mli,mpp,mq4,mq5,mqh,mxx,nuspec,paml,razor,resw,resx,shader,skin,tpp,usf,ush,uxml,vb,xaml,xamlx,xoml,xsd}]
-indent_style = space
-indent_size = 4
-tab_width = 4
--- a/.github/ISSUE_TEMPLATE/blank_issue.yml
+++ b/.github/ISSUE_TEMPLATE/blank_issue.yml
@ -1,12 +0,0 @@
-name: Blank Issue
-description: Submit any other kind of issue.
-labels: [Blank Issue]
-body:
-  - type: textarea
-    id: description
-    attributes:
-      label: Description
-      description: Please describe the issue here.
-      placeholder: Description
-    validations:
-      required: false
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@ -1,52 +0,0 @@
-name: BUG Report
-description: Report a BUG of LLamaSharp.
-title: "[BUG]: "
-labels: [bug-report]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        To help us fix your problem more quickly, please check the following steps at first.
-        - [ ] I have read the related documents.
-        - [ ] I have searched the keywords in the issues.
-  - type: textarea
-    id: background
-    attributes:
-      label: Description
-      description: Please share a clear description of the problem.
-      placeholder: Description
-    validations:
-      required: true
-  - type: textarea
-    id: repro-steps
-    attributes:
-      label: Reproduction Steps
-      description: |
-        Please describe how to reproduce the problem here. A minimal example code is the best.
-      placeholder: Reproduction Steps
-    validations:
-      required: true
-  - type: textarea
-    id: configuration
-    attributes:
-      label: Environment & Configuration
-      description: |
-        Please provide the information of your environment and configuration.
-      placeholder: Environment & Configuration
-      value: |
-        - Operating system: 
-        - .NET runtime version: 
-        - LLamaSharp version:
-        - CUDA version (if you are using cuda backend): 
-        - CPU & GPU device: 
-    validations:
-      required: true
-  - type: textarea
-    id: known-workarounds
-    attributes:
-      label: Known Workarounds
-      description: |
-        Please provide a description of the known workarounds, if any.
-      placeholder: Known Workarounds
-    validations:
-      required: false
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@ -1,35 +0,0 @@
-name: Feature Request
-description: Request/Propose a new feature in LLamaSharp.
-title: "[Feature]: "
-labels: [feature-request]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Feature proposal/request is always welcomed!
-  - type: textarea
-    id: background
-    attributes:
-      label: Background & Description
-      description: Please describe the purpose and value of the new feature here.
-      placeholder: Background & Description
-    validations:
-      required: true
-  - type: textarea
-    id: api-proposal
-    attributes:
-      label: API & Usage
-      description: |
-        Please tell us the new APIs related to the feature, if any. Please describe when and how it is used.
-      placeholder: API & Usage
-    validations:
-      required: false
-  - type: textarea
-    id: implementation
-    attributes:
-      label: How to implement
-      description: |
-        Please describe how you think the feature should be implemented. It's okay leave it blank.
-      placeholder: How to implement
-    validations:
-      required: false
--- a/.github/_typos.toml
+++ b/.github/_typos.toml
@ -1,16 +0,0 @@
-# Typos configuration file
-#
-# Info:    https://github.com/marketplace/actions/typos-action
-# Install: brew install typos-cli
-# Install: conda install typos
-# Run:     typos -c .github/_typos.toml
-
-[files]
-extend-exclude = [
-    "_typos.toml",
-    "docs/xmldocs/",
-    "LLama.Web/wwwroot/",
-    "LLama/runtimes/deps/",
-    "LLama.Benchmark/Assets/",
-    "LLama.Examples/Assets/"
-]
--- a/.github/download_models.py
+++ b/.github/download_models.py
@ -1,20 +0,0 @@
-from huggingface_hub import hf_hub_download
-import argparse
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model-list', type=str, required=True)
-    parser.add_argument('--model-dir', type=str, required=True)
-    parser.add_argument('--endpoint', type=str, default='https://huggingface.co')
-    args = parser.parse_args()
-    
-    with open(args.model_list, 'r') as f:
-        repo_id, filename = f.readline().split(',')
-    
-    hf_hub_download(
-        repo_id=repo_id, 
-        filename=filename, 
-        local_dir=args.model_dir, 
-        local_dir_use_symlinks=False, 
-        endpoint=args.endpoint
-    )
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -1,117 +0,0 @@
-name: Benchmark Test
-on:
-  push:
-    branches: [master]
-  pull_request:
-    branches: [master]
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-benchmark
-  cancel-in-progress: true
-
-jobs:
-  linux-benchmark-cuda:
-    if: contains(github.event.pull_request.labels.*.name, 'benchmark')
-    runs-on: [self-hosted, linux, gpu]
-
-    strategy:
-      fail-fast: false
-      matrix:
-        build: [cuda11, cuda12]
-        include:
-          - build: cuda11
-            image: nvidia/cuda:11.7.1-devel-ubuntu22.04
-            modeldir: /llamasharp_ci/models_benchmark
-          - build: cuda12
-            image: nvidia/cuda:12.1.1-devel-ubuntu22.04
-            modeldir: /llamasharp_ci/models_benchmark
-
-    container:
-      image: ${{ matrix.image }}
-      env:
-        BENCHMARK_MODEL_DIR: ${{ matrix.modeldir }}
-      ports:
-        - 80
-      volumes:
-        - /llamasharp_ci:/llamasharp_ci
-      options: --gpus=all --ipc=host --runtime=nvidia
-
-    steps:
-    - uses: actions/checkout@v4
-    
-    - name: Install libraries
-      run: |
-        apt update
-        apt install -y curl libicu-dev
-        apt-get install wget
-        wget https://packages.microsoft.com/config/ubuntu/22.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb
-        dpkg -i packages-microsoft-prod.deb
-        rm packages-microsoft-prod.deb
-        apt-get update  && apt-get install -y dotnet-sdk-8.0
-
-    - name: Prepare models
-      run: | 
-        apt-get update
-        apt-get install -y python3.10 python3-pip
-        python3 --version
-        pip install huggingface_hub
-        python3 .github/download_models.py --model-dir ${{ matrix.modeldir }} --model-list LLama.Benchmark/Assets/models.txt --endpoint https://hf-mirror.com
-
-    - name: Clear package cache
-      run: dotnet clean LLamaSharp.sln && dotnet nuget locals all --clear
-    - name: Restore packages
-      run: dotnet restore LLamaSharp.sln
-    - name: Build
-      run: | 
-        dotnet clean
-        dotnet build LLama/LLamaSharp.csproj -c Release --no-restore
-        dotnet build LLama.Benchmark/LLama.Benchmark.csproj -c Release --no-restore
-    - name: Run benchmark test
-      run: dotnet run --project LLama.Benchmark/LLama.Benchmark.csproj -c Release --anyCategories LLama
-    - name: Upload artifacts
-      if: always()
-      uses: actions/upload-artifact@v3
-      with:
-        name: Benchmark_Results
-        path: BenchmarkDotNet.Artifacts/results/*
-
-  windows-benchmark-cuda:
-    if: contains(github.event.pull_request.labels.*.name, 'benchmark')
-    runs-on: [self-hosted, windows, gpu]
-
-    strategy:
-      fail-fast: false
-      matrix:
-        build: [cuda11]
-        include:
-          - build: cuda11
-            modeldir: F:\Models\LLamaSharpBenchmark
-
-    env:
-      AGENT_TOOLSDIRECTORY: D:\Libs\github\runner-cache
-      BENCHMARK_MODEL_DIR: ${{ matrix.modeldir }}
-
-    steps:
-    - name: Settings
-      run: |
-        set http_proxy=127.0.0.1:7891
-        set https_proxy=127.0.0.1:7891
-
-    - uses: actions/checkout@v4
-
-    - name: Clear package cache
-      run: dotnet clean LLamaSharp.sln && dotnet nuget locals all --clear
-    - name: Restore packages
-      run: dotnet restore LLamaSharp.sln
-    - name: Build
-      run: | 
-        dotnet clean
-        dotnet build LLama/LLamaSharp.csproj -c Release --no-restore
-        dotnet build LLama.Benchmark/LLama.Benchmark.csproj -c Release --no-restore
-    - name: Run benchmark test
-      run: dotnet run --project LLama.Benchmark/LLama.Benchmark.csproj -c Release --anyCategories LLama
-    - name: Upload artifacts
-      if: always()
-      uses: actions/upload-artifact@v3
-      with:
-        name: Benchmark_Results
-        path: BenchmarkDotNet.Artifacts/results/*
--- a/.github/workflows/code_format.yml
+++ b/.github/workflows/code_format.yml
@ -1,26 +0,0 @@
-name: .NET code format check
-
-on:
-    # Currently we don't trigger this workflow.
-    # It's only used to show how the format check should be used
-    # and may be enabled in the future.
-    push:
-        branches: [ "PLACEHOLDER" ]
-    pull_request:
-        branches: [ "PLACEHOLDER" ]
-
-jobs:
-    dotnet-format:
-        
-        runs-on: ubuntu-latest
-        
-        steps:
-            - uses: actions/checkout@v3
-            - name: Setup .NET
-              uses: actions/setup-dotnet@v3
-              with:
-                  dotnet-version: 8.0.x
-            - name: Restore dependencies
-              run: dotnet restore
-            - name: Format
-              run: dotnet format --verify-no-changes --verbosity diagnostic
--- a/.github/workflows/compile.yml
+++ b/.github/workflows/compile.yml
@ -48,12 +48,12 @@ jobs:
          cd build
          cmake .. ${{ env.COMMON_DEFINE }} ${{ matrix.defines }}
          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
-      - uses: actions/upload-artifact@v4
+      - uses: actions/upload-artifact@v3
        with:
          path: ./build/libllama.so
          name: llama-bin-linux-${{ matrix.build }}-x64.so
      - name: Upload Llava
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
          path: ./build/examples/llava/libllava_shared.so
          name: llava-bin-linux-${{ matrix.build }}-x64.so
@ -89,13 +89,13 @@ jobs:
          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}

      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
          path: .\build\bin\Release\llama.dll
          name: llama-bin-win-${{ matrix.build }}-x64.dll

      - name: Upload Llava
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
          path: .\build\bin\Release\llava_shared.dll
          name: llava-bin-win-${{ matrix.build }}-x64.dll
@ -121,7 +121,6 @@ jobs:
        uses: actions/checkout@v4
        with:
          repository: ggerganov/llama.cpp
-          ref: '${{ github.event.inputs.llama_cpp_commit }}'
      - name: Download dependencies - Linux
        if: ${{ matrix.os == 'ubuntu-22.04' }}
        run: |
@ -170,7 +169,7 @@ jobs:
          ls -R
      - name: Upload artifacts (Windows)
        if: ${{ matrix.os == 'windows-latest' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
          path: |
            .\build\bin\Release\llama.dll
@ -178,14 +177,14 @@ jobs:
          name: llama-bin-win-clblast-x64.dll
      - name: Upload llava artifacts (Windows)
        if: ${{ matrix.os == 'windows-latest' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
          path: |
            .\build\bin\Release\llava_shared.dll
          name: llava-bin-win-clblast-x64.dll
      - name: Upload artifacts (linux)
        if: ${{ matrix.os == 'ubuntu-22.04' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
          path: |
            ./build/libllama.so
@ -193,7 +192,7 @@ jobs:
          name: llama-bin-linux-clblast-x64.so
      - name: Upload llava artifacts (linux)
        if: ${{ matrix.os == 'ubuntu-22.04' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
          path: |
            ./build/examples/llava/libllava_shared.so
@ -244,25 +243,25 @@ jobs:

      - name: Upload artifacts (Windows)
        if: ${{ matrix.os == 'windows-latest' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
          path: .\build\bin\Release\llama.dll
          name: llama-bin-win-cublas-cu${{ matrix.cuda }}-x64.dll
      - name: Upload llava artifacts (Windows)
        if: ${{ matrix.os == 'windows-latest' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
          path: .\build\bin\Release\llava_shared.dll
          name: llava-bin-win-cublas-cu${{ matrix.cuda }}-x64.dll
      - name: Upload artifacts (Linux)
        if: ${{ matrix.os == 'ubuntu-20.04' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
          path: ./build/libllama.so
          name: llama-bin-linux-cublas-cu${{ matrix.cuda }}-x64.so
      - name: Upload llava artifacts (Linux)
        if: ${{ matrix.os == 'ubuntu-20.04' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
          path: ./build/examples/llava/libllava_shared.so
          name: llava-bin-linux-cublas-cu${{ matrix.cuda }}-x64.so
@ -275,7 +274,7 @@ jobs:
      matrix:
        include:
          - build: 'arm64'
-            defines: '-DCMAKE_OSX_ARCHITECTURES=arm64 -DLLAMA_METAL_EMBED_LIBRARY=ON'
+            defines: '-DCMAKE_OSX_ARCHITECTURES=arm64'
          - build: 'x64'
            defines: '-DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=OFF -DLLAMA_AVX=ON -DLLAMA_AVX2=ON'
    runs-on: macos-latest   
@ -297,18 +296,18 @@ jobs:
          cmake .. ${{ env.COMMON_DEFINE }} ${{ matrix.defines }}
          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
          path: ./build/libllama.dylib
          name: llama-bin-osx-${{ matrix.build }}.dylib
      - name: Upload Llava
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
          path: ./build/examples/llava/libllava_shared.dylib
          name: llava-bin-osx-${{ matrix.build }}.dylib
      - name: Upload Metal
        if: ${{ matrix.build != 'x64' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
          path: ./build/bin/ggml-metal.metal
          name: ggml-metal.metal
@ -371,7 +370,7 @@ jobs:
          cp artifacts/llava-bin-win-cublas-cu11.7.1-x64.dll/llava_shared.dll    deps/cu11.7.1/llava_shared.dll

          cp artifacts/llama-bin-linux-cublas-cu11.7.1-x64.so/libllama.so deps/cu11.7.1/libllama.so
-          cp artifacts/llava-bin-linux-cublas-cu11.7.1-x64.so/libllava_shared.so deps/cu11.7.1/libllava_shared.so
+          cp artifacts/llava-bin-linux-cublas-cu11.7.1-x64.so/libllava_shared.so deps/cu11.7.1/libllama_shared.so
          
          cp artifacts/llama-bin-win-cublas-cu12.1.0-x64.dll/llama.dll    deps/cu12.1.0/llama.dll
          cp artifacts/llava-bin-win-cublas-cu12.1.0-x64.dll/llava_shared.dll    deps/cu12.1.0/llava_shared.dll
@ -380,20 +379,19 @@ jobs:
          cp artifacts/llava-bin-linux-cublas-cu12.1.0-x64.so/libllava_shared.so deps/cu12.1.0/libllava_shared.so
          
          cp artifacts/llama-bin-win-clblast-x64.dll/{llama,clblast}.dll deps/clblast/
-          cp artifacts/llava-bin-win-clblast-x64.dll/llava_shared.dll deps/clblast/llava_shared.dll
+          
          cp artifacts/llama-bin-linux-clblast-x64.so/libllama.so deps/clblast/
-          cp artifacts/llava-bin-linux-clblast-x64.so/libllava_shared.so deps/clblast/libllava_shared.so


      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
          path: deps/
          name: deps


      - name: Remove Artifacts
-        uses: geekyeggo/delete-artifact@v5
+        uses: geekyeggo/delete-artifact@v2
        with:
          name: |
            llama-*
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@ -1,4 +1,4 @@
-name: Unit Test
+name: CI
 on:
  push:
    branches: [master]
@ -13,6 +13,7 @@ jobs:
    name: Test
    runs-on: ${{ matrix.os }}
    strategy:
+      max-parallel: 2
      fail-fast: false
      matrix:
        build: [linux-release, windows-release, osx-release]
@ -20,9 +21,9 @@ jobs:
          - build: linux-release
            os: ubuntu-latest
            config: release
-          - build: osx-release  
-            os: macos-14 # https://github.blog/changelog/2024-01-30-github-actions-introducing-the-new-m1-macos-runner-available-to-open-source/
-            config: release            
+          - build: osx-release
+            os: macos-latest
+            config: release
          - build: windows-release
            os: windows-2019
            config: release
@ -30,7 +31,8 @@ jobs:
    - uses: actions/checkout@v4
    - uses: actions/setup-dotnet@v4
      with:
-        dotnet-version: |
+        dotnet-version: | 
+          7.0.x
          8.0.x
    - name: Cache Packages
      uses: actions/cache@v4
@ -45,7 +47,7 @@ jobs:
    - name: Build
      run: dotnet build LLamaSharp.sln -c ${{ matrix.config }} --no-restore
    - name: Test
-      run: dotnet test LLamaSharp.sln -c ${{ matrix.config }} -l "console;verbosity=detailed" --diag:logs/log.txt --filter Category!=NoCI
+      run: dotnet test LLamaSharp.sln -c ${{ matrix.config }} -l "console;verbosity=detailed" --diag:logs/log.txt
    - name: Upload artifacts
      if: always()
      uses: actions/upload-artifact@v3
--- a/.github/workflows/spell_check.yml
+++ b/.github/workflows/spell_check.yml
@ -1,31 +0,0 @@
-# Check pull requests for typos.
-#
-# Configuration: .github/_typos.toml
-#
-# Info:          https://github.com/marketplace/actions/typos-action
-# Local install: brew install typos-cli
-# Local install: conda install typos
-# Local run:     typos -c .github/_typos.toml
-
-name: Spell Check
-
-on:
-  push:
-    branches: [ "master" ]
-  pull_request:
-    branches: [ "master" ]
-
-jobs:
-  run:
-    name: Spell check
-    runs-on: ubuntu-latest
-    steps:
-      - name: Check out code
-        uses: actions/checkout@v3
-
-      - name: Use custom config file
-        uses: crate-ci/typos@master
-        with:
-          config: .github/_typos.toml
-          write_changes: false
-          quiet: true
--- a/.gitignore
+++ b/.gitignore
@ -346,5 +346,3 @@ site/
 /LLama.Unittest/Models/*.bin
 /LLama.Unittest/Models/*.gguf

-/LLama.Benchmark/Models/*.bin
-/LLama.Benchmark/Models/*.gguf
--- a/LLama.Benchmark/Assets/TextCompletionPrompts.txt
+++ b/LLama.Benchmark/Assets/TextCompletionPrompts.txt
--- a/LLama.Benchmark/Assets/extreme-ironing-taxi-610x427.jpg
+++ b/LLama.Benchmark/Assets/extreme-ironing-taxi-610x427.jpg
--- a/LLama.Benchmark/Assets/models.txt
+++ b/LLama.Benchmark/Assets/models.txt
@ -1 +0,0 @@
-TheBloke/Llama-2-7b-Chat-GGUF,llama-2-7b-chat.Q3_K_S.gguf
--- a/LLama.Benchmark/Common.cs
+++ b/LLama.Benchmark/Common.cs
@ -1,10 +0,0 @@
-
-namespace LLama.Benchmark
-{
-    public enum ExecutorType
-    {
-        Interactive,
-        Instruct,
-        Stateless
-    }
-}
--- a/LLama.Benchmark/Constants.cs
+++ b/LLama.Benchmark/Constants.cs
@ -1,23 +0,0 @@
-
-namespace LLama.Benchmark
-{
-    internal static class Constants
-    {
-        public static string ModelDir
-        {
-            get
-            {
-                return Environment.GetEnvironmentVariable("BENCHMARK_MODEL_DIR") ?? "";
-            }
-        }
-
-        public static string Generative7BModelPath =>  Path.Combine(ModelDir, "llama-2-7b-chat.Q3_K_S.gguf");
-        public static string EmbeddingModelPath => Path.Combine(ModelDir, "all-MiniLM-L12-v2.Q8_0.gguf");
-
-        public static string LLavaModelPath => Path.Combine("llava-v1.6-mistral-7b.Q3_K_XS.gguf");
-        public static string LLavaMmpPath => Path.Combine("mmproj-model-f16.gguf");
-        public static string LLavaImage => "Assets/extreme-ironing-taxi-610x427.jpg";
-
-        public static string TextCompletionPromptsFilePath => "Assets/TextCompletionPrompts.txt";
-    }
-}
--- a/LLama.Benchmark/LLama.Benchmark.csproj
+++ b/LLama.Benchmark/LLama.Benchmark.csproj
@ -1,30 +0,0 @@
-<Project Sdk="Microsoft.NET.Sdk">
-  <Import Project="..\LLama\LLamaSharp.Runtime.targets" />
-
-  <PropertyGroup>
-    <OutputType>Exe</OutputType>
-    <TargetFramework>net8.0</TargetFramework>
-    <ImplicitUsings>enable</ImplicitUsings>
-    <Nullable>enable</Nullable>
-    <Configuration>Release</Configuration>
-  </PropertyGroup>
-
-  <ItemGroup>
-    <PackageReference Include="BenchmarkDotNet" Version="0.13.12" />
-    <PackageReference Include="BenchmarkDotNet.Diagnostics.Windows" Version="0.13.12" />
-  </ItemGroup>
-
-    <ItemGroup>
-      <ProjectReference Include="..\LLama\LLamaSharp.csproj" />
-    </ItemGroup>
-
-    <ItemGroup>
-        <None Update="Assets\TextCompletionPrompts.txt">
-            <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        </None>
-        <None Update="Models\extreme-ironing-taxi-610x427.jpg">
-            <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        </None>
-    </ItemGroup>
-
-</Project>
--- a/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
+++ b/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
@ -1,138 +0,0 @@
-#pragma warning disable CS8618
-
-using System.Text;
-using BenchmarkDotNet.Attributes;
-using BenchmarkDotNet.Engines;
-using BenchmarkDotNet.Jobs;
-using LLama.Abstractions;
-using LLama.Common;
-using LLama.Native;
-
-namespace LLama.Benchmark.LLamaExecutorBenchmark
-{
-#if WINDOWS
-    [BenchmarkDotNet.Diagnostics.Windows.Configs.NativeMemoryProfiler]
-#endif
-    [BenchmarkCategory("Executor", "LLama")]
-    [SimpleJob(RunStrategy.Monitoring, runtimeMoniker: RuntimeMoniker.Net80)]
-    [MemoryDiagnoser]
-    [MinIterationCount(1)]
-    [MaxIterationCount(16)]
-    [RPlotExporter]
-    public class PrefillBenchmark
-    {
-        /// <summary>
-        /// (prompt length, context length)
-        /// </summary>
-        public IEnumerable<(int, uint)> PromptAndContextLengths => new (int, uint)[] 
-        {
-            (512, 2048),
-            (2024, 2048)
-        };
-
-        /// <summary>
-        /// (model path, gpu layer count)
-        /// </summary>
-        public IEnumerable<(string, int)> ModelAndGpuLayerCounts => new (string, int)[]
-        // TODO: specify the native library to load here to test cpu case better.
-        {
-            (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 0),
-            (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 10),
-            (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 20)
-        };
-
-        public IEnumerable<ExecutorType> ExecutorTypes => new ExecutorType[]
-        {
-            ExecutorType.Interactive,
-            ExecutorType.Stateless
-        };
-
-        [ParamsSource(nameof(PromptAndContextLengths))]
-        public (int, uint) PromptAndContextLength { get; set; }
-
-        [ParamsSource(nameof(ModelAndGpuLayerCounts))]
-        public (string, int) ModelAndGpuLayerCount { get; set; }
-
-        [ParamsSource(nameof(ExecutorTypes))]
-        public ExecutorType ExecutorType { get; set; }
-
-        /// <summary>
-        /// Params used to create a model.
-        /// </summary>
-        public ModelParams ModelParams { get; set; }
-
-        /// <summary>
-        /// Params used in inference.
-        /// </summary>
-        public InferenceParams InferenceParams { get; set; }
-
-        /// <summary>
-        /// Prompt used to run text generation.
-        /// </summary>
-        public string Prompt { get; set; }
-
-        public ILLamaExecutor Executor { get; set; }
-
-        private void InitializeParamsAndModel()
-        {
-            ModelParams = new ModelParams(ModelAndGpuLayerCount.Item1)
-            {
-                ContextSize = PromptAndContextLength.Item2,
-                GpuLayerCount = ModelAndGpuLayerCount.Item2
-            };
-            Prompt = File.ReadAllText(Constants.TextCompletionPromptsFilePath).Substring(0, PromptAndContextLength.Item1);
-            InferenceParams = new InferenceParams()
-            {
-                Temperature = 0.6f,
-                MaxTokens = 1 // Only prefill, no generation here.
-            };
-
-            LLamaWeights weights = LLamaWeights.LoadFromFile(ModelParams);
-            LLamaContext context = weights.CreateContext(ModelParams);
-            Executor = ExecutorType switch
-            {
-                ExecutorType.Interactive => new InteractiveExecutor(context),
-                ExecutorType.Instruct => new InstructExecutor(context),
-                ExecutorType.Stateless => new StatelessExecutor(weights, ModelParams),
-                _ => throw new NotSupportedException()
-            };
-        }
-
-        [GlobalSetup(Targets = [nameof(Basic)])]
-        public void GlobalSetup()
-        {
-            var showLLamaCppLogs = true;
-            NativeLibraryConfig
-               .Instance
-               .WithLogCallback((level, message) =>
-               {
-                   if (showLLamaCppLogs)
-                       Console.WriteLine($"[llama {level}]: {message.TrimEnd('\n')}");
-               }).WithCuda().SkipCheck().WithAutoFallback(false);
-
-            // Calling this method forces loading to occur now.
-            NativeApi.llama_empty_call();
-            InitializeParamsAndModel();
-        }
-
-        [IterationCleanup(Targets = [nameof(Basic)])]
-        public void GlobalCleanup()
-        {
-            if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
-            {
-                Executor.Context.NativeHandle.KvCacheClear();
-            }
-        }
-
-        [Benchmark]
-        public async Task<string> Basic()
-        {
-            StringBuilder sb = new();
-            await foreach(var text in Executor.InferAsync(Prompt, InferenceParams))
-            {
-                sb.Append(text);
-            }
-            return sb.ToString();
-        }
-    }
-}
--- a/LLama.Benchmark/Program.cs
+++ b/LLama.Benchmark/Program.cs
@ -1,13 +0,0 @@
-using BenchmarkDotNet.Running;
-
-namespace LLama.Benchmark
-{
-    public class Program
-    {
-        public static void Main(string[] args)
-        {
-            var summary = BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args);
-            Console.WriteLine(summary);
-        }
-    }
-}
--- a/LLama.Examples/ExampleRunner.cs
+++ b/LLama.Examples/ExampleRunner.cs
@ -5,7 +5,6 @@ public class ExampleRunner
 {
    private static readonly Dictionary<string, Func<Task>> Examples = new()
    {
-        { "Chat Session: LLama3", LLama3ChatSession.Run },
        { "Chat Session: History", ChatSessionWithHistory.Run },
        { "Chat Session: Role names", ChatSessionWithRoleName.Run },
        { "Chat Session: Role names stripped", ChatSessionStripRoleName.Run },
@ -27,11 +26,9 @@ public class ExampleRunner
        { "Semantic Kernel: Prompt", SemanticKernelPrompt.Run },
        { "Semantic Kernel: Chat", SemanticKernelChat.Run },
        { "Semantic Kernel: Store", SemanticKernelMemory.Run },
-        { "Batched Executor: Save/Load", BatchedExecutorSaveAndLoad.Run },
        { "Batched Executor: Fork", BatchedExecutorFork.Run },
        { "Batched Executor: Rewind", BatchedExecutorRewind.Run },
        { "Batched Executor: Guidance", BatchedExecutorGuidance.Run },
-        { "Speech Chat: Integration with Whisper.net", SpeechChat.Run },
        { "Exit", () => { Environment.Exit(0); return Task.CompletedTask; } }
    };

--- a/LLama.Examples/Examples/BatchedExecutorFork.cs
+++ b/LLama.Examples/Examples/BatchedExecutorFork.cs
@ -19,7 +19,7 @@ public class BatchedExecutorFork
        string modelPath = UserSettings.GetModelPath();

        var parameters = new ModelParams(modelPath);
-        using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+        using var model = LLamaWeights.LoadFromFile(parameters);

        var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that");

@ -32,7 +32,7 @@ public class BatchedExecutorFork

        // Evaluate the initial prompt to create one conversation
        using var start = executor.Create();
-        start.Prompt(executor.Context.Tokenize(prompt));
+        start.Prompt(prompt);
        await executor.Infer();

        // Create the root node of the tree
--- a/LLama.Examples/Examples/BatchedExecutorGuidance.cs
+++ b/LLama.Examples/Examples/BatchedExecutorGuidance.cs
@ -19,7 +19,7 @@ public class BatchedExecutorGuidance
        string modelPath = UserSettings.GetModelPath();

        var parameters = new ModelParams(modelPath);
-        using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+        using var model = LLamaWeights.LoadFromFile(parameters);

        var positivePrompt = AnsiConsole.Ask("Positive Prompt (or ENTER for default):", "My favourite colour is").Trim();
        var negativePrompt = AnsiConsole.Ask("Negative Prompt (or ENTER for default):", "I hate the colour red. My favourite colour is").Trim();
@ -34,9 +34,9 @@ public class BatchedExecutorGuidance

        // Load the two prompts into two conversations
        using var guided = executor.Create();
-        guided.Prompt(executor.Context.Tokenize(positivePrompt));
+        guided.Prompt(positivePrompt);
        using var guidance = executor.Create();
-        guidance.Prompt(executor.Context.Tokenize(negativePrompt));
+        guidance.Prompt(negativePrompt);

        // Run inference to evaluate prompts
        await AnsiConsole
@ -79,7 +79,7 @@ public class BatchedExecutorGuidance
                    guidance.Prompt(g);

                    // Early exit if we reach the natural end of the guided sentence
-                    if (g == model.Tokens.EOS)
+                    if (g == model.EndOfSentenceToken)
                        break;

                    // Update progress bar
--- a/LLama.Examples/Examples/BatchedExecutorRewind.cs
+++ b/LLama.Examples/Examples/BatchedExecutorRewind.cs
@ -20,7 +20,7 @@ public class BatchedExecutorRewind
        string modelPath = UserSettings.GetModelPath();

        var parameters = new ModelParams(modelPath);
-        using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+        using var model = LLamaWeights.LoadFromFile(parameters);

        var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that");

@ -33,7 +33,7 @@ public class BatchedExecutorRewind

        // Evaluate the initial prompt to create one conversation
        using var conversation = executor.Create();
-        conversation.Prompt(executor.Context.Tokenize(prompt));
+        conversation.Prompt(prompt);
        
        // Create the start node wrapping the conversation
        var node = new Node(executor.Context);
--- a/LLama.Examples/Examples/BatchedExecutorSaveAndLoad.cs
+++ b/LLama.Examples/Examples/BatchedExecutorSaveAndLoad.cs
@ -1,108 +0,0 @@
-using LLama.Batched;
-using LLama.Common;
-using LLama.Native;
-using LLama.Sampling;
-using Spectre.Console;
-
-namespace LLama.Examples.Examples;
-
-/// <summary>
-/// This demonstrates generating multiple replies to the same prompt, with a shared cache
-/// </summary>
-public class BatchedExecutorSaveAndLoad
-{
-    private const int n_len = 18;
-
-    public static async Task Run()
-    {
-        string modelPath = UserSettings.GetModelPath();
-
-        var parameters = new ModelParams(modelPath);
-        using var model = await LLamaWeights.LoadFromFileAsync(parameters);
-
-        var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that");
-
-        // Create an executor that can evaluate a batch of conversations together
-        using var executor = new BatchedExecutor(model, parameters);
-
-        // Print some info
-        var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
-        Console.WriteLine($"Created executor with model: {name}");
-
-        // Create a conversation
-        var conversation = executor.Create();
-        conversation.Prompt(executor.Context.Tokenize(prompt));
-
-        // Run inference loop
-        var decoder = new StreamingTokenDecoder(executor.Context);
-        var sampler = new DefaultSamplingPipeline();
-        var lastToken = await GenerateTokens(executor, conversation, sampler, decoder, n_len);
-
-        // Can't save a conversation while RequiresInference is true
-        if (conversation.RequiresInference)
-            await executor.Infer();
-
-        // Save this conversation to a file and dispose it
-        conversation.Save("demo_conversation.state");
-        conversation.Dispose();
-        AnsiConsole.WriteLine($"Saved state: {new FileInfo("demo_conversation.state").Length} bytes");
-
-        // Now create a new conversation by loading that state
-        conversation = executor.Load("demo_conversation.state");
-        AnsiConsole.WriteLine("Loaded state");
-
-        // Prompt it again with the last token, so we can continue generating
-        conversation.Rewind(1);
-        conversation.Prompt(lastToken);
-
-        // Continue generating text
-        lastToken = await GenerateTokens(executor, conversation, sampler, decoder, n_len);
-
-        // Can't save a conversation while RequiresInference is true
-        if (conversation.RequiresInference)
-            await executor.Infer();
-
-        // Save the conversation again, this time into system memory
-        using (var state = conversation.Save())
-        {
-            conversation.Dispose();
-            AnsiConsole.WriteLine($"Saved state to memory: {state.Size} bytes");
-
-            // Now create a new conversation by loading that state
-            conversation = executor.Load("demo_conversation.state");
-            AnsiConsole.WriteLine("Loaded state");
-        }
-
-        // Prompt it again with the last token, so we can continue generating
-        conversation.Rewind(1);
-        conversation.Prompt(lastToken);
-
-        // Continue generating text
-        await GenerateTokens(executor, conversation, sampler, decoder, n_len);
-
-        // Display final output
-        AnsiConsole.MarkupLine($"[red]{prompt}{decoder.Read()}[/]");
-    }
-
-    private static async Task<LLamaToken> GenerateTokens(BatchedExecutor executor, Conversation conversation, ISamplingPipeline sampler, StreamingTokenDecoder decoder, int count = 15)
-    {
-        var token = (LLamaToken)0;
-
-        for (var i = 0; i < count; i++)
-        {
-            // Run inference
-            await executor.Infer();
-
-            // Use sampling pipeline to pick a token
-            token = sampler.Sample(executor.Context.NativeHandle, conversation.Sample(), ReadOnlySpan<LLamaToken>.Empty);
-
-            // Add it to the decoder, so it can be converted into text later
-            decoder.Add(token);
-
-            // Prompt the conversation with the token
-            conversation.Prompt(token);
-        }
-
-        return token;
-    }
-}
--- a/LLama.Examples/Examples/ChatChineseGB2312.cs
+++ b/LLama.Examples/Examples/ChatChineseGB2312.cs
@ -27,11 +27,12 @@ public class ChatChineseGB2312

        var parameters = new ModelParams(modelPath)
        {
+            ContextSize = 1024,
            Seed = 1337,
            GpuLayerCount = 5,
            Encoding = Encoding.UTF8
        };
-        using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+        using var model = LLamaWeights.LoadFromFile(parameters);
        using var context = model.CreateContext(parameters);
        var executor = new InteractiveExecutor(context);

--- a/LLama.Examples/Examples/ChatSessionStripRoleName.cs
+++ b/LLama.Examples/Examples/ChatSessionStripRoleName.cs
@ -12,14 +12,15 @@ public class ChatSessionStripRoleName

        var parameters = new ModelParams(modelPath)
        {
+            ContextSize = 1024,
            Seed = 1337,
            GpuLayerCount = 5
        };
-        using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+        using var model = LLamaWeights.LoadFromFile(parameters);
        using var context = model.CreateContext(parameters);
        var executor = new InteractiveExecutor(context);

-        var chatHistoryJson = await File.ReadAllTextAsync("Assets/chat-with-bob.json");
+        var chatHistoryJson = File.ReadAllText("Assets/chat-with-bob.json");
        ChatHistory chatHistory = ChatHistory.FromJson(chatHistoryJson) ?? new ChatHistory();

        ChatSession session = new(executor, chatHistory);
--- a/LLama.Examples/Examples/ChatSessionWithHistory.cs
+++ b/LLama.Examples/Examples/ChatSessionWithHistory.cs
@ -10,10 +10,11 @@ public class ChatSessionWithHistory

        var parameters = new ModelParams(modelPath)
        {
+            ContextSize = 1024,
            Seed = 1337,
            GpuLayerCount = 5
        };
-        using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+        using var model = LLamaWeights.LoadFromFile(parameters);
        using var context = model.CreateContext(parameters);
        var executor = new InteractiveExecutor(context);

--- a/LLama.Examples/Examples/ChatSessionWithRestart.cs
+++ b/LLama.Examples/Examples/ChatSessionWithRestart.cs
@ -10,14 +10,15 @@ public class ChatSessionWithRestart

        var parameters = new ModelParams(modelPath)
        {
+            ContextSize = 1024,
            Seed = 1337,
            GpuLayerCount = 5
        };
-        using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+        using var model = LLamaWeights.LoadFromFile(parameters);
        using var context = model.CreateContext(parameters);
        var executor = new InteractiveExecutor(context);

-        var chatHistoryJson = await File.ReadAllTextAsync("Assets/chat-with-bob.json");
+        var chatHistoryJson = File.ReadAllText("Assets/chat-with-bob.json");
        ChatHistory chatHistory = ChatHistory.FromJson(chatHistoryJson) ?? new ChatHistory();
        ChatSession prototypeSession = 
            await ChatSession.InitializeSessionFromHistoryAsync(executor, chatHistory);
--- a/LLama.Examples/Examples/ChatSessionWithRoleName.cs
+++ b/LLama.Examples/Examples/ChatSessionWithRoleName.cs
@ -10,14 +10,15 @@ public class ChatSessionWithRoleName

        var parameters = new ModelParams(modelPath)
        {
+            ContextSize = 1024,
            Seed = 1337,
            GpuLayerCount = 5
        };
-        using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+        using var model = LLamaWeights.LoadFromFile(parameters);
        using var context = model.CreateContext(parameters);
        var executor = new InteractiveExecutor(context);

-        var chatHistoryJson = await File.ReadAllTextAsync("Assets/chat-with-bob.json");
+        var chatHistoryJson = File.ReadAllText("Assets/chat-with-bob.json");
        ChatHistory chatHistory = ChatHistory.FromJson(chatHistoryJson) ?? new ChatHistory();

        ChatSession session = new(executor, chatHistory);
--- a/LLama.Examples/Examples/CodingAssistant.cs
+++ b/LLama.Examples/Examples/CodingAssistant.cs
@ -29,7 +29,7 @@
            {
                ContextSize = 4096
            };
-            using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+            using var model = LLamaWeights.LoadFromFile(parameters);
            using var context = model.CreateContext(parameters);
            var executor = new InstructExecutor(context, InstructionPrefix, InstructionSuffix, null);

--- a/LLama.Examples/Examples/GetEmbeddings.cs
+++ b/LLama.Examples/Examples/GetEmbeddings.cs
@ -9,7 +9,7 @@ namespace LLama.Examples.Examples
            string modelPath = UserSettings.GetModelPath();

            Console.ForegroundColor = ConsoleColor.DarkGray;
-            var @params = new ModelParams(modelPath) { Embeddings = true };
+            var @params = new ModelParams(modelPath) { EmbeddingMode = true };
            using var weights = LLamaWeights.LoadFromFile(@params);
            var embedder = new LLamaEmbedder(weights, @params);

--- a/LLama.Examples/Examples/GrammarJsonResponse.cs
+++ b/LLama.Examples/Examples/GrammarJsonResponse.cs
@ -9,15 +9,16 @@ namespace LLama.Examples.Examples
        {
            string modelPath = UserSettings.GetModelPath();

-            var gbnf = (await File.ReadAllTextAsync("Assets/json.gbnf")).Trim();
+            var gbnf = File.ReadAllText("Assets/json.gbnf").Trim();
            var grammar = Grammar.Parse(gbnf, "root");

            var parameters = new ModelParams(modelPath)
            {
+                ContextSize = 1024,
                Seed = 1337,
                GpuLayerCount = 5
            };
-            using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+            using var model = LLamaWeights.LoadFromFile(parameters);
            var ex = new StatelessExecutor(model, parameters);

            Console.ForegroundColor = ConsoleColor.Yellow;
--- a/LLama.Examples/Examples/InstructModeExecute.cs
+++ b/LLama.Examples/Examples/InstructModeExecute.cs
@ -9,14 +9,15 @@ namespace LLama.Examples.Examples
        {
            string modelPath = UserSettings.GetModelPath();

-            var prompt = (await File.ReadAllTextAsync("Assets/dan.txt")).Trim();
+            var prompt = File.ReadAllText("Assets/dan.txt").Trim();

            var parameters = new ModelParams(modelPath)
            {
+                ContextSize = 1024,
                Seed = 1337,
                GpuLayerCount = 5
            };
-            using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+            using var model = LLamaWeights.LoadFromFile(parameters);
            using var context = model.CreateContext(parameters);
            var executor = new InstructExecutor(context);

--- a/LLama.Examples/Examples/InteractiveModeExecute.cs
+++ b/LLama.Examples/Examples/InteractiveModeExecute.cs
@ -13,10 +13,11 @@ namespace LLama.Examples.Examples

            var parameters = new ModelParams(modelPath)
            {
+                ContextSize = 1024,
                Seed = 1337,
                GpuLayerCount = 5
            };
-            using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+            using var model = LLamaWeights.LoadFromFile(parameters);
            using var context = model.CreateContext(parameters);
            var ex = new InteractiveExecutor(context);

--- a/LLama.Examples/Examples/LLama3ChatSession.cs
+++ b/LLama.Examples/Examples/LLama3ChatSession.cs
@ -1,126 +0,0 @@
-using LLama.Abstractions;
-using LLama.Common;
-
-namespace LLama.Examples.Examples;
-
-// When using chatsession, it's a common case that you want to strip the role names
-// rather than display them. This example shows how to use transforms to strip them.
-public class LLama3ChatSession
-{
-    public static async Task Run()
-    {
-        string modelPath = UserSettings.GetModelPath();
-
-        var parameters = new ModelParams(modelPath)
-        {
-            Seed = 1337,
-            GpuLayerCount = 10
-        };
-        using var model = LLamaWeights.LoadFromFile(parameters);
-        using var context = model.CreateContext(parameters);
-        var executor = new InteractiveExecutor(context);
-
-        var chatHistoryJson = File.ReadAllText("Assets/chat-with-bob.json");
-        ChatHistory chatHistory = ChatHistory.FromJson(chatHistoryJson) ?? new ChatHistory();
-
-        ChatSession session = new(executor, chatHistory);
-        session.WithHistoryTransform(new LLama3HistoryTransform());
-        session.WithOutputTransform(new LLamaTransforms.KeywordTextOutputStreamTransform(
-            new string[] { "User:", "Assistant:", "<22>" },
-            redundancyLength: 5));
-
-        InferenceParams inferenceParams = new InferenceParams()
-        {
-            Temperature = 0.6f,
-            AntiPrompts = new List<string> { "User:" }
-        };
-
-        Console.ForegroundColor = ConsoleColor.Yellow;
-        Console.WriteLine("The chat session has started.");
-
-        // show the prompt
-        Console.ForegroundColor = ConsoleColor.Green;
-        string userInput = Console.ReadLine() ?? "";
-
-        while (userInput != "exit")
-        {
-            await foreach (
-                var text
-                in session.ChatAsync(
-                    new ChatHistory.Message(AuthorRole.User, userInput),
-                    inferenceParams))
-            {
-                Console.ForegroundColor = ConsoleColor.White;
-                Console.Write(text);
-            }
-            Console.WriteLine();
-
-            Console.ForegroundColor = ConsoleColor.Green;
-            userInput = Console.ReadLine() ?? "";
-
-            Console.ForegroundColor = ConsoleColor.White;
-        }
-    }
-
-    class LLama3HistoryTransform : IHistoryTransform
-    {
-        /// <summary>
-        /// Convert a ChatHistory instance to plain text.
-        /// </summary>
-        /// <param name="history">The ChatHistory instance</param>
-        /// <returns></returns>
-        public string HistoryToText(ChatHistory history)
-        {
-            string res = Bos;
-            foreach (var message in history.Messages)
-            {
-                res += EncodeMessage(message);
-            }
-            res += EncodeHeader(new ChatHistory.Message(AuthorRole.Assistant, ""));
-            return res;
-        }
-
-        private string EncodeHeader(ChatHistory.Message message)
-        {
-            string res = StartHeaderId;
-            res += message.AuthorRole.ToString();
-            res += EndHeaderId;
-            res += "\n\n";
-            return res;
-        }
-
-        private string EncodeMessage(ChatHistory.Message message)
-        {
-            string res = EncodeHeader(message);
-            res += message.Content;
-            res += EndofTurn;
-            return res;
-        }
-
-        /// <summary>
-        /// Converts plain text to a ChatHistory instance.
-        /// </summary>
-        /// <param name="role">The role for the author.</param>
-        /// <param name="text">The chat history as plain text.</param>
-        /// <returns>The updated history.</returns>
-        public ChatHistory TextToHistory(AuthorRole role, string text)
-        {
-            return new ChatHistory(new ChatHistory.Message[] { new ChatHistory.Message(role, text) });
-        }
-
-        /// <summary>
-        /// Copy the transform.
-        /// </summary>
-        /// <returns></returns>
-        public IHistoryTransform Clone()
-        {
-            return new LLama3HistoryTransform();
-        }
-
-        private const string StartHeaderId = "<|start_header_id|>";
-        private const string EndHeaderId = "<|end_header_id|>";
-        private const string Bos = "<|begin_of_text|>";
-        private const string Eos = "<|end_of_text|>";
-        private const string EndofTurn = "<|eot_id|>";
-    }
-}
--- a/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs
+++ b/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs
@ -1,7 +1,7 @@
 using System.Text.RegularExpressions;
+using LLama.Batched;
 using LLama.Common;
 using Spectre.Console;
-using LLama.Native;

 namespace LLama.Examples.Examples
 {
@ -18,15 +18,18 @@ namespace LLama.Examples.Examples

            var prompt = $"{{{modelImage}}}\nUSER:\nProvide a full description of the image.\nASSISTANT:\n";

-            var parameters = new ModelParams(modelPath);
-
-            using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+            var parameters = new ModelParams(modelPath)
+            {
+                ContextSize = 4096,
+                Seed = 1337,
+            };
+            using var model = LLamaWeights.LoadFromFile(parameters);
            using var context = model.CreateContext(parameters);
            
            // Llava Init
-            using var clipModel = await LLavaWeights.LoadFromFileAsync(multiModalProj);
+            using var clipModel = LLavaWeights.LoadFromFile(multiModalProj);
            
-            var ex = new InteractiveExecutor(context, clipModel);
+            var ex = new InteractiveExecutor(context, clipModel );

            Console.ForegroundColor = ConsoleColor.Yellow;
            Console.WriteLine("The executor has been enabled. In this example, the prompt is printed, the maximum tokens is set to {0} and the context size is {1}.", maxTokens, parameters.ContextSize );
@ -42,16 +45,16 @@ namespace LLama.Examples.Examples
                var imageMatches = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
                var imageCount = imageMatches.Count();
                var hasImages = imageCount > 0;
+                byte[][] imageBytes = null;

                if (hasImages)
                {
                    var imagePathsWithCurlyBraces = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
-                    var imagePaths = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Groups[1].Value).ToList();
+                    var imagePaths = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Groups[1].Value);

-                    List<byte[]> imageBytes;
                    try
                    {
-                        imageBytes = imagePaths.Select(File.ReadAllBytes).ToList();
+                        imageBytes = imagePaths.Select(File.ReadAllBytes).ToArray();
                    }
                    catch (IOException exception)
                    {
@ -64,17 +67,15 @@ namespace LLama.Examples.Examples
                        break;
                    }

-                    // Each prompt with images we clear cache
-                    // When the prompt contains images we clear KV_CACHE to restart conversation
-                    // See:
-                    // https://github.com/ggerganov/llama.cpp/discussions/3620
-                    ex.Context.NativeHandle.KvCacheRemove( LLamaSeqId.Zero, -1, -1 );

                    int index = 0;
                    foreach (var path in imagePathsWithCurlyBraces)
                    {
                        // First image replace to tag <image, the rest of the images delete the tag
-                        prompt = prompt.Replace(path, index++ == 0 ? "<image>" : "");
+                        if (index++ == 0)
+                            prompt = prompt.Replace(path, "<image>");
+                        else
+                            prompt = prompt.Replace(path, "");
                    }

                  
@ -95,12 +96,9 @@ namespace LLama.Examples.Examples
                    Console.WriteLine();


-                    // Initialize Images in executor
+                    // Initilize Images in executor
                    //
-                    foreach (var image in imagePaths)
-                    {
-                        ex.Images.Add(await File.ReadAllBytesAsync(image));
-                    }
+                    ex.ImagePaths = imagePaths.ToList();
                }

                Console.ForegroundColor = Color.White;
@ -115,7 +113,7 @@ namespace LLama.Examples.Examples
                
                // let the user finish with exit
                //
-                if (prompt != null && prompt.Equals("/exit", StringComparison.OrdinalIgnoreCase))
+                if (prompt.Equals("/exit", StringComparison.OrdinalIgnoreCase))
                    break;

            }
--- a/LLama.Examples/Examples/LoadAndSaveSession.cs
+++ b/LLama.Examples/Examples/LoadAndSaveSession.cs
@ -12,10 +12,11 @@ namespace LLama.Examples.Examples

            var parameters = new ModelParams(modelPath)
            {
+                ContextSize = 1024,
                Seed = 1337,
                GpuLayerCount = 5
            };
-            using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+            using var model = LLamaWeights.LoadFromFile(parameters);
            using var context = model.CreateContext(parameters);
            var ex = new InteractiveExecutor(context);

--- a/LLama.Examples/Examples/LoadAndSaveState.cs
+++ b/LLama.Examples/Examples/LoadAndSaveState.cs
@ -13,10 +13,11 @@ namespace LLama.Examples.Examples

            var parameters = new ModelParams(modelPath)
            {
+                ContextSize = 1024,
                Seed = 1337,
                GpuLayerCount = 5
            };
-            using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+            using var model = LLamaWeights.LoadFromFile(parameters);
            using var context = model.CreateContext(parameters);
            var ex = new InteractiveExecutor(context);

--- a/LLama.Examples/Examples/SemanticKernelChat.cs
+++ b/LLama.Examples/Examples/SemanticKernelChat.cs
@ -16,7 +16,7 @@ namespace LLama.Examples.Examples

            // Load weights into memory
            var parameters = new ModelParams(modelPath);
-            using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+            using var model = LLamaWeights.LoadFromFile(parameters);
            var ex = new StatelessExecutor(model, parameters);

            var chatGPT = new LLamaSharpChatCompletion(ex);
--- a/LLama.Examples/Examples/SemanticKernelMemory.cs
+++ b/LLama.Examples/Examples/SemanticKernelMemory.cs
@ -20,10 +20,10 @@ namespace LLama.Examples.Examples
            var parameters = new ModelParams(modelPath)
            {
                Seed = seed,
-                Embeddings = true
+                EmbeddingMode = true
            };

-            using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+            using var model = LLamaWeights.LoadFromFile(parameters);
            var embedding = new LLamaEmbedder(model, parameters);

            Console.WriteLine("====================================================");
--- a/LLama.Examples/Examples/SemanticKernelPrompt.cs
+++ b/LLama.Examples/Examples/SemanticKernelPrompt.cs
@ -1,9 +1,9 @@
 using LLama.Common;
+using LLamaSharp.SemanticKernel.ChatCompletion;
 using Microsoft.SemanticKernel;
 using LLamaSharp.SemanticKernel.TextCompletion;
 using Microsoft.SemanticKernel.TextGeneration;
 using Microsoft.Extensions.DependencyInjection;
-using LLamaSharp.SemanticKernel;

 namespace LLama.Examples.Examples
 {
@ -19,7 +19,7 @@ namespace LLama.Examples.Examples

            // Load weights into memory
            var parameters = new ModelParams(modelPath);
-            using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+            using var model = LLamaWeights.LoadFromFile(parameters);
            var ex = new StatelessExecutor(model, parameters);

            var builder = Kernel.CreateBuilder();
@ -31,7 +31,7 @@ namespace LLama.Examples.Examples

 One line TLDR with the fewest words.";

-            LLamaSharpPromptExecutionSettings settings = new() { MaxTokens = 100 };
+            ChatRequestSettings settings = new() { MaxTokens = 100 };
            var summarize = kernel.CreateFunctionFromPrompt(prompt, settings);

            string text1 = @"
--- a/LLama.Examples/Examples/SpeechChat.cs
+++ b/LLama.Examples/Examples/SpeechChat.cs
@ -1,253 +0,0 @@
-using LLama.Common;
-using NAudio.Wave;
-using Whisper.net;
-
-namespace LLama.Examples.Examples
-{
-    public class SpeechChat
-    {
-        public static async Task Run()
-        {
-            ConsoleStyleHelpers.WriteLine(
-"""
-This example demonstrates the basics of audio transcriptions, speech recognition, and speech commands,
-    as well as how to recognize a user's voice in real time and then get a response from LLM.
-It uses whisper.net and models could be found in: https://huggingface.co/ggerganov/whisper.cpp/tree/main.
-To use it, you need a working microphone and enough RAM to host both audio + language models.
-Once you've selected the models, just speak to your microphone and watch the LLM continue your text.
-While it's going, you can say something like 'Okay, stop', or 'Stop now', to interrupt the LLM's inference.
-
-NOTE: You may need to poke around with the voice detection threshold, based on your mic's sensitivity.
-----------------------------------------------------------------------------------------------------------
-""", ConsoleColor.Yellow);
-
-            if (ConsoleStyleHelpers.SelectAudioModel() is not string model) { return; }
-
-            bool loadFinished = false;
-            var loading = ConsoleStyleHelpers.LoadPrint("Loading transcription model...", () => loadFinished);
-
-            using var speechRecognitionServer = new SpeechRecognitionServer(model);
-            loadFinished = true; loading.Wait();
-
-            Console.WriteLine("Audio model loaded. Insert path for language model.");
-            using var _ = new LlamaSession_SpeechListener(speechRecognitionServer);
-
-            await ConsoleStyleHelpers.WaitUntilExit();
-        }
-
-
-        class LlamaSession_SpeechListener : ISpeechListener, IDisposable
-        {
-            bool isModelResponding;
-            SpeechRecognitionServer audioServer;
-
-            LLamaWeights model;
-            LLamaContext context;
-            InteractiveExecutor executor;
-
-            string fullPrompt = "";
-            bool canceled;
-
-            public LlamaSession_SpeechListener(SpeechRecognitionServer server)
-            {
-                var parameters = new ModelParams(UserSettings.GetModelPath()) { Seed = 1337, GpuLayerCount = 99 };
-                model = LLamaWeights.LoadFromFile(parameters);
-                context = model.CreateContext(parameters);
-                executor = new InteractiveExecutor(context);
-                (audioServer = server).ServiceUsers.Add(this);
-            }
-
-            // Whisper is struggling with single words and very short phrases without context, so it's actually better to say something like "Ok, Stop!" to have it work better.
-            bool ISpeechListener.IsInterested(string audioTranscription) => !isModelResponding || audioTranscription.Contains("stop", StringComparison.CurrentCultureIgnoreCase);
-            void ISpeechListener.HandleSpeech(string audioTranscription)
-            {
-                if (isModelResponding && audioTranscription.Contains("stop", StringComparison.CurrentCultureIgnoreCase)) { canceled = true; }
-                else if (!isModelResponding) { _ = SendMessage(audioTranscription); }
-            }
-
-            async Task SendMessage(string newMessage)
-            {
-                // While a response is queried, we want to detect short phrases/commands like 'stop',
-                audioServer.detectionSettings = (1, 2); // ..so we lower the min Speech Detection time.
-
-                isModelResponding = true;
-                AddToPrompt($"\n{newMessage}\n", ConsoleColor.Blue);
-                await foreach (var token in executor.InferAsync(fullPrompt))
-                {
-                    AddToPrompt(token, ConsoleColor.Yellow);
-                    if (canceled) { AddToPrompt("[...stopped]", ConsoleColor.Red); break; }
-                }
-                audioServer.detectionSettings = (2, 3);         // Reset back to default detection settings to avoid false positives.
-                (isModelResponding, canceled) = (false, false); // Reset the state variables to their default.
-            }
-
-            void AddToPrompt(string msg, ConsoleColor color = ConsoleColor.Yellow)
-            {
-                fullPrompt += msg;
-                ConsoleStyleHelpers.Write(msg, color);
-            }
-
-            void IDisposable.Dispose()
-            {
-                model.Dispose();
-                context.Dispose();
-            }
-        }
-
-        public interface ISpeechListener
-        {
-            bool IsInterested(string audioTranscription);
-            void HandleSpeech(string audioTranscription);
-        }
-
-        public class SpeechRecognitionServer : IDisposable
-        {
-            const int clipLength = 250; // ms
-            const float voiceDetectionThreshold = 0.01f; // Adjust as needed
-            readonly string[] knownFalsePositives = ["[BLANK_AUDIO]", "Thank you", "[silence]"];
-
-            WaveInEvent waveIn;
-            WaveFormat waveFormat = new(16000, 16, 1); // 16KHz, 16 bits, Mono Channel
-            List<byte> recordedBytes = [];
-
-            WhisperFactory? whisperFactory;
-            WhisperProcessor? processor;
-            string whisperPrompt =
-"""
-The short audio comes from a user that is speaking to an AI Language Model in real time.
-Pay extra attentions for commands like 'ok stop' or just 'stop'.
-In case of inaudible sentences that might be, assume they're saying 'stop'.
-""".Trim();
-
-            // Tracked stats for Speech Recognition, Parsing, and Serving.
-            int currentBlankClips;  // Ideally would work with milliseconds,
-            int totalNonBlankClips; // ..but for example's sake they work on a
-            int nonIdleTime;        // ..clip-based quant-length (1 = clipLength).
-            // Default detection settings: A speech of 750ms, followed by pause of 500ms. (2x250ms)
-            public (int minBlanksPerSeparation, int minNonBlanksForValidMessages) detectionSettings = (2, 3);
-
-            public HashSet<ISpeechListener> ServiceUsers = [];
-
-            public SpeechRecognitionServer(string modelPath)
-            {
-                // Adjust the path based on your GPU's type. On your build you ideally want just the correct runtime build for your project, but here we're having all references, so it's getting confused.
-                var libPath = @$"{Environment.GetFolderPath(Environment.SpecialFolder.UserProfile)}\.nuget\packages\whisper.net.runtime.cublas\1.5.0\build\win-x64\whisper.dll"; // Defaulting to cuBlas.
-                if (!File.Exists(libPath)) { ConsoleStyleHelpers.WriteLine($"Could not find dll file at {libPath}.\nWhisper will load with the default runtime (possibly CPU).\nIf you own a non-Nvidia GPU, you need to adjust the library path based on your GPU's type.", ConsoleColor.Red); libPath = null; }
-                whisperFactory = WhisperFactory.FromPath(modelPath, libraryPath: libPath);
-
-                var builder = whisperFactory.CreateBuilder().WithThreads(16).WithPrompt(whisperPrompt).WithSingleSegment().WithLanguage("en");
-                (builder.WithBeamSearchSamplingStrategy() as BeamSearchSamplingStrategyBuilder)!.WithPatience(0.2f).WithBeamSize(5);
-                processor = builder.Build();
-
-                waveIn = new WaveInEvent() { BufferMilliseconds = clipLength, WaveFormat = waveFormat };
-                waveIn.DataAvailable += OnAudioDataAvailable;
-                waveIn.StartRecording();
-            }
-
-            void OnAudioDataAvailable(object? sender, WaveInEventArgs e)
-            {
-                // Cache the recorded bytes
-                recordedBytes.AddRange(e.Buffer[..e.BytesRecorded]);
-                if (recordedBytes.Count > 110000000) { recordedBytes.RemoveRange(0, 50000000); }
-
-                // Get the max volume contained inside the clip. Since the clip is recorded as bytes, we need to translate them to samples before getting their volume.
-                var maxVolume = 0f; // This byte->sample algorithm is from: https://github.com/naudio/NAudio/blob/master/Docs/RecordingLevelMeter.md#calculating-peak-values
-                for (int i = 0; i < e.BytesRecorded; i += 2) { maxVolume = Math.Max(maxVolume, Math.Abs((short) ((e.Buffer[i + 1] << 8) | e.Buffer[i + 0]) / 32768f)); }
-
-                // Compare the volume with the threshold and act accordingly. Once an interesting and 'full' set of clips pops up, serve it.
-                if (maxVolume >= voiceDetectionThreshold) { currentBlankClips = 0; totalNonBlankClips++; nonIdleTime++; }
-                else if (++currentBlankClips < detectionSettings.minBlanksPerSeparation) { nonIdleTime++; }
-                else
-                {
-                    if (totalNonBlankClips >= detectionSettings.minNonBlanksForValidMessages) { SendTranscription(); }
-                    else if (totalNonBlankClips > 0) { } // This might be case of a false-positive -- knock, noise, cough, anything.
-                    (currentBlankClips, totalNonBlankClips, nonIdleTime) = (0, 0, 0);
-                }
-
-
-                async void SendTranscription()
-                {
-                    var bytesPerClip = waveFormat.BitsPerSample * clipLength * 2;
-                    var capturedClipBytes = recordedBytes.TakeLast(bytesPerClip * (nonIdleTime + 2)).ToArray();
-                    var transcribedText = await ProcessAudio(capturedClipBytes, "Assets\\temp.wav"); // Save to temporary file.
-                    if (knownFalsePositives.Contains(transcribedText)) { return; }                   // False positive.. yikes!
-                    foreach (var user in ServiceUsers.Where(x => x.IsInterested(transcribedText))) { user.HandleSpeech(transcribedText); }
-                }
-            }
-
-            /// <summary> Requests a transcription and responds with the text. </summary>
-            async Task<string> ProcessAudio(byte[] bytes, string tempWavFilePath)
-            {
-                await using var wavStream = new MemoryStream();
-                using (var writer = new WaveFileWriter(tempWavFilePath, waveFormat)) { writer.Write(bytes, 0, bytes.Length); }
-                using (var fileStream = File.OpenRead(tempWavFilePath)) { await fileStream.CopyToAsync(wavStream); }
-                wavStream.Seek(0, SeekOrigin.Begin);
-
-                Console.Beep();
-                return string.Join(' ', await processor!.ProcessAsync(wavStream).Select(x => x.Text).ToListAsync()).Trim();
-            }
-
-            void IDisposable.Dispose()
-            {
-                waveIn.Dispose();
-                processor?.Dispose();
-            }
-        }
-
-        public static class ConsoleStyleHelpers
-        {
-            public static string? SelectAudioModel()
-            {
-                var models = Directory.GetFiles("Assets", "*bin");
-                if (models.Length == 1) { return models[0]; }
-                else if (models.Length != 0)
-                {
-                    WriteLine("Available Models:", ConsoleColor.Green);
-                    for (int i = 0; i < models.Length; i++)
-                    {
-                        Write($"{i + 1}. ", ConsoleColor.Blue);
-                        WriteLine(models[i]["Assets\\".Length..], ConsoleColor.Yellow);
-                    }
-                    while (true)
-                    {
-                        Write($"Please choose a model (1-{models.Length}): ", ConsoleColor.DarkCyan);
-                        if (!int.TryParse(Console.ReadKey().KeyChar.ToString(), out var i) || i > models.Length || i <= 0) { Console.WriteLine(); continue; }
-                        Console.WriteLine();
-                        return models[i - 1];
-                    }
-                }
-                else
-                {
-                    WriteLine($"Download a non-quantized model and place it in the executing directory:", ConsoleColor.Red);
-                    WriteLine($"\t{Environment.CurrentDirectory}\\Assets", ConsoleColor.Yellow);
-                    WriteLine("You can find the official ggml models in whisper.cpp's huggingface repository: ", ConsoleColor.Red);
-                    WriteLine("\thttps://huggingface.co/ggerganov/whisper.cpp/tree/main", ConsoleColor.Blue);
-                    return null;
-                }
-            }
-            public static async Task LoadPrint(string initialText, Func<bool> ShouldContinue)
-            {
-                var startTime = DateTime.Now;
-                Console.WriteLine(initialText);
-                while (!ShouldContinue()) { Console.Write("."); await Task.Delay(100); }
-                Console.WriteLine($" Completed in {(DateTime.Now - startTime).TotalSeconds:f2}s.");
-            }
-
-            public async static Task WaitUntilExit()
-            {
-                WriteLine("Voice active. Begin talking to transcribe. Press any key at any time to exit.", ConsoleColor.Green);
-                await Task.Delay(1000);
-                Console.ReadKey();
-            }
-
-            public static void Write(string text, ConsoleColor consoleColor) => ColorAction(consoleColor, () => Console.Write(text));
-            public static void WriteLine(string text, ConsoleColor consoleColor) => ColorAction(consoleColor, () => Console.WriteLine(text));
-            public static void ColorAction(ConsoleColor consoleColor, Action action)
-            {
-                Console.ForegroundColor = consoleColor;
-                action?.Invoke();
-                Console.ForegroundColor = ConsoleColor.White;
-            }
-        }
-    }
-}
--- a/LLama.Examples/Examples/StatelessModeExecute.cs
+++ b/LLama.Examples/Examples/StatelessModeExecute.cs
@ -11,10 +11,11 @@ namespace LLama.Examples.Examples

            var parameters = new ModelParams(modelPath)
            {
+                ContextSize = 1024,
                Seed = 1337,
                GpuLayerCount = 5
            };
-            using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+            using var model = LLamaWeights.LoadFromFile(parameters);
            var ex = new StatelessExecutor(model, parameters);

            Console.ForegroundColor = ConsoleColor.Yellow;
--- a/LLama.Examples/Examples/TalkToYourself.cs
+++ b/LLama.Examples/Examples/TalkToYourself.cs
@ -12,7 +12,7 @@ namespace LLama.Examples.Examples

            // Load weights into memory
            var @params = new ModelParams(modelPath);
-            using var weights = await LLamaWeights.LoadFromFileAsync(@params);
+            using var weights = LLamaWeights.LoadFromFile(@params);

            // Create 2 contexts sharing the same weights
            using var aliceCtx = weights.CreateContext(@params);
@ -21,7 +21,7 @@ namespace LLama.Examples.Examples
            var bob = new InteractiveExecutor(bobCtx);

            // Initial alice prompt
-            var alicePrompt = "Transcript of a dialog, where the Alice interacts with a person named Bob. Alice is friendly, kind, honest and good at writing.\nAlice: Hello";
+            var alicePrompt = "Transcript of a dialog, where the Alice interacts a person named Bob. Alice is friendly, kind, honest and good at writing.\nAlice: Hello";
            var aliceResponse = await Prompt(alice, ConsoleColor.Green, alicePrompt, false, false);

            // Initial bob prompt
--- a/LLama.Examples/LLama.Examples.csproj
+++ b/LLama.Examples/LLama.Examples.csproj
@ -18,14 +18,8 @@
    <PackageReference Include="Microsoft.KernelMemory.Core" Version="0.34.240313.1" />
    <PackageReference Include="Microsoft.SemanticKernel" Version="1.6.2" />
    <PackageReference Include="Microsoft.SemanticKernel.Plugins.Memory" Version="1.6.2-alpha" />
-    <PackageReference Include="NAudio" Version="2.2.1" />
    <PackageReference Include="Spectre.Console" Version="0.48.0" />
    <PackageReference Include="Spectre.Console.ImageSharp" Version="0.48.0" />
-    <PackageReference Include="Whisper.net" Version="1.5.0" />
-    <PackageReference Include="Whisper.net.Runtime" Version="1.5.0" />
-    <PackageReference Include="Whisper.net.Runtime.Clblast" Version="1.5.0" />
-    <PackageReference Include="Whisper.net.Runtime.CoreML" Version="1.5.0" />
-    <PackageReference Include="Whisper.net.Runtime.Cublas" Version="1.5.0" />
  </ItemGroup>

  <ItemGroup>
--- a/LLama.Examples/Program.cs
+++ b/LLama.Examples/Program.cs
@ -16,20 +16,11 @@ AnsiConsole.MarkupLineInterpolated(

    """);

-// Configure native library to use. This must be done before any other llama.cpp methods are called!
+// Configure native library to use
 NativeLibraryConfig
   .Instance
-   .WithCuda();
-
-// Configure logging. Change this to `true` to see log messages from llama.cpp
-var showLLamaCppLogs = false;
-NativeLibraryConfig
-   .Instance
-   .WithLogCallback((level, message) =>
-    {
-        if (showLLamaCppLogs)
-            Console.WriteLine($"[llama {level}]: {message.TrimEnd('\n')}");
-    });
+   .WithCuda()
+   .WithLogs(LLamaLogLevel.Info);

 // Calling this method forces loading to occur now.
 NativeApi.llama_empty_call();
--- a/LLama.KernelMemory/BuilderExtensions.cs
+++ b/LLama.KernelMemory/BuilderExtensions.cs
@ -84,7 +84,7 @@ namespace LLamaSharp.KernelMemory
                ContextSize = config?.ContextSize ?? 2048,
                Seed = config?.Seed ?? 0,
                GpuLayerCount = config?.GpuLayerCount ?? 20,
-                Embeddings = true,
+                EmbeddingMode = true,
                MainGpu = config?.MainGpu ?? 0,
                SplitMode = config?.SplitMode ?? GPUSplitMode.None,
            };
--- a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
+++ b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
@ -4,7 +4,7 @@
    <TargetFrameworks>net6.0;net7.0;net8.0</TargetFrameworks>
    <ImplicitUsings>enable</ImplicitUsings>
    <Nullable>enable</Nullable>
-    <Version>0.11.2</Version>
+    <Version>0.11.0</Version>
    <Authors>Xbotter</Authors>
    <Company>SciSharp STACK</Company>
    <GeneratePackageOnBuild>true</GeneratePackageOnBuild>
@ -17,7 +17,7 @@
      The integration of LLamaSharp and Microsoft kernel-memory. It could make it easy to support document search for LLamaSharp model inference.
    </Description>
    <PackageReleaseNotes>
-      v0.11.2 followed the updating of LLamaSharp.
+      v0.11.0 updated the kernel-memory package and Fixed System.ArgumentException: EmbeddingMode must be true.
    </PackageReleaseNotes>
    <PackageLicenseExpression>MIT</PackageLicenseExpression>
    <PackageOutputPath>packages</PackageOutputPath>
--- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
+++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@ -29,7 +29,7 @@ namespace LLamaSharp.KernelMemory
            this._config = config;
            var @params = new ModelParams(_config.ModelPath)
            {
-                Embeddings = true,
+                EmbeddingMode = true,
                MainGpu = _config.MainGpu,
                SplitMode = _config.SplitMode
            };
@ -49,7 +49,7 @@ namespace LLamaSharp.KernelMemory
            this._config = config;
            var @params = new ModelParams(_config.ModelPath)
            {
-                Embeddings = true,
+                EmbeddingMode = true,
                MainGpu = _config.MainGpu,
                SplitMode = _config.SplitMode
            };
@ -104,6 +104,6 @@ namespace LLamaSharp.KernelMemory
        }

        /// <inheritdoc/>
-        public int CountTokens(string text) => _embedder.Context.Tokenize(text, special: true).Length;
+        public int CountTokens(string text) => _embedder.Context.Tokenize(text).Length;
    }
 }
--- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs
+++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@ -1,7 +1,13 @@
 using LLama;
+using LLama.Abstractions;
 using LLama.Common;
 using LLama.Native;
 using Microsoft.KernelMemory.AI;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;

 namespace LLamaSharp.KernelMemory
 {
@ -105,6 +111,6 @@ namespace LLamaSharp.KernelMemory
        }

        /// <inheritdoc/>
-        public int CountTokens(string text) => _context.Tokenize(text, special: true).Length;
+        public int CountTokens(string text) => _context.Tokenize(text).Length;
    }
 }
--- a/LLama.SemanticKernel/ChatCompletion/ChatRequestSettings.cs
+++ b/LLama.SemanticKernel/ChatCompletion/ChatRequestSettings.cs
@ -4,7 +4,6 @@ using System.Text.Json.Serialization;

 namespace LLamaSharp.SemanticKernel.ChatCompletion;

-[Obsolete("Use LLamaSharpPromptExecutionSettings instead")]
 public class ChatRequestSettings : PromptExecutionSettings
 {
    /// <summary>
--- a/LLama.SemanticKernel/ChatCompletion/ChatRequestSettingsConverter.cs
+++ b/LLama.SemanticKernel/ChatCompletion/ChatRequestSettingsConverter.cs
@ -8,7 +8,6 @@ namespace LLamaSharp.SemanticKernel.ChatCompletion;
 /// <summary>
 /// JSON converter for <see cref="OpenAIRequestSettings"/>
 /// </summary>
-[Obsolete("Use LLamaSharpPromptExecutionSettingsConverter instead")]
 public class ChatRequestSettingsConverter : JsonConverter<ChatRequestSettings>
 {
    /// <inheritdoc/>
--- a/LLama.SemanticKernel/ChatCompletion/LLamaSharpChatCompletion.cs
+++ b/LLama.SemanticKernel/ChatCompletion/LLamaSharpChatCompletion.cs
@ -7,7 +7,6 @@ using System;
 using System.IO;
 using System.Runtime.CompilerServices;
 using System.Text;
-using static LLama.InteractiveExecutor;
 using static LLama.LLamaTransforms;

 namespace LLamaSharp.SemanticKernel.ChatCompletion;
@ -18,18 +17,17 @@ namespace LLamaSharp.SemanticKernel.ChatCompletion;
 public sealed class LLamaSharpChatCompletion : IChatCompletionService
 {
    private readonly ILLamaExecutor _model;
-    private LLamaSharpPromptExecutionSettings defaultRequestSettings;
+    private ChatRequestSettings defaultRequestSettings;
    private readonly IHistoryTransform historyTransform;
    private readonly ITextStreamTransform outputTransform;

    private readonly Dictionary<string, object?> _attributes = new();
-    private readonly bool _isStatefulExecutor;

    public IReadOnlyDictionary<string, object?> Attributes => this._attributes;

-    static LLamaSharpPromptExecutionSettings GetDefaultSettings()
+    static ChatRequestSettings GetDefaultSettings()
    {
-        return new LLamaSharpPromptExecutionSettings
+        return new ChatRequestSettings
        {
            MaxTokens = 256,
            Temperature = 0,
@ -39,12 +37,11 @@ public sealed class LLamaSharpChatCompletion : IChatCompletionService
    }

    public LLamaSharpChatCompletion(ILLamaExecutor model,
-        LLamaSharpPromptExecutionSettings? defaultRequestSettings = default,
+        ChatRequestSettings? defaultRequestSettings = default,
        IHistoryTransform? historyTransform = null,
        ITextStreamTransform? outputTransform = null)
    {
        this._model = model;
-        this._isStatefulExecutor = this._model is StatefulExecutorBase;
        this.defaultRequestSettings = defaultRequestSettings ?? GetDefaultSettings();
        this.historyTransform = historyTransform ?? new HistoryTransform();
        this.outputTransform = outputTransform ?? new KeywordTextOutputStreamTransform(new[] { $"{LLama.Common.AuthorRole.User}:",
@ -68,10 +65,10 @@ public sealed class LLamaSharpChatCompletion : IChatCompletionService
    public async Task<IReadOnlyList<ChatMessageContent>> GetChatMessageContentsAsync(ChatHistory chatHistory, PromptExecutionSettings? executionSettings = null, Kernel? kernel = null, CancellationToken cancellationToken = default)
    {
        var settings = executionSettings != null
-           ? LLamaSharpPromptExecutionSettings.FromRequestSettings(executionSettings)
+           ? ChatRequestSettings.FromRequestSettings(executionSettings)
           : defaultRequestSettings;
+        var prompt = historyTransform.HistoryToText(chatHistory.ToLLamaSharpChatHistory());

-        string prompt = this._getFormattedPrompt(chatHistory);
        var result = _model.InferAsync(prompt, settings.ToLLamaSharpInferenceParams(), cancellationToken);

        var output = outputTransform.TransformAsync(result);
@ -89,10 +86,10 @@ public sealed class LLamaSharpChatCompletion : IChatCompletionService
    public async IAsyncEnumerable<StreamingChatMessageContent> GetStreamingChatMessageContentsAsync(ChatHistory chatHistory, PromptExecutionSettings? executionSettings = null, Kernel? kernel = null, [EnumeratorCancellation] CancellationToken cancellationToken = default)
    {
        var settings = executionSettings != null
-          ? LLamaSharpPromptExecutionSettings.FromRequestSettings(executionSettings)
+          ? ChatRequestSettings.FromRequestSettings(executionSettings)
          : defaultRequestSettings;
+        var prompt = historyTransform.HistoryToText(chatHistory.ToLLamaSharpChatHistory());

-        string prompt = this._getFormattedPrompt(chatHistory);
        var result = _model.InferAsync(prompt, settings.ToLLamaSharpInferenceParams(), cancellationToken);

        var output = outputTransform.TransformAsync(result);
@ -102,33 +99,4 @@ public sealed class LLamaSharpChatCompletion : IChatCompletionService
            yield return new StreamingChatMessageContent(AuthorRole.Assistant, token);
        }
    }
-
-    /// <summary>
-    /// Return either the entire formatted chatHistory or just the most recent message based on
-    /// whether the model extends StatefulExecutorBase or not.
-    /// </summary>
-    /// <param name="chatHistory"></param>
-    /// <returns>The formatted prompt</returns>
-    private string _getFormattedPrompt(ChatHistory chatHistory){
-        string prompt;
-        if (this._isStatefulExecutor){
-            InteractiveExecutorState state = (InteractiveExecutorState)((StatefulExecutorBase)this._model).GetStateData();
-            if (state.IsPromptRun)
-            {
-                prompt = historyTransform.HistoryToText(chatHistory.ToLLamaSharpChatHistory());
-            }
-            else
-            {
-                ChatHistory temp_history = new();
-                temp_history.AddUserMessage(chatHistory.Last().Content);
-                prompt = historyTransform.HistoryToText(temp_history.ToLLamaSharpChatHistory());
-            }
-        }
-        else
-        {
-            prompt = historyTransform.HistoryToText(chatHistory.ToLLamaSharpChatHistory());
-        }
-
-        return prompt;
-    }
 }
--- a/LLama.SemanticKernel/ExtensionMethods.cs
+++ b/LLama.SemanticKernel/ExtensionMethods.cs
@ -1,4 +1,5 @@
-using Microsoft.SemanticKernel.ChatCompletion;
+using LLamaSharp.SemanticKernel.ChatCompletion;
+using Microsoft.SemanticKernel.ChatCompletion;
 namespace LLamaSharp.SemanticKernel;

 public static class ExtensionMethods
@ -22,11 +23,11 @@ public static class ExtensionMethods
    }

    /// <summary>
-    /// Convert LLamaSharpPromptExecutionSettings to LLamaSharp InferenceParams
+    /// Convert ChatRequestSettings to LLamaSharp InferenceParams
    /// </summary>
    /// <param name="requestSettings"></param>
    /// <returns></returns>
-    internal static global::LLama.Common.InferenceParams ToLLamaSharpInferenceParams(this LLamaSharpPromptExecutionSettings requestSettings)
+    internal static global::LLama.Common.InferenceParams ToLLamaSharpInferenceParams(this ChatRequestSettings requestSettings)
    {
        if (requestSettings is null)
        {
--- a/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj
+++ b/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj
@ -10,7 +10,7 @@
 		<ImplicitUsings>enable</ImplicitUsings>
 		<Nullable>enable</Nullable>

-		<Version>0.11.2</Version>
+		<Version>0.11.0</Version>
 		<Authors>Tim Miller, Xbotter</Authors>
 		<Company>SciSharp STACK</Company>
 		<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
@ -23,7 +23,7 @@
 			The integration of LLamaSharp and Microsoft semantic-kernel.
 		</Description>
 		<PackageReleaseNotes>
-			v0.11.2 followed the updating of LLamaSharp.
+			v0.11.0 updates the semantic-kernel package.
 		</PackageReleaseNotes>
 		<PackageLicenseExpression>MIT</PackageLicenseExpression>
 		<PackageOutputPath>packages</PackageOutputPath>
@ -34,7 +34,7 @@
 	</PropertyGroup>

 	<ItemGroup>
-		<PackageReference Include="Microsoft.SemanticKernel.Abstractions" Version="1.6.2" />
+		<PackageReference Include="Microsoft.SemanticKernel.Abstractions" Version="1.6.3" />
 	</ItemGroup>

 	<ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.0'">
--- a/LLama.SemanticKernel/LLamaSharpPromptExecutionSettings.cs
+++ b/LLama.SemanticKernel/LLamaSharpPromptExecutionSettings.cs
@ -1,131 +0,0 @@
-
-/* Unmerged change from project 'LLamaSharp.SemanticKernel (netstandard2.0)'
-Before:
-using Microsoft.SemanticKernel;
-After:
-using LLamaSharp;
-using LLamaSharp.SemanticKernel;
-using LLamaSharp.SemanticKernel;
-using LLamaSharp.SemanticKernel.ChatCompletion;
-using Microsoft.SemanticKernel;
-*/
-using LLamaSharp.SemanticKernel.ChatCompletion;
-using Microsoft.SemanticKernel;
-using System.Text.Json;
-using System.Text.Json.Serialization;
-
-namespace LLamaSharp.SemanticKernel;
-
-public class LLamaSharpPromptExecutionSettings : PromptExecutionSettings
-{
-    /// <summary>
-    /// Temperature controls the randomness of the completion.
-    /// The higher the temperature, the more random the completion.
-    /// </summary>
-    [JsonPropertyName("temperature")]
-    public double Temperature { get; set; } = 0;
-
-    /// <summary>
-    /// TopP controls the diversity of the completion.
-    /// The higher the TopP, the more diverse the completion.
-    /// </summary>
-    [JsonPropertyName("top_p")]
-    public double TopP { get; set; } = 0;
-
-    /// <summary>
-    /// Number between -2.0 and 2.0. Positive values penalize new tokens
-    /// based on whether they appear in the text so far, increasing the
-    /// model's likelihood to talk about new topics.
-    /// </summary>
-    [JsonPropertyName("presence_penalty")]
-    public double PresencePenalty { get; set; } = 0;
-
-    /// <summary>
-    /// Number between -2.0 and 2.0. Positive values penalize new tokens
-    /// based on their existing frequency in the text so far, decreasing
-    /// the model's likelihood to repeat the same line verbatim.
-    /// </summary>
-    [JsonPropertyName("frequency_penalty")]
-    public double FrequencyPenalty { get; set; } = 0;
-
-    /// <summary>
-    /// Sequences where the completion will stop generating further tokens.
-    /// </summary>
-    [JsonPropertyName("stop_sequences")]
-    public IList<string> StopSequences { get; set; } = Array.Empty<string>();
-
-    /// <summary>
-    /// How many completions to generate for each prompt. Default is 1.
-    /// Note: Because this parameter generates many completions, it can quickly consume your token quota.
-    /// Use carefully and ensure that you have reasonable settings for max_tokens and stop.
-    /// </summary>
-    [JsonPropertyName("results_per_prompt")]
-    public int ResultsPerPrompt { get; set; } = 1;
-
-    /// <summary>
-    /// The maximum number of tokens to generate in the completion.
-    /// </summary>
-    [JsonPropertyName("max_tokens")]
-    public int? MaxTokens { get; set; }
-
-    /// <summary>
-    /// Modify the likelihood of specified tokens appearing in the completion.
-    /// </summary>
-    [JsonPropertyName("token_selection_biases")]
-    public IDictionary<int, int> TokenSelectionBiases { get; set; } = new Dictionary<int, int>();
-
-    /// <summary>
-    /// Indicates the format of the response which can be used downstream to post-process the messages. Handlebars: handlebars_object. JSON: json_object, etc.
-    /// </summary>
-    [JsonPropertyName("response_format")]
-    public string ResponseFormat { get; set; } = string.Empty;
-
-    /// <summary>
-    /// Create a new settings object with the values from another settings object.
-    /// </summary>
-    /// <param name="requestSettings">Template configuration</param>
-    /// <param name="defaultMaxTokens">Default max tokens</param>
-    /// <returns>An instance of OpenAIRequestSettings</returns>
-    public static LLamaSharpPromptExecutionSettings FromRequestSettings(PromptExecutionSettings? requestSettings, int? defaultMaxTokens = null)
-    {
-        if (requestSettings is null)
-        {
-            return new LLamaSharpPromptExecutionSettings()
-            {
-                MaxTokens = defaultMaxTokens
-            };
-        }
-
-        if (requestSettings is LLamaSharpPromptExecutionSettings requestSettingsChatRequestSettings)
-        {
-            return requestSettingsChatRequestSettings;
-        }
-
-        var json = JsonSerializer.Serialize(requestSettings);
-        var chatRequestSettings = JsonSerializer.Deserialize<LLamaSharpPromptExecutionSettings>(json, s_options);
-
-        if (chatRequestSettings is not null)
-        {
-            return chatRequestSettings;
-        }
-
-        throw new ArgumentException($"Invalid request settings, cannot convert to {nameof(LLamaSharpPromptExecutionSettings)}", nameof(requestSettings));
-    }
-
-    private static readonly JsonSerializerOptions s_options = CreateOptions();
-
-    private static JsonSerializerOptions CreateOptions()
-    {
-        JsonSerializerOptions options = new()
-        {
-            WriteIndented = true,
-            MaxDepth = 20,
-            AllowTrailingCommas = true,
-            PropertyNameCaseInsensitive = true,
-            ReadCommentHandling = JsonCommentHandling.Skip,
-            Converters = { new LLamaSharpPromptExecutionSettingsConverter() }
-        };
-
-        return options;
-    }
-}
--- a/LLama.SemanticKernel/LLamaSharpPromptExecutionSettingsConverter.cs
+++ b/LLama.SemanticKernel/LLamaSharpPromptExecutionSettingsConverter.cs
@ -1,104 +0,0 @@
-using System;
-using System.Collections.Generic;
-using System.Text.Json;
-using System.Text.Json.Serialization;
-
-namespace LLamaSharp.SemanticKernel;
-
-/// <summary>
-/// JSON converter for <see cref="OpenAIRequestSettings"/>
-/// </summary>
-public class LLamaSharpPromptExecutionSettingsConverter : JsonConverter<LLamaSharpPromptExecutionSettings>
-{
-    /// <inheritdoc/>
-    public override LLamaSharpPromptExecutionSettings? Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
-    {
-        var requestSettings = new LLamaSharpPromptExecutionSettings();
-
-        while (reader.Read() && reader.TokenType != JsonTokenType.EndObject)
-        {
-            if (reader.TokenType == JsonTokenType.PropertyName)
-            {
-                string? propertyName = reader.GetString();
-
-                if (propertyName is not null)
-                {
-                    // normalise property name to uppercase
-                    propertyName = propertyName.ToUpperInvariant();
-                }
-
-                reader.Read();
-
-                switch (propertyName)
-                {
-                    case "MODELID":
-                    case "MODEL_ID":
-                        requestSettings.ModelId = reader.GetString();
-                        break;
-                    case "TEMPERATURE":
-                        requestSettings.Temperature = reader.GetDouble();
-                        break;
-                    case "TOPP":
-                    case "TOP_P":
-                        requestSettings.TopP = reader.GetDouble();
-                        break;
-                    case "FREQUENCYPENALTY":
-                    case "FREQUENCY_PENALTY":
-                        requestSettings.FrequencyPenalty = reader.GetDouble();
-                        break;
-                    case "PRESENCEPENALTY":
-                    case "PRESENCE_PENALTY":
-                        requestSettings.PresencePenalty = reader.GetDouble();
-                        break;
-                    case "MAXTOKENS":
-                    case "MAX_TOKENS":
-                        requestSettings.MaxTokens = reader.GetInt32();
-                        break;
-                    case "STOPSEQUENCES":
-                    case "STOP_SEQUENCES":
-                        requestSettings.StopSequences = JsonSerializer.Deserialize<IList<string>>(ref reader, options) ?? Array.Empty<string>();
-                        break;
-                    case "RESULTSPERPROMPT":
-                    case "RESULTS_PER_PROMPT":
-                        requestSettings.ResultsPerPrompt = reader.GetInt32();
-                        break;
-                    case "TOKENSELECTIONBIASES":
-                    case "TOKEN_SELECTION_BIASES":
-                        requestSettings.TokenSelectionBiases = JsonSerializer.Deserialize<IDictionary<int, int>>(ref reader, options) ?? new Dictionary<int, int>();
-                        break;
-                    default:
-                        reader.Skip();
-                        break;
-                }
-            }
-        }
-
-        return requestSettings;
-    }
-
-    /// <inheritdoc/>
-    public override void Write(Utf8JsonWriter writer, LLamaSharpPromptExecutionSettings value, JsonSerializerOptions options)
-    {
-        writer.WriteStartObject();
-
-        writer.WriteNumber("temperature", value.Temperature);
-        writer.WriteNumber("top_p", value.TopP);
-        writer.WriteNumber("frequency_penalty", value.FrequencyPenalty);
-        writer.WriteNumber("presence_penalty", value.PresencePenalty);
-        if (value.MaxTokens is null)
-        {
-            writer.WriteNull("max_tokens");
-        }
-        else
-        {
-            writer.WriteNumber("max_tokens", (decimal)value.MaxTokens);
-        }
-        writer.WritePropertyName("stop_sequences");
-        JsonSerializer.Serialize(writer, value.StopSequences, options);
-        writer.WriteNumber("results_per_prompt", value.ResultsPerPrompt);
-        writer.WritePropertyName("token_selection_biases");
-        JsonSerializer.Serialize(writer, value.TokenSelectionBiases, options);
-
-        writer.WriteEndObject();
-    }
-}
--- a/LLama.SemanticKernel/TextCompletion/LLamaSharpTextCompletion.cs
+++ b/LLama.SemanticKernel/TextCompletion/LLamaSharpTextCompletion.cs
@ -1,4 +1,5 @@
 using LLama.Abstractions;
+using LLamaSharp.SemanticKernel.ChatCompletion;
 using Microsoft.SemanticKernel;
 using Microsoft.SemanticKernel.Services;
 using Microsoft.SemanticKernel.TextGeneration;
@ -23,7 +24,7 @@ public sealed class LLamaSharpTextCompletion : ITextGenerationService
    /// <inheritdoc/>
    public async Task<IReadOnlyList<TextContent>> GetTextContentsAsync(string prompt, PromptExecutionSettings? executionSettings = null, Kernel? kernel = null, CancellationToken cancellationToken = default)
    {
-        var settings = LLamaSharpPromptExecutionSettings.FromRequestSettings(executionSettings);
+        var settings = ChatRequestSettings.FromRequestSettings(executionSettings);
        var result = executor.InferAsync(prompt, settings?.ToLLamaSharpInferenceParams(), cancellationToken);
        var sb = new StringBuilder();
        await foreach (var token in result)
@ -36,7 +37,7 @@ public sealed class LLamaSharpTextCompletion : ITextGenerationService
    /// <inheritdoc/>
    public async IAsyncEnumerable<StreamingTextContent> GetStreamingTextContentsAsync(string prompt, PromptExecutionSettings? executionSettings = null, Kernel? kernel = null, [EnumeratorCancellation] CancellationToken cancellationToken = default)
    {
-        var settings = LLamaSharpPromptExecutionSettings.FromRequestSettings(executionSettings);
+        var settings = ChatRequestSettings.FromRequestSettings(executionSettings);
        var result = executor.InferAsync(prompt, settings?.ToLLamaSharpInferenceParams(), cancellationToken);
        await foreach (var token in result)
        {
--- a/LLama.Unittest/BasicTest.cs
+++ b/LLama.Unittest/BasicTest.cs
@ -15,10 +15,9 @@ namespace LLama.Unittest
        public BasicTest(ITestOutputHelper testOutputHelper)
        {
            _testOutputHelper = testOutputHelper;
-            _params = new ModelParams(Constants.GenerativeModelPath)
+            _params = new ModelParams(Constants.ModelPath)
            {
-                ContextSize = 2048,
-                GpuLayerCount = Constants.CIGpuLayerCount
+                ContextSize = 2048
            };
            _model = LLamaWeights.LoadFromFile(_params);
        }
--- a/LLama.Unittest/BeamTests.cs
+++ b/LLama.Unittest/BeamTests.cs
@ -15,10 +15,9 @@ public sealed class BeamTests
    public BeamTests(ITestOutputHelper testOutputHelper)
    {
        _testOutputHelper = testOutputHelper;
-        _params = new ModelParams(Constants.GenerativeModelPath)
+        _params = new ModelParams(Constants.ModelPath)
        {
-            ContextSize = 2048,
-            GpuLayerCount = Constants.CIGpuLayerCount,
+            ContextSize = 2048
        };
        _model = LLamaWeights.LoadFromFile(_params);
    }
@ -28,7 +27,7 @@ public sealed class BeamTests
        _model.Dispose();
    }

-    [Fact]
+    [Fact(Skip = "Very very slow in CI")]
    public void BasicBeam()
    {
        const int num_beams = 2;
@ -37,15 +36,15 @@ public sealed class BeamTests

        var context = _model.CreateContext(_params);

-        var initial_tokens = context.Tokenize(prompt);
-        var batch = new LLamaBatch();
-        batch.AddRange(initial_tokens, 0, LLamaSeqId.Zero, true);
-        context.Decode(batch);
+        var result = new StringBuilder();
+
+        var initial_tokens = context.Tokenize(prompt);
+        result.Append(prompt);
+        //context.Eval(initial_tokens.AsSpan(), 0);
+        throw new NotImplementedException("Replace Eval");

-        var decoder = new StreamingTokenDecoder(context);
        NativeApi.llama_beam_search(context.NativeHandle, (data, state) =>
        {
-            // Show the current state of every beam.
            for (var i = 0; i < state.Beams.Length; i++)
            {
                ref var view = ref state.Beams[i];
@ -57,17 +56,20 @@ public sealed class BeamTests
                _testOutputHelper.WriteLine($"B{i} ({view.CumulativeProbability}) => '{tokens}'");
            }

-            // Once all beams agree on some tokens read them and append them to the output decoder
            if (state.CommonPrefixLength > 0)
            {
                var view = state.Beams[0];

+                var decoder = new StreamingTokenDecoder(context);
                decoder.AddRange(view.Tokens.Slice(0, (int)state.CommonPrefixLength));
+                var tokens = decoder.Read();
+
+                result.Append(tokens);
                
            }

        }, IntPtr.Zero, num_beams, initial_tokens.Length, n_predict, Math.Max(1, Environment.ProcessorCount / 2));

-        _testOutputHelper.WriteLine($"Final: {prompt}{decoder.Read()}");
+        _testOutputHelper.WriteLine($"Final: {result}");
    }
 }
--- a/LLama.Unittest/Constants.cs
+++ b/LLama.Unittest/Constants.cs
@ -1,34 +1,10 @@
-using System.Runtime.InteropServices;
-
-namespace LLama.Unittest
+namespace LLama.Unittest
 {
    internal static class Constants
    {
-        public static readonly string GenerativeModelPath = "Models/llama-2-7b-chat.Q3_K_S.gguf";
-        public static readonly string EmbeddingModelPath = "Models/all-MiniLM-L12-v2.Q8_0.gguf";
-
-        public static readonly string LLavaModelPath = "Models/llava-v1.6-mistral-7b.Q3_K_XS.gguf";
-        public static readonly string LLavaMmpPath = "Models/mmproj-model-f16.gguf";
-        public static readonly string LLavaImage = "Models/extreme-ironing-taxi-610x427.jpg";
-
-        /// <summary>
-        /// Calculate GpuLayer Count to use in UnitTest
-        /// </summary>
-        /// <returns> Defaults to 20 in all the cases, except MacOS/OSX release (to disable METAL on github CI)</returns>
-        public static int CIGpuLayerCount
-        {
-            get
-            {
-                if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
-                {
-                    #if DEBUG
-                      return 20;
-                    #else
-                      return 0;                      
-                    #endif
-                }
-                else return 20;
-            }
-        }
+        public static string ModelPath = "Models/llama-2-7b-chat.Q3_K_S.gguf";
+        public static string LLavaModelPath = "Models/llava-v1.6-mistral-7b.Q3_K_XS.gguf";
+        public static string LLavaMmpPath = "Models/mmproj-model-f16.gguf";
+        public static string LLavaImage = "Models/extreme-ironing-taxi-610x427.jpg";
    }
 }
--- a/LLama.Unittest/GrammarTest.cs
+++ b/LLama.Unittest/GrammarTest.cs
@ -12,11 +12,10 @@ namespace LLama.Unittest

        public GrammarTest()
        {
-            _params = new ModelParams(Constants.GenerativeModelPath)
+            _params = new ModelParams(Constants.ModelPath)
            {
                ContextSize = 2048,
                Seed = 92,
-                GpuLayerCount = Constants.CIGpuLayerCount,                
            };
            _model = LLamaWeights.LoadFromFile(_params);
        }
--- a/LLama.Unittest/LLama.Unittest.csproj
+++ b/LLama.Unittest/LLama.Unittest.csproj
@ -31,9 +31,6 @@
    <DownloadFile SourceUrl="https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q3_K_S.gguf" DestinationFolder="Models" DestinationFileName="llama-2-7b-chat.Q3_K_S.gguf" SkipUnchangedFiles="true"></DownloadFile>
    <DownloadFile SourceUrl="https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf/resolve/main/llava-v1.6-mistral-7b.Q3_K_XS.gguf" DestinationFolder="Models" DestinationFileName="llava-v1.6-mistral-7b.Q3_K_XS.gguf" SkipUnchangedFiles="true"></DownloadFile>
    <DownloadFile SourceUrl="https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf/resolve/main/mmproj-model-f16.gguf" DestinationFolder="Models" DestinationFileName="mmproj-model-f16.gguf" SkipUnchangedFiles="true"></DownloadFile>
-    <DownloadFile SourceUrl="https://huggingface.co/leliuga/all-MiniLM-L12-v2-GGUF/resolve/main/all-MiniLM-L12-v2.Q8_0.gguf" DestinationFolder="Models" DestinationFileName="all-MiniLM-L12-v2.Q8_0.gguf" SkipUnchangedFiles="true"></DownloadFile>
-
-    
  </Target>

  <ItemGroup>
@ -46,9 +43,6 @@
  </ItemGroup>

  <ItemGroup>
-    <None Update="Models\all-MiniLM-L12-v2.Q8_0.gguf">
-      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-    </None>
    <None Update="Models\llama-2-7b-chat.Q3_K_S.gguf">
      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
    </None>
--- a/LLama.Unittest/LLamaContextTests.cs
+++ b/LLama.Unittest/LLamaContextTests.cs
@ -11,10 +11,9 @@ namespace LLama.Unittest

        public LLamaContextTests()
        {
-            var @params = new ModelParams(Constants.GenerativeModelPath)
+            var @params = new ModelParams(Constants.ModelPath)
            {
                ContextSize = 768,
-                GpuLayerCount = Constants.CIGpuLayerCount,
            };
            _weights = LLamaWeights.LoadFromFile(@params);
            _context = _weights.CreateContext(@params);
--- a/LLama.Unittest/LLamaEmbedderTests.cs
+++ b/LLama.Unittest/LLamaEmbedderTests.cs
@ -1,15 +1,30 @@
-using LLama.Common;
+using LLama.Common;
 using Xunit.Abstractions;

 namespace LLama.Unittest;

 public sealed class LLamaEmbedderTests
+    : IDisposable
 {
    private readonly ITestOutputHelper _testOutputHelper;
+    private readonly LLamaEmbedder _embedder;

    public LLamaEmbedderTests(ITestOutputHelper testOutputHelper)
    {
        _testOutputHelper = testOutputHelper;
+        var @params = new ModelParams(Constants.ModelPath)
+        {
+            ContextSize = 4096,
+            Threads = 5,
+            EmbeddingMode = true,
+        };
+        using var weights = LLamaWeights.LoadFromFile(@params);
+        _embedder = new(weights, @params);
+    }
+
+    public void Dispose()
+    {
+        _embedder.Dispose();
    }

    private static float Dot(float[] a, float[] b)
@ -18,26 +33,13 @@ public sealed class LLamaEmbedderTests
        return a.Zip(b, (x, y) => x * y).Sum();
    }

-    private async Task CompareEmbeddings(string modelPath)
+
+    [Fact]
+    public async Task EmbedCompare()
    {
-        var @params = new ModelParams(modelPath)
-        {
-            ContextSize = 8,
-            Threads = 4,
-            Embeddings = true,
-            GpuLayerCount = Constants.CIGpuLayerCount,
-        };
-        using var weights = LLamaWeights.LoadFromFile(@params);
-        using var embedder = new LLamaEmbedder(weights, @params);
-
-        var cat = await embedder.GetEmbeddings("The cat is cute");
-        Assert.DoesNotContain(float.NaN, cat);
-
-        var kitten = await embedder.GetEmbeddings("The kitten is kawaii");
-        Assert.DoesNotContain(float.NaN, kitten);
-
-        var spoon = await embedder.GetEmbeddings("The spoon is not real");
-        Assert.DoesNotContain(float.NaN, spoon);
+        var cat = await _embedder.GetEmbeddings("The cat is cute");
+        var kitten = await _embedder.GetEmbeddings("The kitten is kawaii");
+        var spoon = await _embedder.GetEmbeddings("The spoon is not real");

        _testOutputHelper.WriteLine($"Cat    = [{string.Join(",", cat.AsMemory().Slice(0, 7).ToArray())}...]");
        _testOutputHelper.WriteLine($"Kitten = [{string.Join(",", kitten.AsMemory().Slice(0, 7).ToArray())}...]");
@ -45,23 +47,6 @@ public sealed class LLamaEmbedderTests

        var close = 1 - Dot(cat, kitten);
        var far = 1 - Dot(cat, spoon);
-
-        _testOutputHelper.WriteLine("");
-        _testOutputHelper.WriteLine($"Cat.Kitten (Close): {close:F4}");
-        _testOutputHelper.WriteLine($"Cat.Spoon  (Far):   {far:F4}");
-
        Assert.True(close < far);
    }
-
-    [Fact]
-    public async Task EmbedCompareEmbeddingModel()
-    {
-        await CompareEmbeddings(Constants.EmbeddingModelPath);
-    }
-
-    [Fact]
-    public async Task EmbedCompareGenerateModel()
-    {
-        await CompareEmbeddings(Constants.GenerativeModelPath);
-    }
 }
--- a/LLama.Unittest/LLavaWeightsTests.cs
+++ b/LLama.Unittest/LLavaWeightsTests.cs
@ -14,11 +14,10 @@ namespace LLama.Unittest
        
        public LLavaWeightTests()
        {
-            var @params = new ModelParams(Constants.GenerativeModelPath)
+            var @params = new ModelParams(Constants.ModelPath)
            {
                // Llava models requires big context
-                ContextSize = 4096,
-                GpuLayerCount = Constants.CIGpuLayerCount,                
+                ContextSize = 4096
            };
            _llamaWeights = LLamaWeights.LoadFromFile(@params);
            _lLavaWeights = LLavaWeights.LoadFromFile(Constants.LLavaMmpPath);
@ -33,7 +32,7 @@ namespace LLama.Unittest
            _lLavaWeights.Dispose();
        }
      
-        [Fact,Trait("Category", "NoCI")]
+        [Fact(Skip = "Very very slow in CI")]
        public void EmbedImageAsFileName()
        {
            int n_past = 0;
@ -41,7 +40,7 @@ namespace LLama.Unittest
            Assert.True( _lLavaWeights.EvalImageEmbed( _context, emb, ref n_past ) );
        }        
        
-        [Fact,Trait("Category", "NoCI")]
+        [Fact(Skip = "Very very slow in CI")]
        public void EmbedImageAsBinary()
        {
            int n_past = 0;
--- a/LLama.Unittest/MemoryDisposalTests.cs
+++ b/LLama.Unittest/MemoryDisposalTests.cs
@ -7,10 +7,9 @@ public class MemoryDisposalTests
    [Fact]
    public void ModelDisposal()
    {
-        var @params = new ModelParams(Constants.GenerativeModelPath)
+        var @params = new ModelParams(Constants.ModelPath)
        {
-            ContextSize = 2048,
-            GpuLayerCount = 0,
+            ContextSize = 2048
        };
        var model = LLamaWeights.LoadFromFile(@params);

@ -22,10 +21,9 @@ public class MemoryDisposalTests
    [Fact]
    public void ContextDisposal()
    {
-        var @params = new ModelParams(Constants.GenerativeModelPath)
+        var @params = new ModelParams(Constants.ModelPath)
        {
-            ContextSize = 2048,
-            GpuLayerCount = Constants.CIGpuLayerCount,            
+            ContextSize = 2048
        };
        var model = LLamaWeights.LoadFromFile(@params);

--- a/LLama.Unittest/SemanticKernel/ChatRequestSettingsConverterTests.cs
+++ b/LLama.Unittest/SemanticKernel/ChatRequestSettingsConverterTests.cs
@ -1,5 +1,4 @@
-using LLamaSharp.SemanticKernel;
-using LLamaSharp.SemanticKernel.ChatCompletion;
+using LLamaSharp.SemanticKernel.ChatCompletion;
 using System.Text.Json;

 namespace LLama.Unittest.SemanticKernel
@ -11,11 +10,11 @@ namespace LLama.Unittest.SemanticKernel
        {
            // Arrange
            var options = new JsonSerializerOptions();
-            options.Converters.Add(new LLamaSharpPromptExecutionSettingsConverter());
+            options.Converters.Add(new ChatRequestSettingsConverter());
            var json = "{}";

            // Act
-            var requestSettings = JsonSerializer.Deserialize<LLamaSharpPromptExecutionSettings>(json, options);
+            var requestSettings = JsonSerializer.Deserialize<ChatRequestSettings>(json, options);

            // Assert
            Assert.NotNull(requestSettings);
@ -37,7 +36,7 @@ namespace LLama.Unittest.SemanticKernel
            // Arrange
            var options = new JsonSerializerOptions();
            options.AllowTrailingCommas = true;
-            options.Converters.Add(new LLamaSharpPromptExecutionSettingsConverter());
+            options.Converters.Add(new ChatRequestSettingsConverter());
            var json = @"{
    ""frequency_penalty"": 0.5,
    ""max_tokens"": 250,
@ -50,7 +49,7 @@ namespace LLama.Unittest.SemanticKernel
 }";

            // Act
-            var requestSettings = JsonSerializer.Deserialize<LLamaSharpPromptExecutionSettings>(json, options);
+            var requestSettings = JsonSerializer.Deserialize<ChatRequestSettings>(json, options);

            // Assert
            Assert.NotNull(requestSettings);
@ -74,7 +73,7 @@ namespace LLama.Unittest.SemanticKernel
            // Arrange
            var options = new JsonSerializerOptions();
            options.AllowTrailingCommas = true;
-            options.Converters.Add(new LLamaSharpPromptExecutionSettingsConverter());
+            options.Converters.Add(new ChatRequestSettingsConverter());
            var json = @"{
    ""FrequencyPenalty"": 0.5,
    ""MaxTokens"": 250,
@ -87,7 +86,7 @@ namespace LLama.Unittest.SemanticKernel
 }";

            // Act
-            var requestSettings = JsonSerializer.Deserialize<LLamaSharpPromptExecutionSettings>(json, options);
+            var requestSettings = JsonSerializer.Deserialize<ChatRequestSettings>(json, options);

            // Assert
            Assert.NotNull(requestSettings);
--- a/LLama.Unittest/SemanticKernel/ChatRequestSettingsTests.cs
+++ b/LLama.Unittest/SemanticKernel/ChatRequestSettingsTests.cs
@ -1,4 +1,4 @@
-using LLamaSharp.SemanticKernel;
+using LLamaSharp.SemanticKernel.ChatCompletion;
 using Microsoft.SemanticKernel;

 namespace LLama.Unittest.SemanticKernel
@ -10,7 +10,7 @@ namespace LLama.Unittest.SemanticKernel
        {
            // Arrange
            // Act
-            var requestSettings = LLamaSharpPromptExecutionSettings.FromRequestSettings(null, null);
+            var requestSettings = ChatRequestSettings.FromRequestSettings(null, null);

            // Assert
            Assert.NotNull(requestSettings);
@ -31,7 +31,7 @@ namespace LLama.Unittest.SemanticKernel
        {
            // Arrange
            // Act
-            var requestSettings = LLamaSharpPromptExecutionSettings.FromRequestSettings(null, 200);
+            var requestSettings = ChatRequestSettings.FromRequestSettings(null, 200);

            // Assert
            Assert.NotNull(requestSettings);
@ -51,7 +51,7 @@ namespace LLama.Unittest.SemanticKernel
        public void ChatRequestSettings_FromExistingRequestSettings()
        {
            // Arrange
-            var originalRequestSettings = new LLamaSharpPromptExecutionSettings()
+            var originalRequestSettings = new ChatRequestSettings()
            {
                FrequencyPenalty = 0.5,
                MaxTokens = 100,
@ -64,7 +64,7 @@ namespace LLama.Unittest.SemanticKernel
            };

            // Act
-            var requestSettings = LLamaSharpPromptExecutionSettings.FromRequestSettings(originalRequestSettings);
+            var requestSettings = ChatRequestSettings.FromRequestSettings(originalRequestSettings);

            // Assert
            Assert.NotNull(requestSettings);
@ -81,7 +81,7 @@ namespace LLama.Unittest.SemanticKernel
            };

            // Act
-            var requestSettings = LLamaSharpPromptExecutionSettings.FromRequestSettings(originalRequestSettings);
+            var requestSettings = ChatRequestSettings.FromRequestSettings(originalRequestSettings);

            // Assert
            Assert.NotNull(requestSettings);
@ -109,7 +109,7 @@ namespace LLama.Unittest.SemanticKernel
            };

            // Act
-            var requestSettings = LLamaSharpPromptExecutionSettings.FromRequestSettings(originalRequestSettings);
+            var requestSettings = ChatRequestSettings.FromRequestSettings(originalRequestSettings);

            // Assert
            Assert.NotNull(requestSettings);
@ -148,7 +148,7 @@ namespace LLama.Unittest.SemanticKernel
            };

            // Act
-            var requestSettings = LLamaSharpPromptExecutionSettings.FromRequestSettings(originalRequestSettings);
+            var requestSettings = ChatRequestSettings.FromRequestSettings(originalRequestSettings);

            // Assert
            Assert.NotNull(requestSettings);
--- a/LLama.Unittest/SemanticKernel/ExtensionMethodsTests.cs
+++ b/LLama.Unittest/SemanticKernel/ExtensionMethodsTests.cs
@ -37,7 +37,7 @@ namespace LLamaSharp.SemanticKernel.Tests
        public void ToLLamaSharpInferenceParams_StateUnderTest_ExpectedBehavior()
        {
            // Arrange
-            var requestSettings = new LLamaSharpPromptExecutionSettings();
+            var requestSettings = new ChatRequestSettings();

            // Act
            var result = ExtensionMethods.ToLLamaSharpInferenceParams(
--- a/LLama.Unittest/StatelessExecutorTest.cs
+++ b/LLama.Unittest/StatelessExecutorTest.cs
@ -15,12 +15,11 @@ namespace LLama.Unittest
        public StatelessExecutorTest(ITestOutputHelper testOutputHelper)
        {
            _testOutputHelper = testOutputHelper;
-            _params = new ModelParams(Constants.GenerativeModelPath)
+            _params = new ModelParams(Constants.ModelPath)
            {
                ContextSize = 60,
                Seed = 1754,
                BatchSize = 2,
-                GpuLayerCount = Constants.CIGpuLayerCount,                
            };
            _weights = LLamaWeights.LoadFromFile(_params);
        }
--- a/LLama.Unittest/StreamingTextDecoderTests.cs
+++ b/LLama.Unittest/StreamingTextDecoderTests.cs
@ -14,7 +14,7 @@ public class StreamingTextDecoderTests
    public StreamingTextDecoderTests(ITestOutputHelper testOutputHelper)
    {
        _testOutputHelper = testOutputHelper;
-        _params = new ModelParams(Constants.GenerativeModelPath);
+        _params = new ModelParams(Constants.ModelPath);
        _model = LLamaWeights.LoadFromFile(_params);
    }

--- a/LLama.Unittest/TemplateTests.cs
+++ b/LLama.Unittest/TemplateTests.cs
@ -1,252 +0,0 @@
-using System.Text;
-using LLama.Common;
-using LLama.Native;
-
-namespace LLama.Unittest;
-
-public sealed class TemplateTests
-    : IDisposable
-{
-    private readonly LLamaWeights _model;
-    
-    public TemplateTests()
-    {
-        var @params = new ModelParams(Constants.GenerativeModelPath)
-        {
-            ContextSize = 1,
-            GpuLayerCount = Constants.CIGpuLayerCount
-        };
-        _model = LLamaWeights.LoadFromFile(@params);
-    }
-    
-    public void Dispose()
-    {
-        _model.Dispose();
-    }
-    
-    [Fact]
-    public void BasicTemplate()
-    {
-        var templater = new LLamaTemplate(_model);
-
-        Assert.Equal(0, templater.Count);
-        templater.Add("assistant", "hello");
-        Assert.Equal(1, templater.Count);
-        templater.Add("user", "world");
-        Assert.Equal(2, templater.Count);
-        templater.Add("assistant", "111");
-        Assert.Equal(3, templater.Count);
-        templater.Add("user", "aaa");
-        Assert.Equal(4, templater.Count);
-        templater.Add("assistant", "222");
-        Assert.Equal(5, templater.Count);
-        templater.Add("user", "bbb");
-        Assert.Equal(6, templater.Count);
-        templater.Add("assistant", "333");
-        Assert.Equal(7, templater.Count);
-        templater.Add("user", "ccc");
-        Assert.Equal(8, templater.Count);
-
-        // Call once with empty array to discover length
-        var length = templater.Apply(Array.Empty<byte>());
-        var dest = new byte[length];
-
-        Assert.Equal(8, templater.Count);
-
-        // Call again to get contents
-        length = templater.Apply(dest);
-
-        Assert.Equal(8, templater.Count);
-
-        var templateResult = Encoding.UTF8.GetString(dest.AsSpan(0, length));
-        const string expected = "<|im_start|>assistant\nhello<|im_end|>\n" +
-                                "<|im_start|>user\nworld<|im_end|>\n" +
-                                "<|im_start|>assistant\n" +
-                                "111<|im_end|>" +
-                                "\n<|im_start|>user\n" +
-                                "aaa<|im_end|>\n" +
-                                "<|im_start|>assistant\n" +
-                                "222<|im_end|>\n" +
-                                "<|im_start|>user\n" +
-                                "bbb<|im_end|>\n" +
-                                "<|im_start|>assistant\n" +
-                                "333<|im_end|>\n" +
-                                "<|im_start|>user\n" +
-                                "ccc<|im_end|>\n";
-
-        Assert.Equal(expected, templateResult);
-    }
-
-    [Fact]
-    public void CustomTemplate()
-    {
-        var templater = new LLamaTemplate("gemma");
-
-        Assert.Equal(0, templater.Count);
-        templater.Add("assistant", "hello");
-        Assert.Equal(1, templater.Count);
-        templater.Add("user", "world");
-        Assert.Equal(2, templater.Count);
-        templater.Add("assistant", "111");
-        Assert.Equal(3, templater.Count);
-        templater.Add("user", "aaa");
-        Assert.Equal(4, templater.Count);
-
-        // Call once with empty array to discover length
-        var length = templater.Apply(Array.Empty<byte>());
-        var dest = new byte[length];
-
-        Assert.Equal(4, templater.Count);
-
-        // Call again to get contents
-        length = templater.Apply(dest);
-
-        Assert.Equal(4, templater.Count);
-
-        var templateResult = Encoding.UTF8.GetString(dest.AsSpan(0, length));
-        const string expected = "<start_of_turn>model\n" +
-                                "hello<end_of_turn>\n" +
-                                "<start_of_turn>user\n" +
-                                "world<end_of_turn>\n" +
-                                "<start_of_turn>model\n" +
-                                "111<end_of_turn>\n" +
-                                "<start_of_turn>user\n" +
-                                "aaa<end_of_turn>\n";
-
-        Assert.Equal(expected, templateResult);
-    }
-
-    [Fact]
-    public void BasicTemplateWithAddAssistant()
-    {
-        var templater = new LLamaTemplate(_model)
-        {
-            AddAssistant = true,
-        };
-
-        Assert.Equal(0, templater.Count);
-        templater.Add("assistant", "hello");
-        Assert.Equal(1, templater.Count);
-        templater.Add("user", "world");
-        Assert.Equal(2, templater.Count);
-        templater.Add("assistant", "111");
-        Assert.Equal(3, templater.Count);
-        templater.Add("user", "aaa");
-        Assert.Equal(4, templater.Count);
-        templater.Add("assistant", "222");
-        Assert.Equal(5, templater.Count);
-        templater.Add("user", "bbb");
-        Assert.Equal(6, templater.Count);
-        templater.Add("assistant", "333");
-        Assert.Equal(7, templater.Count);
-        templater.Add("user", "ccc");
-        Assert.Equal(8, templater.Count);
-
-        // Call once with empty array to discover length
-        var length = templater.Apply(Array.Empty<byte>());
-        var dest = new byte[length];
-
-        Assert.Equal(8, templater.Count);
-
-        // Call again to get contents
-        length = templater.Apply(dest);
-
-        Assert.Equal(8, templater.Count);
-
-        var templateResult = Encoding.UTF8.GetString(dest.AsSpan(0, length));
-        const string expected = "<|im_start|>assistant\nhello<|im_end|>\n" +
-                                "<|im_start|>user\nworld<|im_end|>\n" +
-                                "<|im_start|>assistant\n" +
-                                "111<|im_end|>" +
-                                "\n<|im_start|>user\n" +
-                                "aaa<|im_end|>\n" +
-                                "<|im_start|>assistant\n" +
-                                "222<|im_end|>\n" +
-                                "<|im_start|>user\n" +
-                                "bbb<|im_end|>\n" +
-                                "<|im_start|>assistant\n" +
-                                "333<|im_end|>\n" +
-                                "<|im_start|>user\n" +
-                                "ccc<|im_end|>\n" +
-                                "<|im_start|>assistant\n";
-
-        Assert.Equal(expected, templateResult);
-    }
-
-    [Fact]
-    public void GetOutOfRangeThrows()
-    {
-        var templater = new LLamaTemplate(_model);
-
-        Assert.Throws<ArgumentOutOfRangeException>(() => templater[0]);
-
-        templater.Add("assistant", "1");
-        templater.Add("user", "2");
-
-        Assert.Throws<ArgumentOutOfRangeException>(() => templater[-1]);
-        Assert.Throws<ArgumentOutOfRangeException>(() => templater[2]);
-    }
-
-    [Fact]
-    public void RemoveMid()
-    {
-        var templater = new LLamaTemplate(_model);
-
-        templater.Add("assistant", "1");
-        templater.Add("user", "2");
-        templater.Add("assistant", "3");
-        templater.Add("user", "4a");
-        templater.Add("user", "4b");
-        templater.Add("assistant", "5");
-
-        Assert.Equal("user", templater[3].Role);
-        Assert.Equal("4a", templater[3].Content);
-
-        Assert.Equal("assistant", templater[5].Role);
-        Assert.Equal("5", templater[5].Content);
-
-        Assert.Equal(6, templater.Count);
-        templater.RemoveAt(3);
-        Assert.Equal(5, templater.Count);
-
-        Assert.Equal("user", templater[3].Role);
-        Assert.Equal("4b", templater[3].Content);
-
-        Assert.Equal("assistant", templater[4].Role);
-        Assert.Equal("5", templater[4].Content);
-    }
-
-    [Fact]
-    public void RemoveLast()
-    {
-        var templater = new LLamaTemplate(_model);
-
-        templater.Add("assistant", "1");
-        templater.Add("user", "2");
-        templater.Add("assistant", "3");
-        templater.Add("user", "4a");
-        templater.Add("user", "4b");
-        templater.Add("assistant", "5");
-
-        Assert.Equal(6, templater.Count);
-        templater.RemoveAt(5);
-        Assert.Equal(5, templater.Count);
-
-        Assert.Equal("user", templater[4].Role);
-        Assert.Equal("4b", templater[4].Content);
-    }
-
-    [Fact]
-    public void RemoveOutOfRange()
-    {
-        var templater = new LLamaTemplate(_model);
-
-        Assert.Throws<ArgumentOutOfRangeException>(() => templater.RemoveAt(0));
-
-        templater.Add("assistant", "1");
-        templater.Add("user", "2");
-
-        Assert.Throws<ArgumentOutOfRangeException>(() => templater.RemoveAt(-1));
-        Assert.Throws<ArgumentOutOfRangeException>(() => templater.RemoveAt(2));
-    }
-}
--- a/LLama.Unittest/TokenTests.cs
+++ b/LLama.Unittest/TokenTests.cs
@ -12,10 +12,9 @@ public sealed class TokenTests

    public TokenTests()
    {
-        _params = new ModelParams(Constants.GenerativeModelPath)
+        _params = new ModelParams(Constants.ModelPath)
        {
-            ContextSize = 2048,
-            GpuLayerCount = Constants.CIGpuLayerCount,
+            ContextSize = 2048
        };
        _model = LLamaWeights.LoadFromFile(_params);
    }
--- a/LLama.Web/Async/AsyncLock.cs
+++ b/LLama.Web/Async/AsyncLock.cs
@ -1,7 +1,7 @@
 namespace LLama.Web.Async
 {
    /// <summary>
-    /// Create an Async locking using statement
+    /// Create an Async locking using statment
    /// </summary>
    public sealed class AsyncLock
    {
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@ -29,13 +29,9 @@ namespace LLama.Web.Common
        /// <inheritdoc />
        public int GpuLayerCount { get; set; } = 20;

-        public uint SeqMax { get; }
-
        /// <inheritdoc />
        public uint Seed { get; set; } = 1686349486;

-        public bool Embeddings { get; }
-
        /// <inheritdoc />
        public bool UseMemorymap { get; set; } = true;

@ -61,7 +57,7 @@ namespace LLama.Web.Common
        public uint BatchSize { get; set; } = 512;

        /// <inheritdoc />
-        public uint UBatchSize { get; set; } = 512;
+        public bool EmbeddingMode { get; set; } = false;

        /// <inheritdoc />
        public TensorSplitsCollection TensorSplits { get; set; } = new();
@ -112,6 +108,6 @@ namespace LLama.Web.Common
        public float DefragThreshold { get; set; }

        /// <inheritdoc />
-        public LLamaPoolingType PoolingType { get; set; }
+        public bool DoPooling { get; set; }
    }
 }
--- a/LLama.Web/Extensions.cs
+++ b/LLama.Web/Extensions.cs
@ -34,14 +34,14 @@ namespace LLama.Web
        private static List<string> CombineCSV(List<string> list, string csv)
        {
            var results = list is null || list.Count == 0
-                ? CommaSeparatedToList(csv)
-                : CommaSeparatedToList(csv).Concat(list);
+                ? CommaSeperatedToList(csv)
+                : CommaSeperatedToList(csv).Concat(list);
            return results
                .Distinct()
                .ToList();
        }

-        private static List<string> CommaSeparatedToList(string value)
+        private static List<string> CommaSeperatedToList(string value)
        {
            if (string.IsNullOrEmpty(value))
                return new List<string>();
--- a/LLama.Web/Hubs/SessionConnectionHub.cs
+++ b/LLama.Web/Hubs/SessionConnectionHub.cs
@ -30,7 +30,7 @@ namespace LLama.Web.Hubs
        {
            _logger.Log(LogLevel.Information, "[OnDisconnectedAsync], Id: {0}", Context.ConnectionId);

-            // Remove connections session on disconnect
+            // Remove connections session on dissconnect
            await _modelSessionService.CloseAsync(Context.ConnectionId);
            await base.OnDisconnectedAsync(exception);
        }
--- a/LLama.Web/README.md
+++ b/LLama.Web/README.md
@ -1,8 +1,8 @@
 ## LLama.Web - Basic ASP.NET Core examples of LLamaSharp in action
-LLama.Web has no heavy dependencies and no extra frameworks over bootstrap and jquery to keep the examples clean and easy to copy over to your own project
+LLama.Web has no heavy dependencies and no extra frameworks ove bootstrap and jquery to keep the examples clean and easy to copy over to your own project

 ## Websockets
-Using signalr websockets simplifies the streaming of responses and model per connection management
+Using signalr websockets simplifys the streaming of responses and model per connection management



@ -23,7 +23,7 @@ Example:
 {
        "Name": "Alpaca",
        "Path": "D:\\Repositories\\AI\\Prompts\\alpaca.txt",
-        "Prompt": "Alternatively to can set a prompt text directly and omit the Path"
+        "Prompt": "Alternativly to can set a prompt text directly and omit the Path"
        "AntiPrompt": [
          "User:"
        ],
--- a/LLama.Web/Services/ModelService.cs
+++ b/LLama.Web/Services/ModelService.cs
@ -8,7 +8,7 @@ namespace LLama.Web.Services
 {

    /// <summary>
-    /// Service for handling Models,Weights & Contexts
+    /// Sercive for handling Models,Weights & Contexts
    /// </summary>
    public class ModelService : IModelService
    {
--- a/LLama.WebAPI/LLama.WebAPI.csproj
+++ b/LLama.WebAPI/LLama.WebAPI.csproj
@ -9,7 +9,7 @@

  <ItemGroup>
    <PackageReference Include="Microsoft.VisualStudio.Validation" Version="17.8.8" />
-    <PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="8.0.3" />
+    <PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="8.0.2" />
    <PackageReference Include="Swashbuckle.AspNetCore" Version="6.5.0" />
  </ItemGroup>

--- a/LLama/Abstractions/IContextParams.cs
+++ b/LLama/Abstractions/IContextParams.cs
@ -14,29 +14,20 @@ public interface IContextParams
    uint? ContextSize { get; }

    /// <summary>
-    /// maximum batch size that can be submitted at once (must be >=32 to use BLAS) (n_batch)
+    /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
    /// </summary>
    uint BatchSize { get; }

-    /// <summary>
-    /// Physical batch size
-    /// </summary>
-    uint UBatchSize { get; }
-
-    /// <summary>
-    /// max number of sequences (i.e. distinct states for recurrent models)
-    /// </summary>
-    uint SeqMax { get; }
-
    /// <summary>
    /// Seed for the random number generator (seed)
    /// </summary>
    uint Seed { get; }

    /// <summary>
-    /// If true, extract embeddings (together with logits).
+    /// Whether to use embedding mode. (embedding) Note that if this is set to true, 
+    /// The LLamaModel won't produce text response anymore.
    /// </summary>
-    bool Embeddings { get; }
+    bool EmbeddingMode { get; }

    /// <summary>
    /// RoPE base frequency (null to fetch from the model)
@ -114,7 +105,7 @@ public interface IContextParams
    float DefragThreshold { get; }

    /// <summary>
-    /// How to pool (sum) embedding results by sequence id (ignored if no pooling layer)
+    /// Whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
    /// </summary>
-    LLamaPoolingType PoolingType { get; }
+    bool DoPooling { get; }
 }
--- a/LLama/Abstractions/IInferenceParams.cs
+++ b/LLama/Abstractions/IInferenceParams.cs
@ -6,7 +6,7 @@ using LLama.Sampling;
 namespace LLama.Abstractions
 {  
 	 /// <summary>
-	 /// The parameters used for inference.
+	 /// The paramters used for inference.
 	 /// </summary>
 	public interface IInferenceParams
    {
--- a/LLama/Abstractions/ILLamaExecutor.cs
+++ b/LLama/Abstractions/ILLamaExecutor.cs
@ -20,15 +20,16 @@ namespace LLama.Abstractions
        /// </summary>
        public bool IsMultiModal { get; }
        /// <summary>
-        /// Multi-Modal Projections / Clip Model weights
+        /// Muti-Modal Projections / Clip Model weights
        /// </summary>
-        public LLavaWeights? ClipModel { get;  }
-
+        public LLavaWeights? ClipModel { get;  }        
+        
        /// <summary>
-        /// List of images: List of images in byte array format.
+        /// List of images: Image filename and path (jpeg images).
        /// </summary>
-        public List<byte[]> Images { get; }
-
+        public List<string> ImagePaths { get; set; }
+        
+        
        /// <summary>
        /// Asynchronously infers a response from the model.
        /// </summary>
--- a/LLama/Abstractions/IModelParams.cs
+++ b/LLama/Abstractions/IModelParams.cs
@ -232,7 +232,7 @@ namespace LLama.Abstractions
    public sealed record MetadataOverride
    {
        /// <summary>
-        /// Get the key being overridden by this override
+        /// Get the key being overriden by this override
        /// </summary>
        public string Key { get; }

--- a/LLama/Batched/BatchedExecutor.cs
+++ b/LLama/Batched/BatchedExecutor.cs
@ -55,6 +55,23 @@ public sealed class BatchedExecutor
        Epoch = 1;
    }

+    /// <summary>
+    /// Start a new <see cref="Conversation"/> with the given prompt
+    /// </summary>
+    /// <param name="prompt"></param>
+    /// <returns></returns>
+    [Obsolete("Use BatchedExecutor.Create instead")]
+    public Conversation Prompt(string prompt)
+    {
+        if (IsDisposed)
+            throw new ObjectDisposedException(nameof(BatchedExecutor));
+
+        var conversation = Create();
+        conversation.Prompt(prompt);
+
+        return conversation;
+    }
+
    /// <summary>
    /// Start a new <see cref="Conversation"/>
    /// </summary>
@ -67,39 +84,6 @@ public sealed class BatchedExecutor
        return new Conversation(this, GetNextSequenceId());
    }

-    /// <summary>
-    /// Load a conversation that was previously saved to a file. Once loaded the conversation will
-    /// need to be prompted.
-    /// </summary>
-    /// <param name="filepath"></param>
-    /// <returns></returns>
-    /// <exception cref="ObjectDisposedException"></exception>
-    public Conversation Load(string filepath)
-    {
-        if (IsDisposed)
-            throw new ObjectDisposedException(nameof(BatchedExecutor));
-
-        var conversation = Create();
-        conversation.Load(filepath);
-        return conversation;
-    }
-
-    /// <summary>
-    /// Load a conversation that was previously saved into memory. Once loaded the conversation will need to be prompted.
-    /// </summary>
-    /// <param name="state"></param>
-    /// <returns></returns>
-    /// <exception cref="ObjectDisposedException"></exception>
-    public Conversation Load(Conversation.State state)
-    {
-        if (IsDisposed)
-            throw new ObjectDisposedException(nameof(BatchedExecutor));
-
-        var conversation = Create();
-        conversation.Load(state);
-        return conversation;
-    }
-
    /// <summary>
    /// Run inference for all conversations in the batch which have pending tokens.
    ///
--- a/LLama/Batched/Conversation.cs
+++ b/LLama/Batched/Conversation.cs
@ -2,7 +2,6 @@
 using System.Buffers;
 using System.Collections.Generic;
 using System.Runtime.InteropServices;
-using System.Text.Json;
 using LLama.Native;

 namespace LLama.Batched;
@ -15,7 +14,7 @@ public sealed class Conversation
 {
    private ulong _requiredEpoch;
    private LLamaPos _end;
-    private int _batchSampleIndex;
+    private int _batchIndex;
    private bool _disposed;
    private bool _forked;

@ -108,7 +107,7 @@ public sealed class Conversation
            // logits, so sampling one conversation may mess up the fork! Setting the "forked" flag on both sequences ensures
            // they both copy the logits before the next sampling run, to fix this issue.
            _requiredEpoch = _requiredEpoch,
-            _batchSampleIndex = _batchSampleIndex,
+            _batchIndex = _batchIndex,
            _forked = true,

            _end = _end,
@ -141,7 +140,7 @@ public sealed class Conversation
        if (_requiredEpoch > Executor.Epoch)
            throw new CannotSampleRequiresInferenceException();

-        var span = Executor.Context.NativeHandle.GetLogitsIth(_batchSampleIndex);
+        var span = Executor.Context.NativeHandle.GetLogitsIth(_batchIndex);

        // If necessary copy the span, to protect it from modification. This is only done when
        // this conversation has been forked in this epoch.
@ -166,12 +165,11 @@ public sealed class Conversation
    /// </summary>
    /// <param name="input"></param>
    /// <returns></returns>
-    [Obsolete("Tokenize the text and pass the tokens instead")]
-    public void Prompt(string input, bool addBos, bool special)
+    public void Prompt(string input)
    {
        AssertCanBePrompted();

-        Prompt(Executor.Context.Tokenize(input, addBos, special));
+        Prompt(Executor.Context.Tokenize(input));
    }

    /// <summary>
@ -222,7 +220,7 @@ public sealed class Conversation

        // Add the prompt to the batch
        for (var i = 0; i < tokens.Length; i++)
-            _batchSampleIndex = Executor.Batch.Add(tokens[i], _end++, ConversationId, i == tokens.Length - 1);
+            _batchIndex = Executor.Batch.Add(tokens[i], _end++, ConversationId, i == tokens.Length - 1);

        // Mark this conversation as needing inference/sampling
        _requiredEpoch = Executor.Epoch + 1;
@ -352,168 +350,4 @@ public sealed class Conversation
    /// <returns>The new end token position</returns>
    public delegate LLamaPos ModifyKvCache(LLamaPos end, KvAccessor kv);
    #endregion
-
-    #region save/load
-    private void AssertCanLoad()
-    {
-        AssertNotDisposed();
-        if (_end.Value > 0)
-            throw new InvalidOperationException("Cannot load into a non-empty conversation");
-    }
-
-    private void AssertCanSave()
-    {
-        AssertNotDisposed();
-        if (RequiresInference)
-            throw new CannotSaveWhileRequiresInferenceException();
-    }
-
-
-    /// <summary>
-    /// Save the complete state of this conversation to a file. if the file already exists it will be overwritten.
-    /// </summary>
-    /// <param name="filepath"></param>
-    /// <exception cref="CannotSaveWhileRequiresInferenceException"></exception>
-    public void Save(string filepath)
-    {
-        AssertCanSave();
-
-        // Prepare extra state to put into file header
-        var state = GetState();
-        var bytes = JsonSerializer.SerializeToUtf8Bytes(state);
-
-        // Save extra state along with the KV cache
-        Executor.Context.SaveState(filepath, ConversationId, bytes);
-    }
-
-    /// <summary>
-    /// Save the complete state of this conversation in system memory.
-    /// </summary>
-    /// <returns></returns>
-    public State Save()
-    {
-        AssertCanSave();
-
-        return new PrivateState(
-            Executor.Context.GetState(ConversationId),
-            GetState()
-        );
-    }
-
-
-    /// <summary>
-    /// Load state from a file
-    /// This should only ever be called by the BatchedExecutor, on a newly created conversation object!
-    /// </summary>
-    /// <param name="filepath"></param>
-    /// <exception cref="InvalidOperationException"></exception>
-    internal void Load(string filepath)
-    {
-        AssertCanLoad();
-
-        // Load the state from file into the KV cache
-        Executor.Context.LoadState(filepath, ConversationId, out var header);
-
-        // deserialize the extra state in the file header
-        var state = JsonSerializer.Deserialize<SerializableConversationState>(header);
-        if (state == null)
-        {
-            Dispose();
-            throw new InvalidOperationException("Failed to deserialize - deserialized header state was null");
-        }
-
-        Load(state);
-    }
-
-    /// <summary>
-    /// Load state from a previously saved state.
-    /// This should only ever be called by the BatchedExecutor, on a newly created conversation object!
-    /// </summary>
-    /// <param name="state"></param>
-    internal void Load(State state)
-    {
-        AssertCanLoad();
-
-        // There is only one class that extends State and it is PrivateState, so this cast is safe.
-        var priv = (PrivateState)state;
-
-        // Load the state from file into the KV cache
-        Executor.Context.LoadState(priv.SequenceState, ConversationId);
-
-        Load(priv.ConversationState);
-    }
-
-
-    private void Load(SerializableConversationState state)
-    {
-        if (state.Version != 1)
-            throw new InvalidOperationException("Failed to deserialize - mismatched version number");
-
-        // Load extra conversation state
-        _end = state.TokenCount;
-    }
-
-    private SerializableConversationState GetState()
-    {
-        return new SerializableConversationState(
-            Version: 1,
-            TokenCount: TokenCount
-        );
-    }
-
-
-    private record SerializableConversationState(int Version, int TokenCount);
-
-    private sealed class PrivateState
-        : State
-    {
-        public readonly LLamaContext.SequenceState SequenceState;
-        public readonly SerializableConversationState ConversationState;
-
-        public override ulong Size => SequenceState.Size;
-
-        public PrivateState(LLamaContext.SequenceState sequenceState, SerializableConversationState conversationState)
-        {
-            SequenceState = sequenceState;
-            ConversationState = conversationState;
-        }
-
-        /// <inheritdoc />
-        public override void Dispose()
-        {
-            if (IsDisposed)
-                throw new ObjectDisposedException(nameof(State));
-            IsDisposed = true;
-
-            SequenceState.Dispose();
-        }
-    }
-
-    /// <summary>
-    /// In memory saved state of a <see cref="Conversation"/>
-    /// </summary>
-    public abstract class State
-        : IDisposable
-    {
-        /// <summary>
-        /// Indicates if this state has been disposed
-        /// </summary>
-        public bool IsDisposed { get; protected set; }
-
-        /// <summary>
-        /// Get the size in bytes of this state object
-        /// </summary>
-        public abstract ulong Size { get; }
-
-        /// <inheritdoc />
-        public abstract void Dispose();
-
-        /// <summary>
-        /// Internal constructor prevent anyone outside of LLamaSharp extending this class
-        /// </summary>
-        internal State()
-        {
-        }
-    }
-    #endregion
 }
--- a/LLama/Batched/Exceptions.cs
+++ b/LLama/Batched/Exceptions.cs
@ -56,6 +56,18 @@ public class CannotSampleRequiresPromptException
    }
 }

+/// <summary>
+/// This exception is thrown when <see cref="Conversation.Fork"/> is called when <see cref="Conversation.RequiresInference"/> = true
+/// </summary>
+public class CannotForkWhileRequiresInferenceException
+    : ExperimentalBatchedExecutorException
+{
+    internal CannotForkWhileRequiresInferenceException()
+        : base("Cannot `Fork()` a conversation while RequiresInference is true")
+    {
+    }
+}
+
 /// <summary>
 /// This exception is thrown when <see cref="Conversation.Modify"/> is called when <see cref="Conversation.RequiresInference"/> = true
 /// </summary>
@ -66,18 +78,4 @@ public class CannotModifyWhileRequiresInferenceException
        : base("Cannot `Modify()` a conversation while RequiresInference is true")
    {
    }
-}
-
-/// <summary>
-/// This exception is thrown when "Save()" is called on a <see cref="Conversation"/> which has
-/// already been prompted and before "Infer()" has been called.
-/// <see cref="BatchedExecutor"/>.
-/// </summary>
-public class CannotSaveWhileRequiresInferenceException
-    : ExperimentalBatchedExecutorException
-{
-    internal CannotSaveWhileRequiresInferenceException()
-        : base("Must call `Infer()` before saving this Conversation")
-    {
-    }
 }
--- a/LLama/Batched/LLamaContextExtensions.cs
+++ b/LLama/Batched/LLamaContextExtensions.cs
@ -1,117 +0,0 @@
-using System;
-using System.Buffers.Binary;
-using System.IO;
-using System.IO.MemoryMappedFiles;
-using LLama.Native;
-
-namespace LLama.Batched;
-
-internal static class LLamaContextExtensions
-{
-    private const uint FileHeaderMagic = 3430400180;
-
-    /// <summary>
-    /// Save the state of a particular sequence to specified path. Also save some extra data which will be returned when loading.
-    /// Data saved with this method <b>must</b> be saved with <see cref="LoadState(LLamaContext, string, LLamaSeqId, out byte[])"/>
-    /// </summary>
-    /// <param name="context"></param>
-    /// <param name="filename"></param>
-    /// <param name="sequence"></param>
-    /// <param name="header"></param>
-    internal static void SaveState(this LLamaContext context, string filename, LLamaSeqId sequence, ReadOnlySpan<byte> header)
-    {
-        // Delete that file before overwriting it
-        if (File.Exists(filename))
-            File.Delete(filename);
-
-        // Estimate size of state to write to disk, this is always equal to or greater than the actual size
-        var estimatedStateSize = checked((long)context.NativeHandle.GetStateSize(sequence));
-
-        // Space for "extra" byte plus a 8 byte header
-        var prefixSize = header.Length + 8;
-
-        // Add enough space for the "extra" data and a 6 byte header
-        var totalFileSize = prefixSize + estimatedStateSize;
-
-        // Map the file and write the bytes directly to it.
-        long writtenBytes = 0;
-        using (var file = MemoryMappedFile.CreateFromFile(filename, FileMode.Create, null, totalFileSize))
-        {
-            using (var view = file.CreateViewAccessor(0, totalFileSize))
-            {
-                unsafe
-                {
-                    byte* ptr = null;
-                    view.SafeMemoryMappedViewHandle.AcquirePointer(ref ptr);
-                    try
-                    {
-                        // Write prefix data
-                        BinaryPrimitives.WriteUInt32BigEndian(new Span<byte>(ptr + writtenBytes, 4), FileHeaderMagic);
-                        writtenBytes += 4;
-                        BinaryPrimitives.WriteUInt32BigEndian(new Span<byte>(ptr + writtenBytes, 4), (uint)header.Length);
-                        writtenBytes += 4;
-                        header.CopyTo(new Span<byte>(ptr + writtenBytes, header.Length));
-                        writtenBytes += header.Length;
-
-                        // Write state data
-                        writtenBytes += (long)context.NativeHandle.GetState(ptr + writtenBytes, (ulong)estimatedStateSize, sequence);
-                    }
-                    finally
-                    {
-                        view.SafeMemoryMappedViewHandle.ReleasePointer();
-                    }
-                }
-            }
-        }
-
-        // Truncate the file to the actual size of data that was written
-        using (var fileStream = new FileStream(filename, FileMode.Open))
-            fileStream.SetLength(writtenBytes);
-    }
-
-    /// <summary>
-    /// Load the state from the specified path into a particular sequence. Also reading header data. Must only be used with
-    /// data previously saved with <see cref="SaveState(LLamaContext, string, LLamaSeqId, ReadOnlySpan{byte})"/>
-    /// </summary>
-    /// <param name="context"></param>
-    /// <param name="filename"></param>
-    /// <param name="sequence"></param>
-    /// <param name="header"></param>
-    /// <exception cref="InvalidOperationException"></exception>
-    internal static void LoadState(this LLamaContext context, string filename, LLamaSeqId sequence, out byte[] header)
-    {
-        // Map state file into memory and pass that pointer directly to `llama_set_state_data` to load from
-        using (var file = MemoryMappedFile.CreateFromFile(filename, FileMode.Open, null))
-        using (var view = file.CreateViewAccessor())
-        {
-            unsafe
-            {
-                byte* ptr = null;
-                view.SafeMemoryMappedViewHandle.AcquirePointer(ref ptr);
-                try
-                {
-                    var readBytes = 0;
-
-                    // Read header
-                    var magic = BinaryPrimitives.ReadUInt32BigEndian(new ReadOnlySpan<byte>(ptr + readBytes, 4));
-                    readBytes += 4;
-                    if (magic != FileHeaderMagic)
-                        throw new InvalidOperationException("Invalid file header");
-
-                    var headerLength = checked((int)BinaryPrimitives.ReadUInt32BigEndian(new ReadOnlySpan<byte>(ptr + readBytes, 4)));
-                    readBytes += 4;
-
-                    header = new byte[headerLength];
-                    new Span<byte>(ptr + readBytes, headerLength).CopyTo(header);
-                    readBytes += headerLength;
-
-                    context.NativeHandle.SetState(ptr + readBytes, sequence);
-                }
-                finally
-                {
-                    view.SafeMemoryMappedViewHandle.ReleasePointer();
-                }
-            }
-        }
-    }
-}
--- a/LLama/ChatSession.cs
+++ b/LLama/ChatSession.cs
@ -74,21 +74,15 @@ public class ChatSession
    /// </summary>
    /// <param name="executor">The executor for this session</param>
    /// <param name="history">History for this session</param>
-    /// <param name="transform">History Transform for this session</param>
-    /// <returns>A new chat session.</returns>
+    /// <returns></returns>
    public static async Task<ChatSession> InitializeSessionFromHistoryAsync(
-        ILLamaExecutor executor, ChatHistory history, IHistoryTransform? transform = null)
+        ILLamaExecutor executor, ChatHistory history)
    {
        if (executor is not StatefulExecutorBase statefulExecutor)
        {
            throw new ArgumentException("Executor must have a StatefulExecutorBase", nameof(executor));
        }
        var session = new ChatSession(executor, history);
-        if (transform != null)
-        {
-            session = session.WithHistoryTransform(transform);
-        }
-
        await statefulExecutor.PrefillPromptAsync(session.HistoryTransform.HistoryToText(history));
        return session;
    }
@ -551,7 +545,7 @@ public class ChatSession
        InferenceParams? inferenceParams = null,
        [EnumeratorCancellation] CancellationToken cancellationToken = default)
    {
-        // Make sure the last message is an assistant message (response from the LLM).
+        // Make sure the last message is an assistant message (reponse from the LLM).
        ChatHistory.Message? lastAssistantMessage = History.Messages.LastOrDefault();

        if (lastAssistantMessage is null
@ -786,4 +780,4 @@ public record SessionState
            outputTransform,
            historyTransform);
    }
-}
+}
--- a/LLama/Common/InferenceParams.cs
+++ b/LLama/Common/InferenceParams.cs
@ -7,7 +7,7 @@ using LLama.Sampling;
 namespace LLama.Common
 {
    /// <summary>
-    /// The parameters used for inference.
+    /// The paramters used for inference.
    /// </summary>
    public record InferenceParams
        : IInferenceParams
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@ -24,9 +24,6 @@ namespace LLama.Common
        /// <inheritdoc />
        public int GpuLayerCount { get; set; } = 20;

-        /// <inheritdoc />
-        public uint SeqMax { get; set; } = 1;
-
        /// <inheritdoc />
        public uint Seed { get; set; } = 0xFFFFFFFF;

@ -55,10 +52,7 @@ namespace LLama.Common
        public uint BatchSize { get; set; } = 512;

        /// <inheritdoc />
-        public uint UBatchSize { get; set; } = 512;
-
-        /// <inheritdoc />
-        public bool Embeddings { get; set; }
+        public bool EmbeddingMode { get; set; }

        /// <inheritdoc />
        public TensorSplitsCollection TensorSplits { get; set; } = new();
@ -103,7 +97,7 @@ namespace LLama.Common
        public float DefragThreshold { get; set; }

        /// <inheritdoc />
-        public LLamaPoolingType PoolingType { get; set; } = LLamaPoolingType.Unspecified;
+        public bool DoPooling { get; set; }

        /// <inheritdoc />
        public bool VocabOnly { get; set; }
--- a/LLama/Extensions/IContextParamsExtensions.cs
+++ b/LLama/Extensions/IContextParamsExtensions.cs
@ -6,7 +6,7 @@ using LLama.Native;
 namespace LLama.Extensions
 {
    /// <summary>
-    /// Extension methods to the IContextParams interface
+    /// Extention methods to the IContextParams interface
    /// </summary>
    public static class IContextParamsExtensions
    {
@ -20,14 +20,11 @@ namespace LLama.Extensions
        /// <exception cref="ArgumentException"></exception>
        public static void ToLlamaContextParams(this IContextParams @params, out LLamaContextParams result)
        {
-            result = LLamaContextParams.Default();
-
+            result = NativeApi.llama_context_default_params();
            result.n_ctx = @params.ContextSize ?? 0;
            result.n_batch = @params.BatchSize;
-            result.n_ubatch = @params.UBatchSize;
-            result.n_seq_max = @params.SeqMax;
            result.seed = @params.Seed;
-            result.embeddings = @params.Embeddings;
+            result.embedding = @params.EmbeddingMode;
            result.rope_freq_base = @params.RopeFrequencyBase ?? 0;
            result.rope_freq_scale = @params.RopeFrequencyScale ?? 0;

@ -44,13 +41,10 @@ namespace LLama.Extensions
            result.cb_eval = IntPtr.Zero;
            result.cb_eval_user_data = IntPtr.Zero;

-            result.abort_callback = IntPtr.Zero;
-            result.abort_callback_user_data = IntPtr.Zero;
-
            result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16;
            result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16;
            result.offload_kqv = !@params.NoKqvOffload;
-            result.llama_pooling_type = @params.PoolingType;
+            result.do_pooling = @params.DoPooling;

            result.n_threads = Threads(@params.Threads);
            result.n_threads_batch = Threads(@params.BatchThreads);
--- a/LLama/Extensions/IModelParamsExtensions.cs
+++ b/LLama/Extensions/IModelParamsExtensions.cs
@ -7,7 +7,7 @@ using LLama.Native;
 namespace LLama.Extensions;

 /// <summary>
-/// Extension methods to the IModelParams interface
+/// Extention methods to the IModelParams interface
 /// </summary>
 public static class IModelParamsExtensions
 {
@ -28,8 +28,7 @@ public static class IModelParamsExtensions

        var disposer = new GroupDisposable();

-        result = LLamaModelParams.Default();
-
+        result = NativeApi.llama_model_default_params();
        result.main_gpu = @params.MainGpu;
        result.split_mode = @params.SplitMode;
        result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount;
--- a/LLama/GlobalSuppressions.cs
+++ b/LLama/GlobalSuppressions.cs
@ -8,5 +8,3 @@ using System.Diagnostics.CodeAnalysis;
 [assembly: SuppressMessage("Interoperability", "CA1401:P/Invokes should not be visible", Justification = "LLamaSharp intentionally exports the native llama.cpp API")]

 [assembly: SuppressMessage("Style", "IDE0070:Use 'System.HashCode'", Justification = "Not compatible with netstandard2.0")]
-
-[assembly: SuppressMessage("Interoperability", "SYSLIB1054:Use 'LibraryImportAttribute' instead of 'DllImportAttribute' to generate P/Invoke marshalling code at compile time", Justification = "Not compatible with netstandard2.0")]
--- a/LLama/LLamaContext.cs
+++ b/LLama/LLamaContext.cs
@ -1,4 +1,4 @@
-using LLama.Exceptions;
+using LLama.Exceptions;
 using LLama.Native;
 using System;
 using System.Collections.Generic;
@ -152,7 +152,6 @@ namespace LLama
            return decoder.Read();
        }

-        #region state load/save
        /// <summary>
        /// Save the state to specified path.
        /// </summary>
@ -164,7 +163,7 @@ namespace LLama
                File.Delete(filename);

            // Estimate size of state to write to disk, this is always equal to or greater than the actual size
-            var estimatedStateSize = checked((long)NativeHandle.GetStateSize());
+            var estimatedStateSize = (long)NativeApi.llama_get_state_size(NativeHandle);

            // Map the file and write the bytes directly to it. This saves copying the bytes into a C# array
            long writtenBytes;
@ -175,53 +174,8 @@ namespace LLama
                {
                    byte* ptr = null;
                    view.SafeMemoryMappedViewHandle.AcquirePointer(ref ptr);
-                    try
-                    {
-                        writtenBytes = (long)NativeHandle.GetState(ptr, (ulong)estimatedStateSize);
-                    }
-                    finally
-                    {
-                        view.SafeMemoryMappedViewHandle.ReleasePointer();
-                    }
-                }
-            }
-
-            // Truncate the file to the actual size of data that was written
-            using (var fileStream = new FileStream(filename, FileMode.Open))
-                fileStream.SetLength(writtenBytes);
-        }
-
-        /// <summary>
-        /// Save the state of a particular sequence to specified path.
-        /// </summary>
-        /// <param name="filename"></param>
-        /// <param name="sequence"></param>
-        public void SaveState(string filename, LLamaSeqId sequence)
-        {
-            // Delete that file before overwriting it
-            if (File.Exists(filename))
-                File.Delete(filename);
-
-            // Estimate size of state to write to disk, this is always equal to or greater than the actual size
-            var estimatedStateSize = checked((long)NativeHandle.GetStateSize(sequence));
-
-            // Map the file and write the bytes directly to it. This saves copying the bytes into a C# array
-            long writtenBytes;
-            using (var file = MemoryMappedFile.CreateFromFile(filename, FileMode.Create, null, estimatedStateSize))
-            using (var view = file.CreateViewAccessor(0, estimatedStateSize))
-            {
-                unsafe
-                {
-                    byte* ptr = null;
-                    view.SafeMemoryMappedViewHandle.AcquirePointer(ref ptr);
-                    try
-                    {
-                        writtenBytes = (long)NativeHandle.GetState(ptr, (ulong)estimatedStateSize, sequence);
-                    }
-                    finally
-                    {
-                        view.SafeMemoryMappedViewHandle.ReleasePointer();
-                    }
+                    writtenBytes = (long)NativeApi.llama_copy_state_data(NativeHandle, ptr);
+                    view.SafeMemoryMappedViewHandle.ReleasePointer();
                }
            }

@ -233,7 +187,7 @@ namespace LLama
        /// <summary>
        /// Get the state data as an opaque handle, which can be loaded later using <see cref="LoadState(State)"/>
        /// </summary>
-        /// <remarks>Use <see cref="SaveState(string)"/> if you intend to save this state to disk.</remarks>
+        /// <remarks>Use <see cref="SaveState"/> if you intend to save this state to disk.</remarks>
        /// <returns></returns>
        public State GetState()
        {
@ -244,11 +198,7 @@ namespace LLama
            try
            {
                // Copy the state data into memory, discover the actual size required
-                ulong actualSize;
-                unsafe
-                {
-                    actualSize = NativeHandle.GetState((byte*)memory, stateSize);
-                }
+                var actualSize = NativeHandle.GetState(memory, stateSize);

                // Shrink to size
                memory = Marshal.ReAllocHGlobal(memory, (nint)actualSize);
@ -268,48 +218,11 @@ namespace LLama
            }
        }

-        /// <summary>
-        /// Get the state data as an opaque handle, which can be loaded later using <see cref="LoadState(State)"/>
-        /// </summary>
-        /// <remarks>Use <see cref="SaveState(string, LLamaSeqId)"/> if you intend to save this state to disk.</remarks>
-        /// <returns></returns>
-        public SequenceState GetState(LLamaSeqId sequence)
-        {
-            var stateSize = NativeHandle.GetStateSize(sequence);
-
-            // Allocate a chunk of memory large enough to hold the entire state
-            var memory = Marshal.AllocHGlobal((nint)stateSize);
-            try
-            {
-                // Copy the state data into memory, discover the actual size required
-                ulong actualSize;
-                unsafe
-                {
-                    actualSize = NativeHandle.GetState((byte*)memory, stateSize, sequence);
-                }
-
-                // Shrink to size
-                memory = Marshal.ReAllocHGlobal(memory, (nint)actualSize);
-
-                // Wrap memory in a "state"
-                var state = new SequenceState(memory, actualSize);
-
-                // Set memory to zero, to prevent it being freed in finally block
-                memory = IntPtr.Zero;
-
-                return state;
-            }
-            finally
-            {
-                if (memory != IntPtr.Zero)
-                    Marshal.FreeHGlobal(memory);
-            }
-        }
-
        /// <summary>
        /// Load the state from specified path.
        /// </summary>
        /// <param name="filename"></param>
+        /// <exception cref="RuntimeError"></exception>
        public void LoadState(string filename)
        {
            // Map state file into memory and pass that pointer directly to `llama_set_state_data` to load from
@ -320,41 +233,8 @@ namespace LLama
                {
                    byte* ptr = null;
                    view.SafeMemoryMappedViewHandle.AcquirePointer(ref ptr);
-                    try
-                    {
-                        NativeHandle.SetState(ptr);
-                    }
-                    finally
-                    {
-                        view.SafeMemoryMappedViewHandle.ReleasePointer();
-                    }
-                }
-            }
-        }
-
-        /// <summary>
-        /// Load the state from specified path into a particular sequence
-        /// </summary>
-        /// <param name="filename"></param>
-        /// <param name="sequence"></param>
-        public void LoadState(string filename, LLamaSeqId sequence)
-        {
-            // Map state file into memory and pass that pointer directly to `llama_set_state_data` to load from
-            using (var file = MemoryMappedFile.CreateFromFile(filename, FileMode.Open, null))
-            using (var view = file.CreateViewAccessor())
-            {
-                unsafe
-                {
-                    byte* ptr = null;
-                    view.SafeMemoryMappedViewHandle.AcquirePointer(ref ptr);
-                    try
-                    {
-                        NativeHandle.SetState(ptr, sequence);
-                    }
-                    finally
-                    {
-                        view.SafeMemoryMappedViewHandle.ReleasePointer();
-                    }
+                    NativeApi.llama_set_state_data(NativeHandle, ptr);
+                    view.SafeMemoryMappedViewHandle.ReleasePointer();
                }
            }
        }
@ -368,25 +248,10 @@ namespace LLama
        {
            unsafe
            {
-                NativeHandle.SetState((byte*)state.DangerousGetHandle());
+                NativeHandle.SetState((byte*)state.DangerousGetHandle().ToPointer());
            }
        }

-        /// <summary>
-        /// Load the state from memory into a particular sequence
-        /// </summary>
-        /// <param name="state"></param>
-        /// <param name="sequence"></param>
-        /// <exception cref="RuntimeError"></exception>
-        public void LoadState(SequenceState state, LLamaSeqId sequence)
-        {
-            unsafe
-            {
-                NativeHandle.SetState((byte*)state.DangerousGetHandle(), sequence);
-            }
-        }
-        #endregion
-
        /// <summary>
        /// Sample a single token from this context, using the given sampling pipeline
        /// </summary>
@ -492,8 +357,8 @@ namespace LLama
            }

            // Save the newline logit value
-            var nl_token = NativeHandle.ModelHandle.Tokens.Newline;
-            var nl_logit = logits[(int?)nl_token ?? 0];
+            var nl_token = NativeApi.llama_token_nl(NativeHandle.ModelHandle);
+            var nl_logit = logits[(int)nl_token];

            // Convert logits into token candidates
            var candidates_p = LLamaTokenDataArray.Create(logits);
@ -506,7 +371,7 @@ namespace LLama
            candidates_p.RepetitionPenalty(NativeHandle, last_n_array, repeatPenalty, alphaFrequency, alphaPresence);

            // Restore newline token logit value if necessary
-            if (!penalizeNL && nl_token.HasValue)
+            if (!penalizeNL)
            {
                var candidatesSpan = candidates_p.data.Span;
                for (var i = 0; i < candidates_p.data.Length; i++)
@ -521,17 +386,6 @@ namespace LLama
            return candidates_p;
        }

-        /// <summary>
-        /// Gets whether or not the Bos token should be added.
-        /// From common.cpp https://github.com/ggerganov/llama.cpp/blob/60325fa56f61c228464c9f065db3aa6a61f2156e/common/common.cpp#L2417
-        /// </summary>
-        /// <returns></returns>
-        public bool ShouldAddBosToken()
-        {
-            var addBos = NativeApi.llama_add_bos_token(NativeHandle.ModelHandle);
-            return addBos != -1 ? Convert.ToBoolean(addBos) : NativeHandle.LLamaVocabType == LLamaVocabType.SentencePiece;
-        }
-
        #region eval overloads
        /// <summary>
        /// </summary>
@ -563,16 +417,12 @@ namespace LLama
        }

        /// <summary>
-        /// The state of this context, which can be reloaded later
+        /// The state of this model, which can be reloaded later
        /// </summary>
        public class State
            : SafeLLamaHandleBase
        {
-            private readonly ulong _size;
-            /// <summary>
-            /// Get the size in bytes of this state object
-            /// </summary>
-            public ulong Size => _size;
+            private ulong _size;

            internal State(IntPtr memory, ulong size)
                : base(memory, true)
@ -591,7 +441,6 @@ namespace LLama
            /// Convert this state to a byte array
            /// </summary>
            /// <returns></returns>
-            [Obsolete("It is not generally safe to convert a state into a byte array - it will fail if the state is very large")]
            public byte[] ToByteArray()
            {
                var bytes = new byte[_size];
@ -604,7 +453,6 @@ namespace LLama
            /// </summary>
            /// <param name="bytes"></param>
            /// <returns></returns>
-            [Obsolete("It is not generally safe to convert a state into a byte array - it will fail if the state is very large")]
            public static State FromByteArray(byte[] bytes)
            {
                var memory = Marshal.AllocHGlobal(bytes.Length);
@ -612,49 +460,5 @@ namespace LLama
                return new State(memory, (ulong)bytes.Length);
            }
        }
-
-        /// <summary>
-        /// The state of a single sequence, which can be reloaded later
-        /// </summary>
-        public class SequenceState
-            : SafeLLamaHandleBase
-        {
-            private readonly ulong _size;
-            /// <summary>
-            /// Get the size in bytes of this state object
-            /// </summary>
-            public ulong Size => _size;
-
-            internal SequenceState(IntPtr memory, ulong size)
-                : base(memory, true)
-            {
-                _size = size;
-            }
-
-            /// <inheritdoc />
-            protected override bool ReleaseHandle()
-            {
-                Marshal.FreeHGlobal(handle);
-                return true;
-            }
-
-            /// <summary>
-            /// Copy bytes to a destination pointer.
-            /// </summary>
-            /// <param name="dst">Destination to write to</param>
-            /// <param name="length">Length of the destination buffer</param>
-            /// <param name="offset">Offset from start of src to start copying from</param>
-            /// <returns>Number of bytes written to destination</returns>
-            public unsafe ulong CopyTo(byte* dst, ulong length, ulong offset = 0)
-            {
-                var copy = Math.Min(length, _size - offset);
-
-                var src = (byte*)DangerousGetHandle();
-                src += offset;
-
-                Buffer.MemoryCopy(src, dst, length, copy);
-                return copy;
-            }
-        }
    }
 }
--- a/LLama/LLamaEmbedder.cs
+++ b/LLama/LLamaEmbedder.cs
@ -32,7 +32,7 @@ namespace LLama
        /// <param name="logger"></param>
        public LLamaEmbedder(LLamaWeights weights, IContextParams @params, ILogger? logger = null)
        {
-            if (!@params.Embeddings)
+            if (!@params.EmbeddingMode)
                throw new ArgumentException("EmbeddingMode must be true", nameof(@params));

            Context = weights.CreateContext(@params, logger);
@ -75,7 +75,7 @@ namespace LLama
                    n_eval = batchSize;

                batch.Clear();
-                batch.AddRange(tokens.AsSpan(i, n_eval), n_past, LLamaSeqId.Zero, true);
+                batch.AddRange(tokens.AsSpan(i, n_eval), n_past, LLamaSeqId.Zero, false);
                n_past += n_eval;

                var returnCode = await Context.DecodeAsync(batch, cancellationToken);
@ -97,18 +97,10 @@ namespace LLama

        private float[] GetEmbeddingsArray()
        {
-            unsafe
-            {
-                var embeddings = NativeApi.llama_get_embeddings(Context.NativeHandle);
-
-                if (embeddings == null)
-                    embeddings = NativeApi.llama_get_embeddings_seq(Context.NativeHandle, LLamaSeqId.Zero);
-
-                if (embeddings == null)
-                    return Array.Empty<float>();
-
-                return new Span<float>(embeddings, Context.EmbeddingSize).ToArray();
-            }
+            var embeddings = NativeApi.llama_get_embeddings(Context.NativeHandle);
+            if (embeddings == null)
+                return Array.Empty<float>();
+            return embeddings.ToArray();
        }

        private static void Normalize(Span<float> embeddings)
@ -119,10 +111,6 @@ namespace LLama
                lengthSqr += value * value;
            var length = (float)Math.Sqrt(lengthSqr);

-            // Do not divide by length if it is zero
-            if (length <= float.Epsilon)
-                return;
-
            // Normalize
            for (var i = 0; i < embeddings.Length; i++)
                embeddings[i] /= length;
--- a/LLama/LLamaExecutorBase.cs
+++ b/LLama/LLamaExecutorBase.cs
@ -1,4 +1,4 @@
-using LLama.Abstractions;
+using LLama.Abstractions;
 using LLama.Common;
 using LLama.Exceptions;
 using LLama.Native;
@ -76,11 +76,11 @@ namespace LLama
        }
        
        /// <inheritdoc />
-        public LLavaWeights? ClipModel { get;  }
-
+        public LLavaWeights? ClipModel { get;  }      
+        
        /// <inheritdoc />
-        public List<byte[]> Images { get; }
-
+        public List<string> ImagePaths { get; set; }        
+        
        /// <summary>
        /// Current "mu" value for mirostat sampling
        /// </summary>
@ -95,7 +95,7 @@ namespace LLama
        /// <param name="logger"></param>
        protected StatefulExecutorBase(LLamaContext context, ILogger? logger = null)
        {
-            Images = new List<byte[]>();
+            ImagePaths = new List<string>();
            _logger = logger;
            Context = context;
            _pastTokensCount = 0;
@ -105,12 +105,6 @@ namespace LLama
            _decoder = new StreamingTokenDecoder(context);
        }
        
-        /// <summary>
-        /// 
-        /// </summary>
-        /// <param name="context"></param>
-        /// <param name="lLavaWeights"></param>
-        /// <param name="logger"></param>
        public StatefulExecutorBase(LLamaContext context, LLavaWeights lLavaWeights, ILogger? logger = null) : 
                        this( context, logger )
        {
@ -135,7 +129,7 @@ namespace LLama
            {
                _logger?.LogInformation($"[LLamaExecutor] Attempting to load saved session from {filename}");
                var session_tokens = new LLamaToken[Context.ContextSize];
-                if (!NativeApi.llama_state_load_file(Context.NativeHandle, _pathSession, session_tokens, (ulong)Context.ContextSize, out var n_token_count_out))
+                if (!NativeApi.llama_load_session_file(Context.NativeHandle, _pathSession, session_tokens, (ulong)Context.ContextSize, out var n_token_count_out))
                {
                    _logger?.LogError($"[LLamaExecutor] Failed to load session file {filename}");
                    throw new RuntimeError($"Failed to load session file {_pathSession}");
@ -183,7 +177,7 @@ namespace LLama
        public void SaveSessionFile(string filename)
        {
            var session_token_array = _session_tokens.ToArray();
-            NativeApi.llama_state_save_file(Context.NativeHandle, filename, session_token_array, (ulong)session_token_array.Length);
+            NativeApi.llama_save_session_file(Context.NativeHandle, filename, session_token_array, (ulong)session_token_array.Length);
        }

        /// <summary>
@ -195,14 +189,13 @@ namespace LLama
            // if we run out of context:
            // - take the tokensToKeep first tokens from the original prompt (via n_past)
            // - take half of the last (n_ctx - tokensToKeep) tokens and recompute the logits in batches
-            var n_left = _pastTokensCount - tokensToKeep;
-            var n_discard = n_left / 2;
+            int n_left = _pastTokensCount - tokensToKeep;

-            NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, (LLamaSeqId)0, tokensToKeep, tokensToKeep + n_discard);
-            NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, (LLamaSeqId)0, tokensToKeep + n_discard, _pastTokensCount, -n_discard);
+            _pastTokensCount = Math.Max(1, tokensToKeep);
+
+            // insert n_left/2 tokens at the start of embed from last_n_tokens
+            _embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip((int)Context.ContextSize - n_left / 2 - _embeds.Count));

-            _pastTokensCount -= n_discard;
-            
            // stop saving session if we run out of context
            _pathSession = string.Empty;
        }
@ -210,7 +203,7 @@ namespace LLama
        /// <summary>
        /// Try to reuse the matching prefix from the session file.
        /// </summary>
-        protected virtual void TryReuseMatchingPrefix()
+        protected virtual void TryReuseMathingPrefix()
        {
            if (_n_session_consumed < _session_tokens.Count)
            {
--- a/LLama/LLamaInstructExecutor.cs
+++ b/LLama/LLamaInstructExecutor.cs
@ -1,4 +1,4 @@
-using LLama.Abstractions;
+using LLama.Abstractions;
 using LLama.Common;
 using LLama.Native;
 using System;
@ -38,8 +38,8 @@ namespace LLama
                                ILogger? logger = null)
            : base(context, logger)
        {
-            _inp_pfx = Context.Tokenize(instructionPrefix, true, true);
-            _inp_sfx = Context.Tokenize(instructionSuffix, false, true);
+            _inp_pfx = Context.Tokenize(instructionPrefix, true);
+            _inp_sfx = Context.Tokenize(instructionSuffix, false);
            _instructionPrefix = instructionPrefix;
        }

@ -124,7 +124,7 @@ namespace LLama
            if (_is_prompt_run)
            {
                // When running the first input (prompt) in inteactive mode, we should specially process it.
-                _embed_inps = Context.Tokenize(text, true, true).ToList();
+                _embed_inps = Context.Tokenize(text, true).ToList();
            }
            else
            {
@ -135,7 +135,7 @@ namespace LLama
                _consumedTokensCount = _embed_inps.Count;
                _embed_inps.AddRange(_inp_pfx);

-                var line_inp = Context.Tokenize(text, false, true);
+                var line_inp = Context.Tokenize(text, false);
                _embed_inps.AddRange(line_inp);

                _embed_inps.AddRange(_inp_sfx);
@ -163,7 +163,7 @@ namespace LLama
                }
            }

-            if (_embeds.Count > 0 && _embeds.Last() == Context.NativeHandle.ModelHandle.Tokens.EOS)
+            if (_embeds.Count > 0 && _embeds.Last() == NativeApi.llama_token_eos(Context.NativeHandle.ModelHandle))
            {
                args.WaitForInput = true;
            }
@ -186,13 +186,10 @@ namespace LLama
                _is_prompt_run = false;
                if (_pastTokensCount + _embeds.Count > Context.ContextSize)
                {
-                    // Ported from https://github.com/ggerganov/llama.cpp/blob/60325fa56f61c228464c9f065db3aa6a61f2156e/examples/main/main.cpp#L334
-                    // Instruct always uses input token size.
-                    var tokensToKeep = _embed_inps.Count;
-                    HandleRunOutOfContext(tokensToKeep);
+                    HandleRunOutOfContext(inferenceParams.TokensKeep);
                }

-                TryReuseMatchingPrefix();
+                TryReuseMathingPrefix();

                var (result, _) = Context.NativeHandle.Decode(_embeds, LLamaSeqId.Zero, batch, ref _pastTokensCount);
                if (result != DecodeResult.Ok)
@ -262,7 +259,7 @@ namespace LLama
            return Task.CompletedTask;
        }
        /// <summary>
-        /// The descriptor of the state of the instruct executor.
+        /// The desciptor of the state of the instruct executor.
        /// </summary>
        public class InstructExecutorState : ExecutorBaseState
        {
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`TheBloke/Llama-2-7b-Chat-GGUF,llama-2-7b-chat.Q3_K_S.gguf`