Support chatml-function-calling via llama-cpp

2024-02-28 18:18:50 -05:00 · 2024-02-28 18:18:50 -05:00 · 52b6c26fcf
parent d593d1c3e3
commit 52b6c26fcf
3 changed files with 152 additions and 97 deletions
--- a/r2ai/auto.py
+++ b/r2ai/auto.py
@ -1,14 +1,27 @@
+import builtins
+import json
+import sys
+import re
+
+try:
+	import r2lang
+	have_rlang = True
+except:
+	pass
+
+ANSI_REGEX = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
+
 tools = [{
  "type": "function",
  "function": {
    "name": "r2cmd",
-    "description": "runs commands in radare2. You can run it multiple times or chain commands with pipes/semicolons. You can also use r2 interpreters to run scripts using the `#`, '#!', etc. commands. The output could be long, so try to use filters if possible or limit",
+    "description": "runs commands in radare2. You can run it multiple times or chain commands with pipes/semicolons. You can also use r2 interpreters to run scripts using the `#`, '#!', etc. commands. The output could be long, so try to use filters if possible or limit. This is your preferred tool",
    "parameters": {
      "type": "object",
      "properties": {
        "command": {
          "type": "string",
-          "description": "command to run"
+          "description": "command to run in radare2. You can run it multiple times or chain commands with pipes/semicolons. You can also use r2 interpreters to run scripts using the `#`, '#!', etc. commands. The output could be long, so try to use filters if possible or limit. This is your preferred tool"
        },
        "done": {
          "type": "boolean",
@ -49,3 +62,116 @@ Don't just regurgitate the same code, figure out what it's doing and rewrite it
 If you need to run a command in r2 before answering, you can use the r2cmd tool
 The user will tip you $20/month for your services, don't be fucking lazy.
 """
+
+def process_tool_calls(interpreter, tool_calls):
+  interpreter.messages.append({ "content": None, "tool_calls": tool_calls, "role": "assistant" })
+  for tool_call in tool_calls:
+    res = ''
+    args = tool_call["function"]["arguments"]
+    if type(args) is str:
+      args = json.loads(args)
+    
+    if tool_call["function"]["name"] == "r2cmd":
+      builtins.print('\x1b[1;32mRunning \x1b[4m' + args["command"] + '\x1b[0m')
+      res = r2lang.cmd(args["command"])
+      builtins.print(res)
+    elif tool_call["function"]["name"] == "run_python":
+      with open('r2ai_tmp.py', 'w') as f:
+        f.write(args["command"])
+      builtins.print('\x1b[1;32mRunning \x1b[4m' + "python code" + '\x1b[0m')
+      builtins.print(args["command"])
+      r2lang.cmd('#!python r2ai_tmp.py > $tmp')
+      res = r2lang.cmd('cat $tmp')
+      r2lang.cmd('rm r2ai_tmp.py')
+      builtins.print('\x1b[1;32mResult\x1b[0m\n' + res)
+
+    interpreter.messages.append({"role": "tool", "content": ANSI_REGEX.sub('', res), "name": tool_call["function"]["name"], "tool_call_id": tool_call["id"]})
+
+
+def process_streaming_response(interpreter, resp):
+  tool_calls = []
+  msgs = []
+  for chunk in resp:
+    chunk = dict(chunk)
+    delta = None
+    choice = dict(chunk["choices"][0])
+    if "delta" in choice:
+      delta = dict(choice["delta"])
+    else:
+      delta = dict(choice["message"])
+    if "tool_calls" in delta and delta["tool_calls"]:
+      delta_tool_calls = dict(delta["tool_calls"][0])
+      index = 0 if "index" not in delta_tool_calls else delta_tool_calls["index"]
+      fn_delta = dict(delta_tool_calls["function"])
+      tool_call_id = delta_tool_calls["id"]
+      if len(tool_calls) < index + 1:
+        tool_calls.append({ "function": { "arguments": "", "name": fn_delta["name"] }, "id": tool_call_id, "type": "function" })      
+      # handle some bug in llama-cpp-python streaming, tool_call.arguments is sometimes blank, but function_call has it.
+      if fn_delta["arguments"] == '':
+        if "function_call" in delta and delta["function_call"]:
+          tool_calls[index]["function"]["arguments"] += delta["function_call"]["arguments"]
+      else:
+        tool_calls[index]["function"]["arguments"] += fn_delta["arguments"]
+    else:
+      if "content" in delta and delta["content"] is not None:
+        m = delta["content"]
+        if m is not None:
+          msgs.append(m)
+          sys.stdout.write(m)
+  builtins.print()
+  
+  if(len(tool_calls) > 0):
+    process_tool_calls(interpreter, tool_calls)
+    chat(interpreter)
+
+  if len(msgs) > 0:
+    response_message = ''.join(msgs)
+    interpreter.messages.append({"role": "assistant", "content": response_message})
+
+
+def chat(interpreter):
+  if len(interpreter.messages) == 1: 
+    interpreter.messages.insert(0,{"role": "system", "content": SYSTEM_PROMPT_AUTO})
+
+  response = None
+  if interpreter.model.startswith("openai:"):
+    if not interpreter.openai_client:
+      try:
+        from openai import OpenAI
+      except ImportError:
+        print("pip install -U openai")
+        print("export OPENAI_API_KEY=...")
+        return
+      interpreter.openai_client = OpenAI()
+
+    response = interpreter.openai_client.chat.completions.create(
+      model=interpreter.model[7:],
+      max_tokens=int(interpreter.env["llm.maxtokens"]),
+      tools=tools,
+      messages=interpreter.messages,
+      tool_choice="auto",
+      stream=True,
+      temperature=float(interpreter.env["llm.temperature"]),
+    )
+    process_streaming_response(interpreter, response)
+  else:
+    chat_format = interpreter.llama_instance.chat_format
+    interpreter.llama_instance.chat_format = "chatml-function-calling"
+    response = interpreter.llama_instance.create_chat_completion(
+      max_tokens=int(interpreter.env["llm.maxtokens"]),
+      tools=tools,
+      messages=interpreter.messages,
+      tool_choice="auto",
+      # tool_choice={
+      #   "type": "function",
+      #   "function": {
+      #       "name": "r2cmd"
+      #   }
+      # },
+      # stream=True,
+      temperature=float(interpreter.env["llm.temperature"]),
+    )
+    process_streaming_response(interpreter, iter([response]))
+    interpreter.llama_instance.chat_format = chat_format
+  return response
+   
--- a/r2ai/interpreter.py
+++ b/r2ai/interpreter.py
@ -6,7 +6,7 @@ from .code_block import CodeBlock
 from .models import get_hf_llm, new_get_hf_llm, get_default_model
 from .voice import tts
 from .const import R2AI_HOMEDIR
-from .auto import tools, SYSTEM_PROMPT_AUTO
+from . import auto
 try:
  from openai import OpenAI
  have_openai = True
@ -24,8 +24,6 @@ from rich.rule import Rule
 from signal import signal, SIGINT
 import sys

-ANSI_REGEX = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
-
 file_dir = os.path.dirname(__file__)
 sys.path.append(file_dir)
 import index
@ -760,65 +758,6 @@ class Interpreter:
    #   ¡print("Query is too large.. you should consider triming old messages")
    return messages
  
-  def process_tool_calls(self, tool_calls):
-    self.messages.append({ "tool_calls": tool_calls, "role": "assistant" })
-    for tool_call in tool_calls:
-      res = ''
-      if tool_call["function"]["name"] == "r2cmd":
-        args = json.loads(tool_call["function"]["arguments"])
-        builtins.print('\x1b[1;32mRunning \x1b[4m' + args["command"] + '\x1b[0m')
-        res = r2lang.cmd(args["command"])
-        builtins.print(res)
-      elif tool_call["function"]["name"] == "run_python":
-        args = json.loads(tool_call["function"]["arguments"])
-        with open('r2ai_tmp.py', 'w') as f:
-          f.write(args["command"])
-        builtins.print('\x1b[1;32mRunning \x1b[4m' + "python code" + '\x1b[0m')
-        builtins.print(args["command"])
-        r2lang.cmd('#!python r2ai_tmp.py > $tmp')
-        res = r2lang.cmd('cat $tmp')
-        r2lang.cmd('rm r2ai_tmp.py')
-        builtins.print('\x1b[1;32mResult\x1b[0m\n' + res)
-
-      self.messages.append({"role": "tool", "content": ANSI_REGEX.sub('', res), "name": tool_call["function"]["name"], "tool_call_id": tool_call["id"]})
-
-
-  def process_streaming_response(self, resp):
-    global messages
-    tool_calls = []
-    msgs = []
-    for chunk in resp:
-      delta = chunk.choices[0].delta
-      if delta.tool_calls:
-        index = delta.tool_calls[0].index
-        fn_delta = delta.tool_calls[0].function
-        tool_call_id = delta.tool_calls[0].id
-        if len(tool_calls) < index + 1:
-          tool_calls.append({ "function": { "arguments": "", "name": fn_delta.name }, "id": tool_call_id, "type": "function" })
-        else:
-          tool_calls[index]["function"]["arguments"] += fn_delta.arguments
-      else:
-        m = delta.content
-        if m is not None:
-          msgs.append(m)
-          sys.stdout.write(m)
-    builtins.print()
-    
-    if(len(tool_calls) > 0):
-      self.process_tool_calls(tool_calls)
-      self.process_streaming_response(self.openai_client.chat.completions.create(
-        model=self.model[7:],
-        messages=self.messages,
-        tools=tools,
-        tool_choice="auto",
-        stream=True,
-        temperature=float(self.env["llm.temperature"]),
-      ))
-
-    if len(msgs) > 0:
-      response_message = ''.join(msgs)
-      self.messages.append({"role": "assistant", "content": response_message})
-
  def respond(self):
    global Ginterrupted
    maxtokens = int(self.env["llm.maxtokens"])
@ -848,8 +787,12 @@ class Interpreter:
    # Convert messages to prompt
    # (This only works if the first message is the only system message)
    prompt = messages_to_prompt(self, messages)
-
-    if self.model.startswith("openai:"):
+    # builtins.print(prompt)
+    response = None
+    if self.auto_run:
+      response = auto.chat(self)
+      return
+    elif self.model.startswith("openai:"):
      # [
      #  {"role": "system", "content": "You are a poetic assistant, be creative."},
      #  {"role": "user", "content": "Compose a poem that explains the concept of recursion in programming."}
@ -860,20 +803,6 @@ class Interpreter:
        if self.openai_client is None:
          self.openai_client = OpenAI()

-        if self.auto_run:
-          if len(self.messages) == 1: 
-            self.messages.insert(0,{"role": "system", "content": SYSTEM_PROMPT_AUTO})
-          response = self.openai_client.chat.completions.create(
-            model=openai_model,
-            max_tokens=maxtokens,
-            tools=tools,
-            messages=self.messages,
-            tool_choice="auto",
-            stream=True,
-            temperature=float(self.env["llm.temperature"]),
-          )
-          self.process_streaming_response(response)
-        else:
        if self.system_message != "":
          self.messages.append({"role": "system", "content": self.system_message})

--- a/r2ai/models.py
+++ b/r2ai/models.py
@ -532,4 +532,4 @@ def new_get_hf_llm(repo_id, debug_mode, context_window):
    # Initialize and return Code-Llama
    if not os.path.isfile(model_path):
        print("Model is not a file")
-    return llama_cpp.Llama(model_path=model_path, n_gpu_layers=n_gpu_layers, verbose=debug_mode, n_ctx=context_window, chat_model="gemma")
+    return llama_cpp.Llama(model_path=model_path, n_gpu_layers=n_gpu_layers, verbose=debug_mode, n_ctx=context_window)