Add support for VectorDB (-e data.vectordb=true)

This commit is contained in:
pancake 2023-11-28 11:05:01 +01:00
parent e24fdcaa30
commit 2a34bc145f
4 changed files with 68 additions and 7 deletions

View File

@ -32,12 +32,28 @@ Run a language model in local, without internet, to entertain you or help answer
This is optional ans system dependant. but on recent Debian/Ubuntu systems the `pip` tool is no longer working, because it conflicts with the system packages. The best way to do this is with `venv`:
```bash
python -m venv r2ai
. r2ai/bin/activate
python -m venv venv
. venv/bin/activate
pip install -r requirements.txt
```
Optionally if you want better indexer for the data install vectordb.
```bash
# on Linux
pip install vectordb2
# on macOS
pip install vectordb2 spacy
python -m spacy download en_core_web_sm
brew install llvm
export PATH=/opt/homebrew/Cellar/llvm/17.0.5/bin/:$PATH
CC=clang CXX=clang++ pip install git+https://github.com/teemupitkanen/mrpt/
```
And now you should be able to run it like this
```bash
pip install -r requirements.txt
r2pm -r r2ai
```

View File

@ -337,3 +337,5 @@ elif len(sys.argv) > 1:
r2ai_repl()
elif not within_r2 and have_r2pipe:
r2ai_repl()
else:
r2ai_repl()

View File

@ -11,6 +11,9 @@ except:
from utils import slurp
R2AI_HISTFILE = "/dev/null"
have_vectordb = None
vectordb_instance = None
MAXCHARS = 128
MAXMATCHES = 5
MASTODON_KEY = ""
@ -127,6 +130,42 @@ def smart_slurp(file):
text = md2txt(text)
return text
def vectordb_search2(query_text, use_mastodon):
result = []
if use_mastodon:
print("TODO: mastodon search not supported for indexdb yet")
if have_vectordb == True and vectordb_instance is not None:
res = vectordb_instance.search(query_text, top_n=3)
for r in res:
if r['distance'] < 1:
result.append(r)
return result
def vectordb_search(query_text, source_files, use_mastodon, use_debug):
global have_vectordb, vectordb_instance
if have_vectordb == False:
return []
if have_vectordb == True and vectordb_instance is not None:
return vectordb_search2(query_text, use_mastodon)
try:
import vectordb
have_vectordb = True
except:
have_vectordb = False
print("To better data index use:")
print(" pip install vectordb2")
print("On macOS you'll need to also do this:")
print(" python -m pip install spacy")
print(" python -m spacy download en_core_web_sm")
vectordb_instance = vectordb.Memory()
# indexing data
for file in source_files:
lines = smart_slurp(file).splitlines()
for line in lines:
vectordb_instance.save(line)
# vectordb_instance.save(line, {"title":file, "url": file})
return vectordb_search2(query_text, use_mastodon)
class compute_rarity():
use_mastodon = MASTODON_KEY != "" # False
use_debug = False
@ -241,12 +280,14 @@ def find_sources(srcdir):
res.append(f"{srcdir}/{f2}")
return res
def main_indexer(text, datadir, hist, use_mastodon, use_debug):
def main_indexer(text, datadir, hist, use_mastodon, use_debug, use_vectordb):
source_files = []
if datadir is not None and datadir != "":
source_files.extend(find_sources(datadir))
source_files.extend(find_sources(datadir))
if hist:
source_files.append(R2AI_HISTFILE)
source_files.append(R2AI_HISTFILE)
if use_vectordb:
return vectordb_search(text, source_files, use_mastodon, use_debug)
raredb = compute_rarity(source_files, use_mastodon, use_debug)
res = raredb.find_matches(text)
res = sorted(set(res))

View File

@ -358,6 +358,7 @@ class Interpreter:
self.env["data.local"] = "false"
self.env["data.hist"] = "false"
self.env["data.mastodon"] = "false"
self.env["data.vectordb"] = "false"
self.env["key.mastodon"] = ""
self.env["key.openai"] = ""
# self.env["chat.temperature"] = "0.002" # TODO
@ -452,11 +453,12 @@ class Interpreter:
if self.env["data.use"] == "true":
hist = self.env["data.hist"] == "true"
use_mastodon = self.env["data.mastodon"] == "true"
use_vectordb = self.env["data.vectordb"] == "true"
use_debug = self.env["debug"] == "true"
datadir = None
if self.env["data.local"] == "true":
datadir = self.env["data.path"]
matches = main_indexer(message, datadir, hist, use_mastodon, use_debug)
matches = main_indexer(message, datadir, hist, use_mastodon, use_debug, use_vectordb)
if len(matches) > 0:
for m in matches:
if self.env["debug"] == "true":