Add support for VectorDB (-e data.vectordb=true)
This commit is contained in:
parent
e24fdcaa30
commit
2a34bc145f
22
README.md
22
README.md
|
@ -32,12 +32,28 @@ Run a language model in local, without internet, to entertain you or help answer
|
|||
This is optional ans system dependant. but on recent Debian/Ubuntu systems the `pip` tool is no longer working, because it conflicts with the system packages. The best way to do this is with `venv`:
|
||||
|
||||
```bash
|
||||
python -m venv r2ai
|
||||
. r2ai/bin/activate
|
||||
python -m venv venv
|
||||
. venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
Optionally if you want better indexer for the data install vectordb.
|
||||
|
||||
```bash
|
||||
# on Linux
|
||||
pip install vectordb2
|
||||
|
||||
# on macOS
|
||||
pip install vectordb2 spacy
|
||||
python -m spacy download en_core_web_sm
|
||||
brew install llvm
|
||||
export PATH=/opt/homebrew/Cellar/llvm/17.0.5/bin/:$PATH
|
||||
CC=clang CXX=clang++ pip install git+https://github.com/teemupitkanen/mrpt/
|
||||
```
|
||||
|
||||
And now you should be able to run it like this
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
r2pm -r r2ai
|
||||
```
|
||||
|
||||
|
|
2
main.py
2
main.py
|
@ -337,3 +337,5 @@ elif len(sys.argv) > 1:
|
|||
r2ai_repl()
|
||||
elif not within_r2 and have_r2pipe:
|
||||
r2ai_repl()
|
||||
else:
|
||||
r2ai_repl()
|
||||
|
|
|
@ -11,6 +11,9 @@ except:
|
|||
from utils import slurp
|
||||
R2AI_HISTFILE = "/dev/null"
|
||||
|
||||
have_vectordb = None
|
||||
vectordb_instance = None
|
||||
|
||||
MAXCHARS = 128
|
||||
MAXMATCHES = 5
|
||||
MASTODON_KEY = ""
|
||||
|
@ -127,6 +130,42 @@ def smart_slurp(file):
|
|||
text = md2txt(text)
|
||||
return text
|
||||
|
||||
def vectordb_search2(query_text, use_mastodon):
|
||||
result = []
|
||||
if use_mastodon:
|
||||
print("TODO: mastodon search not supported for indexdb yet")
|
||||
if have_vectordb == True and vectordb_instance is not None:
|
||||
res = vectordb_instance.search(query_text, top_n=3)
|
||||
for r in res:
|
||||
if r['distance'] < 1:
|
||||
result.append(r)
|
||||
return result
|
||||
|
||||
def vectordb_search(query_text, source_files, use_mastodon, use_debug):
|
||||
global have_vectordb, vectordb_instance
|
||||
if have_vectordb == False:
|
||||
return []
|
||||
if have_vectordb == True and vectordb_instance is not None:
|
||||
return vectordb_search2(query_text, use_mastodon)
|
||||
try:
|
||||
import vectordb
|
||||
have_vectordb = True
|
||||
except:
|
||||
have_vectordb = False
|
||||
print("To better data index use:")
|
||||
print(" pip install vectordb2")
|
||||
print("On macOS you'll need to also do this:")
|
||||
print(" python -m pip install spacy")
|
||||
print(" python -m spacy download en_core_web_sm")
|
||||
vectordb_instance = vectordb.Memory()
|
||||
# indexing data
|
||||
for file in source_files:
|
||||
lines = smart_slurp(file).splitlines()
|
||||
for line in lines:
|
||||
vectordb_instance.save(line)
|
||||
# vectordb_instance.save(line, {"title":file, "url": file})
|
||||
return vectordb_search2(query_text, use_mastodon)
|
||||
|
||||
class compute_rarity():
|
||||
use_mastodon = MASTODON_KEY != "" # False
|
||||
use_debug = False
|
||||
|
@ -241,12 +280,14 @@ def find_sources(srcdir):
|
|||
res.append(f"{srcdir}/{f2}")
|
||||
return res
|
||||
|
||||
def main_indexer(text, datadir, hist, use_mastodon, use_debug):
|
||||
def main_indexer(text, datadir, hist, use_mastodon, use_debug, use_vectordb):
|
||||
source_files = []
|
||||
if datadir is not None and datadir != "":
|
||||
source_files.extend(find_sources(datadir))
|
||||
source_files.extend(find_sources(datadir))
|
||||
if hist:
|
||||
source_files.append(R2AI_HISTFILE)
|
||||
source_files.append(R2AI_HISTFILE)
|
||||
if use_vectordb:
|
||||
return vectordb_search(text, source_files, use_mastodon, use_debug)
|
||||
raredb = compute_rarity(source_files, use_mastodon, use_debug)
|
||||
res = raredb.find_matches(text)
|
||||
res = sorted(set(res))
|
||||
|
|
|
@ -358,6 +358,7 @@ class Interpreter:
|
|||
self.env["data.local"] = "false"
|
||||
self.env["data.hist"] = "false"
|
||||
self.env["data.mastodon"] = "false"
|
||||
self.env["data.vectordb"] = "false"
|
||||
self.env["key.mastodon"] = ""
|
||||
self.env["key.openai"] = ""
|
||||
# self.env["chat.temperature"] = "0.002" # TODO
|
||||
|
@ -452,11 +453,12 @@ class Interpreter:
|
|||
if self.env["data.use"] == "true":
|
||||
hist = self.env["data.hist"] == "true"
|
||||
use_mastodon = self.env["data.mastodon"] == "true"
|
||||
use_vectordb = self.env["data.vectordb"] == "true"
|
||||
use_debug = self.env["debug"] == "true"
|
||||
datadir = None
|
||||
if self.env["data.local"] == "true":
|
||||
datadir = self.env["data.path"]
|
||||
matches = main_indexer(message, datadir, hist, use_mastodon, use_debug)
|
||||
matches = main_indexer(message, datadir, hist, use_mastodon, use_debug, use_vectordb)
|
||||
if len(matches) > 0:
|
||||
for m in matches:
|
||||
if self.env["debug"] == "true":
|
||||
|
|
Loading…
Reference in New Issue