Spaces:
Runtime error
Runtime error
pond918
commited on
Commit
·
0f6d38f
1
Parent(s):
610b037
feat: workaround to support update
Browse files- faiss_vdb.py +44 -17
faiss_vdb.py
CHANGED
@@ -10,8 +10,9 @@ for development only.
|
|
10 |
db_path = './data'
|
11 |
|
12 |
embeddings = HuggingFaceEmbeddings(model_name='all-mpnet-base-v2')
|
|
|
13 |
db = FAISS.load_local(db_path, embeddings) if os.path.exists(
|
14 |
-
db_path) else FAISS.from_texts([''], embeddings)
|
15 |
|
16 |
|
17 |
def upsert(text, meta):
|
@@ -22,13 +23,8 @@ def upsert(text, meta):
|
|
22 |
Returns:
|
23 |
total docs count
|
24 |
"""
|
25 |
-
id = meta['id']
|
26 |
-
|
27 |
-
if doc and not isinstance(doc, str):
|
28 |
-
FAISS_append_txt(text, meta, id, doc)
|
29 |
-
print('WARN: existing doc, insert duplicate', id)
|
30 |
-
else:
|
31 |
-
db.add_texts([text], metadatas=[meta], ids=[id])
|
32 |
db.save_local(db_path)
|
33 |
|
34 |
return True
|
@@ -38,20 +34,51 @@ def total():
|
|
38 |
return len(db.index_to_docstore_id)
|
39 |
|
40 |
|
41 |
-
def search(text, size=4):
|
42 |
docs = db.similarity_search(text, size)
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
return data
|
45 |
|
46 |
|
47 |
-
def FAISS_append_txt(text, meta, id, doc):
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
vector = np.array(emb, dtype=np.float32)
|
53 |
if db._normalize_L2:
|
54 |
faiss = db.dependable_faiss_import()
|
55 |
faiss.normalize_L2(vector)
|
56 |
-
|
57 |
-
# TODO set db.index_to_docstore_id[old_idx] = ''
|
|
|
10 |
db_path = './data'
|
11 |
|
12 |
embeddings = HuggingFaceEmbeddings(model_name='all-mpnet-base-v2')
|
13 |
+
# TODO clear empty entry on load
|
14 |
db = FAISS.load_local(db_path, embeddings) if os.path.exists(
|
15 |
+
db_path) else FAISS.from_texts([''], embeddings, metadatas=[{"none": 1}])
|
16 |
|
17 |
|
18 |
def upsert(text, meta):
|
|
|
23 |
Returns:
|
24 |
total docs count
|
25 |
"""
|
26 |
+
id = clear_existing(meta['id'])
|
27 |
+
db.add_texts([text], metadatas=[meta], ids=[id])
|
|
|
|
|
|
|
|
|
|
|
28 |
db.save_local(db_path)
|
29 |
|
30 |
return True
|
|
|
34 |
return len(db.index_to_docstore_id)
|
35 |
|
36 |
|
37 |
+
def search(text, size=4, offset=0):
|
38 |
docs = db.similarity_search(text, size)
|
39 |
+
full = size <= len(docs)
|
40 |
+
docs = docs[offset:]
|
41 |
+
data, deled = [], 0
|
42 |
+
for doc in docs:
|
43 |
+
if 'id' in doc.metadata: data.append(doc.metadata)
|
44 |
+
elif 'none' not in doc.metadata: deled = deled + 1
|
45 |
+
if (full and deled > 0):
|
46 |
+
data = data + search(text, size + deled, size)
|
47 |
return data
|
48 |
|
49 |
|
50 |
+
# def FAISS_append_txt(text, meta, id, doc):
|
51 |
+
# """ hack """
|
52 |
+
# doc.page_content = text
|
53 |
+
# doc.metadata = meta
|
54 |
+
# emb = [db.embedding_function(text)]
|
55 |
+
# vector = np.array(emb, dtype=np.float32)
|
56 |
+
# if db._normalize_L2:
|
57 |
+
# faiss = db.dependable_faiss_import()
|
58 |
+
# faiss.normalize_L2(vector)
|
59 |
+
# db.index.add(vector).update()
|
60 |
+
# starting_len = len(self.index_to_docstore_id)
|
61 |
+
# # TODO set db.index_to_docstore_id[old_idx] = ''
|
62 |
+
|
63 |
+
def clear_existing(id):
|
64 |
+
while True:
|
65 |
+
doc = db.docstore.search(id)
|
66 |
+
if not doc or isinstance(doc, str):
|
67 |
+
print('[VDB] insert new doc', id)
|
68 |
+
return id
|
69 |
+
|
70 |
+
# emb = embed_text(doc.page_content)
|
71 |
+
# idx = db.index.assign(emb)
|
72 |
+
# db.index.remove_ids()
|
73 |
+
doc.metadata = {} # clear meta to disable it
|
74 |
+
doc.page_content = {}
|
75 |
+
id = id + '#'
|
76 |
+
|
77 |
+
|
78 |
+
def embed_text(txt):
|
79 |
+
emb = [db.embedding_function()]
|
80 |
vector = np.array(emb, dtype=np.float32)
|
81 |
if db._normalize_L2:
|
82 |
faiss = db.dependable_faiss_import()
|
83 |
faiss.normalize_L2(vector)
|
84 |
+
return vector
|
|