pond918 commited on
Commit
0f6d38f
·
1 Parent(s): 610b037

feat: workaround to support update

Browse files
Files changed (1) hide show
  1. faiss_vdb.py +44 -17
faiss_vdb.py CHANGED
@@ -10,8 +10,9 @@ for development only.
10
  db_path = './data'
11
 
12
  embeddings = HuggingFaceEmbeddings(model_name='all-mpnet-base-v2')
 
13
  db = FAISS.load_local(db_path, embeddings) if os.path.exists(
14
- db_path) else FAISS.from_texts([''], embeddings)
15
 
16
 
17
  def upsert(text, meta):
@@ -22,13 +23,8 @@ def upsert(text, meta):
22
  Returns:
23
  total docs count
24
  """
25
- id = meta['id']
26
- doc = db.docstore.search(id)
27
- if doc and not isinstance(doc, str):
28
- FAISS_append_txt(text, meta, id, doc)
29
- print('WARN: existing doc, insert duplicate', id)
30
- else:
31
- db.add_texts([text], metadatas=[meta], ids=[id])
32
  db.save_local(db_path)
33
 
34
  return True
@@ -38,20 +34,51 @@ def total():
38
  return len(db.index_to_docstore_id)
39
 
40
 
41
- def search(text, size=4):
42
  docs = db.similarity_search(text, size)
43
- data = [doc.metadata for doc in docs if ('id' in doc.metadata)]
 
 
 
 
 
 
 
44
  return data
45
 
46
 
47
- def FAISS_append_txt(text, meta, id, doc):
48
- """ hack """
49
- doc.page_content = text
50
- doc.metadata = meta
51
- emb = [db.embedding_function(text)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  vector = np.array(emb, dtype=np.float32)
53
  if db._normalize_L2:
54
  faiss = db.dependable_faiss_import()
55
  faiss.normalize_L2(vector)
56
- db.index.add(vector)
57
- # TODO set db.index_to_docstore_id[old_idx] = ''
 
10
  db_path = './data'
11
 
12
  embeddings = HuggingFaceEmbeddings(model_name='all-mpnet-base-v2')
13
+ # TODO clear empty entry on load
14
  db = FAISS.load_local(db_path, embeddings) if os.path.exists(
15
+ db_path) else FAISS.from_texts([''], embeddings, metadatas=[{"none": 1}])
16
 
17
 
18
  def upsert(text, meta):
 
23
  Returns:
24
  total docs count
25
  """
26
+ id = clear_existing(meta['id'])
27
+ db.add_texts([text], metadatas=[meta], ids=[id])
 
 
 
 
 
28
  db.save_local(db_path)
29
 
30
  return True
 
34
  return len(db.index_to_docstore_id)
35
 
36
 
37
+ def search(text, size=4, offset=0):
38
  docs = db.similarity_search(text, size)
39
+ full = size <= len(docs)
40
+ docs = docs[offset:]
41
+ data, deled = [], 0
42
+ for doc in docs:
43
+ if 'id' in doc.metadata: data.append(doc.metadata)
44
+ elif 'none' not in doc.metadata: deled = deled + 1
45
+ if (full and deled > 0):
46
+ data = data + search(text, size + deled, size)
47
  return data
48
 
49
 
50
+ # def FAISS_append_txt(text, meta, id, doc):
51
+ # """ hack """
52
+ # doc.page_content = text
53
+ # doc.metadata = meta
54
+ # emb = [db.embedding_function(text)]
55
+ # vector = np.array(emb, dtype=np.float32)
56
+ # if db._normalize_L2:
57
+ # faiss = db.dependable_faiss_import()
58
+ # faiss.normalize_L2(vector)
59
+ # db.index.add(vector).update()
60
+ # starting_len = len(self.index_to_docstore_id)
61
+ # # TODO set db.index_to_docstore_id[old_idx] = ''
62
+
63
+ def clear_existing(id):
64
+ while True:
65
+ doc = db.docstore.search(id)
66
+ if not doc or isinstance(doc, str):
67
+ print('[VDB] insert new doc', id)
68
+ return id
69
+
70
+ # emb = embed_text(doc.page_content)
71
+ # idx = db.index.assign(emb)
72
+ # db.index.remove_ids()
73
+ doc.metadata = {} # clear meta to disable it
74
+ doc.page_content = {}
75
+ id = id + '#'
76
+
77
+
78
+ def embed_text(txt):
79
+ emb = [db.embedding_function()]
80
  vector = np.array(emb, dtype=np.float32)
81
  if db._normalize_L2:
82
  faiss = db.dependable_faiss_import()
83
  faiss.normalize_L2(vector)
84
+ return vector