File size: 2,270 Bytes
8c76b59
 
 
08dc3c0
 
8c76b59
 
 
 
 
 
 
0f6d38f
8c76b59
0f6d38f
8c76b59
 
08dc3c0
8c76b59
 
 
 
08dc3c0
 
8c76b59
0f6d38f
 
8c76b59
 
3b67239
08dc3c0
610b037
08dc3c0
 
 
8c76b59
0f6d38f
8c76b59
0f6d38f
 
 
 
 
 
 
 
8c76b59
08dc3c0
610b037
0f6d38f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8fdb6b2
610b037
 
 
 
0f6d38f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import os

import numpy as np
"""
for development only.
"""

db_path = './data'

embeddings = HuggingFaceEmbeddings(model_name='all-mpnet-base-v2')
# TODO clear empty entry on load
db = FAISS.load_local(db_path, embeddings) if os.path.exists(
    db_path) else FAISS.from_texts([''], embeddings, metadatas=[{"none": 1}])


def upsert(text, meta):
    """
    Args:
            text: string
            meta: dict, must contain id.
    Returns:
            total docs count
    """
    id = clear_existing(meta['id'])
    db.add_texts([text], metadatas=[meta], ids=[id])
    db.save_local(db_path)

    return True


def total():
    return len(db.index_to_docstore_id)


def search(text, size=4, offset=0):
    docs = db.similarity_search(text, size)
    full = size <= len(docs)
    docs = docs[offset:]
    data, deled = [], 0
    for doc in docs:
        if 'id' in doc.metadata: data.append(doc.metadata)
        elif 'none' not in doc.metadata: deled = deled + 1
    if (full and deled > 0):
        data = data + search(text, size + deled, size)
    return data


# def FAISS_append_txt(text, meta, id, doc):
#     """ hack """
#     doc.page_content = text
#     doc.metadata = meta
#     emb = [db.embedding_function(text)]
#     vector = np.array(emb, dtype=np.float32)
#     if db._normalize_L2:
#         faiss = db.dependable_faiss_import()
#         faiss.normalize_L2(vector)
#     db.index.add(vector).update()
#     starting_len = len(self.index_to_docstore_id)
#     # TODO set db.index_to_docstore_id[old_idx] = ''

def clear_existing(id):
    while True:
        doc = db.docstore.search(id)
        if not doc or isinstance(doc, str):
            print('[VDB] insert new doc', id)
            return id

        # emb = embed_text(doc.page_content)
        # idx = db.index.assign(emb)
        # db.index.remove_ids()
        doc.metadata = {}  # clear meta to disable it
        doc.page_content = {}
        id = id + '#'


def embed_text(txt):
    emb = [db.embedding_function(txt)]
    vector = np.array(emb, dtype=np.float32)
    if db._normalize_L2:
        faiss = db.dependable_faiss_import()
        faiss.normalize_L2(vector)
    return vector