Spaces:
Sleeping
Sleeping
Upload functions.py
Browse files- functions.py +43 -0
functions.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#----------- SETUP -----------
|
2 |
+
from langchain_community.document_loaders import WebBaseLoader
|
3 |
+
from langchain_text_splitters import CharacterTextSplitter
|
4 |
+
from langchain_community.vectorstores import FAISS
|
5 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
import os
|
8 |
+
import logging
|
9 |
+
logging.getLogger("langchain.text_splitter").setLevel(logging.ERROR)
|
10 |
+
import warnings
|
11 |
+
warnings.filterwarnings("ignore")
|
12 |
+
import yaml
|
13 |
+
|
14 |
+
# ----------- PARAMS -----------
|
15 |
+
with open('./config.yaml', 'r', encoding='utf-8') as file:
|
16 |
+
config = yaml.safe_load(file)
|
17 |
+
EMBEDDING_MODEL = config.get('EMBEDDING_MODEL')
|
18 |
+
LLM_MODEL = config.get('LLM_MODEL')
|
19 |
+
REBUILD_VECTOR_STORE = config.get('REBUILD_VECTOR_STORE')
|
20 |
+
CHUNK_SIZE = config.get('CHUNK_SIZE')
|
21 |
+
CHUNK_OVERLAP = config.get('CHUNK_OVERLAP')
|
22 |
+
CACHE_FOLDER = config.get('CACHE_FOLDER')
|
23 |
+
URL_LIST = config.get('URL_LIST')
|
24 |
+
VS_BASE = config.get('VS_BASE')
|
25 |
+
|
26 |
+
# ----------- VECTOR STORE CREATION -----------
|
27 |
+
def fn_rebuild_vector_store(REBUILD_VECTOR_STORE, URL_LIST, VS_BASE, EMBEDDING_MODEL, CACHE_FOLDER, CHUNK_SIZE, CHUNK_OVERLAP):
|
28 |
+
if REBUILD_VECTOR_STORE:
|
29 |
+
print("[INFO] REBUILD_VECTOR_STORE was set True. Recreating the vector store...")
|
30 |
+
loader = WebBaseLoader(web_paths=URL_LIST)
|
31 |
+
docs = loader.load()
|
32 |
+
text_splitter = CharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
|
33 |
+
split_docs = text_splitter.split_documents(docs)
|
34 |
+
embeddings = HuggingFaceEmbeddings(
|
35 |
+
model_name=EMBEDDING_MODEL,
|
36 |
+
cache_folder=CACHE_FOLDER)
|
37 |
+
vector_store = FAISS.from_documents(split_docs, embeddings)
|
38 |
+
os.makedirs(VS_BASE, exist_ok=True)
|
39 |
+
vector_store.save_local(VS_BASE)
|
40 |
+
print(f"[INFO] Vector Store saved in the path: {VS_BASE}")
|
41 |
+
else:
|
42 |
+
print("[INFO] REBUILD_VECTOR_STORE was set False. Using the current vector store...")
|
43 |
+
return print(f"[INFO] End of vector store process")
|