Spaces:
Sleeping
Sleeping
Added vdb-v3-wikksplitter metadata
Browse files
app.py
CHANGED
@@ -11,7 +11,7 @@ MAX_NEW_TOKENS = 700
|
|
11 |
SHOW_MODEL_PARAMETERS_IN_UI = os.environ.get("SHOW_MODEL_PARAMETERS_IN_UI", default="False") == "True"
|
12 |
import logging
|
13 |
|
14 |
-
logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)s] - %(message)s')
|
15 |
|
16 |
setup()
|
17 |
|
|
|
11 |
SHOW_MODEL_PARAMETERS_IN_UI = os.environ.get("SHOW_MODEL_PARAMETERS_IN_UI", default="False") == "True"
|
12 |
import logging
|
13 |
|
14 |
+
logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(name)s][%(levelname)s] - %(message)s')
|
15 |
|
16 |
setup()
|
17 |
|
rag.py
CHANGED
@@ -9,9 +9,10 @@ from huggingface_hub import snapshot_download, InferenceClient
|
|
9 |
from langchain_community.vectorstores import FAISS
|
10 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
11 |
|
|
|
12 |
|
13 |
-
logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)s] - %(message)s')
|
14 |
# logging.getLogger().setLevel(logging.INFO)
|
|
|
15 |
|
16 |
|
17 |
class RAG:
|
@@ -99,6 +100,15 @@ class RAG:
|
|
99 |
documents_retrieved = self.vectore_store.similarity_search_with_score_by_vector(embedding, k=number_of_contexts)
|
100 |
logging.info(f"Documents retrieved: {len(documents_retrieved)}")
|
101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
# Reranking
|
104 |
# ==============================================================================================================
|
@@ -137,6 +147,7 @@ class RAG:
|
|
137 |
|
138 |
return response.json()[0]["generated_text"].split("###")[-1][8:]
|
139 |
|
|
|
140 |
def predict_completion(self, instruction, context, model_parameters):
|
141 |
|
142 |
client = OpenAI(
|
@@ -183,22 +194,39 @@ class RAG:
|
|
183 |
|
184 |
return text_context, full_context, source_context
|
185 |
|
|
|
186 |
def get_response(self, prompt: str, model_parameters: dict) -> str:
|
187 |
try:
|
188 |
docs = self.get_context(prompt, model_parameters["NUM_CHUNKS"])
|
189 |
|
190 |
response = ""
|
191 |
|
192 |
-
for i, (doc, score) in enumerate(docs):
|
193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
response += "\n\n" + "="*100
|
195 |
response += f"\nDocument {i+1}"
|
196 |
response += "\n" + "="*100
|
197 |
response += f"\nScore: {score:.5f}"
|
198 |
response += f"\nTitle: {doc.metadata['title']}"
|
|
|
199 |
response += f"\nURL: {doc.metadata['url']}"
|
200 |
-
response += f"\nID: {doc.metadata['
|
201 |
-
response += f"\nStart index: {doc.metadata['start_index']}"
|
202 |
# response += f"\nSource: {doc.metadata['src']}"
|
203 |
# response += f"\nRedirected: {doc.metadata['redirected']}"
|
204 |
# url = doc.metadata['url']
|
|
|
9 |
from langchain_community.vectorstores import FAISS
|
10 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
11 |
|
12 |
+
from termcolor import cprint
|
13 |
|
|
|
14 |
# logging.getLogger().setLevel(logging.INFO)
|
15 |
+
logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(name)s][%(levelname)s] - %(message)s')
|
16 |
|
17 |
|
18 |
class RAG:
|
|
|
100 |
documents_retrieved = self.vectore_store.similarity_search_with_score_by_vector(embedding, k=number_of_contexts)
|
101 |
logging.info(f"Documents retrieved: {len(documents_retrieved)}")
|
102 |
|
103 |
+
for i, (doc, score) in enumerate(documents_retrieved):
|
104 |
+
logging.info(f"Document {i+1}:")
|
105 |
+
logging.info(f"Score: {score:.5f}")
|
106 |
+
logging.info(f"Title: {doc.metadata}")
|
107 |
+
# logging.info(f"Source: {doc.metadata['src']}")
|
108 |
+
# logging.info(f"Redirected: {doc.metadata['redirected']}")
|
109 |
+
# url = doc.metadata['url']
|
110 |
+
# logging.info(f"Revision ID: {url}")
|
111 |
+
# logging.info(f'URL: <a href="{url}" target="_blank">{url}</a><br>')
|
112 |
|
113 |
# Reranking
|
114 |
# ==============================================================================================================
|
|
|
147 |
|
148 |
return response.json()[0]["generated_text"].split("###")[-1][8:]
|
149 |
|
150 |
+
|
151 |
def predict_completion(self, instruction, context, model_parameters):
|
152 |
|
153 |
client = OpenAI(
|
|
|
194 |
|
195 |
return text_context, full_context, source_context
|
196 |
|
197 |
+
|
198 |
def get_response(self, prompt: str, model_parameters: dict) -> str:
|
199 |
try:
|
200 |
docs = self.get_context(prompt, model_parameters["NUM_CHUNKS"])
|
201 |
|
202 |
response = ""
|
203 |
|
204 |
+
for i, (doc, score) in enumerate(docs):
|
205 |
|
206 |
+
# ----------------------------------------------------------------------------
|
207 |
+
# vector_db__BAAI__bge-m3__cfg-v3-wikisplitter => metadata
|
208 |
+
# ----------------------------------------------------------------------------
|
209 |
+
# {
|
210 |
+
# 'document_id': '1535',
|
211 |
+
# 'title': 'Intel·ligència artificial',
|
212 |
+
# 'url': 'https://ca.wikipedia.org/wiki?curid=1535',
|
213 |
+
# 'language': 'ca',
|
214 |
+
# 'src': '/gpfs/projects/bsc88/apps/projects/__wiki-rag__/_data/json_extractor/cawiki-20250501/wiki_00.jsonl',
|
215 |
+
# 'section_title': 'Centres tecnològics a Catalunya i les seves aportacions i investigacions en la IA.',
|
216 |
+
# 'section_id': 32,
|
217 |
+
# 'section_len': 3403,
|
218 |
+
# 'split_level': 'section'
|
219 |
+
# }
|
220 |
+
# ----------------------------------------------------------------------------
|
221 |
response += "\n\n" + "="*100
|
222 |
response += f"\nDocument {i+1}"
|
223 |
response += "\n" + "="*100
|
224 |
response += f"\nScore: {score:.5f}"
|
225 |
response += f"\nTitle: {doc.metadata['title']}"
|
226 |
+
response += f"\nSection title: {doc.metadata['section_title']}"
|
227 |
response += f"\nURL: {doc.metadata['url']}"
|
228 |
+
response += f"\nID: {doc.metadata['document_id']}"
|
229 |
+
# response += f"\nStart index: {doc.metadata['start_index']}"
|
230 |
# response += f"\nSource: {doc.metadata['src']}"
|
231 |
# response += f"\nRedirected: {doc.metadata['redirected']}"
|
232 |
# url = doc.metadata['url']
|