nurasaki commited on
Commit
2ad5136
·
1 Parent(s): a880965

Added vdb-v3-wikksplitter metadata

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. rag.py +32 -4
app.py CHANGED
@@ -11,7 +11,7 @@ MAX_NEW_TOKENS = 700
11
  SHOW_MODEL_PARAMETERS_IN_UI = os.environ.get("SHOW_MODEL_PARAMETERS_IN_UI", default="False") == "True"
12
  import logging
13
 
14
- logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)s] - %(message)s')
15
 
16
  setup()
17
 
 
11
  SHOW_MODEL_PARAMETERS_IN_UI = os.environ.get("SHOW_MODEL_PARAMETERS_IN_UI", default="False") == "True"
12
  import logging
13
 
14
+ logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(name)s][%(levelname)s] - %(message)s')
15
 
16
  setup()
17
 
rag.py CHANGED
@@ -9,9 +9,10 @@ from huggingface_hub import snapshot_download, InferenceClient
9
  from langchain_community.vectorstores import FAISS
10
  from langchain_community.embeddings import HuggingFaceEmbeddings
11
 
 
12
 
13
- logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)s] - %(message)s')
14
  # logging.getLogger().setLevel(logging.INFO)
 
15
 
16
 
17
  class RAG:
@@ -99,6 +100,15 @@ class RAG:
99
  documents_retrieved = self.vectore_store.similarity_search_with_score_by_vector(embedding, k=number_of_contexts)
100
  logging.info(f"Documents retrieved: {len(documents_retrieved)}")
101
 
 
 
 
 
 
 
 
 
 
102
 
103
  # Reranking
104
  # ==============================================================================================================
@@ -137,6 +147,7 @@ class RAG:
137
 
138
  return response.json()[0]["generated_text"].split("###")[-1][8:]
139
 
 
140
  def predict_completion(self, instruction, context, model_parameters):
141
 
142
  client = OpenAI(
@@ -183,22 +194,39 @@ class RAG:
183
 
184
  return text_context, full_context, source_context
185
 
 
186
  def get_response(self, prompt: str, model_parameters: dict) -> str:
187
  try:
188
  docs = self.get_context(prompt, model_parameters["NUM_CHUNKS"])
189
 
190
  response = ""
191
 
192
- for i, (doc, score) in enumerate(docs):
193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  response += "\n\n" + "="*100
195
  response += f"\nDocument {i+1}"
196
  response += "\n" + "="*100
197
  response += f"\nScore: {score:.5f}"
198
  response += f"\nTitle: {doc.metadata['title']}"
 
199
  response += f"\nURL: {doc.metadata['url']}"
200
- response += f"\nID: {doc.metadata['id']}"
201
- response += f"\nStart index: {doc.metadata['start_index']}"
202
  # response += f"\nSource: {doc.metadata['src']}"
203
  # response += f"\nRedirected: {doc.metadata['redirected']}"
204
  # url = doc.metadata['url']
 
9
  from langchain_community.vectorstores import FAISS
10
  from langchain_community.embeddings import HuggingFaceEmbeddings
11
 
12
+ from termcolor import cprint
13
 
 
14
  # logging.getLogger().setLevel(logging.INFO)
15
+ logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(name)s][%(levelname)s] - %(message)s')
16
 
17
 
18
  class RAG:
 
100
  documents_retrieved = self.vectore_store.similarity_search_with_score_by_vector(embedding, k=number_of_contexts)
101
  logging.info(f"Documents retrieved: {len(documents_retrieved)}")
102
 
103
+ for i, (doc, score) in enumerate(documents_retrieved):
104
+ logging.info(f"Document {i+1}:")
105
+ logging.info(f"Score: {score:.5f}")
106
+ logging.info(f"Title: {doc.metadata}")
107
+ # logging.info(f"Source: {doc.metadata['src']}")
108
+ # logging.info(f"Redirected: {doc.metadata['redirected']}")
109
+ # url = doc.metadata['url']
110
+ # logging.info(f"Revision ID: {url}")
111
+ # logging.info(f'URL: <a href="{url}" target="_blank">{url}</a><br>')
112
 
113
  # Reranking
114
  # ==============================================================================================================
 
147
 
148
  return response.json()[0]["generated_text"].split("###")[-1][8:]
149
 
150
+
151
  def predict_completion(self, instruction, context, model_parameters):
152
 
153
  client = OpenAI(
 
194
 
195
  return text_context, full_context, source_context
196
 
197
+
198
  def get_response(self, prompt: str, model_parameters: dict) -> str:
199
  try:
200
  docs = self.get_context(prompt, model_parameters["NUM_CHUNKS"])
201
 
202
  response = ""
203
 
204
+ for i, (doc, score) in enumerate(docs):
205
 
206
+ # ----------------------------------------------------------------------------
207
+ # vector_db__BAAI__bge-m3__cfg-v3-wikisplitter => metadata
208
+ # ----------------------------------------------------------------------------
209
+ # {
210
+ # 'document_id': '1535',
211
+ # 'title': 'Intel·ligència artificial',
212
+ # 'url': 'https://ca.wikipedia.org/wiki?curid=1535',
213
+ # 'language': 'ca',
214
+ # 'src': '/gpfs/projects/bsc88/apps/projects/__wiki-rag__/_data/json_extractor/cawiki-20250501/wiki_00.jsonl',
215
+ # 'section_title': 'Centres tecnològics a Catalunya i les seves aportacions i investigacions en la IA.',
216
+ # 'section_id': 32,
217
+ # 'section_len': 3403,
218
+ # 'split_level': 'section'
219
+ # }
220
+ # ----------------------------------------------------------------------------
221
  response += "\n\n" + "="*100
222
  response += f"\nDocument {i+1}"
223
  response += "\n" + "="*100
224
  response += f"\nScore: {score:.5f}"
225
  response += f"\nTitle: {doc.metadata['title']}"
226
+ response += f"\nSection title: {doc.metadata['section_title']}"
227
  response += f"\nURL: {doc.metadata['url']}"
228
+ response += f"\nID: {doc.metadata['document_id']}"
229
+ # response += f"\nStart index: {doc.metadata['start_index']}"
230
  # response += f"\nSource: {doc.metadata['src']}"
231
  # response += f"\nRedirected: {doc.metadata['redirected']}"
232
  # url = doc.metadata['url']