Spaces:
Running
on
T4
Running
on
T4
measure retriever and reader time
Browse files
app.py
CHANGED
|
@@ -3,6 +3,7 @@ import pandas as pd
|
|
| 3 |
import logging
|
| 4 |
import asyncio
|
| 5 |
import os
|
|
|
|
| 6 |
from uuid import uuid4
|
| 7 |
from datetime import datetime, timedelta
|
| 8 |
from pathlib import Path
|
|
@@ -139,8 +140,13 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
|
|
| 139 |
vectorstore = vectorstores["docling"]
|
| 140 |
|
| 141 |
##------------------------------get context----------------------------------------------
|
|
|
|
|
|
|
|
|
|
| 142 |
context_retrieved = get_context(vectorstore=vectorstore,query=query,reports=reports,
|
| 143 |
sources=sources,subtype=subtype)
|
|
|
|
|
|
|
| 144 |
context_retrieved_formatted = "||".join(doc.page_content for doc in context_retrieved)
|
| 145 |
context_retrieved_lst = [doc.page_content for doc in context_retrieved]
|
| 146 |
|
|
@@ -186,18 +192,19 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
|
|
| 186 |
"session_duration_seconds": session_duration,
|
| 187 |
"client_location": session_data['location_info'],
|
| 188 |
"platform": session_data['platform_info'],
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
"subtype": subtype,
|
| 193 |
#"year": year,
|
| 194 |
"question": query,
|
| 195 |
"retriever": model_config.get('retriever','MODEL'),
|
| 196 |
"endpoint_type": model_config.get('reader','TYPE'),
|
| 197 |
"reader": model_config.get('reader','NVIDIA_MODEL'),
|
| 198 |
-
|
| 199 |
}
|
| 200 |
|
|
|
|
| 201 |
if model_config.get('reader','TYPE') == 'NVIDIA':
|
| 202 |
chat_model = nvidia_client()
|
| 203 |
async def process_stream():
|
|
@@ -226,6 +233,8 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
|
|
| 226 |
|
| 227 |
elif model_config.get('reader','TYPE') == 'DEDICATED':
|
| 228 |
chat_model = dedicated_endpoint()
|
|
|
|
|
|
|
| 229 |
async def process_stream():
|
| 230 |
# Without nonlocal, Python would create a new local variable answer_yet inside process_stream(),
|
| 231 |
# instead of modifying the one from the outer scope.
|
|
@@ -236,11 +245,15 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
|
|
| 236 |
answer_yet += token
|
| 237 |
parsed_answer = parse_output_llm_with_sources(answer_yet)
|
| 238 |
history[-1] = (query, parsed_answer)
|
|
|
|
| 239 |
yield [tuple(x) for x in history], docs_html, logs_data, session_id
|
| 240 |
-
|
|
|
|
|
|
|
| 241 |
# Stream the response updates
|
| 242 |
async for update in process_stream():
|
| 243 |
yield update
|
|
|
|
| 244 |
|
| 245 |
else:
|
| 246 |
chat_model = serverless_api() # TESTING: ADAPTED FOR HF INFERENCE API (needs to be reverted for production version)
|
|
@@ -276,6 +289,7 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
|
|
| 276 |
|
| 277 |
async for update in process_stream():
|
| 278 |
yield update
|
|
|
|
| 279 |
|
| 280 |
# logging the event
|
| 281 |
try:
|
|
|
|
| 3 |
import logging
|
| 4 |
import asyncio
|
| 5 |
import os
|
| 6 |
+
import time
|
| 7 |
from uuid import uuid4
|
| 8 |
from datetime import datetime, timedelta
|
| 9 |
from pathlib import Path
|
|
|
|
| 140 |
vectorstore = vectorstores["docling"]
|
| 141 |
|
| 142 |
##------------------------------get context----------------------------------------------
|
| 143 |
+
|
| 144 |
+
### adding for assessing computation time
|
| 145 |
+
start_time = time.time()
|
| 146 |
context_retrieved = get_context(vectorstore=vectorstore,query=query,reports=reports,
|
| 147 |
sources=sources,subtype=subtype)
|
| 148 |
+
end_time = time.time()
|
| 149 |
+
print("Time for retriever:",end_time - start_time)
|
| 150 |
context_retrieved_formatted = "||".join(doc.page_content for doc in context_retrieved)
|
| 151 |
context_retrieved_lst = [doc.page_content for doc in context_retrieved]
|
| 152 |
|
|
|
|
| 192 |
"session_duration_seconds": session_duration,
|
| 193 |
"client_location": session_data['location_info'],
|
| 194 |
"platform": session_data['platform_info'],
|
| 195 |
+
"system_prompt": SYSTEM_PROMPT,
|
| 196 |
+
"sources": sources,
|
| 197 |
+
"reports": reports,
|
| 198 |
+
"subtype": subtype,
|
| 199 |
#"year": year,
|
| 200 |
"question": query,
|
| 201 |
"retriever": model_config.get('retriever','MODEL'),
|
| 202 |
"endpoint_type": model_config.get('reader','TYPE'),
|
| 203 |
"reader": model_config.get('reader','NVIDIA_MODEL'),
|
| 204 |
+
"docs": [doc.page_content for doc in context_retrieved],
|
| 205 |
}
|
| 206 |
|
| 207 |
+
|
| 208 |
if model_config.get('reader','TYPE') == 'NVIDIA':
|
| 209 |
chat_model = nvidia_client()
|
| 210 |
async def process_stream():
|
|
|
|
| 233 |
|
| 234 |
elif model_config.get('reader','TYPE') == 'DEDICATED':
|
| 235 |
chat_model = dedicated_endpoint()
|
| 236 |
+
### adding for assessing computation time
|
| 237 |
+
start_time = time.time()
|
| 238 |
async def process_stream():
|
| 239 |
# Without nonlocal, Python would create a new local variable answer_yet inside process_stream(),
|
| 240 |
# instead of modifying the one from the outer scope.
|
|
|
|
| 245 |
answer_yet += token
|
| 246 |
parsed_answer = parse_output_llm_with_sources(answer_yet)
|
| 247 |
history[-1] = (query, parsed_answer)
|
| 248 |
+
logs_data["answer"] = parsed_answer
|
| 249 |
yield [tuple(x) for x in history], docs_html, logs_data, session_id
|
| 250 |
+
end_time = time.time()
|
| 251 |
+
print("Time for reader:",end_time - start_time)
|
| 252 |
+
|
| 253 |
# Stream the response updates
|
| 254 |
async for update in process_stream():
|
| 255 |
yield update
|
| 256 |
+
|
| 257 |
|
| 258 |
else:
|
| 259 |
chat_model = serverless_api() # TESTING: ADAPTED FOR HF INFERENCE API (needs to be reverted for production version)
|
|
|
|
| 289 |
|
| 290 |
async for update in process_stream():
|
| 291 |
yield update
|
| 292 |
+
|
| 293 |
|
| 294 |
# logging the event
|
| 295 |
try:
|