Spaces:
Running
on
T4
Running
on
T4
Update app.py
Browse files
app.py
CHANGED
|
@@ -224,23 +224,23 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
|
|
| 224 |
async for update in process_stream():
|
| 225 |
yield update
|
| 226 |
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
|
| 245 |
else:
|
| 246 |
chat_model = serverless_api() # TESTING: ADAPTED FOR HF INFERENCE API (needs to be reverted for production version)
|
|
|
|
| 224 |
async for update in process_stream():
|
| 225 |
yield update
|
| 226 |
|
| 227 |
+
elif model_config.get('reader','TYPE') == 'DEDICATED':
|
| 228 |
+
chat_model = dedicated_endpoint()
|
| 229 |
+
async def process_stream():
|
| 230 |
+
# Without nonlocal, Python would create a new local variable answer_yet inside process_stream(),
|
| 231 |
+
# instead of modifying the one from the outer scope.
|
| 232 |
+
nonlocal answer_yet # Use the outer scope's answer_yet variable
|
| 233 |
+
# Iterate over the streaming response chunks
|
| 234 |
+
async for chunk in chat_model.astream(messages):
|
| 235 |
+
token = chunk.content
|
| 236 |
+
answer_yet += token
|
| 237 |
+
parsed_answer = parse_output_llm_with_sources(answer_yet)
|
| 238 |
+
history[-1] = (query, parsed_answer)
|
| 239 |
+
yield [tuple(x) for x in history], docs_html
|
| 240 |
+
|
| 241 |
+
# Stream the response updates
|
| 242 |
+
async for update in process_stream():
|
| 243 |
+
yield update
|
| 244 |
|
| 245 |
else:
|
| 246 |
chat_model = serverless_api() # TESTING: ADAPTED FOR HF INFERENCE API (needs to be reverted for production version)
|