Update app/main.py
Browse files- app/main.py +19 -5
app/main.py
CHANGED
@@ -1079,8 +1079,9 @@ async def chat_completions(request: OpenAIRequest, api_key: str = Depends(get_ap
|
|
1079 |
|
1080 |
|
1081 |
if request.stream:
|
1082 |
-
# Check if fake streaming is enabled
|
1083 |
-
|
|
|
1084 |
return await fake_stream_generator(model_name, prompt, current_gen_config, request)
|
1085 |
|
1086 |
# Regular streaming call
|
@@ -1219,7 +1220,13 @@ async def chat_completions(request: OpenAIRequest, api_key: str = Depends(get_ap
|
|
1219 |
# This assumes the generator correctly terminates after yielding the error.
|
1220 |
# Re-evaluate if this causes issues. The goal is to avoid double responses.
|
1221 |
# It seems returning the StreamingResponse object itself is the correct FastAPI pattern.
|
1222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1223 |
|
1224 |
|
1225 |
else:
|
@@ -1259,7 +1266,12 @@ async def chat_completions(request: OpenAIRequest, api_key: str = Depends(get_ap
|
|
1259 |
return JSONResponse(status_code=500, content=error_response)
|
1260 |
else:
|
1261 |
# Let the StreamingResponse handle yielding the error
|
1262 |
-
|
|
|
|
|
|
|
|
|
|
|
1263 |
|
1264 |
|
1265 |
except Exception as e:
|
@@ -1366,7 +1378,9 @@ async def fake_stream_generator(model_name, prompt, current_gen_config, request)
|
|
1366 |
keep_alive_sent += 1
|
1367 |
|
1368 |
# Wait before sending the next keep-alive message
|
1369 |
-
|
|
|
|
|
1370 |
|
1371 |
try:
|
1372 |
# Get the response from the completed task
|
|
|
1079 |
|
1080 |
|
1081 |
if request.stream:
|
1082 |
+
# Check if fake streaming is enabled (directly from environment variable)
|
1083 |
+
fake_streaming = os.environ.get("FAKE_STREAMING", "false").lower() == "true"
|
1084 |
+
if fake_streaming:
|
1085 |
return await fake_stream_generator(model_name, prompt, current_gen_config, request)
|
1086 |
|
1087 |
# Regular streaming call
|
|
|
1220 |
# This assumes the generator correctly terminates after yielding the error.
|
1221 |
# Re-evaluate if this causes issues. The goal is to avoid double responses.
|
1222 |
# It seems returning the StreamingResponse object itself is the correct FastAPI pattern.
|
1223 |
+
# For streaming requests, we need to return a new StreamingResponse with an error
|
1224 |
+
# since we can't access the previous StreamingResponse objects
|
1225 |
+
async def error_stream():
|
1226 |
+
yield f"data: {json.dumps(error_response)}\n\n"
|
1227 |
+
yield "data: [DONE]\n\n"
|
1228 |
+
|
1229 |
+
return StreamingResponse(error_stream(), media_type="text/event-stream")
|
1230 |
|
1231 |
|
1232 |
else:
|
|
|
1266 |
return JSONResponse(status_code=500, content=error_response)
|
1267 |
else:
|
1268 |
# Let the StreamingResponse handle yielding the error
|
1269 |
+
# For streaming requests, create a new error stream
|
1270 |
+
async def error_stream():
|
1271 |
+
yield f"data: {json.dumps(error_response)}\n\n"
|
1272 |
+
yield "data: [DONE]\n\n"
|
1273 |
+
|
1274 |
+
return StreamingResponse(error_stream(), media_type="text/event-stream")
|
1275 |
|
1276 |
|
1277 |
except Exception as e:
|
|
|
1378 |
keep_alive_sent += 1
|
1379 |
|
1380 |
# Wait before sending the next keep-alive message
|
1381 |
+
# Get interval from environment variable directly
|
1382 |
+
fake_streaming_interval = float(os.environ.get("FAKE_STREAMING_INTERVAL", "1.0"))
|
1383 |
+
await asyncio.sleep(fake_streaming_interval)
|
1384 |
|
1385 |
try:
|
1386 |
# Get the response from the completed task
|