bibibi12345 commited on
Commit
aa79ca3
·
1 Parent(s): 63eb410

fixed auto retry for fake streaming

Browse files
Files changed (2) hide show
  1. app/api_helpers.py +47 -27
  2. app/routes/chat_api.py +10 -4
app/api_helpers.py CHANGED
@@ -66,21 +66,13 @@ def is_response_valid(response):
66
  # print(f"DEBUG: Response valid due to part.text in candidate's content part")
67
  return True
68
 
69
- # Check for prompt_feedback, which indicates the API processed the request,
70
- # even if the content is empty (e.g. due to safety filtering).
71
- # The fake_stream_generator should still attempt to process this to convey safety messages if present.
72
- if hasattr(response, 'prompt_feedback'):
73
- # Check if there's any block reason, which might be interesting to log or handle
74
- if hasattr(response.prompt_feedback, 'block_reason') and response.prompt_feedback.block_reason:
75
- print(f"DEBUG: Response has prompt_feedback with block_reason: {response.prompt_feedback.block_reason}, considering it valid for processing.")
76
- else:
77
- print("DEBUG: Response has prompt_feedback (no block_reason), considering it valid for processing.")
78
- return True
79
-
80
- print("DEBUG: Response is invalid, no usable text content or prompt_feedback found.")
81
  return False
82
 
83
- async def fake_stream_generator(client_instance, model_name: str, prompt: Union[types.Content, List[types.Content]], current_gen_config: Dict[str, Any], request_obj: OpenAIRequest):
84
  response_id = f"chatcmpl-{int(time.time())}"
85
  async def fake_stream_inner():
86
  print(f"FAKE STREAMING: Making non-streaming request to Gemini API (Model: {model_name})")
@@ -98,8 +90,20 @@ async def fake_stream_generator(client_instance, model_name: str, prompt: Union[
98
  await asyncio.sleep(app_config.FAKE_STREAMING_INTERVAL_SECONDS)
99
  try:
100
  response = api_call_task.result()
101
- if not is_response_valid(response):
102
- raise ValueError(f"Invalid/empty response in fake stream: {str(response)[:200]}")
 
 
 
 
 
 
 
 
 
 
 
 
103
  full_text = ""
104
  if hasattr(response, 'text'):
105
  full_text = response.text or "" # Coalesce None to empty string
@@ -136,10 +140,12 @@ async def fake_stream_generator(client_instance, model_name: str, prompt: Union[
136
  # It's good practice to log the JSON payload here too for consistency,
137
  # though the main concern was the true streaming path.
138
  json_payload_for_fake_stream_error = json.dumps(err_resp)
139
- print(f"DEBUG: Yielding error JSON payload during fake streaming: {json_payload_for_fake_stream_error}")
140
- yield f"data: {json_payload_for_fake_stream_error}\n\n"
141
- yield "data: [DONE]\n\n"
142
- raise # Re-raise the exception to allow auto-mode retries
 
 
143
  return fake_stream_inner()
144
 
145
  async def execute_gemini_call(
@@ -147,14 +153,15 @@ async def execute_gemini_call(
147
  model_to_call: str,
148
  prompt_func: Callable[[List[OpenAIMessage]], Union[types.Content, List[types.Content]]],
149
  gen_config_for_call: Dict[str, Any],
150
- request_obj: OpenAIRequest # Pass the whole request object
 
151
  ):
152
  actual_prompt_for_call = prompt_func(request_obj.messages)
153
 
154
  if request_obj.stream:
155
  if app_config.FAKE_STREAMING_ENABLED:
156
  return StreamingResponse(
157
- await fake_stream_generator(current_client, model_to_call, actual_prompt_for_call, gen_config_for_call, request_obj),
158
  media_type="text/event-stream"
159
  )
160
 
@@ -180,15 +187,28 @@ async def execute_gemini_call(
180
 
181
  err_resp_content_call = create_openai_error_response(500, error_message_str, "server_error")
182
  json_payload_for_error = json.dumps(err_resp_content_call)
183
- print(f"DEBUG: Yielding error JSON payload during true streaming: {json_payload_for_error}")
184
- yield f"data: {json_payload_for_error}\n\n"
185
- yield "data: [DONE]\n\n"
186
- raise # Re-raise to be caught by retry logic if any
 
 
187
  return StreamingResponse(_stream_generator_inner_for_execute(), media_type="text/event-stream")
188
  else:
189
  response_obj_call = await current_client.aio.models.generate_content(
190
  model=model_to_call, contents=actual_prompt_for_call, config=gen_config_for_call
191
  )
192
- if not is_response_valid(response_obj_call):
193
- raise ValueError("Invalid/empty response from non-streaming Gemini call in _execute_gemini_call.")
 
 
 
 
 
 
 
 
 
 
 
194
  return JSONResponse(content=convert_to_openai_format(response_obj_call, request_obj.model))
 
66
  # print(f"DEBUG: Response valid due to part.text in candidate's content part")
67
  return True
68
 
69
+ # Removed prompt_feedback as a sole criterion for validity.
70
+ # It should only be valid if actual text content is found.
71
+ # Block reasons will be checked explicitly by callers if they need to treat it as an error for retries.
72
+ print("DEBUG: Response is invalid, no usable text content found by is_response_valid.")
 
 
 
 
 
 
 
 
73
  return False
74
 
75
+ async def fake_stream_generator(client_instance, model_name: str, prompt: Union[types.Content, List[types.Content]], current_gen_config: Dict[str, Any], request_obj: OpenAIRequest, is_auto_attempt: bool):
76
  response_id = f"chatcmpl-{int(time.time())}"
77
  async def fake_stream_inner():
78
  print(f"FAKE STREAMING: Making non-streaming request to Gemini API (Model: {model_name})")
 
90
  await asyncio.sleep(app_config.FAKE_STREAMING_INTERVAL_SECONDS)
91
  try:
92
  response = api_call_task.result()
93
+
94
+ # Check for safety blocks first, as this should trigger a retry in auto-mode
95
+ if hasattr(response, 'prompt_feedback') and \
96
+ hasattr(response.prompt_feedback, 'block_reason') and \
97
+ response.prompt_feedback.block_reason:
98
+ block_message = f"Response blocked by safety filter: {response.prompt_feedback.block_reason}"
99
+ if hasattr(response.prompt_feedback, 'block_reason_message') and response.prompt_feedback.block_reason_message:
100
+ block_message = f"Response blocked by safety filter: {response.prompt_feedback.block_reason_message} (Reason: {response.prompt_feedback.block_reason})"
101
+ print(f"DEBUG: {block_message} (in fake_stream_generator)") # Log this specific condition
102
+ raise ValueError(block_message) # This will be caught by the except Exception as e below it
103
+
104
+ if not is_response_valid(response): # is_response_valid now only checks for actual text
105
+ raise ValueError(f"Invalid/empty response in fake stream (no text content): {str(response)[:200]}")
106
+
107
  full_text = ""
108
  if hasattr(response, 'text'):
109
  full_text = response.text or "" # Coalesce None to empty string
 
140
  # It's good practice to log the JSON payload here too for consistency,
141
  # though the main concern was the true streaming path.
142
  json_payload_for_fake_stream_error = json.dumps(err_resp)
143
+ # Log the error JSON that WOULD have been sent if not in auto-mode or if this was the final error handler.
144
+ print(f"DEBUG: Internal error in fake_stream_generator. JSON error for handler: {json_payload_for_fake_stream_error}")
145
+ if not is_auto_attempt:
146
+ yield f"data: {json_payload_for_fake_stream_error}\n\n"
147
+ yield "data: [DONE]\n\n"
148
+ raise e # Re-raise the original exception e
149
  return fake_stream_inner()
150
 
151
  async def execute_gemini_call(
 
153
  model_to_call: str,
154
  prompt_func: Callable[[List[OpenAIMessage]], Union[types.Content, List[types.Content]]],
155
  gen_config_for_call: Dict[str, Any],
156
+ request_obj: OpenAIRequest, # Pass the whole request object
157
+ is_auto_attempt: bool = False
158
  ):
159
  actual_prompt_for_call = prompt_func(request_obj.messages)
160
 
161
  if request_obj.stream:
162
  if app_config.FAKE_STREAMING_ENABLED:
163
  return StreamingResponse(
164
+ await fake_stream_generator(current_client, model_to_call, actual_prompt_for_call, gen_config_for_call, request_obj, is_auto_attempt=is_auto_attempt),
165
  media_type="text/event-stream"
166
  )
167
 
 
187
 
188
  err_resp_content_call = create_openai_error_response(500, error_message_str, "server_error")
189
  json_payload_for_error = json.dumps(err_resp_content_call)
190
+ # Log the error JSON that WOULD have been sent if not in auto-mode or if this was the final error handler.
191
+ print(f"DEBUG: Internal error in _stream_generator_inner_for_execute. JSON error for handler: {json_payload_for_error}")
192
+ if not is_auto_attempt: # is_auto_attempt is from execute_gemini_call's scope
193
+ yield f"data: {json_payload_for_error}\n\n"
194
+ yield "data: [DONE]\n\n"
195
+ raise e_stream_call # Re-raise the original exception
196
  return StreamingResponse(_stream_generator_inner_for_execute(), media_type="text/event-stream")
197
  else:
198
  response_obj_call = await current_client.aio.models.generate_content(
199
  model=model_to_call, contents=actual_prompt_for_call, config=gen_config_for_call
200
  )
201
+
202
+ # Check for safety blocks first for non-streaming calls
203
+ if hasattr(response_obj_call, 'prompt_feedback') and \
204
+ hasattr(response_obj_call.prompt_feedback, 'block_reason') and \
205
+ response_obj_call.prompt_feedback.block_reason:
206
+ block_message = f"Response blocked by safety filter: {response_obj_call.prompt_feedback.block_reason}"
207
+ if hasattr(response_obj_call.prompt_feedback, 'block_reason_message') and response_obj_call.prompt_feedback.block_reason_message:
208
+ block_message = f"Response blocked by safety filter: {response_obj_call.prompt_feedback.block_reason_message} (Reason: {response_obj_call.prompt_feedback.block_reason})"
209
+ print(f"DEBUG: {block_message} (in execute_gemini_call non-streaming)") # Log this specific condition
210
+ raise ValueError(block_message)
211
+
212
+ if not is_response_valid(response_obj_call): # is_response_valid now only checks for actual text
213
+ raise ValueError("Invalid/empty response from non-streaming Gemini call (no text content).")
214
  return JSONResponse(content=convert_to_openai_format(response_obj_call, request_obj.model))
app/routes/chat_api.py CHANGED
@@ -240,7 +240,8 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
240
  print(f"Auto-mode attempting: '{attempt['name']}' for model {attempt['model']}")
241
  current_gen_config = attempt["config_modifier"](generation_config.copy())
242
  try:
243
- return await execute_gemini_call(client_to_use, attempt["model"], attempt["prompt_func"], current_gen_config, request)
 
244
  except Exception as e_auto:
245
  last_err = e_auto
246
  print(f"Auto-attempt '{attempt['name']}' for model {attempt['model']} failed: {e_auto}")
@@ -251,11 +252,15 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
251
  if not request.stream and last_err:
252
  return JSONResponse(status_code=500, content=create_openai_error_response(500, err_msg, "server_error"))
253
  elif request.stream:
254
- async def final_error_stream():
 
255
  err_content = create_openai_error_response(500, err_msg, "server_error")
256
- yield f"data: {json.dumps(err_content)}\n\n"
 
 
 
257
  yield "data: [DONE]\n\n"
258
- return StreamingResponse(final_error_stream(), media_type="text/event-stream")
259
  return JSONResponse(status_code=500, content=create_openai_error_response(500, "All auto-mode attempts failed without specific error.", "server_error"))
260
 
261
  else: # Not an auto model
@@ -284,6 +289,7 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
284
  # This means if `request.model` was "gemini-1.5-pro-search", `base_model_name` becomes "gemini-1.5-pro"
285
  # but the API call might need the full "gemini-1.5-pro-search".
286
  # Let's use `request.model` for the API call here, and `base_model_name` for checks like Express eligibility.
 
287
  return await execute_gemini_call(client_to_use, base_model_name, current_prompt_func, generation_config, request)
288
 
289
  except Exception as e:
 
240
  print(f"Auto-mode attempting: '{attempt['name']}' for model {attempt['model']}")
241
  current_gen_config = attempt["config_modifier"](generation_config.copy())
242
  try:
243
+ # Pass is_auto_attempt=True for auto-mode calls
244
+ return await execute_gemini_call(client_to_use, attempt["model"], attempt["prompt_func"], current_gen_config, request, is_auto_attempt=True)
245
  except Exception as e_auto:
246
  last_err = e_auto
247
  print(f"Auto-attempt '{attempt['name']}' for model {attempt['model']} failed: {e_auto}")
 
252
  if not request.stream and last_err:
253
  return JSONResponse(status_code=500, content=create_openai_error_response(500, err_msg, "server_error"))
254
  elif request.stream:
255
+ # This is the final error handling for auto-mode if all attempts fail AND it was a streaming request
256
+ async def final_auto_error_stream():
257
  err_content = create_openai_error_response(500, err_msg, "server_error")
258
+ json_payload_final_auto_error = json.dumps(err_content)
259
+ # Log the final error being sent to client after all auto-retries failed
260
+ print(f"DEBUG: Auto-mode all attempts failed. Yielding final error JSON: {json_payload_final_auto_error}")
261
+ yield f"data: {json_payload_final_auto_error}\n\n"
262
  yield "data: [DONE]\n\n"
263
+ return StreamingResponse(final_auto_error_stream(), media_type="text/event-stream")
264
  return JSONResponse(status_code=500, content=create_openai_error_response(500, "All auto-mode attempts failed without specific error.", "server_error"))
265
 
266
  else: # Not an auto model
 
289
  # This means if `request.model` was "gemini-1.5-pro-search", `base_model_name` becomes "gemini-1.5-pro"
290
  # but the API call might need the full "gemini-1.5-pro-search".
291
  # Let's use `request.model` for the API call here, and `base_model_name` for checks like Express eligibility.
292
+ # For non-auto mode, is_auto_attempt defaults to False in execute_gemini_call
293
  return await execute_gemini_call(client_to_use, base_model_name, current_prompt_func, generation_config, request)
294
 
295
  except Exception as e: