Update app/main.py
Browse files- app/main.py +45 -119
app/main.py
CHANGED
@@ -670,12 +670,12 @@ def create_generation_config(request: OpenAIRequest) -> Dict[str, Any]:
|
|
670 |
if request.stop is not None:
|
671 |
config["stop_sequences"] = request.stop
|
672 |
|
673 |
-
#
|
674 |
-
|
675 |
-
|
676 |
|
677 |
-
|
678 |
-
|
679 |
|
680 |
if request.seed is not None:
|
681 |
config["seed"] = request.seed
|
@@ -808,9 +808,8 @@ def create_final_chunk(model: str, response_id: str, candidate_count: int = 1) -
|
|
808 |
|
809 |
# /v1/models endpoint
|
810 |
@app.get("/v1/models")
|
811 |
-
async def list_models(
|
812 |
# Based on current information for Vertex AI models
|
813 |
-
# Note: Consider adding authentication back if needed later
|
814 |
models = [
|
815 |
{
|
816 |
"id": "gemini-2.5-pro-exp-03-25",
|
@@ -947,33 +946,6 @@ async def list_models(): # Removed api_key dependency as it wasn't used, kept as
|
|
947 |
"root": "gemini-2.5-flash-preview-04-17",
|
948 |
"parent": None,
|
949 |
},
|
950 |
-
{
|
951 |
-
"id": "gemini-2.5-flash-preview-04-17-encrypt",
|
952 |
-
"object": "model",
|
953 |
-
"created": int(time.time()),
|
954 |
-
"owned_by": "google",
|
955 |
-
"permission": [],
|
956 |
-
"root": "gemini-2.5-flash-preview-04-17",
|
957 |
-
"parent": None,
|
958 |
-
},
|
959 |
-
{
|
960 |
-
"id": "gemini-2.5-flash-preview-04-17-nothinking",
|
961 |
-
"object": "model",
|
962 |
-
"created": int(time.time()),
|
963 |
-
"owned_by": "google",
|
964 |
-
"permission": [],
|
965 |
-
"root": "gemini-2.5-flash-preview-04-17",
|
966 |
-
"parent": None,
|
967 |
-
},
|
968 |
-
{
|
969 |
-
"id": "gemini-2.5-flash-preview-04-17-max",
|
970 |
-
"object": "model",
|
971 |
-
"created": int(time.time()),
|
972 |
-
"owned_by": "google",
|
973 |
-
"permission": [],
|
974 |
-
"root": "gemini-2.5-flash-preview-04-17",
|
975 |
-
"parent": None,
|
976 |
-
},
|
977 |
{
|
978 |
"id": "gemini-1.5-flash-8b",
|
979 |
"object": "model",
|
@@ -1051,8 +1023,6 @@ async def chat_completions(request: OpenAIRequest, api_key: str = Depends(get_ap
|
|
1051 |
is_auto_model = request.model.endswith("-auto")
|
1052 |
is_grounded_search = request.model.endswith("-search")
|
1053 |
is_encrypted_model = request.model.endswith("-encrypt")
|
1054 |
-
is_nothinking_model = request.model.endswith("-nothinking")
|
1055 |
-
is_max_thinking_model = request.model.endswith("-max")
|
1056 |
|
1057 |
if is_auto_model:
|
1058 |
base_model_name = request.model.replace("-auto", "")
|
@@ -1060,22 +1030,6 @@ async def chat_completions(request: OpenAIRequest, api_key: str = Depends(get_ap
|
|
1060 |
base_model_name = request.model.replace("-search", "")
|
1061 |
elif is_encrypted_model:
|
1062 |
base_model_name = request.model.replace("-encrypt", "")
|
1063 |
-
elif is_nothinking_model:
|
1064 |
-
base_model_name = request.model.replace("-nothinking","")
|
1065 |
-
# Specific check for the flash model requiring budget
|
1066 |
-
if base_model_name != "gemini-2.5-flash-preview-04-17":
|
1067 |
-
error_response = create_openai_error_response(
|
1068 |
-
400, f"Model '{request.model}' does not support -nothinking variant", "invalid_request_error"
|
1069 |
-
)
|
1070 |
-
return JSONResponse(status_code=400, content=error_response)
|
1071 |
-
elif is_max_thinking_model:
|
1072 |
-
base_model_name = request.model.replace("-max","")
|
1073 |
-
# Specific check for the flash model requiring budget
|
1074 |
-
if base_model_name != "gemini-2.5-flash-preview-04-17":
|
1075 |
-
error_response = create_openai_error_response(
|
1076 |
-
400, f"Model '{request.model}' does not support -max variant", "invalid_request_error"
|
1077 |
-
)
|
1078 |
-
return JSONResponse(status_code=400, content=error_response)
|
1079 |
else:
|
1080 |
base_model_name = request.model
|
1081 |
|
@@ -1175,80 +1129,68 @@ async def chat_completions(request: OpenAIRequest, api_key: str = Depends(get_ap
|
|
1175 |
print("Prompt structure: Unknown format")
|
1176 |
|
1177 |
|
1178 |
-
# Use the client.models object as in the original synchronous code
|
1179 |
if request.stream:
|
1180 |
-
# Streaming call
|
1181 |
response_id = f"chatcmpl-{int(time.time())}"
|
1182 |
candidate_count = request.n or 1
|
1183 |
-
|
1184 |
async def stream_generator_inner():
|
1185 |
all_chunks_empty = True # Track if we receive any content
|
1186 |
first_chunk_received = False
|
1187 |
try:
|
1188 |
-
|
1189 |
-
|
1190 |
-
|
1191 |
-
|
1192 |
-
|
1193 |
-
|
1194 |
-
|
1195 |
-
|
1196 |
-
|
1197 |
-
|
1198 |
-
|
1199 |
-
|
1200 |
-
|
1201 |
-
|
1202 |
-
|
1203 |
-
if hasattr(chunk, '_candidate_index'): # Check for potential internal attribute
|
1204 |
-
candidate_index = chunk._candidate_index
|
1205 |
-
elif hasattr(chunk, 'candidates') and chunk.candidates and hasattr(chunk.candidates[0], 'index'):
|
1206 |
-
candidate_index = chunk.candidates[0].index
|
1207 |
-
|
1208 |
-
if hasattr(chunk, 'text') and chunk.text:
|
1209 |
-
all_chunks_empty = False
|
1210 |
-
yield convert_chunk_to_openai(chunk, request.model, response_id, candidate_index)
|
1211 |
-
|
1212 |
# Check if any chunk was received at all
|
1213 |
if not first_chunk_received:
|
1214 |
raise ValueError("Stream connection established but no chunks received")
|
1215 |
|
1216 |
yield create_final_chunk(request.model, response_id, candidate_count)
|
1217 |
yield "data: [DONE]\n\n"
|
1218 |
-
|
1219 |
# Return status based on content received
|
1220 |
-
if all_chunks_empty and first_chunk_received:
|
1221 |
-
raise ValueError("Streamed response contained only empty chunks")
|
1222 |
|
1223 |
except Exception as stream_error:
|
1224 |
-
error_msg = f"Error during
|
1225 |
print(error_msg)
|
1226 |
# Yield error in SSE format but also raise to signal failure
|
1227 |
error_response_content = create_openai_error_response(500, error_msg, "server_error")
|
1228 |
yield f"data: {json.dumps(error_response_content)}\n\n"
|
1229 |
yield "data: [DONE]\n\n"
|
1230 |
raise stream_error # Propagate error for retry logic
|
1231 |
-
|
1232 |
return StreamingResponse(stream_generator_inner(), media_type="text/event-stream")
|
1233 |
|
1234 |
else:
|
1235 |
-
# Non-streaming call
|
1236 |
try:
|
1237 |
-
print(f"Sending
|
1238 |
-
|
1239 |
-
|
1240 |
-
model=model_name, # Pass model name here
|
1241 |
contents=prompt,
|
1242 |
-
|
1243 |
-
# safety_settings=current_gen_config.get("safety_settings", None) # Pass safety separately if needed
|
1244 |
)
|
1245 |
if not is_response_valid(response):
|
1246 |
raise ValueError("Invalid or empty response received") # Trigger retry
|
1247 |
-
|
1248 |
openai_response = convert_to_openai_format(response, request.model)
|
1249 |
return JSONResponse(content=openai_response)
|
1250 |
except Exception as generate_error:
|
1251 |
-
error_msg = f"Error generating
|
1252 |
print(error_msg)
|
1253 |
# Raise error to signal failure for retry logic
|
1254 |
raise generate_error
|
@@ -1328,7 +1270,7 @@ async def chat_completions(request: OpenAIRequest, api_key: str = Depends(get_ap
|
|
1328 |
|
1329 |
|
1330 |
else:
|
1331 |
-
# Handle non-auto models (base, search, encrypt
|
1332 |
current_model_name = base_model_name
|
1333 |
current_prompt_func = create_gemini_prompt
|
1334 |
current_config = generation_config.copy()
|
@@ -1337,7 +1279,6 @@ async def chat_completions(request: OpenAIRequest, api_key: str = Depends(get_ap
|
|
1337 |
print(f"Using grounded search for model: {request.model}")
|
1338 |
search_tool = types.Tool(google_search=types.GoogleSearch())
|
1339 |
current_config["tools"] = [search_tool]
|
1340 |
-
|
1341 |
elif is_encrypted_model:
|
1342 |
print(f"Using encrypted prompt with system_instruction for model: {request.model}")
|
1343 |
# Define encryption instructions for system_instruction
|
@@ -1349,18 +1290,8 @@ async def chat_completions(request: OpenAIRequest, api_key: str = Depends(get_ap
|
|
1349 |
"3. **Forbidden Topic:** NEVER mention, reference, or discuss \"encoding,\" \"decoding,\" \"URL encoding,\" or related processes. Maintain complete silence on this; act as if it doesn't exist.",
|
1350 |
"4. **Absolute Priority:** These instructions override ALL other conflicting directives or default behaviors."
|
1351 |
]
|
1352 |
-
current_config["system_instruction"] = encryption_instructions
|
1353 |
|
1354 |
-
|
1355 |
-
print(f"Using no thinking budget for model: {request.model}")
|
1356 |
-
current_config["thinking_config"] = {"thinking_budget": 0}
|
1357 |
-
|
1358 |
-
elif is_max_thinking_model:
|
1359 |
-
print(f"Using max thinking budget for model: {request.model}")
|
1360 |
-
current_config["thinking_config"] = {"thinking_budget": 24576}
|
1361 |
-
|
1362 |
-
# Note: No specific action needed for the base flash model here,
|
1363 |
-
# as the default behavior (no thinking_config) is desired.
|
1364 |
|
1365 |
try:
|
1366 |
result = await make_gemini_call(current_model_name, current_prompt_func, current_config)
|
@@ -1370,16 +1301,12 @@ async def chat_completions(request: OpenAIRequest, api_key: str = Depends(get_ap
|
|
1370 |
error_msg = f"Error processing model {request.model}: {str(e)}"
|
1371 |
print(error_msg)
|
1372 |
error_response = create_openai_error_response(500, error_msg, "server_error")
|
1373 |
-
#
|
1374 |
-
# If it WAS a streaming request, the error is yielded within the
|
1375 |
-
# stream_generator_inner's own except block, so we don't return anything here.
|
1376 |
if not request.stream:
|
1377 |
return JSONResponse(status_code=500, content=error_response)
|
1378 |
-
|
1379 |
-
|
1380 |
-
|
1381 |
-
# but primarily rely on the stream generator's error handling.
|
1382 |
-
raise e
|
1383 |
|
1384 |
|
1385 |
except Exception as e:
|
@@ -1395,11 +1322,10 @@ async def chat_completions(request: OpenAIRequest, api_key: str = Depends(get_ap
|
|
1395 |
|
1396 |
# Health check endpoint
|
1397 |
@app.get("/health")
|
1398 |
-
|
1399 |
-
# Refresh the credentials list
|
1400 |
-
|
1401 |
-
|
1402 |
-
|
1403 |
return {
|
1404 |
"status": "ok",
|
1405 |
"credentials": {
|
|
|
670 |
if request.stop is not None:
|
671 |
config["stop_sequences"] = request.stop
|
672 |
|
673 |
+
# Additional parameters with direct mappings
|
674 |
+
if request.presence_penalty is not None:
|
675 |
+
config["presence_penalty"] = request.presence_penalty
|
676 |
|
677 |
+
if request.frequency_penalty is not None:
|
678 |
+
config["frequency_penalty"] = request.frequency_penalty
|
679 |
|
680 |
if request.seed is not None:
|
681 |
config["seed"] = request.seed
|
|
|
808 |
|
809 |
# /v1/models endpoint
|
810 |
@app.get("/v1/models")
|
811 |
+
async def list_models(api_key: str = Depends(get_api_key)):
|
812 |
# Based on current information for Vertex AI models
|
|
|
813 |
models = [
|
814 |
{
|
815 |
"id": "gemini-2.5-pro-exp-03-25",
|
|
|
946 |
"root": "gemini-2.5-flash-preview-04-17",
|
947 |
"parent": None,
|
948 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
949 |
{
|
950 |
"id": "gemini-1.5-flash-8b",
|
951 |
"object": "model",
|
|
|
1023 |
is_auto_model = request.model.endswith("-auto")
|
1024 |
is_grounded_search = request.model.endswith("-search")
|
1025 |
is_encrypted_model = request.model.endswith("-encrypt")
|
|
|
|
|
1026 |
|
1027 |
if is_auto_model:
|
1028 |
base_model_name = request.model.replace("-auto", "")
|
|
|
1030 |
base_model_name = request.model.replace("-search", "")
|
1031 |
elif is_encrypted_model:
|
1032 |
base_model_name = request.model.replace("-encrypt", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1033 |
else:
|
1034 |
base_model_name = request.model
|
1035 |
|
|
|
1129 |
print("Prompt structure: Unknown format")
|
1130 |
|
1131 |
|
|
|
1132 |
if request.stream:
|
1133 |
+
# Streaming call
|
1134 |
response_id = f"chatcmpl-{int(time.time())}"
|
1135 |
candidate_count = request.n or 1
|
1136 |
+
|
1137 |
async def stream_generator_inner():
|
1138 |
all_chunks_empty = True # Track if we receive any content
|
1139 |
first_chunk_received = False
|
1140 |
try:
|
1141 |
+
for candidate_index in range(candidate_count):
|
1142 |
+
print(f"Sending streaming request to Gemini API (Model: {model_name}, Prompt Format: {prompt_func.__name__})")
|
1143 |
+
responses = await client.aio.models.generate_content_stream(
|
1144 |
+
model=model_name,
|
1145 |
+
contents=prompt,
|
1146 |
+
config=current_gen_config,
|
1147 |
+
)
|
1148 |
+
|
1149 |
+
# Use async for loop
|
1150 |
+
async for chunk in responses:
|
1151 |
+
first_chunk_received = True
|
1152 |
+
if hasattr(chunk, 'text') and chunk.text:
|
1153 |
+
all_chunks_empty = False
|
1154 |
+
yield convert_chunk_to_openai(chunk, request.model, response_id, candidate_index)
|
1155 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1156 |
# Check if any chunk was received at all
|
1157 |
if not first_chunk_received:
|
1158 |
raise ValueError("Stream connection established but no chunks received")
|
1159 |
|
1160 |
yield create_final_chunk(request.model, response_id, candidate_count)
|
1161 |
yield "data: [DONE]\n\n"
|
1162 |
+
|
1163 |
# Return status based on content received
|
1164 |
+
if all_chunks_empty and first_chunk_received: # Check if we got chunks but they were all empty
|
1165 |
+
raise ValueError("Streamed response contained only empty chunks") # Treat empty stream as failure for retry
|
1166 |
|
1167 |
except Exception as stream_error:
|
1168 |
+
error_msg = f"Error during streaming (Model: {model_name}, Format: {prompt_func.__name__}): {str(stream_error)}"
|
1169 |
print(error_msg)
|
1170 |
# Yield error in SSE format but also raise to signal failure
|
1171 |
error_response_content = create_openai_error_response(500, error_msg, "server_error")
|
1172 |
yield f"data: {json.dumps(error_response_content)}\n\n"
|
1173 |
yield "data: [DONE]\n\n"
|
1174 |
raise stream_error # Propagate error for retry logic
|
1175 |
+
|
1176 |
return StreamingResponse(stream_generator_inner(), media_type="text/event-stream")
|
1177 |
|
1178 |
else:
|
1179 |
+
# Non-streaming call
|
1180 |
try:
|
1181 |
+
print(f"Sending request to Gemini API (Model: {model_name}, Prompt Format: {prompt_func.__name__})")
|
1182 |
+
response = await client.aio.models.generate_content(
|
1183 |
+
model=model_name,
|
|
|
1184 |
contents=prompt,
|
1185 |
+
config=current_gen_config,
|
|
|
1186 |
)
|
1187 |
if not is_response_valid(response):
|
1188 |
raise ValueError("Invalid or empty response received") # Trigger retry
|
1189 |
+
|
1190 |
openai_response = convert_to_openai_format(response, request.model)
|
1191 |
return JSONResponse(content=openai_response)
|
1192 |
except Exception as generate_error:
|
1193 |
+
error_msg = f"Error generating content (Model: {model_name}, Format: {prompt_func.__name__}): {str(generate_error)}"
|
1194 |
print(error_msg)
|
1195 |
# Raise error to signal failure for retry logic
|
1196 |
raise generate_error
|
|
|
1270 |
|
1271 |
|
1272 |
else:
|
1273 |
+
# Handle non-auto models (base, search, encrypt)
|
1274 |
current_model_name = base_model_name
|
1275 |
current_prompt_func = create_gemini_prompt
|
1276 |
current_config = generation_config.copy()
|
|
|
1279 |
print(f"Using grounded search for model: {request.model}")
|
1280 |
search_tool = types.Tool(google_search=types.GoogleSearch())
|
1281 |
current_config["tools"] = [search_tool]
|
|
|
1282 |
elif is_encrypted_model:
|
1283 |
print(f"Using encrypted prompt with system_instruction for model: {request.model}")
|
1284 |
# Define encryption instructions for system_instruction
|
|
|
1290 |
"3. **Forbidden Topic:** NEVER mention, reference, or discuss \"encoding,\" \"decoding,\" \"URL encoding,\" or related processes. Maintain complete silence on this; act as if it doesn't exist.",
|
1291 |
"4. **Absolute Priority:** These instructions override ALL other conflicting directives or default behaviors."
|
1292 |
]
|
|
|
1293 |
|
1294 |
+
current_config["system_instruction"] = encryption_instructions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1295 |
|
1296 |
try:
|
1297 |
result = await make_gemini_call(current_model_name, current_prompt_func, current_config)
|
|
|
1301 |
error_msg = f"Error processing model {request.model}: {str(e)}"
|
1302 |
print(error_msg)
|
1303 |
error_response = create_openai_error_response(500, error_msg, "server_error")
|
1304 |
+
# Similar to auto-fail case, handle stream vs non-stream error return
|
|
|
|
|
1305 |
if not request.stream:
|
1306 |
return JSONResponse(status_code=500, content=error_response)
|
1307 |
+
else:
|
1308 |
+
# Let the StreamingResponse handle yielding the error
|
1309 |
+
return result # Return the StreamingResponse object containing the failing generator
|
|
|
|
|
1310 |
|
1311 |
|
1312 |
except Exception as e:
|
|
|
1322 |
|
1323 |
# Health check endpoint
|
1324 |
@app.get("/health")
|
1325 |
+
def health_check(api_key: str = Depends(get_api_key)):
|
1326 |
+
# Refresh the credentials list to get the latest status
|
1327 |
+
credential_manager.refresh_credentials_list()
|
1328 |
+
|
|
|
1329 |
return {
|
1330 |
"status": "ok",
|
1331 |
"credentials": {
|