Spaces:
Running
Running
Commit
·
71235a6
1
Parent(s):
aa79ca3
separated vertex and express models
Browse files- app/model_loader.py +5 -1
- app/routes/chat_api.py +74 -33
- app/routes/models_api.py +3 -1
app/model_loader.py
CHANGED
@@ -31,9 +31,13 @@ async def fetch_and_parse_models_config() -> Optional[Dict[str, List[str]]]:
|
|
31 |
"vertex_models" in data and isinstance(data["vertex_models"], list) and \
|
32 |
"vertex_express_models" in data and isinstance(data["vertex_express_models"], list):
|
33 |
print("Successfully fetched and parsed model configuration.")
|
|
|
|
|
|
|
|
|
34 |
return {
|
35 |
"vertex_models": data["vertex_models"],
|
36 |
-
"vertex_express_models":
|
37 |
}
|
38 |
else:
|
39 |
print(f"ERROR: Fetched model configuration has an invalid structure: {data}")
|
|
|
31 |
"vertex_models" in data and isinstance(data["vertex_models"], list) and \
|
32 |
"vertex_express_models" in data and isinstance(data["vertex_express_models"], list):
|
33 |
print("Successfully fetched and parsed model configuration.")
|
34 |
+
|
35 |
+
# Add [EXPRESS] prefix to express models
|
36 |
+
prefixed_express_models = [f"[EXPRESS] {model_name}" for model_name in data["vertex_express_models"]]
|
37 |
+
|
38 |
return {
|
39 |
"vertex_models": data["vertex_models"],
|
40 |
+
"vertex_express_models": prefixed_express_models
|
41 |
}
|
42 |
else:
|
43 |
print(f"ERROR: Fetched model configuration has an invalid structure: {data}")
|
app/routes/chat_api.py
CHANGED
@@ -37,11 +37,12 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
|
|
37 |
OPENAI_DIRECT_SUFFIX = "-openai"
|
38 |
EXPERIMENTAL_MARKER = "-exp-"
|
39 |
PAY_PREFIX = "[PAY]"
|
|
|
40 |
|
41 |
# Model validation based on a predefined list has been removed as per user request.
|
42 |
# The application will now attempt to use any provided model string.
|
43 |
# We still need to fetch vertex_express_model_ids for the Express Mode logic.
|
44 |
-
vertex_express_model_ids = await get_vertex_express_models()
|
45 |
|
46 |
# Updated logic for is_openai_direct_model
|
47 |
is_openai_direct_model = False
|
@@ -57,25 +58,37 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
|
|
57 |
is_encrypted_full_model = request.model.endswith("-encrypt-full")
|
58 |
is_nothinking_model = request.model.endswith("-nothinking")
|
59 |
is_max_thinking_model = request.model.endswith("-max")
|
60 |
-
base_model_name = request.model
|
61 |
|
62 |
-
# Determine base_model_name by stripping known suffixes
|
63 |
-
#
|
64 |
-
if is_openai_direct_model:
|
65 |
-
# The general PAY_PREFIX stripper later will handle if this result starts with [PAY]
|
66 |
-
base_model_name = request.model[:-len(OPENAI_DIRECT_SUFFIX)]
|
67 |
-
elif is_auto_model: base_model_name = request.model[:-len("-auto")]
|
68 |
-
elif is_grounded_search: base_model_name = request.model[:-len("-search")]
|
69 |
-
elif is_encrypted_full_model: base_model_name = request.model[:-len("-encrypt-full")] # Must be before -encrypt
|
70 |
-
elif is_encrypted_model: base_model_name = request.model[:-len("-encrypt")]
|
71 |
-
elif is_nothinking_model: base_model_name = request.model[:-len("-nothinking")]
|
72 |
-
elif is_max_thinking_model: base_model_name = request.model[:-len("-max")]
|
73 |
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
76 |
if base_model_name.startswith(PAY_PREFIX):
|
77 |
base_model_name = base_model_name[len(PAY_PREFIX):]
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
# Specific model variant checks (if any remain exclusive and not covered dynamically)
|
80 |
if is_nothinking_model and base_model_name != "gemini-2.5-flash-preview-04-17":
|
81 |
return JSONResponse(status_code=400, content=create_openai_error_response(400, f"Model '{request.model}' (-nothinking) is only supported for 'gemini-2.5-flash-preview-04-17'.", "invalid_request_error"))
|
@@ -86,37 +99,65 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
|
|
86 |
|
87 |
client_to_use = None
|
88 |
express_api_keys_list = app_config.VERTEX_EXPRESS_API_KEY_VAL
|
89 |
-
|
90 |
-
#
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
indexed_keys = list(enumerate(express_api_keys_list))
|
93 |
random.shuffle(indexed_keys)
|
94 |
|
95 |
for original_idx, key_val in indexed_keys:
|
96 |
try:
|
97 |
client_to_use = genai.Client(vertexai=True, api_key=key_val)
|
98 |
-
print(f"INFO: Using Vertex Express Mode for model {base_model_name} with API key (original index: {original_idx}).")
|
99 |
break # Successfully initialized client
|
100 |
except Exception as e:
|
101 |
-
print(f"WARNING: Vertex Express Mode client init failed for API key (original index: {original_idx}): {e}. Trying next key
|
102 |
-
client_to_use = None # Ensure client_to_use is None
|
103 |
-
|
104 |
-
if client_to_use is None:
|
105 |
-
print(f"WARNING: All {len(express_api_keys_list)} Vertex Express API key(s) failed to initialize for model {base_model_name}. Falling back.")
|
106 |
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
rotated_credentials, rotated_project_id = credential_manager_instance.get_random_credentials()
|
|
|
109 |
if rotated_credentials and rotated_project_id:
|
110 |
try:
|
111 |
client_to_use = genai.Client(vertexai=True, credentials=rotated_credentials, project=rotated_project_id, location="global")
|
112 |
-
print(f"INFO: Using
|
113 |
except Exception as e:
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
encryption_instructions_placeholder = ["// Protocol Instructions Placeholder //"] # Actual instructions are in message_processing
|
122 |
if is_openai_direct_model:
|
|
|
37 |
OPENAI_DIRECT_SUFFIX = "-openai"
|
38 |
EXPERIMENTAL_MARKER = "-exp-"
|
39 |
PAY_PREFIX = "[PAY]"
|
40 |
+
EXPRESS_PREFIX = "[EXPRESS] " # Note the space for easier stripping
|
41 |
|
42 |
# Model validation based on a predefined list has been removed as per user request.
|
43 |
# The application will now attempt to use any provided model string.
|
44 |
# We still need to fetch vertex_express_model_ids for the Express Mode logic.
|
45 |
+
# vertex_express_model_ids = await get_vertex_express_models() # We'll use the prefix now
|
46 |
|
47 |
# Updated logic for is_openai_direct_model
|
48 |
is_openai_direct_model = False
|
|
|
58 |
is_encrypted_full_model = request.model.endswith("-encrypt-full")
|
59 |
is_nothinking_model = request.model.endswith("-nothinking")
|
60 |
is_max_thinking_model = request.model.endswith("-max")
|
61 |
+
base_model_name = request.model # Start with the full model name
|
62 |
|
63 |
+
# Determine base_model_name by stripping known prefixes and suffixes
|
64 |
+
# Order of stripping: Prefixes first, then suffixes.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
+
is_express_model_request = False
|
67 |
+
if base_model_name.startswith(EXPRESS_PREFIX):
|
68 |
+
is_express_model_request = True
|
69 |
+
base_model_name = base_model_name[len(EXPRESS_PREFIX):]
|
70 |
+
|
71 |
if base_model_name.startswith(PAY_PREFIX):
|
72 |
base_model_name = base_model_name[len(PAY_PREFIX):]
|
73 |
+
|
74 |
+
# Suffix stripping (applied to the name after prefix removal)
|
75 |
+
# This order matters if a model could have multiple (e.g. -encrypt-auto, though not currently a pattern)
|
76 |
+
if is_openai_direct_model: # This check is based on request.model, so it's fine here
|
77 |
+
# If it was an OpenAI direct model, its base name is request.model minus suffix.
|
78 |
+
# We need to ensure PAY_PREFIX or EXPRESS_PREFIX are also stripped if they were part of the original.
|
79 |
+
temp_base_for_openai = request.model[:-len(OPENAI_DIRECT_SUFFIX)]
|
80 |
+
if temp_base_for_openai.startswith(EXPRESS_PREFIX):
|
81 |
+
temp_base_for_openai = temp_base_for_openai[len(EXPRESS_PREFIX):]
|
82 |
+
if temp_base_for_openai.startswith(PAY_PREFIX):
|
83 |
+
temp_base_for_openai = temp_base_for_openai[len(PAY_PREFIX):]
|
84 |
+
base_model_name = temp_base_for_openai # Assign the fully stripped name
|
85 |
+
elif is_auto_model: base_model_name = base_model_name[:-len("-auto")]
|
86 |
+
elif is_grounded_search: base_model_name = base_model_name[:-len("-search")]
|
87 |
+
elif is_encrypted_full_model: base_model_name = base_model_name[:-len("-encrypt-full")] # Must be before -encrypt
|
88 |
+
elif is_encrypted_model: base_model_name = base_model_name[:-len("-encrypt")]
|
89 |
+
elif is_nothinking_model: base_model_name = base_model_name[:-len("-nothinking")]
|
90 |
+
elif is_max_thinking_model: base_model_name = base_model_name[:-len("-max")]
|
91 |
+
|
92 |
# Specific model variant checks (if any remain exclusive and not covered dynamically)
|
93 |
if is_nothinking_model and base_model_name != "gemini-2.5-flash-preview-04-17":
|
94 |
return JSONResponse(status_code=400, content=create_openai_error_response(400, f"Model '{request.model}' (-nothinking) is only supported for 'gemini-2.5-flash-preview-04-17'.", "invalid_request_error"))
|
|
|
99 |
|
100 |
client_to_use = None
|
101 |
express_api_keys_list = app_config.VERTEX_EXPRESS_API_KEY_VAL
|
102 |
+
|
103 |
+
# This client initialization logic is for Gemini models.
|
104 |
+
# OpenAI Direct models have their own client setup and will return before this.
|
105 |
+
if is_openai_direct_model:
|
106 |
+
# OpenAI Direct logic is self-contained and will return.
|
107 |
+
# If it doesn't return, it means we proceed to Gemini logic, which shouldn't happen
|
108 |
+
# if is_openai_direct_model is true. The main if/elif/else for model types handles this.
|
109 |
+
pass
|
110 |
+
elif is_express_model_request:
|
111 |
+
if not express_api_keys_list:
|
112 |
+
error_msg = f"Model '{request.model}' is an Express model and requires an Express API key, but none are configured."
|
113 |
+
print(f"ERROR: {error_msg}")
|
114 |
+
return JSONResponse(status_code=401, content=create_openai_error_response(401, error_msg, "authentication_error"))
|
115 |
+
|
116 |
+
print(f"INFO: Attempting Vertex Express Mode for model request: {request.model} (base: {base_model_name})")
|
117 |
indexed_keys = list(enumerate(express_api_keys_list))
|
118 |
random.shuffle(indexed_keys)
|
119 |
|
120 |
for original_idx, key_val in indexed_keys:
|
121 |
try:
|
122 |
client_to_use = genai.Client(vertexai=True, api_key=key_val)
|
123 |
+
print(f"INFO: Using Vertex Express Mode for model {request.model} (base: {base_model_name}) with API key (original index: {original_idx}).")
|
124 |
break # Successfully initialized client
|
125 |
except Exception as e:
|
126 |
+
print(f"WARNING: Vertex Express Mode client init failed for API key (original index: {original_idx}) for model {request.model}: {e}. Trying next key.")
|
127 |
+
client_to_use = None # Ensure client_to_use is None for this attempt
|
|
|
|
|
|
|
128 |
|
129 |
+
if client_to_use is None: # All configured Express keys failed
|
130 |
+
error_msg = f"All configured Express API keys failed to initialize for model '{request.model}'."
|
131 |
+
print(f"ERROR: {error_msg}")
|
132 |
+
return JSONResponse(status_code=500, content=create_openai_error_response(500, error_msg, "server_error"))
|
133 |
+
|
134 |
+
else: # Not an Express model request, therefore an SA credential model request for Gemini
|
135 |
+
print(f"INFO: Model '{request.model}' is an SA credential request for Gemini. Attempting SA credentials.")
|
136 |
rotated_credentials, rotated_project_id = credential_manager_instance.get_random_credentials()
|
137 |
+
|
138 |
if rotated_credentials and rotated_project_id:
|
139 |
try:
|
140 |
client_to_use = genai.Client(vertexai=True, credentials=rotated_credentials, project=rotated_project_id, location="global")
|
141 |
+
print(f"INFO: Using SA credential for Gemini model {request.model} (project: {rotated_project_id})")
|
142 |
except Exception as e:
|
143 |
+
client_to_use = None # Ensure it's None on failure
|
144 |
+
error_msg = f"SA credential client initialization failed for Gemini model '{request.model}': {e}."
|
145 |
+
print(f"ERROR: {error_msg}")
|
146 |
+
return JSONResponse(status_code=500, content=create_openai_error_response(500, error_msg, "server_error"))
|
147 |
+
else: # No SA credentials available for an SA model request
|
148 |
+
error_msg = f"Model '{request.model}' requires SA credentials for Gemini, but none are available or loaded."
|
149 |
+
print(f"ERROR: {error_msg}")
|
150 |
+
return JSONResponse(status_code=401, content=create_openai_error_response(401, error_msg, "authentication_error"))
|
151 |
+
|
152 |
+
# If we reach here and client_to_use is still None, it means it's an OpenAI Direct Model,
|
153 |
+
# which handles its own client and responses.
|
154 |
+
# For Gemini models (Express or SA), client_to_use must be set, or an error returned above.
|
155 |
+
if not is_openai_direct_model and client_to_use is None:
|
156 |
+
# This case should ideally not be reached if the logic above is correct,
|
157 |
+
# as each path (Express/SA for Gemini) should either set client_to_use or return an error.
|
158 |
+
# This is a safeguard.
|
159 |
+
print(f"CRITICAL ERROR: Client for Gemini model '{request.model}' was not initialized, and no specific error was returned. This indicates a logic flaw.")
|
160 |
+
return JSONResponse(status_code=500, content=create_openai_error_response(500, "Critical internal server error: Gemini client not initialized.", "server_error"))
|
161 |
|
162 |
encryption_instructions_placeholder = ["// Protocol Instructions Placeholder //"] # Actual instructions are in message_processing
|
163 |
if is_openai_direct_model:
|
app/routes/models_api.py
CHANGED
@@ -56,7 +56,9 @@ async def list_models(fastapi_request: Request, api_key: str = Depends(get_api_k
|
|
56 |
# Add base models and their variations
|
57 |
for original_model_id in sorted(list(all_model_ids)):
|
58 |
current_display_prefix = ""
|
59 |
-
if
|
|
|
|
|
60 |
current_display_prefix = PAY_PREFIX
|
61 |
|
62 |
base_display_id = f"{current_display_prefix}{original_model_id}"
|
|
|
56 |
# Add base models and their variations
|
57 |
for original_model_id in sorted(list(all_model_ids)):
|
58 |
current_display_prefix = ""
|
59 |
+
# Only add PAY_PREFIX if the model is not already an EXPRESS model (which has its own prefix)
|
60 |
+
if not original_model_id.startswith("[EXPRESS]") and \
|
61 |
+
has_sa_creds and not has_express_key and EXPERIMENTAL_MARKER not in original_model_id:
|
62 |
current_display_prefix = PAY_PREFIX
|
63 |
|
64 |
base_display_id = f"{current_display_prefix}{original_model_id}"
|