Huggingface-Space-Commander-dev

Sleeping

App Files Files Community

broadfield-dev commited on Jun 12

Commit

544ffe5

verified ·

1 Parent(s): 031a90d

Update model_logic.py

Browse files

Files changed (1) hide show

model_logic.py +220 -163

model_logic.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import requests
 import json
 import logging
 logging.basicConfig(
     level=logging.INFO,
@@ -10,7 +11,7 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
 API_KEYS = {
-  "HUGGINGFACE": 'HF_TOKEN',
   "GROQ": 'GROQ_API_KEY',
   "OPENROUTER": 'OPENROUTER_API_KEY',
   "TOGETHERAI": 'TOGETHERAI_API_KEY',
@@ -28,96 +29,108 @@ API_URLS = {
   "COHERE": 'https://api.cohere.ai/v1/chat',
   "XAI": 'https://api.x.ai/v1/chat/completions',
   "OPENAI": 'https://api.openai.com/v1/chat/completions',
-  "GOOGLE": 'https://generativelanguage.googleapis.com/v1beta/models/',
 }
-MODELS_BY_PROVIDER = {
-    "groq": {
-        "default": "llama3-8b-8192",
-        "models": {
-            "Llama 3 8B (Groq)": "llama3-8b-8192",
-            "Llama 3 70B (Groq)": "llama3-70b-8192",
-            "Mixtral 8x7B (Groq)": "mixtral-8x7b-32768",
-            "Gemma 7B (Groq)": "gemma-7b-it",
-        }
-    },
-    "openrouter": {
-         "default": "nousresearch/llama-3-8b-instruct",
-         "models": {
-            "Nous Llama-3 8B Instruct (OpenRouter)": "nousresearch/llama-3-8b-instruct",
-            "Mistral 7B Instruct v0.2 (OpenRouter)": "mistralai/mistral-7b-instruct:free",
-            "Gemma 7B Instruct (OpenRouter)": "google/gemma-7b-it:free",
-            "Mixtral 8x7B Instruct v0.1 (OpenRouter)": "mistralai/mixtral-8x7b-instruct",
-            "Llama 2 70B Chat (OpenRouter)": "meta-llama/llama-2-70b-chat",
-            "Neural Chat 7B v3.1 (OpenRouter)": "intel/neural-chat-7b-v3-1",
-            "Goliath 120B (OpenRouter)": "twob/goliath-v2-120b",
-         }
-    },
-     "togetherai": {
-         "default": "meta-llama/Llama-3-8b-chat-hf",
-         "models": {
-             "Llama 3 8B Chat (TogetherAI)": "meta-llama/Llama-3-8b-chat-hf",
-             "Llama 3 70B Chat (TogetherAI)": "meta-llama/Llama-3-70b-chat-hf",
-             "Mixtral 8x7B Instruct (TogetherAI)": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-             "Gemma 7B Instruct (TogetherAI)": "google/gemma-7b-it",
-             "RedPajama INCITE Chat 3B (TogetherAI)": "togethercomputer/RedPajama-INCITE-Chat-3B-v1",
-         }
-     },
-     "google": {
-          "default": "gemini-1.5-flash-latest",
-          "models": {
-              "Gemini 1.5 Flash (Latest)": "gemini-1.5-flash-latest",
-              "Gemini 1.5 Pro (Latest)": "gemini-1.5-pro-latest",
-          }
-     },
-     "cohere": {
-         "default": "command-light",
-         "models": {
-             "Command R (Cohere)": "command-r",
-             "Command R+ (Cohere)": "command-r-plus",
-             "Command Light (Cohere)": "command-light",
-             "Command (Cohere)": "command",
-         }
-     },
-     "huggingface": {
-         "default": "HuggingFaceH4/zephyr-7b-beta",
-         "models": {
-             "Zephyr 7B Beta (H4/HF Inf.)": "HuggingFaceH4/zephyr-7b-beta",
-             "Mistral 7B Instruct v0.2 (HF Inf.)": "mistralai/Mistral-7B-Instruct-v0.2",
-             "Llama 2 13B Chat (Meta/HF Inf.)": "meta-llama/Llama-2-13b-chat-hf",
-             "OpenAssistant/oasst-sft-4-pythia-12b (HF Inf.)": "OpenAssistant/oasst-sft-4-pythia-12b",
-         }
-     },
-     "openai": {
-          "default": "gpt-3.5-turbo",
-          "models": {
-              "GPT-4o (OpenAI)": "gpt-4o",
-              "GPT-4o mini (OpenAI)": "gpt-4o-mini",
-              "GPT-4 Turbo (OpenAI)": "gpt-4-turbo",
-              "GPT-3.5 Turbo (OpenAI)": "gpt-3.5-turbo",
-          }
-     },
-     "xai": {
-          "default": "grok-1",
-          "models": {
-              "Grok-1 (xAI)": "grok-1",
-          }
      }
-}
 def _get_api_key(provider: str, ui_api_key_override: str = None) -> str:
     if ui_api_key_override:
         return ui_api_key_override.strip()
     env_var_name = API_KEYS.get(provider.upper())
     if env_var_name:
         env_key = os.getenv(env_var_name)
         if env_key:
             return env_key.strip()
     if provider.lower() == 'huggingface':
          hf_token = os.getenv("HF_TOKEN")
-         if hf_token: return hf_token.strip()
     logger.warning(f"API Key not found for provider '{provider}'. Checked UI override and environment variable '{env_var_name or 'N/A'}'.")
     return None
@@ -132,9 +145,11 @@ def get_default_model_for_provider(provider: str) -> str | None:
     models_dict = MODELS_BY_PROVIDER.get(provider.lower(), {}).get("models", {})
     default_model_id = MODELS_BY_PROVIDER.get(provider.lower(), {}).get("default")
     if default_model_id:
         for display_name, model_id in models_dict.items():
             if model_id == default_model_id:
                 return display_name
     if models_dict:
         return sorted(list(models_dict.keys()))[0]
     return None
@@ -164,6 +179,7 @@ def generate_stream(provider: str, model_display_name: str, api_key_override: st
     headers = {}
     payload = {}
     request_url = base_url
     logger.info(f"Calling {provider}/{model_display_name} (ID: {model_id}) stream...")
@@ -173,17 +189,27 @@ def generate_stream(provider: str, model_display_name: str, api_key_override: st
             payload = {
                 "model": model_id,
                 "messages": messages,
-                "stream": True
             }
             if provider_lower == "openrouter":
-                 headers["HTTP-Referer"] = os.getenv("SPACE_HOST") or "https://github.com/your_username/ai-space-builder"
-                 headers["X-Title"] = "AI Space Builder"
-            response = requests.post(request_url, headers=headers, json=payload, stream=True, timeout=180)
             response.raise_for_status()
             byte_buffer = b""
             for chunk in response.iter_content(chunk_size=8192):
                 byte_buffer += chunk
                 while b'\n' in byte_buffer:
                     line, byte_buffer = byte_buffer.split(b'\n', 1)
@@ -191,7 +217,7 @@ def generate_stream(provider: str, model_display_name: str, api_key_override: st
                     if decoded_line.startswith('data: '):
                         data = decoded_line[6:]
                         if data == '[DONE]':
-                            byte_buffer = b''
                             break
                         try:
                             event_data = json.loads(data)
@@ -200,11 +226,13 @@ def generate_stream(provider: str, model_display_name: str, api_key_override: st
                                 if delta and delta.get("content"):
                                     yield delta["content"]
                         except json.JSONDecodeError:
-                            logger.warning(f"Failed to decode JSON from stream line: {decoded_line}")
                         except Exception as e:
-                             logger.error(f"Error processing stream data: {e}, Data: {decoded_line}")
             if byte_buffer:
-                 remaining_line = byte_buffer.decode('utf-8', errors='ignore')
                  if remaining_line.startswith('data: '):
                      data = remaining_line[6:]
                      if data != '[DONE]':
@@ -225,14 +253,24 @@ def generate_stream(provider: str, model_display_name: str, api_key_override: st
             filtered_messages = []
             for msg in messages:
                 if msg["role"] == "system":
                     system_instruction = msg["content"]
                 else:
                     role = "model" if msg["role"] == "assistant" else msg["role"]
                     filtered_messages.append({"role": role, "parts": [{"text": msg["content"]}]})
             payload = {
                  "contents": filtered_messages,
-                 "safetySettings": [
                      {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
                      {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
                      {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
@@ -240,69 +278,68 @@ def generate_stream(provider: str, model_display_name: str, api_key_override: st
                  ],
                  "generationConfig": {
                     "temperature": 0.7,
                  }
             }
             if system_instruction:
                 payload["system_instruction"] = {"parts": [{"text": system_instruction}]}
             request_url = f"{base_url}{model_id}:streamGenerateContent"
-            headers = {"Content-Type": "application/json"}
             request_url = f"{request_url}?key={api_key}"
-            response = requests.post(request_url, headers=headers, json=payload, stream=True, timeout=180)
             response.raise_for_status()
             byte_buffer = b""
             for chunk in response.iter_content(chunk_size=8192):
-                 byte_buffer += chunk
-                 while b'\n' in byte_buffer:
-                    line, byte_buffer = byte_buffer.split(b'\n', 1)
-                    decoded_line = line.decode('utf-8', errors='ignore')
-                    if decoded_line.startswith('data: '):
-                        decoded_line = decoded_line[6:].strip()
-                    if not decoded_line: continue
-                    try:
-                         event_data_list = json.loads(f"[{decoded_line}]")
-                         if not isinstance(event_data_list, list): event_data_list = [event_data_list]
-                         for event_data in event_data_list:
-                             if not isinstance(event_data, dict): continue
-                             if event_data.get("candidates") and len(event_data["candidates"]) > 0:
-                                 candidate = event_data["candidates"][0]
-                                 if candidate.get("content") and candidate["content"].get("parts"):
-                                     full_text_chunk = "".join(part.get("text", "") for part in candidate["content"]["parts"])
-                                     if full_text_chunk:
-                                         yield full_text_chunk
-                    except json.JSONDecodeError:
-                        logger.warning(f"Failed to decode JSON from Google stream chunk: {decoded_line}. Accumulating buffer.")
-                        pass
-                    except Exception as e:
-                        logger.error(f"Error processing Google stream data: {e}, Data: {decoded_line}")
             if byte_buffer:
-                 remaining_line = byte_buffer.decode('utf-8', errors='ignore').strip()
-                 if remaining_line:
-                      try:
-                           event_data_list = json.loads(f"[{remaining_line}]")
-                           if not isinstance(event_data_list, list): event_data_list = [event_data_list]
-                           for event_data in event_data_list:
-                               if not isinstance(event_data, dict): continue
-                               if event_data.get("candidates") and len(event_data["candidates"]) > 0:
-                                   candidate = event_data["candidates"][0]
-                                   if candidate.get("content") and candidate["content"].get("parts"):
-                                       full_text_chunk = "".join(part.get("text", "") for part in candidate["content"]["parts"])
-                                       if full_text_chunk:
-                                           yield full_text_chunk
-                      except json.JSONDecodeError:
-                           logger.warning(f"Failed to decode final Google stream buffer JSON: {remaining_line}")
-                      except Exception as e:
-                           logger.error(f"Error processing final Google stream buffer data: {e}, Data: {remaining_line}")
         elif provider_lower == "cohere":
@@ -313,76 +350,93 @@ def generate_stream(provider: str, model_display_name: str, api_key_override: st
             system_prompt_for_cohere = None
             current_message_for_cohere = ""
             temp_history = []
             for msg in messages:
                  if msg["role"] == "system":
-                      system_prompt_for_cohere = msg["content"]
                  elif msg["role"] == "user" or msg["role"] == "assistant":
                      temp_history.append(msg)
-            if temp_history:
-                 current_message_for_cohere = temp_history[-1]["content"]
-                 chat_history_for_cohere = [{"role": ("chatbot" if m["role"] == "assistant" else m["role"]), "message": m["content"]} for m in temp_history[:-1]]
-            if not current_message_for_cohere:
-                 yield "Error: User message not found for Cohere API call."
                  return
             payload = {
                 "model": model_id,
                 "message": current_message_for_cohere,
                 "stream": True,
-                "temperature": 0.7
             }
             if chat_history_for_cohere:
                  payload["chat_history"] = chat_history_for_cohere
             if system_prompt_for_cohere:
                  payload["preamble"] = system_prompt_for_cohere
-            response = requests.post(request_url, headers=headers, json=payload, stream=True, timeout=180)
             response.raise_for_status()
             byte_buffer = b""
             for chunk in response.iter_content(chunk_size=8192):
                 byte_buffer += chunk
-                while b'\n\n' in byte_buffer:
                     event_chunk, byte_buffer = byte_buffer.split(b'\n\n', 1)
                     lines = event_chunk.strip().split(b'\n')
                     event_type = None
                     event_data = None
                     for l in lines:
                          if l.startswith(b"event: "): event_type = l[7:].strip().decode('utf-8', errors='ignore')
                          elif l.startswith(b"data: "):
                               try: event_data = json.loads(l[6:].strip().decode('utf-8', errors='ignore'))
                               except json.JSONDecodeError: logger.warning(f"Cohere: Failed to decode event data JSON: {l[6:].strip()}")
                     if event_type == "text-generation" and event_data and "text" in event_data:
                         yield event_data["text"]
                     elif event_type == "stream-end":
-                        byte_buffer = b''
-                        break
             if byte_buffer:
-                 event_chunk = byte_buffer.strip()
-                 if event_chunk:
-                      lines = event_chunk.split(b'\n')
-                      event_type = None
-                      event_data = None
-                      for l in lines:
-                           if l.startswith(b"event: "): event_type = l[7:].strip().decode('utf-8', errors='ignore')
-                           elif l.startswith(b"data: "):
-                                try: event_data = json.loads(l[6:].strip().decode('utf-8', errors='ignore'))
-                                except json.JSONDecodeError: logger.warning(f"Cohere: Failed to decode final event data JSON: {l[6:].strip()}")
-                      if event_type == "text-generation" and event_data and "text" in event_data:
-                          yield event_data["text"]
-                      elif event_type == "stream-end":
-                           pass
         elif provider_lower == "huggingface":
-             yield f"Error: Direct Hugging Face Inference API streaming for chat models is experimental and model-dependent. Consider using OpenRouter or TogetherAI for HF models with standardized streaming."
              return
         else:
@@ -394,9 +448,12 @@ def generate_stream(provider: str, model_display_name: str, api_key_override: st
         error_text = e.response.text if e.response is not None else 'No response text'
         logger.error(f"HTTP error during streaming for {provider}/{model_id}: {e}")
         yield f"API HTTP Error ({status_code}): {error_text}\nDetails: {e}"
     except requests.exceptions.RequestException as e:
         logger.error(f"Request error during streaming for {provider}/{model_id}: {e}")
         yield f"API Request Error: Could not connect or receive response from {provider} ({e})"
     except Exception as e:
         logger.exception(f"Unexpected error during streaming for {provider}/{model_id}:")
-        yield f"An unexpected error occurred: {e}"

 import requests
 import json
 import logging
+import time # Import time for retries
 logging.basicConfig(
     level=logging.INFO,
 logger = logging.getLogger(__name__)
 API_KEYS = {
+  "HUGGINGFACE": 'HF_TOKEN', # Note: HF_TOKEN is also for HF Hub, so maybe rename this in UI label?
   "GROQ": 'GROQ_API_KEY',
   "OPENROUTER": 'OPENROUTER_API_KEY',
   "TOGETHERAI": 'TOGETHERAI_API_KEY',
   "COHERE": 'https://api.cohere.ai/v1/chat',
   "XAI": 'https://api.x.ai/v1/chat/completions',
   "OPENAI": 'https://api.openai.com/v1/chat/completions',
+  "GOOGLE": 'https://generativelanguage.googleapis.com/v1beta/models/', # Base URL, model ID added later
 }
+# Load model configuration from JSON
+try:
+    with open("models.json", "r") as f:
+        MODELS_BY_PROVIDER = json.load(f)
+    logger.info("models.json loaded successfully.")
+except FileNotFoundError:
+    logger.error("models.json not found. Using hardcoded fallback models.")
+    # Keep the hardcoded fallback as a safety measure
+    MODELS_BY_PROVIDER = {
+        "groq": {
+            "default": "llama3-8b-8192",
+            "models": {
+                "Llama 3 8B (Groq)": "llama3-8b-8192",
+                "Llama 3 70B (Groq)": "llama3-70b-8192",
+                "Mixtral 8x7B (Groq)": "mixtral-8x7b-32768",
+                "Gemma 7B (Groq)": "gemma-7b-it",
+            }
+        },
+         "openrouter": {
+             "default": "nousresearch/llama-3-8b-instruct",
+             "models": {
+                "Nous Llama-3 8B Instruct (OpenRouter)": "nousresearch/llama-3-8b-instruct",
+                "Mistral 7B Instruct v0.2 (OpenRouter)": "mistralai/mistral-7b-instruct:free",
+                "Gemma 7B Instruct (OpenRouter)": "google/gemma-7b-it:free",
+             }
+        },
+        "google": {
+             "default": "gemini-1.5-flash-latest",
+             "models": {
+                 "Gemini 1.5 Flash (Latest)": "gemini-1.5-flash-latest",
+                 "Gemini 1.5 Pro (Latest)": "gemini-1.5-pro-latest",
+             }
+        },
+         "openai": {
+              "default": "gpt-3.5-turbo",
+              "models": {
+                  "GPT-4o mini (OpenAI)": "gpt-4o-mini",
+                  "GPT-3.5 Turbo (OpenAI)": "gpt-3.5-turbo",
+              }
+         },
+        # Add other providers here if needed for fallback
+    }
+except json.JSONDecodeError:
+     logger.error("Error decoding models.json. Using hardcoded fallback models.")
+     # Keep the hardcoded fallback as a safety measure
+     MODELS_BY_PROVIDER = {
+         "groq": {
+             "default": "llama3-8b-8192",
+             "models": {
+                 "Llama 3 8B (Groq)": "llama3-8b-8192",
+                 "Llama 3 70B (Groq)": "llama3-70b-8192",
+                 "Mixtral 8x7B (Groq)": "mixtral-8x7b-32768",
+                 "Gemma 7B (Groq)": "gemma-7b-it",
+             }
+         },
+          "openrouter": {
+              "default": "nousresearch/llama-3-8b-instruct",
+              "models": {
+                 "Nous Llama-3 8B Instruct (OpenRouter)": "nousresearch/llama-3-8b-instruct",
+                 "Mistral 7B Instruct v0.2 (OpenRouter)": "mistralai/mistral-7b-instruct:free",
+                 "Gemma 7B Instruct (OpenRouter)": "google/gemma-7b-it:free",
+              }
+         },
+         "google": {
+              "default": "gemini-1.5-flash-latest",
+              "models": {
+                  "Gemini 1.5 Flash (Latest)": "gemini-1.5-flash-latest",
+                  "Gemini 1.5 Pro (Latest)": "gemini-1.5-pro-latest",
+              }
+         },
+          "openai": {
+               "default": "gpt-3.5-turbo",
+               "models": {
+                   "GPT-4o mini (OpenAI)": "gpt-4o-mini",
+                   "GPT-3.5 Turbo (OpenAI)": "gpt-3.5-turbo",
+               }
+          },
+         # Add other providers here if needed for fallback
      }
 def _get_api_key(provider: str, ui_api_key_override: str = None) -> str:
     if ui_api_key_override:
+        logger.debug(f"Using UI API key override for {provider}")
         return ui_api_key_override.strip()
     env_var_name = API_KEYS.get(provider.upper())
     if env_var_name:
         env_key = os.getenv(env_var_name)
         if env_key:
+            logger.debug(f"Using env var {env_var_name} for {provider}")
             return env_key.strip()
+    # Special case for Hugging Face, HF_TOKEN is common
     if provider.lower() == 'huggingface':
          hf_token = os.getenv("HF_TOKEN")
+         if hf_token:
+             logger.debug(f"Using HF_TOKEN env var for {provider}")
+             return hf_token.strip()
     logger.warning(f"API Key not found for provider '{provider}'. Checked UI override and environment variable '{env_var_name or 'N/A'}'.")
     return None
     models_dict = MODELS_BY_PROVIDER.get(provider.lower(), {}).get("models", {})
     default_model_id = MODELS_BY_PROVIDER.get(provider.lower(), {}).get("default")
     if default_model_id:
+        # Find the display name corresponding to the default model ID
         for display_name, model_id in models_dict.items():
             if model_id == default_model_id:
                 return display_name
+    # Fallback: If no default specified or found, return the first model in the sorted list
     if models_dict:
         return sorted(list(models_dict.keys()))[0]
     return None
     headers = {}
     payload = {}
     request_url = base_url
+    timeout_seconds = 180 # Increased timeout
     logger.info(f"Calling {provider}/{model_display_name} (ID: {model_id}) stream...")
             payload = {
                 "model": model_id,
                 "messages": messages,
+                "stream": True,
+                "temperature": 0.7, # Add temperature
+                "max_tokens": 4096 # Add max_tokens
             }
             if provider_lower == "openrouter":
+                 headers["HTTP-Referer"] = os.getenv("SPACE_HOST") or "https://github.com/your_username/ai-space-commander" # Use space name
+                 headers["X-Title"] = "Hugging Face Space Commander" # Use project title
+            response = requests.post(request_url, headers=headers, json=payload, stream=True, timeout=timeout_seconds)
             response.raise_for_status()
             byte_buffer = b""
             for chunk in response.iter_content(chunk_size=8192):
+                # Check for potential HTTP errors during streaming
+                if response.status_code != 200:
+                     # Attempt to read error body if available
+                     error_body = response.text
+                     logger.error(f"HTTP Error during stream: {response.status_code}, Body: {error_body}")
+                     yield f"API HTTP Error ({response.status_code}) during stream: {error_body}"
+                     return # Stop streaming on error
                 byte_buffer += chunk
                 while b'\n' in byte_buffer:
                     line, byte_buffer = byte_buffer.split(b'\n', 1)
                     if decoded_line.startswith('data: '):
                         data = decoded_line[6:]
                         if data == '[DONE]':
+                            byte_buffer = b'' # Clear buffer after DONE
                             break
                         try:
                             event_data = json.loads(data)
                                 if delta and delta.get("content"):
                                     yield delta["content"]
                         except json.JSONDecodeError:
+                            # Log warning but continue, partial data might be okay or next line fixes it
+                            logger.warning(f"Failed to decode JSON from stream line: {decoded_line.strip()}")
                         except Exception as e:
+                             logger.error(f"Error processing stream data: {e}, Data: {decoded_line.strip()}")
+            # Process any remaining data in the buffer after the loop
             if byte_buffer:
+                 remaining_line = byte_buffer.decode('utf-8', errors='ignore').strip()
                  if remaining_line.startswith('data: '):
                      data = remaining_line[6:]
                      if data != '[DONE]':
             filtered_messages = []
             for msg in messages:
                 if msg["role"] == "system":
+                    # Google's API takes system instruction separately or expects a specific history format
+                    # Let's extract the system instruction
                     system_instruction = msg["content"]
                 else:
+                    # Map roles: 'user' -> 'user', 'assistant' -> 'model'
                     role = "model" if msg["role"] == "assistant" else msg["role"]
                     filtered_messages.append({"role": role, "parts": [{"text": msg["content"]}]})
+            # Ensure conversation history alternates roles correctly for Google
+            # Simple check: if last two roles are same, it's invalid.
+            for i in range(1, len(filtered_messages)):
+                 if filtered_messages[i]["role"] == filtered_messages[i-1]["role"]:
+                      yield f"Error: Google API requires alternating user/model roles in chat history. Please check prompt or history format."
+                      return # Stop if history format is invalid
             payload = {
                  "contents": filtered_messages,
+                 "safetySettings": [ # Default safety settings to allow helpful but potentially sensitive code/instructions
                      {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
                      {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
                      {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
                  ],
                  "generationConfig": {
                     "temperature": 0.7,
+                    "maxOutputTokens": 4096 # Google's max_tokens equivalent
                  }
             }
+            # System instruction is passed separately
             if system_instruction:
                 payload["system_instruction"] = {"parts": [{"text": system_instruction}]}
             request_url = f"{base_url}{model_id}:streamGenerateContent"
+            # API key is passed as a query parameter for Google
             request_url = f"{request_url}?key={api_key}"
+            headers = {"Content-Type": "application/json"} # Content-Type is still application/json
+            response = requests.post(request_url, headers=headers, json=payload, stream=True, timeout=timeout_seconds)
             response.raise_for_status()
             byte_buffer = b""
             for chunk in response.iter_content(chunk_size=8192):
+                 # Check for potential HTTP errors during streaming
+                 if response.status_code != 200:
+                      error_body = response.text
+                      logger.error(f"HTTP Error during Google stream: {response.status_code}, Body: {error_body}")
+                      yield f"API HTTP Error ({response.status_code}) during Google stream: {error_body}"
+                      return # Stop streaming on error
+                 byte_buffer += chunk
+                 # Google's streaming can send multiple JSON objects in one chunk, sometimes split by newlines
+                 # Or just single JSON objects. They don't strictly follow the Server-Sent Events 'data:' format.
+                 # We need to find JSON objects in the buffer.
+                 json_decoder = json.JSONDecoder()
+                 while byte_buffer:
+                     try:
+                         # Attempt to decode a JSON object from the start of the buffer
+                         obj, idx = json_decoder.raw_decode(byte_buffer.decode('utf-8', errors='ignore').lstrip()) # lstrip to handle leading whitespace/newlines
+                         # If successful, process the object
+                         byte_buffer = byte_buffer[len(byte_buffer.decode('utf-8', errors='ignore').lstrip()[:idx]).encode('utf-8'):] # Remove the decoded part from the buffer
+                         if obj.get("candidates") and len(obj["candidates"]) > 0:
+                             candidate = obj["candidates"][0]
+                             if candidate.get("content") and candidate["content"].get("parts"):
+                                 full_text_chunk = "".join(part.get("text", "") for part in candidate["content"]["parts"])
+                                 if full_text_chunk:
+                                     yield full_text_chunk
+                         # Check for potential errors in the response object itself
+                         if obj.get("error"):
+                              error_details = obj["error"].get("message", str(obj["error"]))
+                              logger.error(f"Google API returned error in stream data: {error_details}")
+                              yield f"API Error (Google): {error_details}"
+                              return # Stop streaming
+                     except json.JSONDecodeError:
+                         # If raw_decode fails, it means the buffer doesn't contain a complete JSON object at the start.
+                         # Break the inner while loop and wait for more data.
+                         break
+                     except Exception as e:
+                         logger.error(f"Error processing Google stream data object: {e}, Object: {obj}")
+                         # Decide if this is a fatal error or just a bad chunk
+                         # For now, log and continue might be okay for processing subsequent chunks.
+            # If loop finishes and buffer still has data, log it (incomplete data)
             if byte_buffer:
+                 logger.warning(f"Remaining data in Google stream buffer after processing: {byte_buffer.decode('utf-8', errors='ignore')}")
         elif provider_lower == "cohere":
             system_prompt_for_cohere = None
             current_message_for_cohere = ""
+            # Cohere requires a specific history format and separates system/preamble
+            # The last message is the "message", previous are "chat_history"
             temp_history = []
             for msg in messages:
                  if msg["role"] == "system":
+                      # If multiple system prompts, concatenate them for preamble
+                      if system_prompt_for_cohere: system_prompt_for_cohere += "\n" + msg["content"]
+                      else: system_prompt_for_cohere = msg["content"]
                  elif msg["role"] == "user" or msg["role"] == "assistant":
                      temp_history.append(msg)
+            if not temp_history:
+                 yield "Error: No user message found for Cohere API call."
+                 return
+            if temp_history[-1]["role"] != "user":
+                 yield "Error: Last message must be from user for Cohere API call."
                  return
+            current_message_for_cohere = temp_history[-1]["content"]
+            # Map roles: 'user' -> 'user', 'assistant' -> 'chatbot'
+            chat_history_for_cohere = [{"role": ("chatbot" if m["role"] == "assistant" else m["role"]), "message": m["content"]} for m in temp_history[:-1]]
             payload = {
                 "model": model_id,
                 "message": current_message_for_cohere,
                 "stream": True,
+                "temperature": 0.7,
+                "max_tokens": 4096 # Add max_tokens
             }
             if chat_history_for_cohere:
                  payload["chat_history"] = chat_history_for_cohere
             if system_prompt_for_cohere:
                  payload["preamble"] = system_prompt_for_cohere
+            response = requests.post(request_url, headers=headers, json=payload, stream=True, timeout=timeout_seconds)
             response.raise_for_status()
             byte_buffer = b""
             for chunk in response.iter_content(chunk_size=8192):
+                # Check for potential HTTP errors during streaming
+                 if response.status_code != 200:
+                      error_body = response.text
+                      logger.error(f"HTTP Error during Cohere stream: {response.status_code}, Body: {error_body}")
+                      yield f"API HTTP Error ({response.status_code}) during Cohere stream: {error_body}"
+                      return # Stop streaming on error
                 byte_buffer += chunk
+                while b'\n\n' in byte_buffer: # Cohere uses \n\n as event separator
                     event_chunk, byte_buffer = byte_buffer.split(b'\n\n', 1)
                     lines = event_chunk.strip().split(b'\n')
                     event_type = None
                     event_data = None
                     for l in lines:
+                         if l.strip() == b"": continue # Skip blank lines within an event
                          if l.startswith(b"event: "): event_type = l[7:].strip().decode('utf-8', errors='ignore')
                          elif l.startswith(b"data: "):
                               try: event_data = json.loads(l[6:].strip().decode('utf-8', errors='ignore'))
                               except json.JSONDecodeError: logger.warning(f"Cohere: Failed to decode event data JSON: {l[6:].strip()}")
+                         else:
+                             # Log unexpected lines in event chunk
+                             logger.warning(f"Cohere: Unexpected line in event chunk: {l.decode('utf-8', errors='ignore').strip()}")
                     if event_type == "text-generation" and event_data and "text" in event_data:
                         yield event_data["text"]
                     elif event_type == "stream-end":
+                        logger.debug("Cohere stream-end event received.")
+                        byte_buffer = b'' # Clear buffer after stream-end
+                        break # Exit the while loop
+                    elif event_type == "error":
+                        error_msg = event_data.get("message", str(event_data)) if event_data else "Unknown Cohere stream error"
+                        logger.error(f"Cohere stream error event: {error_msg}")
+                        yield f"API Error (Cohere stream): {error_msg}"
+                        return # Stop streaming on error
+            # Process any remaining data in the buffer after the loop
             if byte_buffer:
+                 logger.warning(f"Remaining data in Cohere stream buffer after processing: {byte_buffer.decode('utf-8', errors='ignore')}")
         elif provider_lower == "huggingface":
+             # Hugging Face Inference API often supports streaming for text-generation,
+             # but chat completion streaming format varies greatly model by model, if supported.
+             # Standard OpenAI-like streaming is not guaranteed.
+             # Let's provide a more informative message.
+             yield f"Error: Direct Hugging Face Inference API streaming for chat models is highly experimental and depends heavily on the specific model's implementation. Standard OpenAI-like streaming is NOT guaranteed. For better compatibility with HF models that support the OpenAI format, consider using the OpenRouter or TogetherAI providers and selecting the HF models listed there."
              return
         else:
         error_text = e.response.text if e.response is not None else 'No response text'
         logger.error(f"HTTP error during streaming for {provider}/{model_id}: {e}")
         yield f"API HTTP Error ({status_code}): {error_text}\nDetails: {e}"
+    except requests.exceptions.Timeout:
+         logger.error(f"Request Timeout after {timeout_seconds} seconds for {provider}/{model_id}.")
+         yield f"API Request Timeout: The request took too long to complete ({timeout_seconds} seconds)."
     except requests.exceptions.RequestException as e:
         logger.error(f"Request error during streaming for {provider}/{model_id}: {e}")
         yield f"API Request Error: Could not connect or receive response from {provider} ({e})"
     except Exception as e:
         logger.exception(f"Unexpected error during streaming for {provider}/{model_id}:")
+        yield f"An unexpected error occurred during streaming: {e}"