Huggingface-Space-Commander-dev

Sleeping

App Files Files Community

broadfield-dev commited on Jun 12

Commit

e8b8149

verified ·

1 Parent(s): 3103a1e

Update model_logic.py

Browse files

Files changed (1) hide show

model_logic.py +36 -115

model_logic.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import requests
 import json
 import logging
-import time # Import time for retries
 logging.basicConfig(
     level=logging.INFO,
@@ -11,7 +11,7 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
 API_KEYS = {
-  "HUGGINGFACE": 'HF_TOKEN', # Note: HF_TOKEN is also for HF Hub, so maybe rename this in UI label?
   "GROQ": 'GROQ_API_KEY',
   "OPENROUTER": 'OPENROUTER_API_KEY',
   "TOGETHERAI": 'TOGETHERAI_API_KEY',
@@ -29,17 +29,15 @@ API_URLS = {
   "COHERE": 'https://api.cohere.ai/v1/chat',
   "XAI": 'https://api.x.ai/v1/chat/completions',
   "OPENAI": 'https://api.openai.com/v1/chat/completions',
-  "GOOGLE": 'https://generativelanguage.googleapis.com/v1beta/models/', # Base URL, model ID added later
 }
-# Load model configuration from JSON
 try:
     with open("models.json", "r") as f:
         MODELS_BY_PROVIDER = json.load(f)
     logger.info("models.json loaded successfully.")
-except FileNotFoundError:
-    logger.error("models.json not found. Using hardcoded fallback models.")
-    # Keep the hardcoded fallback as a safety measure
     MODELS_BY_PROVIDER = {
         "groq": {
             "default": "llama3-8b-8192",
@@ -70,47 +68,9 @@ except FileNotFoundError:
               "models": {
                   "GPT-4o mini (OpenAI)": "gpt-4o-mini",
                   "GPT-3.5 Turbo (OpenAI)": "gpt-3.5-turbo",
-              }
-         },
-        # Add other providers here if needed for fallback
-    }
-except json.JSONDecodeError:
-     logger.error("Error decoding models.json. Using hardcoded fallback models.")
-     # Keep the hardcoded fallback as a safety measure
-     MODELS_BY_PROVIDER = {
-         "groq": {
-             "default": "llama3-8b-8192",
-             "models": {
-                 "Llama 3 8B (Groq)": "llama3-8b-8192",
-                 "Llama 3 70B (Groq)": "llama3-70b-8192",
-                 "Mixtral 8x7B (Groq)": "mixtral-8x7b-32768",
-                 "Gemma 7B (Groq)": "gemma-7b-it",
-             }
-         },
-          "openrouter": {
-              "default": "nousresearch/llama-3-8b-instruct",
-              "models": {
-                 "Nous Llama-3 8B Instruct (OpenRouter)": "nousresearch/llama-3-8b-instruct",
-                 "Mistral 7B Instruct v0.2 (OpenRouter)": "mistralai/mistral-7b-instruct:free",
-                 "Gemma 7B Instruct (OpenRouter)": "google/gemma-7b-it:free",
-              }
-         },
-         "google": {
-              "default": "gemini-1.5-flash-latest",
-              "models": {
-                  "Gemini 1.5 Flash (Latest)": "gemini-1.5-flash-latest",
-                  "Gemini 1.5 Pro (Latest)": "gemini-1.5-pro-latest",
-              }
-         },
-          "openai": {
-               "default": "gpt-3.5-turbo",
-               "models": {
-                   "GPT-4o mini (OpenAI)": "gpt-4o-mini",
-                   "GPT-3.5 Turbo (OpenAI)": "gpt-3.5-turbo",
                }
           },
-         # Add other providers here if needed for fallback
-     }
 def _get_api_key(provider: str, ui_api_key_override: str = None) -> str:
@@ -125,7 +85,6 @@ def _get_api_key(provider: str, ui_api_key_override: str = None) -> str:
             logger.debug(f"Using env var {env_var_name} for {provider}")
             return env_key.strip()
-    # Special case for Hugging Face, HF_TOKEN is common
     if provider.lower() == 'huggingface':
          hf_token = os.getenv("HF_TOKEN")
          if hf_token:
@@ -145,11 +104,9 @@ def get_default_model_for_provider(provider: str) -> str | None:
     models_dict = MODELS_BY_PROVIDER.get(provider.lower(), {}).get("models", {})
     default_model_id = MODELS_BY_PROVIDER.get(provider.lower(), {}).get("default")
     if default_model_id:
-        # Find the display name corresponding to the default model ID
         for display_name, model_id in models_dict.items():
             if model_id == default_model_id:
                 return display_name
-    # Fallback: If no default specified or found, return the first model in the sorted list
     if models_dict:
         return sorted(list(models_dict.keys()))[0]
     return None
@@ -179,7 +136,7 @@ def generate_stream(provider: str, model_display_name: str, api_key_override: st
     headers = {}
     payload = {}
     request_url = base_url
-    timeout_seconds = 180 # Increased timeout
     logger.info(f"Calling {provider}/{model_display_name} (ID: {model_id}) stream...")
@@ -190,25 +147,23 @@ def generate_stream(provider: str, model_display_name: str, api_key_override: st
                 "model": model_id,
                 "messages": messages,
                 "stream": True,
-                "temperature": 0.7, # Add temperature
-                "max_tokens": 4096 # Add max_tokens
             }
             if provider_lower == "openrouter":
-                 headers["HTTP-Referer"] = os.getenv("SPACE_HOST") or "https://github.com/your_username/ai-space-commander" # Use space name
-                 headers["X-Title"] = "Hugging Face Space Commander" # Use project title
             response = requests.post(request_url, headers=headers, json=payload, stream=True, timeout=timeout_seconds)
             response.raise_for_status()
             byte_buffer = b""
             for chunk in response.iter_content(chunk_size=8192):
-                # Check for potential HTTP errors during streaming
                 if response.status_code != 200:
-                     # Attempt to read error body if available
                      error_body = response.text
                      logger.error(f"HTTP Error during stream: {response.status_code}, Body: {error_body}")
                      yield f"API HTTP Error ({response.status_code}) during stream: {error_body}"
-                     return # Stop streaming on error
                 byte_buffer += chunk
                 while b'\n' in byte_buffer:
@@ -217,7 +172,7 @@ def generate_stream(provider: str, model_display_name: str, api_key_override: st
                     if decoded_line.startswith('data: '):
                         data = decoded_line[6:]
                         if data == '[DONE]':
-                            byte_buffer = b'' # Clear buffer after DONE
                             break
                         try:
                             event_data = json.loads(data)
@@ -226,11 +181,9 @@ def generate_stream(provider: str, model_display_name: str, api_key_override: st
                                 if delta and delta.get("content"):
                                     yield delta["content"]
                         except json.JSONDecodeError:
-                            # Log warning but continue, partial data might be okay or next line fixes it
                             logger.warning(f"Failed to decode JSON from stream line: {decoded_line.strip()}")
                         except Exception as e:
                              logger.error(f"Error processing stream data: {e}, Data: {decoded_line.strip()}")
-            # Process any remaining data in the buffer after the loop
             if byte_buffer:
                  remaining_line = byte_buffer.decode('utf-8', errors='ignore').strip()
                  if remaining_line.startswith('data: '):
@@ -253,24 +206,20 @@ def generate_stream(provider: str, model_display_name: str, api_key_override: st
             filtered_messages = []
             for msg in messages:
                 if msg["role"] == "system":
-                    # Google's API takes system instruction separately or expects a specific history format
-                    # Let's extract the system instruction
-                    system_instruction = msg["content"]
                 else:
-                    # Map roles: 'user' -> 'user', 'assistant' -> 'model'
                     role = "model" if msg["role"] == "assistant" else msg["role"]
                     filtered_messages.append({"role": role, "parts": [{"text": msg["content"]}]})
-            # Ensure conversation history alternates roles correctly for Google
-            # Simple check: if last two roles are same, it's invalid.
             for i in range(1, len(filtered_messages)):
                  if filtered_messages[i]["role"] == filtered_messages[i-1]["role"]:
                       yield f"Error: Google API requires alternating user/model roles in chat history. Please check prompt or history format."
-                      return # Stop if history format is invalid
             payload = {
                  "contents": filtered_messages,
-                 "safetySettings": [ # Default safety settings to allow helpful but potentially sensitive code/instructions
                      {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
                      {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
                      {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
@@ -278,66 +227,51 @@ def generate_stream(provider: str, model_display_name: str, api_key_override: st
                  ],
                  "generationConfig": {
                     "temperature": 0.7,
-                    "maxOutputTokens": 4096 # Google's max_tokens equivalent
                  }
             }
-            # System instruction is passed separately
             if system_instruction:
                 payload["system_instruction"] = {"parts": [{"text": system_instruction}]}
             request_url = f"{base_url}{model_id}:streamGenerateContent"
-            # API key is passed as a query parameter for Google
             request_url = f"{request_url}?key={api_key}"
-            headers = {"Content-Type": "application/json"} # Content-Type is still application/json
             response = requests.post(request_url, headers=headers, json=payload, stream=True, timeout=timeout_seconds)
             response.raise_for_status()
             byte_buffer = b""
             for chunk in response.iter_content(chunk_size=8192):
-                 # Check for potential HTTP errors during streaming
                  if response.status_code != 200:
                       error_body = response.text
                       logger.error(f"HTTP Error during Google stream: {response.status_code}, Body: {error_body}")
                       yield f"API HTTP Error ({response.status_code}) during Google stream: {error_body}"
-                      return # Stop streaming on error
                  byte_buffer += chunk
-                 # Google's streaming can send multiple JSON objects in one chunk, sometimes split by newlines
-                 # Or just single JSON objects. They don't strictly follow the Server-Sent Events 'data:' format.
-                 # We need to find JSON objects in the buffer.
-                 json_decoder = json.JSONDecoder()
-                 while byte_buffer:
                      try:
-                         # Attempt to decode a JSON object from the start of the buffer
-                         obj, idx = json_decoder.raw_decode(byte_buffer.decode('utf-8', errors='ignore').lstrip()) # lstrip to handle leading whitespace/newlines
-                         # If successful, process the object
-                         byte_buffer = byte_buffer[len(byte_buffer.decode('utf-8', errors='ignore').lstrip()[:idx]).encode('utf-8'):] # Remove the decoded part from the buffer
                          if obj.get("candidates") and len(obj["candidates"]) > 0:
                              candidate = obj["candidates"][0]
                              if candidate.get("content") and candidate["content"].get("parts"):
                                  full_text_chunk = "".join(part.get("text", "") for part in candidate["content"]["parts"])
                                  if full_text_chunk:
                                      yield full_text_chunk
-                         # Check for potential errors in the response object itself
                          if obj.get("error"):
                               error_details = obj["error"].get("message", str(obj["error"]))
                               logger.error(f"Google API returned error in stream data: {error_details}")
                               yield f"API Error (Google): {error_details}"
-                              return # Stop streaming
                      except json.JSONDecodeError:
-                         # If raw_decode fails, it means the buffer doesn't contain a complete JSON object at the start.
-                         # Break the inner while loop and wait for more data.
                          break
                      except Exception as e:
                          logger.error(f"Error processing Google stream data object: {e}, Object: {obj}")
-                         # Decide if this is a fatal error or just a bad chunk
-                         # For now, log and continue might be okay for processing subsequent chunks.
-            # If loop finishes and buffer still has data, log it (incomplete data)
             if byte_buffer:
                  logger.warning(f"Remaining data in Google stream buffer after processing: {byte_buffer.decode('utf-8', errors='ignore')}")
@@ -350,12 +284,9 @@ def generate_stream(provider: str, model_display_name: str, api_key_override: st
             system_prompt_for_cohere = None
             current_message_for_cohere = ""
-            # Cohere requires a specific history format and separates system/preamble
-            # The last message is the "message", previous are "chat_history"
             temp_history = []
             for msg in messages:
                  if msg["role"] == "system":
-                      # If multiple system prompts, concatenate them for preamble
                       if system_prompt_for_cohere: system_prompt_for_cohere += "\n" + msg["content"]
                       else: system_prompt_for_cohere = msg["content"]
                  elif msg["role"] == "user" or msg["role"] == "assistant":
@@ -369,7 +300,6 @@ def generate_stream(provider: str, model_display_name: str, api_key_override: st
                  return
             current_message_for_cohere = temp_history[-1]["content"]
-            # Map roles: 'user' -> 'user', 'assistant' -> 'chatbot'
             chat_history_for_cohere = [{"role": ("chatbot" if m["role"] == "assistant" else m["role"]), "message": m["content"]} for m in temp_history[:-1]]
             payload = {
@@ -377,7 +307,7 @@ def generate_stream(provider: str, model_display_name: str, api_key_override: st
                 "message": current_message_for_cohere,
                 "stream": True,
                 "temperature": 0.7,
-                "max_tokens": 4096 # Add max_tokens
             }
             if chat_history_for_cohere:
                  payload["chat_history"] = chat_history_for_cohere
@@ -389,54 +319,45 @@ def generate_stream(provider: str, model_display_name: str, api_key_override: st
             byte_buffer = b""
             for chunk in response.iter_content(chunk_size=8192):
-                # Check for potential HTTP errors during streaming
                 if response.status_code != 200:
                     error_body = response.text
                     logger.error(f"HTTP Error during Cohere stream: {response.status_code}, Body: {error_body}")
                     yield f"API HTTP Error ({response.status_code}) during Cohere stream: {error_body}"
-                    return # Stop streaming on error
                 byte_buffer += chunk
-                while b'\n\n' in byte_buffer: # Cohere uses \n\n as event separator
                     event_chunk, byte_buffer = byte_buffer.split(b'\n\n', 1)
                     lines = event_chunk.strip().split(b'\n')
                     event_type = None
                     event_data = None
                     for l in lines:
-                         if l.strip() == b"": continue # Skip blank lines within an event
                          if l.startswith(b"event: "): event_type = l[7:].strip().decode('utf-8', errors='ignore')
                          elif l.startswith(b"data: "):
                               try: event_data = json.loads(l[6:].strip().decode('utf-8', errors='ignore'))
                               except json.JSONDecodeError: logger.warning(f"Cohere: Failed to decode event data JSON: {l[6:].strip()}")
                          else:
-                             # Log unexpected lines in event chunk
                              logger.warning(f"Cohere: Unexpected line in event chunk: {l.decode('utf-8', errors='ignore').strip()}")
                     if event_type == "text-generation" and event_data and "text" in event_data:
                         yield event_data["text"]
                     elif event_type == "stream-end":
                         logger.debug("Cohere stream-end event received.")
-                        byte_buffer = b'' # Clear buffer after stream-end
-                        break # Exit the while loop
                     elif event_type == "error":
                         error_msg = event_data.get("message", str(event_data)) if event_data else "Unknown Cohere stream error"
                         logger.error(f"Cohere stream error event: {error_msg}")
                         yield f"API Error (Cohere stream): {error_msg}"
-                        return # Stop streaming on error
-            # Process any remaining data in the buffer after the loop
             if byte_buffer:
                  logger.warning(f"Remaining data in Cohere stream buffer after processing: {byte_buffer.decode('utf-8', errors='ignore')}")
         elif provider_lower == "huggingface":
-             # Hugging Face Inference API often supports streaming for text-generation,
-             # but chat completion streaming format varies greatly model by model, if supported.
-             # Standard OpenAI-like streaming is not guaranteed.
-             # Let's provide a more informative message.
-             yield f"Error: Direct Hugging Face Inference API streaming for chat models is highly experimental and depends heavily on the specific model's implementation. Standard OpenAI-like streaming is NOT guaranteed. For better compatibility with HF models that support the OpenAI format, consider using the OpenRouter or TogetherAI providers and selecting the HF models listed there."
              return
         else:
@@ -456,4 +377,4 @@ def generate_stream(provider: str, model_display_name: str, api_key_override: st
         yield f"API Request Error: Could not connect or receive response from {provider} ({e})"
     except Exception as e:
         logger.exception(f"Unexpected error during streaming for {provider}/{model_id}:")
-        yield f"An unexpected error occurred during streaming: {e}"

 import requests
 import json
 import logging
+import time
 logging.basicConfig(
     level=logging.INFO,
 logger = logging.getLogger(__name__)
 API_KEYS = {
+  "HUGGINGFACE": 'HF_TOKEN',
   "GROQ": 'GROQ_API_KEY',
   "OPENROUTER": 'OPENROUTER_API_KEY',
   "TOGETHERAI": 'TOGETHERAI_API_KEY',
   "COHERE": 'https://api.cohere.ai/v1/chat',
   "XAI": 'https://api.x.ai/v1/chat/completions',
   "OPENAI": 'https://api.openai.com/v1/chat/completions',
+  "GOOGLE": 'https://generativelanguage.googleapis.com/v1beta/models/',
 }
 try:
     with open("models.json", "r") as f:
         MODELS_BY_PROVIDER = json.load(f)
     logger.info("models.json loaded successfully.")
+except (FileNotFoundError, json.JSONDecodeError) as e:
+    logger.error(f"Error loading models.json: {e}. Using hardcoded fallback models.")
     MODELS_BY_PROVIDER = {
         "groq": {
             "default": "llama3-8b-8192",
               "models": {
                   "GPT-4o mini (OpenAI)": "gpt-4o-mini",
                   "GPT-3.5 Turbo (OpenAI)": "gpt-3.5-turbo",
                }
           },
+    }
 def _get_api_key(provider: str, ui_api_key_override: str = None) -> str:
             logger.debug(f"Using env var {env_var_name} for {provider}")
             return env_key.strip()
     if provider.lower() == 'huggingface':
          hf_token = os.getenv("HF_TOKEN")
          if hf_token:
     models_dict = MODELS_BY_PROVIDER.get(provider.lower(), {}).get("models", {})
     default_model_id = MODELS_BY_PROVIDER.get(provider.lower(), {}).get("default")
     if default_model_id:
         for display_name, model_id in models_dict.items():
             if model_id == default_model_id:
                 return display_name
     if models_dict:
         return sorted(list(models_dict.keys()))[0]
     return None
     headers = {}
     payload = {}
     request_url = base_url
+    timeout_seconds = 180
     logger.info(f"Calling {provider}/{model_display_name} (ID: {model_id}) stream...")
                 "model": model_id,
                 "messages": messages,
                 "stream": True,
+                "temperature": 0.7,
+                "max_tokens": 4096
             }
             if provider_lower == "openrouter":
+                 headers["HTTP-Referer"] = os.getenv("SPACE_HOST") or "https://github.com/huggingface/ai-space-commander"
+                 headers["X-Title"] = "Hugging Face Space Commander"
             response = requests.post(request_url, headers=headers, json=payload, stream=True, timeout=timeout_seconds)
             response.raise_for_status()
             byte_buffer = b""
             for chunk in response.iter_content(chunk_size=8192):
                 if response.status_code != 200:
                      error_body = response.text
                      logger.error(f"HTTP Error during stream: {response.status_code}, Body: {error_body}")
                      yield f"API HTTP Error ({response.status_code}) during stream: {error_body}"
+                     return
                 byte_buffer += chunk
                 while b'\n' in byte_buffer:
                     if decoded_line.startswith('data: '):
                         data = decoded_line[6:]
                         if data == '[DONE]':
+                            byte_buffer = b''
                             break
                         try:
                             event_data = json.loads(data)
                                 if delta and delta.get("content"):
                                     yield delta["content"]
                         except json.JSONDecodeError:
                             logger.warning(f"Failed to decode JSON from stream line: {decoded_line.strip()}")
                         except Exception as e:
                              logger.error(f"Error processing stream data: {e}, Data: {decoded_line.strip()}")
             if byte_buffer:
                  remaining_line = byte_buffer.decode('utf-8', errors='ignore').strip()
                  if remaining_line.startswith('data: '):
             filtered_messages = []
             for msg in messages:
                 if msg["role"] == "system":
+                    if system_instruction: system_instruction += "\n" + msg["content"]
+                    else: system_instruction = msg["content"]
                 else:
                     role = "model" if msg["role"] == "assistant" else msg["role"]
                     filtered_messages.append({"role": role, "parts": [{"text": msg["content"]}]})
             for i in range(1, len(filtered_messages)):
                  if filtered_messages[i]["role"] == filtered_messages[i-1]["role"]:
                       yield f"Error: Google API requires alternating user/model roles in chat history. Please check prompt or history format."
+                      return
             payload = {
                  "contents": filtered_messages,
+                 "safetySettings": [
                      {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
                      {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
                      {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
                  ],
                  "generationConfig": {
                     "temperature": 0.7,
+                    "maxOutputTokens": 4096
                  }
             }
             if system_instruction:
                 payload["system_instruction"] = {"parts": [{"text": system_instruction}]}
             request_url = f"{base_url}{model_id}:streamGenerateContent"
             request_url = f"{request_url}?key={api_key}"
+            headers = {"Content-Type": "application/json"}
             response = requests.post(request_url, headers=headers, json=payload, stream=True, timeout=timeout_seconds)
             response.raise_for_status()
             byte_buffer = b""
+            json_decoder = json.JSONDecoder()
             for chunk in response.iter_content(chunk_size=8192):
                  if response.status_code != 200:
                       error_body = response.text
                       logger.error(f"HTTP Error during Google stream: {response.status_code}, Body: {error_body}")
                       yield f"API HTTP Error ({response.status_code}) during Google stream: {error_body}"
+                      return
                  byte_buffer += chunk
+                 decoded_buffer = byte_buffer.decode('utf-8', errors='ignore')
+                 buffer_index = 0
+                 while buffer_index < len(decoded_buffer):
                      try:
+                         obj, idx = json_decoder.raw_decode(decoded_buffer[buffer_index:].lstrip())
+                         buffer_index += len(decoded_buffer[buffer_index:].lstrip()[:idx])
                          if obj.get("candidates") and len(obj["candidates"]) > 0:
                              candidate = obj["candidates"][0]
                              if candidate.get("content") and candidate["content"].get("parts"):
                                  full_text_chunk = "".join(part.get("text", "") for part in candidate["content"]["parts"])
                                  if full_text_chunk:
                                      yield full_text_chunk
                          if obj.get("error"):
                               error_details = obj["error"].get("message", str(obj["error"]))
                               logger.error(f"Google API returned error in stream data: {error_details}")
                               yield f"API Error (Google): {error_details}"
+                              return
                      except json.JSONDecodeError:
                          break
                      except Exception as e:
                          logger.error(f"Error processing Google stream data object: {e}, Object: {obj}")
             if byte_buffer:
                  logger.warning(f"Remaining data in Google stream buffer after processing: {byte_buffer.decode('utf-8', errors='ignore')}")
             system_prompt_for_cohere = None
             current_message_for_cohere = ""
             temp_history = []
             for msg in messages:
                  if msg["role"] == "system":
                       if system_prompt_for_cohere: system_prompt_for_cohere += "\n" + msg["content"]
                       else: system_prompt_for_cohere = msg["content"]
                  elif msg["role"] == "user" or msg["role"] == "assistant":
                  return
             current_message_for_cohere = temp_history[-1]["content"]
             chat_history_for_cohere = [{"role": ("chatbot" if m["role"] == "assistant" else m["role"]), "message": m["content"]} for m in temp_history[:-1]]
             payload = {
                 "message": current_message_for_cohere,
                 "stream": True,
                 "temperature": 0.7,
+                "max_tokens": 4096
             }
             if chat_history_for_cohere:
                  payload["chat_history"] = chat_history_for_cohere
             byte_buffer = b""
             for chunk in response.iter_content(chunk_size=8192):
                 if response.status_code != 200:
                     error_body = response.text
                     logger.error(f"HTTP Error during Cohere stream: {response.status_code}, Body: {error_body}")
                     yield f"API HTTP Error ({response.status_code}) during Cohere stream: {error_body}"
+                    return
                 byte_buffer += chunk
+                while b'\n\n' in byte_buffer:
                     event_chunk, byte_buffer = byte_buffer.split(b'\n\n', 1)
                     lines = event_chunk.strip().split(b'\n')
                     event_type = None
                     event_data = None
                     for l in lines:
+                         if l.strip() == b"": continue
                          if l.startswith(b"event: "): event_type = l[7:].strip().decode('utf-8', errors='ignore')
                          elif l.startswith(b"data: "):
                               try: event_data = json.loads(l[6:].strip().decode('utf-8', errors='ignore'))
                               except json.JSONDecodeError: logger.warning(f"Cohere: Failed to decode event data JSON: {l[6:].strip()}")
                          else:
                              logger.warning(f"Cohere: Unexpected line in event chunk: {l.decode('utf-8', errors='ignore').strip()}")
                     if event_type == "text-generation" and event_data and "text" in event_data:
                         yield event_data["text"]
                     elif event_type == "stream-end":
                         logger.debug("Cohere stream-end event received.")
+                        byte_buffer = b''
+                        break
                     elif event_type == "error":
                         error_msg = event_data.get("message", str(event_data)) if event_data else "Unknown Cohere stream error"
                         logger.error(f"Cohere stream error event: {error_msg}")
                         yield f"API Error (Cohere stream): {error_msg}"
+                        return
             if byte_buffer:
                  logger.warning(f"Remaining data in Cohere stream buffer after processing: {byte_buffer.decode('utf-8', errors='ignore')}")
         elif provider_lower == "huggingface":
+             yield f"Error: Direct Hugging Face Inference API streaming for chat models is highly experimental and depends heavily on the specific model's implementation. For better compatibility with HF models that support the OpenAI format, consider using the OpenRouter or TogetherAI providers and selecting the HF models listed there."
              return
         else:
         yield f"API Request Error: Could not connect or receive response from {provider} ({e})"
     except Exception as e:
         logger.exception(f"Unexpected error during streaming for {provider}/{model_id}:")
+        yield f"An unexpected error occurred during streaming: {e}"