Spaces:

MicoGuild
/

Olubakka

Sleeping

App Files Files Community

Sachi Wagaarachchi commited on 28 days ago

Commit

0b7ba67

1 Parent(s): 191c0de

debug: updated the pipeline

Browse files

Files changed (4) hide show

src/app.py +64 -23
src/chat_logic.py +98 -19
src/models.py +45 -13
src/utils.py +115 -17

src/app.py CHANGED Viewed

@@ -5,15 +5,15 @@ from src.vector_db import VectorDBHandler
 import logging
 import spaces
 # Initialize components
 model_manager = ModelManager()
 vector_db = VectorDBHandler()
 chat_processor = ChatProcessor(model_manager, vector_db)
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 @spaces.GPU
 def respond(
     message,
@@ -24,28 +24,68 @@ def respond(
     temperature: float = 0.7,
     top_p: float = 0.9,
     top_k: int = 50,
-    repetition_penalty: float = 1.2
 ):
-    """Process chat using the ChatProcessor with streaming support"""
     try:
-        # Process chat through ChatProcessor
-        response_generator = chat_processor.process_chat(
-            message=message,
-            history=history,
-            model_name=model_name,
-            temperature=temperature,
-            max_new_tokens=max_new_tokens,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty
-        )
-        # Stream response tokens
-        response = ""
-        for token in response_generator:
-            response += token
             yield response
     except Exception as e:
         logger.error(f"Chat response error: {str(e)}")
         yield f"Error: {str(e)}"
@@ -65,7 +105,8 @@ demo = gr.ChatInterface(
         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p"),
         gr.Slider(minimum=1, maximum=100, value=50, step=1, label="Top-k"),
-        gr.Slider(minimum=1.0, maximum=2.0, value=1.2, step=0.1, label="Repetition penalty")
     ],
 )

 import logging
 import spaces
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Initialize components
 model_manager = ModelManager()
 vector_db = VectorDBHandler()
 chat_processor = ChatProcessor(model_manager, vector_db)
 @spaces.GPU
 def respond(
     message,
     temperature: float = 0.7,
     top_p: float = 0.9,
     top_k: int = 50,
+    repetition_penalty: float = 1.2,
+    use_direct_pipeline: bool = False
 ):
+    """
+    Process chat using the ChatProcessor with streaming support.
+    Args:
+        message: The user message
+        history: Chat history as list of (user, assistant) message pairs
+        model_name: Name of the model to use
+        system_message: System prompt to guide the model's behavior
+        max_new_tokens: Maximum number of tokens to generate
+        temperature: Sampling temperature
+        top_p: Nucleus sampling parameter
+        top_k: Top-k sampling parameter
+        repetition_penalty: Penalty for token repetition
+        use_direct_pipeline: Whether to use the direct pipeline method
+    Yields:
+        Generated response tokens for streaming UI
+    """
     try:
+        if use_direct_pipeline:
+            # Use the direct pipeline method (non-streaming)
+            generation_config = {
+                "max_new_tokens": max_new_tokens,
+                "temperature": temperature,
+                "top_p": top_p,
+                "top_k": top_k,
+                "repetition_penalty": repetition_penalty,
+                "do_sample": True
+            }
+            response = chat_processor.generate_with_pipeline(
+                message=message,
+                history=history,
+                model_name=model_name,
+                generation_config=generation_config,
+                system_prompt=system_message
+            )
             yield response
+        else:
+            # Use the streaming method
+            response_generator = chat_processor.process_chat(
+                message=message,
+                history=history,
+                model_name=model_name,
+                temperature=temperature,
+                max_new_tokens=max_new_tokens,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                system_prompt=system_message
+            )
+            # Stream response tokens
+            response = ""
+            for token in response_generator:
+                response += token
+                yield response
     except Exception as e:
         logger.error(f"Chat response error: {str(e)}")
         yield f"Error: {str(e)}"
         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p"),
         gr.Slider(minimum=1, maximum=100, value=50, step=1, label="Top-k"),
+        gr.Slider(minimum=1.0, maximum=2.0, value=1.2, step=0.1, label="Repetition penalty"),
+        gr.Checkbox(value=False, label="Use direct pipeline (non-streaming)")
     ],
 )

src/chat_logic.py CHANGED Viewed

@@ -1,6 +1,11 @@
 from transformers import TextIteratorStreamer
 import threading
-from src.utils import format_prompt
 import logging
 class ChatProcessor:
@@ -9,35 +14,56 @@ class ChatProcessor:
         self.model_manager = model_manager
         self.vector_db = vector_db
         self.logger = logging.getLogger(__name__)
     def process_chat(self, message, history, model_name, temperature=0.7,
                     max_new_tokens=512, top_p=0.9, top_k=50, repetition_penalty=1.2,
                     system_prompt=""):
-        """Process chat input and generate streaming response"""
         try:
             # Get model pipeline
             pipe = self.model_manager.get_pipeline(model_name)
             # Format prompt with history and tokenizer
             prompt = format_prompt(message, history, pipe.tokenizer, system_prompt)
-            # Set up streamer
             streamer = TextIteratorStreamer(
                 pipe.tokenizer,
                 skip_prompt=True,
                 skip_special_tokens=True
             )
-            # Get full tokenizer output
-            tokenized_inputs = pipe.tokenizer(prompt, return_tensors="pt")
-            # Determine model device
-            device = pipe.model.device
-            # Move all tensors to the correct device
-            inputs_on_device = {k: v.to(device) for k, v in tokenized_inputs.items()}
-            # Prepare generation kwargs with attention_mask
             generate_kwargs = {
                 "input_ids": inputs_on_device["input_ids"],
                 "attention_mask": inputs_on_device["attention_mask"],
@@ -49,19 +75,72 @@ class ChatProcessor:
                 "streamer": streamer
             }
-            # Start generation thread
             thread = threading.Thread(target=pipe.model.generate, kwargs=generate_kwargs)
             thread.start()
-            # Stream response
             response = ""
             for token in streamer:
                 response += token
                 yield token
-            # Update history (handled by Gradio UI)
-            return response
         except Exception as e:
             self.logger.error(f"Chat processing error: {str(e)}")
-            yield f"Error: {str(e)}"

 from transformers import TextIteratorStreamer
 import threading
+from src.utils import (
+    preprocess_chat_input,
+    format_prompt,
+    prepare_generation_inputs,
+    postprocess_response
+)
 import logging
 class ChatProcessor:
         self.model_manager = model_manager
         self.vector_db = vector_db
         self.logger = logging.getLogger(__name__)
     def process_chat(self, message, history, model_name, temperature=0.7,
                     max_new_tokens=512, top_p=0.9, top_k=50, repetition_penalty=1.2,
                     system_prompt=""):
+        """
+        Process chat input and generate streaming response.
+        This method handles the complete chat processing pipeline:
+        1. Pre-processing: Format the input with history and system prompt
+        2. Model inference: Generate a response using the specified model
+        3. Post-processing: Stream the response tokens
+        Args:
+            message (str): The current user message
+            history (list): List of tuples containing (user_message, assistant_message) pairs
+            model_name (str): Name of the model to use
+            temperature (float): Sampling temperature
+            max_new_tokens (int): Maximum number of tokens to generate
+            top_p (float): Nucleus sampling parameter
+            top_k (int): Top-k sampling parameter
+            repetition_penalty (float): Penalty for token repetition
+            system_prompt (str): Optional system prompt to guide the model's behavior
+        Yields:
+            str: Response tokens as they are generated
+        """
         try:
+            # 1. PRE-PROCESSING
             # Get model pipeline
             pipe = self.model_manager.get_pipeline(model_name)
             # Format prompt with history and tokenizer
             prompt = format_prompt(message, history, pipe.tokenizer, system_prompt)
+            # Set up streamer for token-by-token generation
             streamer = TextIteratorStreamer(
                 pipe.tokenizer,
                 skip_prompt=True,
                 skip_special_tokens=True
             )
+            # Prepare tokenized inputs
+            inputs_on_device = prepare_generation_inputs(
+                prompt,
+                pipe.tokenizer,
+                pipe.model.device
+            )
+            # 2. MODEL INFERENCE
+            # Prepare generation parameters
             generate_kwargs = {
                 "input_ids": inputs_on_device["input_ids"],
                 "attention_mask": inputs_on_device["attention_mask"],
                 "streamer": streamer
             }
+            # Start generation in a separate thread
             thread = threading.Thread(target=pipe.model.generate, kwargs=generate_kwargs)
             thread.start()
+            # 3. POST-PROCESSING
+            # Stream response tokens
             response = ""
             for token in streamer:
+                # Accumulate tokens for the complete response
                 response += token
+                # Yield each token for streaming UI
                 yield token
+            # Return the complete response
+            return postprocess_response(response)
         except Exception as e:
             self.logger.error(f"Chat processing error: {str(e)}")
+            yield f"Error: {str(e)}"
+    def generate_with_pipeline(self, message, history, model_name, generation_config=None, system_prompt=""):
+        """
+        Alternative method that uses the Hugging Face pipeline directly.
+        This method demonstrates a more direct use of the pipeline API.
+        Args:
+            message (str): The current user message
+            history (list): List of tuples containing (user_message, assistant_message) pairs
+            model_name (str): Name of the model to use
+            generation_config (dict): Configuration for text generation
+            system_prompt (str): Optional system prompt to guide the model's behavior
+        Returns:
+            str: The generated response
+        """
+        try:
+            # Get model pipeline
+            pipe = self.model_manager.get_pipeline(model_name)
+            # Pre-process: Format messages for the pipeline
+            messages = preprocess_chat_input(message, history, system_prompt)
+            # Set default generation config if not provided
+            if generation_config is None:
+                generation_config = {
+                    "max_new_tokens": 512,
+                    "temperature": 0.7,
+                    "top_p": 0.9,
+                    "top_k": 50,
+                    "repetition_penalty": 1.2,
+                    "do_sample": True
+                }
+            # Direct pipeline inference
+            response = pipe(
+                messages,
+                **generation_config
+            )
+            # Post-process the response
+            if isinstance(response, list):
+                return postprocess_response(response[0]["generated_text"])
+            else:
+                return postprocess_response(response["generated_text"])
+        except Exception as e:
+            self.logger.error(f"Pipeline generation error: {str(e)}")
+            return f"Error: {str(e)}"

src/models.py CHANGED Viewed

@@ -1,5 +1,41 @@
 from transformers import pipeline
 import logging
 class ModelManager:
     """Manages loading and caching of Qwen models"""
@@ -8,23 +44,19 @@ class ModelManager:
             "Qwen3-14B": "Qwen/Qwen3-14B",
             "Qwen3-8B": "Qwen/Qwen3-8B"
         }
-        self._pipelines = {}
         self.logger = logging.getLogger(__name__)
     def get_pipeline(self, model_name):
         """Get or create a model pipeline"""
-        if model_name in self._pipelines:
-            return self._pipelines[model_name]
         try:
             model_id = self.models[model_name]
-            self.logger.info(f"Loading model: {model_id}")
-            pipe = pipeline(
-                "text-generation",
-                model=model_id,
-                device_map="auto"
-            )
-            self._pipelines[model_name] = pipe
-            return pipe
         except KeyError:
             raise ValueError(f"Model {model_name} not found in available models")

 from transformers import pipeline
 import logging
+from functools import lru_cache
+# Global cache for pipelines to ensure they're initialized only once
+_PIPELINE_CACHE = {}
+@lru_cache(maxsize=5)
+def get_pipeline(model_id, task="text-generation"):
+    """
+    Get or create a model pipeline with caching.
+    This function is cached using lru_cache to ensure efficient reuse.
+    Args:
+        model_id (str): The Hugging Face model ID
+        task (str): The pipeline task (default: "text-generation")
+    Returns:
+        The pipeline object
+    """
+    cache_key = f"{model_id}_{task}"
+    if cache_key in _PIPELINE_CACHE:
+        return _PIPELINE_CACHE[cache_key]
+    logger = logging.getLogger(__name__)
+    logger.info(f"Loading model: {model_id} for task: {task}")
+    pipe = pipeline(
+        task,
+        model=model_id,
+        device_map="auto"
+    )
+    _PIPELINE_CACHE[cache_key] = pipe
+    return pipe
 class ModelManager:
     """Manages loading and caching of Qwen models"""
             "Qwen3-14B": "Qwen/Qwen3-14B",
             "Qwen3-8B": "Qwen/Qwen3-8B"
         }
         self.logger = logging.getLogger(__name__)
     def get_pipeline(self, model_name):
         """Get or create a model pipeline"""
         try:
             model_id = self.models[model_name]
+            return get_pipeline(model_id)
+        except KeyError:
+            raise ValueError(f"Model {model_name} not found in available models")
+    def get_model_id(self, model_name):
+        """Get the model ID for a given model name"""
+        try:
+            return self.models[model_name]
         except KeyError:
             raise ValueError(f"Model {model_name} not found in available models")

src/utils.py CHANGED Viewed

@@ -1,7 +1,18 @@
-def format_prompt(message, history, tokenizer, system_prompt=""):
-    """Format message and history into a prompt for Qwen models
-    Uses tokenizer.apply_chat_template if available, otherwise falls back to manual formatting.
     """
     # Convert history from tuples to dict format expected by apply_chat_template
     formatted_history = []
@@ -12,21 +23,108 @@ def format_prompt(message, history, tokenizer, system_prompt=""):
     # Add current message
     formatted_history.append({"role": "user", "content": message})
-    if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
         messages = [{"role": "system", "content": system_prompt.strip()}] + formatted_history
-        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
     else:
         # Fallback for base LMs without chat template
-        prompt = ""
-        if system_prompt.strip():
-            prompt = system_prompt.strip() + "\n"
-        for msg in formatted_history:
-            if msg['role'] == 'user':
-                prompt += f"<|User|>: {msg['content'].strip()}\n"
-            elif msg['role'] == 'assistant':
-                prompt += f"<|Assistant|>: {msg['content'].strip()}\n"
-        if not prompt.strip().endswith("<|Assistant|>:"):
-            prompt += "<|Assistant|>:"
-        return prompt

+"""
+Utility functions for pre-processing and post-processing in the chat application.
+"""
+def preprocess_chat_input(message, history, system_prompt=""):
+    """
+    Pre-process chat input to prepare it for the model.
+    Args:
+        message (str): The current user message
+        history (list): List of tuples containing (user_message, assistant_message) pairs
+        system_prompt (str): Optional system prompt to guide the model's behavior
+    Returns:
+        dict: Formatted messages in the format expected by the tokenizer
     """
     # Convert history from tuples to dict format expected by apply_chat_template
     formatted_history = []
     # Add current message
     formatted_history.append({"role": "user", "content": message})
+    # Add system message if provided
+    if system_prompt.strip():
         messages = [{"role": "system", "content": system_prompt.strip()}] + formatted_history
+    else:
+        messages = formatted_history
+    return messages
+def format_prompt(message, history, tokenizer, system_prompt=""):
+    """
+    Format message and history into a prompt for Qwen models.
+    Uses tokenizer.apply_chat_template if available, otherwise falls back to manual formatting.
+    Args:
+        message (str): The current user message
+        history (list): List of tuples containing (user_message, assistant_message) pairs
+        tokenizer: The model tokenizer
+        system_prompt (str): Optional system prompt to guide the model's behavior
+    Returns:
+        str: Formatted prompt ready for the model
+    """
+    # Get pre-processed messages
+    messages = preprocess_chat_input(message, history, system_prompt)
+    # Apply chat template if available
+    if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
+        return tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=True
+        )
     else:
         # Fallback for base LMs without chat template
+        return format_prompt_fallback(messages)
+def format_prompt_fallback(messages):
+    """
+    Fallback prompt formatting for models without chat templates.
+    Args:
+        messages (list): List of message dictionaries with role and content
+    Returns:
+        str: Formatted prompt string
+    """
+    prompt = ""
+    # Add system message if present
+    if messages and messages[0]['role'] == 'system':
+        prompt = messages[0]['content'].strip() + "\n"
+        messages = messages[1:]
+    # Add conversation history
+    for msg in messages:
+        if msg['role'] == 'user':
+            prompt += f"<|User|>: {msg['content'].strip()}\n"
+        elif msg['role'] == 'assistant':
+            prompt += f"<|Assistant|>: {msg['content'].strip()}\n"
+    # Add final assistant prompt if needed
+    if not prompt.strip().endswith("<|Assistant|>:"):
+        prompt += "<|Assistant|>:"
+    return prompt
+def prepare_generation_inputs(prompt, tokenizer, device):
+    """
+    Prepare tokenized inputs for model generation.
+    Args:
+        prompt (str): The formatted prompt
+        tokenizer: The model tokenizer
+        device: The device to place tensors on
+    Returns:
+        dict: Tokenized inputs ready for model generation
+    """
+    # Tokenize the prompt
+    tokenized_inputs = tokenizer(prompt, return_tensors="pt")
+    # Move tensors to the correct device
+    inputs_on_device = {k: v.to(device) for k, v in tokenized_inputs.items()}
+    return inputs_on_device
+def postprocess_response(response):
+    """
+    Post-process the model's response.
+    Args:
+        response (str): The raw model response
+    Returns:
+        str: The processed response
+    """
+    # Currently just returns the response as-is
+    # This function can be expanded for additional post-processing steps
+    return response