gpt-oss-20b-mutlilingual-reasoning

Running

App Files Files Community

Tonic commited on 29 days ago

Commit

757241b

1 Parent(s): 46248f1

attempts lora adapter and streaming

Browse files

Files changed (2) hide show

app.py +49 -74
app_alternative.py +159 -0

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import torch
 from threading import Thread
 import gradio as gr
@@ -29,42 +29,20 @@ except Exception as e:
     print(f"❌ Error loading model: {e}")
     raise e
-class LoRAPipeline:
-    def __init__(self, model, tokenizer):
-        self.model = model
-        self.tokenizer = tokenizer
-    def __call__(self, messages, **kwargs):
-        prompt = self.format_messages(messages)
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
-        with torch.no_grad():
-            outputs = self.model.generate(
-                **inputs,
-                **kwargs
-            )
-        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-        response = generated_text[len(prompt):]
-        return response
-    def format_messages(self, messages):
-        """Format messages into a prompt string"""
-        formatted = ""
-        for message in messages:
-            role = message["role"]
-            content = message["content"]
-            if role == "system":
-                formatted += f"System: {content}\n"
-            elif role == "user":
-                formatted += f"User: {content}\n"
-            elif role == "assistant":
-                formatted += f"Assistant: {content}\n"
-        formatted += "Assistant: "
-        return formatted
-# Create the pipeline
-pipe = LoRAPipeline(model, tokenizer)
 def format_conversation_history(chat_history):
     messages = []
@@ -83,7 +61,13 @@ def generate_response(input_data, chat_history, max_new_tokens, system_prompt, t
     processed_history = format_conversation_history(chat_history)
     messages = system_message + processed_history + [new_message]
-    # Generate response using the LoRA pipeline
     generation_kwargs = {
         "max_new_tokens": max_new_tokens,
         "do_sample": True,
@@ -92,47 +76,38 @@ def generate_response(input_data, chat_history, max_new_tokens, system_prompt, t
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
         "pad_token_id": tokenizer.eos_token_id,
     }
-    # For streaming, we'll generate token by token
-    prompt = pipe.format_messages(messages)
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-    # Generate with streaming
-    full_response = ""
-    current_length = inputs["input_ids"].shape[1]
-    with torch.no_grad():
-        for i in range(max_new_tokens):
-            # Generate one token at a time
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=1,
-                do_sample=True,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                repetition_penalty=repetition_penalty,
-                pad_token_id=tokenizer.eos_token_id,
-                use_cache=True
-            )
-            # Get the new token
-            new_token = outputs[0][-1].unsqueeze(0).unsqueeze(0)
-            # Decode the new token
-            new_text = tokenizer.decode(new_token[0], skip_special_tokens=True)
-            if new_text:
-                full_response += new_text
-                yield full_response
-            # Update inputs for next iteration
-            inputs = {"input_ids": torch.cat([inputs["input_ids"], new_token], dim=1)}
-            # Check for end of generation
-            if new_token.item() == tokenizer.eos_token_id:
-                break
 demo = gr.ChatInterface(
     fn=generate_response,

+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, pipeline
 import torch
 from threading import Thread
 import gradio as gr
     print(f"❌ Error loading model: {e}")
     raise e
+def format_messages(messages):
+    """Format messages into a prompt string"""
+    formatted = ""
+    for message in messages:
+        role = message["role"]
+        content = message["content"]
+        if role == "system":
+            formatted += f"System: {content}\n"
+        elif role == "user":
+            formatted += f"User: {content}\n"
+        elif role == "assistant":
+            formatted += f"Assistant: {content}\n"
+    formatted += "Assistant: "
+    return formatted
 def format_conversation_history(chat_history):
     messages = []
     processed_history = format_conversation_history(chat_history)
     messages = system_message + processed_history + [new_message]
+    # Format the prompt
+    prompt = format_messages(messages)
+    # Create streamer for proper streaming
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    # Prepare generation kwargs
     generation_kwargs = {
         "max_new_tokens": max_new_tokens,
         "do_sample": True,
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
         "pad_token_id": tokenizer.eos_token_id,
+        "streamer": streamer,
+        "use_cache": True
     }
+    # Tokenize input
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    # Start generation in a separate thread
+    thread = Thread(target=model.generate, kwargs={**inputs, **generation_kwargs})
+    thread.start()
+    # Stream the response
+    thinking = ""
+    final = ""
+    started_final = False
+    for chunk in streamer:
+        if not started_final:
+            if "assistantfinal" in chunk.lower():
+                split_parts = re.split(r'assistantfinal', chunk, maxsplit=1)
+                thinking += split_parts[0]
+                final += split_parts[1]
+                started_final = True
+            else:
+                thinking += chunk
+        else:
+            final += chunk
+        clean_thinking = re.sub(r'^analysis\s*', '', thinking).strip()
+        clean_final = final.strip()
+        formatted = f"<details open><summary>Click to view Thinking Process</summary>\n\n{clean_thinking}\n\n</details>\n\n{clean_final}"
+        yield formatted
 demo = gr.ChatInterface(
     fn=generate_response,

app_alternative.py ADDED Viewed

	@@ -0,0 +1,159 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+import torch
+from threading import Thread
+import gradio as gr
+import spaces
+import re
+from peft import PeftModel
+# Load the base model
+try:
+    base_model = AutoModelForCausalLM.from_pretrained(
+        "openai/gpt-oss-20b",
+        torch_dtype="auto",
+        device_map="auto",
+        attn_implementation="kernel-community/vllm-flash-attention3"
+    )
+    tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")
+    # Load the LoRA adapter
+    try:
+        model = PeftModel.from_pretrained(base_model, "Tonic/gpt-oss-20b-multilingual-reasoner")
+        print("✅ LoRA model loaded successfully!")
+    except Exception as lora_error:
+        print(f"⚠️ LoRA adapter failed to load: {lora_error}")
+        print("🔄 Falling back to base model...")
+        model = base_model
+except Exception as e:
+    print(f"❌ Error loading model: {e}")
+    raise e
+def format_messages(messages):
+    """Format messages into a prompt string"""
+    formatted = ""
+    for message in messages:
+        role = message["role"]
+        content = message["content"]
+        if role == "system":
+            formatted += f"System: {content}\n"
+        elif role == "user":
+            formatted += f"User: {content}\n"
+        elif role == "assistant":
+            formatted += f"Assistant: {content}\n"
+    formatted += "Assistant: "
+    return formatted
+def format_conversation_history(chat_history):
+    messages = []
+    for item in chat_history:
+        role = item["role"]
+        content = item["content"]
+        if isinstance(content, list):
+            content = content[0]["text"] if content and "text" in content[0] else str(content)
+        messages.append({"role": role, "content": content})
+    return messages
+@spaces.GPU(duration=60)
+def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
+    new_message = {"role": "user", "content": input_data}
+    system_message = [{"role": "system", "content": system_prompt}] if system_prompt else []
+    processed_history = format_conversation_history(chat_history)
+    messages = system_message + processed_history + [new_message]
+    # Format the prompt
+    prompt = format_messages(messages)
+    # Alternative streaming approach with manual chunking
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    # Generate in smaller chunks for better streaming
+    chunk_size = 50  # Generate 50 tokens at a time
+    full_response = ""
+    with torch.no_grad():
+        for i in range(0, max_new_tokens, chunk_size):
+            current_max_tokens = min(chunk_size, max_new_tokens - i)
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=current_max_tokens,
+                do_sample=True,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                pad_token_id=tokenizer.eos_token_id,
+                use_cache=True
+            )
+            # Decode the new tokens
+            new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
+            new_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
+            if new_text:
+                full_response += new_text
+                # Process for thinking/final split
+                thinking = ""
+                final = ""
+                started_final = False
+                if "assistantfinal" in full_response.lower():
+                    split_parts = re.split(r'assistantfinal', full_response, maxsplit=1)
+                    thinking = split_parts[0]
+                    final = split_parts[1] if len(split_parts) > 1 else ""
+                    started_final = True
+                else:
+                    thinking = full_response
+                clean_thinking = re.sub(r'^analysis\s*', '', thinking).strip()
+                clean_final = final.strip()
+                formatted = f"<details open><summary>Click to view Thinking Process</summary>\n\n{clean_thinking}\n\n</details>\n\n{clean_final}"
+                yield formatted
+            # Update inputs for next iteration
+            inputs = {"input_ids": outputs}
+            # Check for end of generation
+            if outputs[0][-1].item() == tokenizer.eos_token_id:
+                break
+demo = gr.ChatInterface(
+    fn=generate_response,
+    additional_inputs=[
+        gr.Slider(label="Max new tokens", minimum=64, maximum=4096, step=1, value=2048),
+        gr.Textbox(
+            label="System Prompt",
+            value="You are a helpful assistant. Reasoning: medium",
+            lines=4,
+            placeholder="Change system prompt"
+        ),
+        gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.7),
+        gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
+        gr.Slider(label="Top-k", minimum=1, maximum=100, step=1, value=50),
+        gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.0)
+    ],
+    examples=[
+        [{"text": "Explain Newton laws clearly and concisely"}],
+        [{"text": "Write a Python function to calculate the Fibonacci sequence"}],
+        [{"text": "What are the benefits of open weight AI models"}],
+    ],
+    cache_examples=False,
+    type="messages",
+    description="""
+# 🙋🏻‍♂️Welcome to 🌟Tonic's gpt-oss-20b Multilingual Reasoner Demo !
+Wait couple of seconds initially. You can adjust reasoning level in the system prompt like "Reasoning: high.
+    """,
+    fill_height=True,
+    textbox=gr.Textbox(
+        label="Query Input",
+        placeholder="Type your prompt"
+    ),
+    stop_btn="Stop Generation",
+    multimodal=False,
+    theme=gr.themes.Soft()
+)
+if __name__ == "__main__":
+    demo.launch(share=True)