GPT-OSS

Running on Zero

App Files Files Community

Spestly commited on Jul 11

Commit

da8de8d

verified ·

1 Parent(s): 739239b

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -12

app.py CHANGED Viewed

@@ -4,42 +4,128 @@ import torch
 import time
 import spaces
 MODELS = {
     "Athena-R3X 8B": "Spestly/Athena-R3X-8B",
     "Athena-R3X 4B": "Spestly/Athena-R3X-4B",
-    # ... other models ...
 }
 @spaces.GPU
 def generate_response(model_id, conversation, user_message, max_length=512, temperature=0.7):
-    # [Same as your function]
-    # ... code omitted for brevity ...
-    pass  # Use your original code here
 def respond(history, message, model_name, max_length, temperature):
     if not message.strip():
-        return history + [["user", message], ["assistant", "Please enter a message"]], ""
     model_id = MODELS.get(model_name, MODELS["Athena-R3X 8B"])
     try:
-        response, _, _ = generate_response(model_id, history, message, max_length, temperature)
         history = history + [["user", message], ["assistant", response]]
         return history, ""
     except Exception as e:
         history = history + [["user", message], ["assistant", f"Error: {str(e)}"]]
         return history, ""
-with gr.Blocks() as demo:
     gr.Markdown("# 🚀 Athena Playground Chat")
     gr.Markdown("*Powered by HuggingFace ZeroGPU*")
-    chatbot = gr.Chatbot(height=500)
     state = gr.State([])  # chat history
     with gr.Row():
-        user_input = gr.Textbox(label="Your message", scale=8)
         send_btn = gr.Button(value="Send", scale=1)
-    # Place settings at the bottom!
     gr.Markdown("### ⚙️ Model & Generation Settings")
     with gr.Row():
         model_choice = gr.Dropdown(
@@ -59,11 +145,11 @@ with gr.Blocks() as demo:
             info="Higher values = more creative responses"
         )
-    def custom_chat(history, message, model_name, max_length, temperature):
         return respond(history, message, model_name, max_length, temperature)
     send_btn.click(
-        custom_chat,
         inputs=[state, user_input, model_choice, max_length, temperature],
         outputs=[chatbot, user_input]
     )

 import time
 import spaces
+# Model configurations
 MODELS = {
     "Athena-R3X 8B": "Spestly/Athena-R3X-8B",
     "Athena-R3X 4B": "Spestly/Athena-R3X-4B",
+    "Athena-R3 7B": "Spestly/Athena-R3-7B",
+    "Athena-3 3B": "Spestly/Athena-3-3B",
+    "Athena-3 7B": "Spestly/Athena-3-7B",
+    "Athena-3 14B": "Spestly/Athena-3-14B",
+    "Athena-2 1.5B": "Spestly/Athena-2-1.5B",
+    "Athena-1 3B": "Spestly/Athena-1-3B",
+    "Athena-1 7B": "Spestly/Athena-1-7B"
 }
 @spaces.GPU
 def generate_response(model_id, conversation, user_message, max_length=512, temperature=0.7):
+    """Generate response using ZeroGPU - all CUDA operations happen here"""
+    print(f"🚀 Loading {model_id}...")
+    start_time = time.time()
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        trust_remote_code=True
+    )
+    load_time = time.time() - start_time
+    print(f"✅ Model loaded in {load_time:.2f}s")
+    # Build messages in proper chat format (OpenAI-style messages)
+    messages = []
+    system_prompt = (
+        "You are Athena, a helpful, harmless, and honest AI assistant. "
+        "You provide clear, accurate, and concise responses to user questions. "
+        "You are knowledgeable across many domains and always aim to be respectful and helpful. "
+        "You are finetuned by Aayan Mishra"
+    )
+    messages.append({"role": "system", "content": system_prompt})
+    # Add conversation history (OpenAI-style)
+    for msg in conversation:
+        if msg["role"] in ("user", "assistant"):
+            messages.append({"role": msg["role"], "content": msg["content"]})
+    # Add current user message
+    messages.append({"role": "user", "content": user_message})
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    inputs = tokenizer(prompt, return_tensors="pt")
+    device = next(model.parameters()).device
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    generation_start = time.time()
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_length,
+            temperature=temperature,
+            do_sample=True,
+            top_p=0.9,
+            pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id
+        )
+    generation_time = time.time() - generation_start
+    response = tokenizer.decode(
+        outputs[0][inputs['input_ids'].shape[-1]:],
+        skip_special_tokens=True
+    ).strip()
+    return response, load_time, generation_time
 def respond(history, message, model_name, max_length, temperature):
+    """Main function for custom Chatbot interface"""
     if not message.strip():
+        history = history + [["user", message], ["assistant", "Please enter a message"]]
+        return history, ""
     model_id = MODELS.get(model_name, MODELS["Athena-R3X 8B"])
     try:
+        # Format history for Athena
+        formatted_history = []
+        for i in range(0, len(history), 2):
+            if i < len(history):
+                user_msg = history[i][1] if history[i][0] == "user" else ""
+                assistant_msg = history[i+1][1] if i+1 < len(history) and history[i+1][0] == "assistant" else ""
+                if user_msg:
+                    formatted_history.append({"role": "user", "content": user_msg})
+                if assistant_msg:
+                    formatted_history.append({"role": "assistant", "content": assistant_msg})
+        response, load_time, generation_time = generate_response(
+            model_id, formatted_history, message, max_length, temperature
+        )
         history = history + [["user", message], ["assistant", response]]
         return history, ""
     except Exception as e:
         history = history + [["user", message], ["assistant", f"Error: {str(e)}"]]
         return history, ""
+css = """
+.message {
+    padding: 10px;
+    margin: 5px;
+    border-radius: 10px;
+}
+"""
+theme = gr.themes.Monochrome()
+with gr.Blocks(title="Athena Playground Chat", css=css, theme=theme) as demo:
     gr.Markdown("# 🚀 Athena Playground Chat")
     gr.Markdown("*Powered by HuggingFace ZeroGPU*")
+    chatbot = gr.Chatbot(height=500, label="Athena", avatar="🤖")
     state = gr.State([])  # chat history
     with gr.Row():
+        user_input = gr.Textbox(label="Your message", scale=8, autofocus=True)
         send_btn = gr.Button(value="Send", scale=1)
+    # --- Configuration controls at the bottom ---
     gr.Markdown("### ⚙️ Model & Generation Settings")
     with gr.Row():
         model_choice = gr.Dropdown(
             info="Higher values = more creative responses"
         )
+    def chat_submit(history, message, model_name, max_length, temperature):
         return respond(history, message, model_name, max_length, temperature)
     send_btn.click(
+        chat_submit,
         inputs=[state, user_input, model_choice, max_length, temperature],
         outputs=[chatbot, user_input]
     )