Spaces:

techindia2025
/

medbot_2

Running on Zero

App Files Files Community

techindia2025 commited on May 21

Commit

1cf7fb2

verified ·

1 Parent(s): c7b27cc

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -52

app.py CHANGED Viewed

@@ -1,65 +1,80 @@
 import gradio as gr
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import spaces
-# Model name
-model_name = "medalpaca/medalpaca-7b"
-# Load tokenizer and model globally for efficiency
-print(f"CUDA available: {torch.cuda.is_available()}")
-if torch.cuda.is_available():
-    print(f"GPU device count: {torch.cuda.device_count()}")
-    print(f"GPU device name: {torch.cuda.get_device_name(0)}")
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-    device_map="auto",  # Use GPU if available
-    load_in_8bit=torch.cuda.is_available()  # 8-bit quantization for GPU
-)
-def format_prompt(message, chat_history):
-    prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
-    if chat_history:
-        prompt += "Previous conversation:\n"
-    for turn in chat_history:
-        user_message, assistant_message = turn
-        prompt += f"Human: {user_message}\nAssistant: {assistant_message}\n\n"
-    prompt += f"Human: {message}\nAssistant:"
-    return prompt
-@spaces.GPU  # <--- This is REQUIRED for ZeroGPU!
-def generate_response(message, chat_history):
-    prompt = format_prompt(message, chat_history)
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-    with torch.no_grad():
-        generation_output = model.generate(
-            input_ids=inputs.input_ids,
-            attention_mask=inputs.attention_mask,
-            max_new_tokens=512,
-            temperature=0.7,
-            top_p=0.9,
-            do_sample=True,
-        )
-    full_output = tokenizer.decode(generation_output[0], skip_special_tokens=True)
-    response = full_output.split("Assistant:")[-1].strip()
-    chat_history.append((message, response))
-    return "", chat_history
-with gr.Blocks(css="footer {visibility: hidden}") as demo:
-    gr.Markdown("# MedAlpaca Medical Chatbot")
-    gr.Markdown("A specialized medical chatbot powered by MedAlpaca-7B.")
-    gr.Markdown("Ask medical questions and get responses from a model trained on medical data.")
-    chatbot = gr.Chatbot(type="messages")
-    msg = gr.Textbox(placeholder="Type your medical question here...")
-    clear = gr.Button("Clear")
-    msg.submit(generate_response, [msg, chatbot], [msg, chatbot])  # Pass GPU-decorated function!
-    clear.click(lambda: None, None, chatbot, queue=False)
 if __name__ == "__main__":
-    print("Starting Gradio app...")
-    demo.launch(server_name="0.0.0.0")

 import gradio as gr
 import spaces
+from transformers import AutoModelForCausalLM, AutoTokenizer
+# Define model options
+MODELS = {
+    "TinyLlama-1.1B": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "Llama-2-7b": "meta-llama/Llama-2-7b-chat-hf"
+}
+# Global variables to store loaded models and tokenizers
+loaded_models = {}
+loaded_tokenizers = {}
+def load_model(model_name):
+    """Load model and tokenizer if not already loaded"""
+    if model_name not in loaded_models:
+        print(f"Loading {model_name}...")
+        model_path = MODELS[model_name]
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype="auto",
+            device_map="auto"  # Use GPU if available
+        )
+        loaded_models[model_name] = model
+        loaded_tokenizers[model_name] = tokenizer
+        print(f"{model_name} loaded successfully!")
+    return loaded_models[model_name], loaded_tokenizers[model_name]
+# Pre-load the smaller model to start with
+print("Pre-loading TinyLlama model...")
+load_model("TinyLlama-1.1B")
+@spaces.GPU  # Required by ZeroGPU!
+def generate_response(message, history, model_choice):
+    """Generate a response from the selected model"""
+    # Load the selected model if not already loaded
+    model, tokenizer = load_model(model_choice)
+    # Format the prompt based on the history
+    prompt = ""
+    for human, assistant in history:
+        prompt += f"User: {human}\nAssistant: {assistant}\n"
+    prompt += f"User: {message}\nAssistant:"
+    # Generate the response
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    outputs = model.generate(
+        inputs["input_ids"],
+        max_new_tokens=512,
+        temperature=0.7,
+        top_p=0.9,
+        do_sample=True,
+    )
+    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+    return response.strip()
+# Create the Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# LLM Chatbot")
+    gr.Markdown("Choose between TinyLlama-1.1B and Llama-2-7b models for your conversation.")
+    with gr.Row():
+        model_dropdown = gr.Dropdown(
+            choices=list(MODELS.keys()),
+            value="TinyLlama-1.1B",
+            label="Select Model"
+        )
+    chatbot = gr.ChatInterface(
+        fn=lambda message, history, model_choice: generate_response(message, history, model_choice),
+        additional_inputs=[model_dropdown],
+    )
 if __name__ == "__main__":
+    demo.launch()