techindia2025 commited on
Commit
1cf7fb2
·
verified ·
1 Parent(s): c7b27cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -52
app.py CHANGED
@@ -1,65 +1,80 @@
1
  import gradio as gr
2
- import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import spaces
 
5
 
6
- # Model name
7
- model_name = "medalpaca/medalpaca-7b"
 
 
 
8
 
9
- # Load tokenizer and model globally for efficiency
10
- print(f"CUDA available: {torch.cuda.is_available()}")
11
- if torch.cuda.is_available():
12
- print(f"GPU device count: {torch.cuda.device_count()}")
13
- print(f"GPU device name: {torch.cuda.get_device_name(0)}")
14
 
15
- tokenizer = AutoTokenizer.from_pretrained(model_name)
16
- model = AutoModelForCausalLM.from_pretrained(
17
- model_name,
18
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
19
- device_map="auto", # Use GPU if available
20
- load_in_8bit=torch.cuda.is_available() # 8-bit quantization for GPU
21
- )
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- def format_prompt(message, chat_history):
24
- prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
25
- if chat_history:
26
- prompt += "Previous conversation:\n"
27
- for turn in chat_history:
28
- user_message, assistant_message = turn
29
- prompt += f"Human: {user_message}\nAssistant: {assistant_message}\n\n"
30
- prompt += f"Human: {message}\nAssistant:"
31
- return prompt
32
 
33
- @spaces.GPU # <--- This is REQUIRED for ZeroGPU!
34
- def generate_response(message, chat_history):
35
- prompt = format_prompt(message, chat_history)
 
 
 
 
 
 
 
 
 
 
36
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
37
- with torch.no_grad():
38
- generation_output = model.generate(
39
- input_ids=inputs.input_ids,
40
- attention_mask=inputs.attention_mask,
41
- max_new_tokens=512,
42
- temperature=0.7,
43
- top_p=0.9,
44
- do_sample=True,
45
- )
46
- full_output = tokenizer.decode(generation_output[0], skip_special_tokens=True)
47
- response = full_output.split("Assistant:")[-1].strip()
48
- chat_history.append((message, response))
49
- return "", chat_history
50
 
51
- with gr.Blocks(css="footer {visibility: hidden}") as demo:
52
- gr.Markdown("# MedAlpaca Medical Chatbot")
53
- gr.Markdown("A specialized medical chatbot powered by MedAlpaca-7B.")
54
- gr.Markdown("Ask medical questions and get responses from a model trained on medical data.")
55
 
56
- chatbot = gr.Chatbot(type="messages")
57
- msg = gr.Textbox(placeholder="Type your medical question here...")
58
- clear = gr.Button("Clear")
 
 
 
59
 
60
- msg.submit(generate_response, [msg, chatbot], [msg, chatbot]) # Pass GPU-decorated function!
61
- clear.click(lambda: None, None, chatbot, queue=False)
 
 
62
 
63
  if __name__ == "__main__":
64
- print("Starting Gradio app...")
65
- demo.launch(server_name="0.0.0.0")
 
1
  import gradio as gr
 
 
2
  import spaces
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
 
5
+ # Define model options
6
+ MODELS = {
7
+ "TinyLlama-1.1B": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
8
+ "Llama-2-7b": "meta-llama/Llama-2-7b-chat-hf"
9
+ }
10
 
11
+ # Global variables to store loaded models and tokenizers
12
+ loaded_models = {}
13
+ loaded_tokenizers = {}
 
 
14
 
15
+ def load_model(model_name):
16
+ """Load model and tokenizer if not already loaded"""
17
+ if model_name not in loaded_models:
18
+ print(f"Loading {model_name}...")
19
+ model_path = MODELS[model_name]
20
+
21
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
22
+ model = AutoModelForCausalLM.from_pretrained(
23
+ model_path,
24
+ torch_dtype="auto",
25
+ device_map="auto" # Use GPU if available
26
+ )
27
+
28
+ loaded_models[model_name] = model
29
+ loaded_tokenizers[model_name] = tokenizer
30
+ print(f"{model_name} loaded successfully!")
31
+
32
+ return loaded_models[model_name], loaded_tokenizers[model_name]
33
 
34
+ # Pre-load the smaller model to start with
35
+ print("Pre-loading TinyLlama model...")
36
+ load_model("TinyLlama-1.1B")
 
 
 
 
 
 
37
 
38
+ @spaces.GPU # Required by ZeroGPU!
39
+ def generate_response(message, history, model_choice):
40
+ """Generate a response from the selected model"""
41
+ # Load the selected model if not already loaded
42
+ model, tokenizer = load_model(model_choice)
43
+
44
+ # Format the prompt based on the history
45
+ prompt = ""
46
+ for human, assistant in history:
47
+ prompt += f"User: {human}\nAssistant: {assistant}\n"
48
+ prompt += f"User: {message}\nAssistant:"
49
+
50
+ # Generate the response
51
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
52
+ outputs = model.generate(
53
+ inputs["input_ids"],
54
+ max_new_tokens=512,
55
+ temperature=0.7,
56
+ top_p=0.9,
57
+ do_sample=True,
58
+ )
59
+ response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
60
+ return response.strip()
 
 
 
 
61
 
62
+ # Create the Gradio interface
63
+ with gr.Blocks() as demo:
64
+ gr.Markdown("# LLM Chatbot")
65
+ gr.Markdown("Choose between TinyLlama-1.1B and Llama-2-7b models for your conversation.")
66
 
67
+ with gr.Row():
68
+ model_dropdown = gr.Dropdown(
69
+ choices=list(MODELS.keys()),
70
+ value="TinyLlama-1.1B",
71
+ label="Select Model"
72
+ )
73
 
74
+ chatbot = gr.ChatInterface(
75
+ fn=lambda message, history, model_choice: generate_response(message, history, model_choice),
76
+ additional_inputs=[model_dropdown],
77
+ )
78
 
79
  if __name__ == "__main__":
80
+ demo.launch()