import gradio as gr from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer import torch if torch.cuda.is_available(): device = "cuda" else: device = "cpu" model_id = "thrishala/mental_health_chatbot" try: model = AutoModelForCausalLM.from_pretrained( model_id, device_map="cpu", torch_dtype=torch.float16, low_cpu_mem_usage=True, max_memory={"cpu": "15GB"}, offload_folder="offload", ) model.to(device) tokenizer = AutoTokenizer.from_pretrained(model_id) tokenizer.model_max_length = 512 # Set maximum length # ok this is just to slow with pipe i wish it was faster. Si were ren=moving pipe in favor of local model # pipe = pipeline( # "text-generation", # model=model, # tokenizer=tokenizer, # torch_dtype=torch.float16, # num_return_sequences=1, # do_sample=False, # truncation=True, # max_new_tokens=128 # ) except Exception as e: print(f"Error loading model: {e}") exit() def generate_text(prompt, max_new_tokens=128): input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device) #Move input to the same device as the model with torch.no_grad(): #Disable gradients during inference output = model.generate( input_ids=input_ids, max_new_tokens=max_new_tokens, do_sample=False, # Or True for sampling eos_token_id=tokenizer.eos_token_id, # Use EOS token to stop generation )[0]["generated_text"] # Extract only the new assistant response after the last Assistant: in the prompt bot_response = response[len(prompt):].split("User:")[0].strip() # Take text after prompt and before next User ) generated_text = tokenizer.decode(output[0], skip_special_tokens=True) return generated_text def respond( message, history, system_message, max_tokens, ): # Construct the prompt with clear separation prompt = f"{system_message}\n" for user_msg, bot_msg in history: prompt += f"User: {user_msg}\nAssistant: {bot_msg}\n" prompt += f"User: {message}\nAssistant:" try: # response = pipe( # prompt, # max_new_tokens=max_tokens, # do_sample=False, # eos_token_id=tokenizer.eos_token_id, # Use EOS token to stop generation # )[0]["generated_text"] # Extract only the new assistant response after the last Assistant: in the prompt bot_response = generate_text(prompt, max_tokens) yield bot_response except Exception as e: print(f"Error during generation: {e}") yield "An error occurred." demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox( value="You are a friendly and helpful mental health chatbot.", label="System message", ), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), ], ) if __name__ == "__main__": demo.launch()