Spaces:
Runtime error
Runtime error
| # Import necessary modules | |
| from transformers import AutoTokenizer | |
| from unsloth import FastLanguageModel | |
| # Define the MedQA prompt | |
| medqa_prompt = """You are a medical QA system. Answer the following medical question clearly and in detail with complete sentences. | |
| ### Question: | |
| {} | |
| ### Answer: | |
| """ | |
| # Load the model and tokenizer using unsloth | |
| model_name = "Vijayendra/Phi4-MedQA" | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name=model_name, | |
| max_seq_length=2048, | |
| dtype=None, # Use default precision | |
| load_in_4bit=True, # Enable 4-bit quantization | |
| device_map="auto" # Automatically map model to available devices | |
| ) | |
| # Enable faster inference | |
| FastLanguageModel.for_inference(model) | |
| # Prepare the medical question | |
| medical_question = "What are the common symptoms of diabetes?" # Replace with your medical question | |
| inputs = tokenizer( | |
| [medqa_prompt.format(medical_question)], | |
| return_tensors="pt", | |
| padding=True, | |
| truncation=True, | |
| max_length=1024 | |
| ).to("cuda") # Ensure inputs are on the GPU | |
| # Generate the output | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=512, # Allow for detailed responses | |
| use_cache=True # Speeds up generation | |
| ) | |
| # Decode and clean the response | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Extract and print the generated answer | |
| answer_text = response.split("### Answer:")[1].strip() if "### Answer:" in response else response.strip() | |
| print(f"Question: {medical_question}") | |
| print(f"Answer: {answer_text}") |