import streamlit as st import torch from transformers import AutoTokenizer, AutoModelForCausalLM # Define the model and tokenizer model_id = "Writer/Palmyra-Med-70B-32k" @st.cache(allow_output_mutation=True) def load_model(): tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float16, device_map="auto", attn_implementation="flash_attention_2", ) return tokenizer, model tokenizer, model = load_model() # Define Streamlit app st.title("Medical Query Model") st.write( "You are interacting with a highly knowledgeable medical model. Enter your medical question below:" ) user_input = st.text_area("Your Question") if st.button("Get Response"): if user_input: # Prepare input for the model messages = [ { "role": "system", "content": "You are a highly knowledgeable and experienced expert in the healthcare and biomedical field, possessing extensive medical knowledge and practical expertise.", }, { "role": "user", "content": user_input, }, ] input_ids = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" ) gen_conf = { "max_new_tokens": 256, "eos_token_id": [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("")], "temperature": 0.0, "top_p": 0.9, } # Generate response with torch.no_grad(): output_id = model.generate(input_ids, **gen_conf) output_text = tokenizer.decode(output_id[0][input_ids.shape[1]:], skip_special_tokens=True) st.write("Response:") st.write(output_text) else: st.warning("Please enter a question.")