Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd, faiss, torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
from sentence_transformers import SentenceTransformer | |
# ββ Load data & embedding model ββ | |
df = pd.read_csv("retrieval_corpus.csv") | |
index = faiss.read_index("faiss_index.bin") | |
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") | |
# ββ Quantized BioMedLM with CPU offload ββ | |
model_id = "stanford-crfm/BioMedLM" | |
bnb_config = BitsAndBytesConfig( | |
load_in_8bit=True, | |
llm_int8_threshold=6.0, | |
llm_int8_enable_fp32_cpu_offload=True, | |
) | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
tokenizer.pad_token = tokenizer.eos_token | |
generation_model = AutoModelForCausalLM.from_pretrained( | |
model_id, | |
quantization_config=bnb_config, | |
device_map={"": "cpu"}, | |
) | |
def retrieve_top_k(q, k=5): | |
emb = embedding_model.encode([q]).astype("float32") | |
D,I = index.search(emb, k) | |
res = df.iloc[I[0]].copy(); res["score"]=D[0]; return res | |
def build_prompt(q, docs): | |
ctx = "\n".join(f"- {d['text']}" for _,d in docs.iterrows()) | |
return f"""[INST] <<SYS>>β¦[/INST]""" # your existing template | |
def generate_local_answer(prompt, max_new_tokens=512): | |
import time | |
device = torch.device("cpu") | |
start = time.time() | |
inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device) | |
out = generation_model.generate( | |
input_ids=inputs.input_ids, | |
attention_mask=inputs.attention_mask, | |
max_new_tokens=max_new_tokens, | |
do_sample=False, | |
num_beams=1, | |
) | |
print(f"Gen time: {time.time()-start:.2f}s") | |
return tokenizer.decode(out[0], skip_special_tokens=True) | |
iface = gr.Interface(fn=lambda q: generate_local_answer(build_prompt(q, retrieve_top_k(q))), | |
inputs="text", outputs="text") | |
iface.launch() | |