LLM_Ariphes / app.py
Euryeth's picture
Update app.py
fce7f32 verified
raw
history blame
1.81 kB
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import os
from huggingface_hub import login
# Authenticate with Hugging Face token
login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))
# Setup environment and dtype for CPU/GPU compatibility
torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
os.environ['HF_HOME'] = '/tmp/cache'
# Load model and tokenizer (using cerebras BTLM-3B-8K)
model_name = "cerebras/btlm-3b-8k-chat"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch_dtype,
device_map="auto"
)
# Create text generation pipeline with required pad_token_id for this model
generator = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device_map="auto",
torch_dtype=torch_dtype,
pad_token_id=tokenizer.eos_token_id # Important for BTLM model
)
def generate_chat_completion(message: str, history: list = None):
"""
If history is provided as list of {'role': str, 'content': str} dicts,
reconstructs the full prompt and returns updated history.
"""
history = history or []
prompt = ""
for msg in history:
prompt += f"{msg['role'].capitalize()}: {msg['content']}\n"
prompt += f"User: {message}\nAssistant:"
output = generator(
prompt,
max_new_tokens=256,
temperature=0.7, # Slightly lower temp for more coherent replies
top_p=0.9,
repetition_penalty=1.1,
do_sample=True
)
reply = output[0]['generated_text'].replace(prompt, "").strip()
# Append new interaction to history
history.append({"role": "user", "content": message})
history.append({"role": "assistant", "content": reply})
return history