Spaces:

Euryeth
/

LLM_Ariphes

Runtime error

App Files Files Community

LLM_Ariphes / app.py

Euryeth

Update app.py

fce7f32 verified 3 months ago

raw

history blame

1.81 kB

	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
	import torch
	import os
	from huggingface_hub import login

	# Authenticate with Hugging Face token
	login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))

	# Setup environment and dtype for CPU/GPU compatibility
	torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
	os.environ['HF_HOME'] = '/tmp/cache'

	# Load model and tokenizer (using cerebras BTLM-3B-8K)
	model_name = "cerebras/btlm-3b-8k-chat"

	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch_dtype,
	device_map="auto"
	)

	# Create text generation pipeline with required pad_token_id for this model
	generator = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	device_map="auto",
	torch_dtype=torch_dtype,
	pad_token_id=tokenizer.eos_token_id # Important for BTLM model
	)

	def generate_chat_completion(message: str, history: list = None):
	"""
	If history is provided as list of {'role': str, 'content': str} dicts,
	reconstructs the full prompt and returns updated history.
	"""
	history = history or []
	prompt = ""
	for msg in history:
	prompt += f"{msg['role'].capitalize()}: {msg['content']}\n"
	prompt += f"User: {message}\nAssistant:"

	output = generator(
	prompt,
	max_new_tokens=256,
	temperature=0.7, # Slightly lower temp for more coherent replies
	top_p=0.9,
	repetition_penalty=1.1,
	do_sample=True
	)
	reply = output[0]['generated_text'].replace(prompt, "").strip()

	# Append new interaction to history
	history.append({"role": "user", "content": message})
	history.append({"role": "assistant", "content": reply})
	return history