Spaces:

YOUSEF2434
/

qwen

Sleeping

qwen / app.py

Update app.py

cda8406 verified 4 months ago

1.51 kB

	import os
	import requests
	import gradio as gr
	from llama_cpp import Llama

	MODEL_URL = "https://huggingface.co/QuantFactory/Ministral-3b-instruct-GGUF/resolve/main/Ministral-3b-instruct.Q4_1.gguf?download=true" # truncated for clarity
	MODEL_PATH = "Ministral-3b-instruct.Q4_1.gguf"

	# Download model if not already downloaded
	if not os.path.exists(MODEL_PATH):
	print("Downloading model...")
	with requests.get(MODEL_URL, stream=True) as r:
	r.raise_for_status()
	with open(MODEL_PATH, 'wb') as f:
	for chunk in r.iter_content(chunk_size=8192):
	f.write(chunk)
	print("Model downloaded.")

	# Load the model with adjustments for CPU
	llm = Llama(
	model_path=MODEL_PATH,
	n_ctx=4096, # Reduced context window size
	n_threads=2, # Reduced threads for CPU use
	n_gpu_layers=0, # Set to 0 since we're using CPU
	chat_format="chatml"
	)

	def chat_interface(message, history):
	if history is None:
	history = []

	chat_prompt = []
	for user_msg, bot_msg in history:
	chat_prompt.append({"role": "user", "content": user_msg})
	chat_prompt.append({"role": "assistant", "content": bot_msg})
	chat_prompt.append({"role": "user", "content": message})

	response = llm.create_chat_completion(messages=chat_prompt, stream=False)
	reply = response["choices"][0]["message"]["content"]
	history.append((message, reply))
	return reply, history

	gr.ChatInterface(fn=chat_interface, title="Ministral 3B Chat").launch()