Spaces:
Sleeping
Sleeping
import os | |
import requests | |
import gradio as gr | |
from llama_cpp import Llama | |
MODEL_URL = "https://cas-bridge.xethub.hf.co/xet-bridge-us/680f85f7f3cf2673404f897f/d6cd862a0b513dd35067121f0413e369b37aa9cacde526e31fc016c68ce0d305?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250503%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250503T191219Z&X-Amz-Expires=3600&X-Amz-Signature=2721d6c78a7db9b4d9b0ed1452ce3017d82b3392a190ae72b3755d873bc6c3df&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=67e6d6e1648b9f55980fa892&response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27Qwen3-4B-128K-Q4_K_M.gguf%3B+filename%3D%22Qwen3-4B-128K-Q4_K_M.gguf%22%3B&x-id=GetObject&Expires=1746303139&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NjMwMzEzOX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82ODBmODVmN2YzY2YyNjczNDA0Zjg5N2YvZDZjZDg2MmEwYjUxM2RkMzUwNjcxMjFmMDQxM2UzNjliMzdhYTljYWNkZTUyNmUzMWZjMDE2YzY4Y2UwZDMwNSoifV19&Signature=lxo1PdbqSDERIivvWyQ88rq6oOx0DF5aefiy1LClNYXtBexSV476eNjAEl0wwE1nf7rW%7EcAIctjqzl%7ElVTEFSCosTEACuRrgqtGjyP94xL0xBFMDv1lqJ6E5UFokq0FTRNNd84Xt3GthKDMYanseX9kGpHBNpCwvlx-BRrNF6cbd2XWcCcmetP-NUrHtmoEHY89LVAFb72EXx7edlsOIMOWGzOYcgHI-IfuM2U4m%7Epquxo429CjrEbUi9xpQIQFVBvWHThIJN2LT5NqXpUpPyXswEJyuQDZ0sDTgBEh5gdHtwAWf0cF0i285VgitHIty8eYLEeYNu7J%7EqEDBu7RrYQ__&Key-Pair-Id=K2L8F4GPSG1IFC" # truncated for clarity | |
MODEL_PATH = "Qwen3-4B-128K-Q4_K_M.gguf" | |
# Download model if not already downloaded | |
if not os.path.exists(MODEL_PATH): | |
print("Downloading model...") | |
with requests.get(MODEL_URL, stream=True) as r: | |
r.raise_for_status() | |
with open(MODEL_PATH, 'wb') as f: | |
for chunk in r.iter_content(chunk_size=8192): | |
f.write(chunk) | |
print("Model downloaded.") | |
# Load the model | |
llm = Llama( | |
model_path=MODEL_PATH, | |
n_ctx=8192, | |
n_threads=4, | |
n_gpu_layers=20, # Adjust for HF GPU environment | |
chat_format="chatml" | |
) | |
def chat_interface(message, history): | |
if history is None: | |
history = [] | |
chat_prompt = [] | |
for user_msg, bot_msg in history: | |
chat_prompt.append({"role": "user", "content": user_msg}) | |
chat_prompt.append({"role": "assistant", "content": bot_msg}) | |
chat_prompt.append({"role": "user", "content": message}) | |
response = llm.create_chat_completion(messages=chat_prompt, stream=False) | |
reply = response["choices"][0]["message"]["content"] | |
history.append((message, reply)) | |
return reply, history | |
gr.ChatInterface(fn=chat_interface, title="Qwen3-4B Chat").launch() | |