Spaces:

Staticaliza
/

Zero-5

Running

App Files Files Community

Zero-5 / app.py

Staticaliza

Update app.py

0cb24f7 verified 8 months ago

raw

history blame

3.46 kB

	# Imports
	import gradio as gr
	import spaces
	import os
	import random
	import threading

	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download

	# Variables
	HF_TOKEN = os.environ.get("HF_TOKEN")

	REPO = "Novaciano/Llama-3.2_1b_Uncensored_RP_Aesir_GGUF"
	FILE = "Llama-3.2_1b_Uncensored_RP_Aesir.gguf"

	TIMEOUT = 60

	MAX_SEED = 9007199254740991

	model = Llama(
	model_path=hf_hub_download(repo_id=REPO, filename=FILE, token=HF_TOKEN),
	n_ctx=32768,
	n_threads=4,
	n_batch=512,
	n_gpu_layers=0,
	verbose=True
	)

	def get_seed(seed):
	if seed and seed.strip().isdigit():
	return int(seed.strip())
	else:
	return random.randint(0, MAX_SEED)

	def generate(prompt, temperature, top_p, top_k, repetition_penalty, max_tokens, seed):
	print("[GENERATE] Model is generating...")

	parameters = {
	"prompt": prompt,
	"temperature": temperature,
	"top_p": top_p,
	"top_k": int(top_k),
	"repeat_penalty": repetition_penalty,
	"max_tokens": int(max_tokens),
	"seed": get_seed(seed),
	"stream": True
	}

	print("Parameters:", parameters)

	event = threading.Event()
	timer = threading.Timer(TIMEOUT, event.set)
	timer.start()

	try:
	output = model.create_completion(**parameters)
	print("[GENERATE] Model has generated.")
	buffer = ""
	try:
	for _, item in enumerate(output):
	if event.is_set():
	raise TimeoutError("[ERROR] Generation timed out.")
	buffer += item["choices"][0]["text"]
	print(item)
	yield buffer
	finally:
	timer.cancel()
	except TimeoutError as e:
	yield str(e)
	finally:
	timer.cancel()

	# @spaces.GPU(duration=15)
	def gpu():
	return

	# Initialize
	model_base = "Any"
	model_quant = "Any Quant"

	with gr.Blocks() as demo:
	gr.Markdown("# 👁️‍🗨️ LM")
	gr.Markdown("• ⚡ A text generation inference for any quant models.")
	gr.Markdown("• ⚠️ WARNING! The inference is very slow due to the model being HUGE; it takes about 10 seconds before it starts generating. Please avoid high max token parameters and sending large amounts of text. Note it uses CPU because running it on GPU overloads the model.")
	gr.Markdown(f"• 🔗 Link to models: [{model_base}]({model_base}) (BASE), [{model_quant}]({model_quant}) (QUANT)")

	prompt = gr.Textbox(lines=4, label="Enter your prompt")
	output = gr.Textbox(lines=10, label="Model output")

	with gr.Accordion("⚙️ Configurations", open=False):
	temperature = gr.Slider(minimum=0.0, maximum=2.0, value=1.0, step=0.01, label="🌡️ Temperature")
	top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.95, step=0.01, label="🧲 Top P")
	top_k = gr.Slider(minimum=1, maximum=2048, value=50, step=1, label="📊 Top K")
	repetition_penalty = gr.Slider(minimum=0.0, maximum=2.0, value=1.2, step=0.01, label="📚 Repetition Penalty")
	max_tokens = gr.Slider(minimum=1, maximum=2048, value=256, step=1, label="⏳ Max New Tokens")
	seed = gr.Textbox(lines=1, label="🌱 Seed (Blank for random)", value="")

	generate_button = gr.Button("Generate")

	generate_button.click(
	fn=generate,
	inputs=[prompt, temperature, top_p, top_k, repetition_penalty, max_tokens, seed],
	outputs=output,
	)

	demo.launch()