Spaces:
Running
Running
File size: 3,460 Bytes
eef102c f3e3a98 52ec652 f3e3a98 0cb24f7 f3e3a98 eef102c f3e3a98 278edce 0cb24f7 278edce f3e3a98 278edce f3e3a98 278edce f3e3a98 278edce f3e3a98 278edce f3e3a98 278edce f3e3a98 278edce f3e3a98 278edce f3e3a98 278edce da8de96 278edce f3e3a98 0677d75 eef102c f3e3a98 278edce eef102c 278edce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# Imports
import gradio as gr
import spaces
import os
import random
import threading
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# Variables
HF_TOKEN = os.environ.get("HF_TOKEN")
REPO = "Novaciano/Llama-3.2_1b_Uncensored_RP_Aesir_GGUF"
FILE = "Llama-3.2_1b_Uncensored_RP_Aesir.gguf"
TIMEOUT = 60
MAX_SEED = 9007199254740991
model = Llama(
model_path=hf_hub_download(repo_id=REPO, filename=FILE, token=HF_TOKEN),
n_ctx=32768,
n_threads=4,
n_batch=512,
n_gpu_layers=0,
verbose=True
)
def get_seed(seed):
if seed and seed.strip().isdigit():
return int(seed.strip())
else:
return random.randint(0, MAX_SEED)
def generate(prompt, temperature, top_p, top_k, repetition_penalty, max_tokens, seed):
print("[GENERATE] Model is generating...")
parameters = {
"prompt": prompt,
"temperature": temperature,
"top_p": top_p,
"top_k": int(top_k),
"repeat_penalty": repetition_penalty,
"max_tokens": int(max_tokens),
"seed": get_seed(seed),
"stream": True
}
print("Parameters:", parameters)
event = threading.Event()
timer = threading.Timer(TIMEOUT, event.set)
timer.start()
try:
output = model.create_completion(**parameters)
print("[GENERATE] Model has generated.")
buffer = ""
try:
for _, item in enumerate(output):
if event.is_set():
raise TimeoutError("[ERROR] Generation timed out.")
buffer += item["choices"][0]["text"]
print(item)
yield buffer
finally:
timer.cancel()
except TimeoutError as e:
yield str(e)
finally:
timer.cancel()
# @spaces.GPU(duration=15)
def gpu():
return
# Initialize
model_base = "Any"
model_quant = "Any Quant"
with gr.Blocks() as demo:
gr.Markdown("# ποΈβπ¨οΈ LM")
gr.Markdown("β’ β‘ A text generation inference for any quant models.")
gr.Markdown("β’ β οΈ **WARNING!** The inference is very slow due to the model being **HUGE**; it takes about 10 seconds before it starts generating. Please avoid high max token parameters and sending large amounts of text. Note it uses CPU because running it on GPU overloads the model.")
gr.Markdown(f"β’ π Link to models: [{model_base}]({model_base}) (BASE), [{model_quant}]({model_quant}) (QUANT)")
prompt = gr.Textbox(lines=4, label="Enter your prompt")
output = gr.Textbox(lines=10, label="Model output")
with gr.Accordion("βοΈ Configurations", open=False):
temperature = gr.Slider(minimum=0.0, maximum=2.0, value=1.0, step=0.01, label="π‘οΈ Temperature")
top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.95, step=0.01, label="π§² Top P")
top_k = gr.Slider(minimum=1, maximum=2048, value=50, step=1, label="π Top K")
repetition_penalty = gr.Slider(minimum=0.0, maximum=2.0, value=1.2, step=0.01, label="π Repetition Penalty")
max_tokens = gr.Slider(minimum=1, maximum=2048, value=256, step=1, label="β³ Max New Tokens")
seed = gr.Textbox(lines=1, label="π± Seed (Blank for random)", value="")
generate_button = gr.Button("Generate")
generate_button.click(
fn=generate,
inputs=[prompt, temperature, top_p, top_k, repetition_penalty, max_tokens, seed],
outputs=output,
)
demo.launch() |