Spaces:

Staticaliza
/

Zero-5

Paused

App Files Files Community

Staticaliza commited on Nov 2, 2024

Commit

f3e3a98

verified ·

1 Parent(s): dab25ef

Create app.py

Browse files

Files changed (1) hide show

app.py +138 -0

app.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# Imports
+import gradio as gr
+import os
+import random
+import time
+import spaces
+import torch
+import threading
+import socket
+import requests
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
+# Variables
+HF_TOKEN = os.environ.get("HF_TOKEN")
+REPO = "bartowski/Mistral-Small-Instruct-2409-GGUF"
+FILE = "Mistral-Small-Instruct-2409-Q6_K_L.gguf"
+TIMEOUT = 30
+MAX_SEED = 9007199254740991
+DEFAULT_INPUT = ""
+DEFAULT_SYSTEM = "Statical is a totally normal human named 'Statical'. Statical responds very briefly, truthfully, and accurately."
+TAG_USER = "USER"
+TAG_ASSISTANT = "STATICAL"
+DEFAULT_SEPARATOR = ","
+DEFAULT_STOP_SEQUENCES = f"{TAG_USER}:,{TAG_ASSISTANT}:,</s>"
+model = Llama(model_path=hf_hub_download(repo_id=REPO, filename=FILE, token=HF_TOKEN), n_ctx=32768, n_threads=48, n_batch=512, n_gpu_layers=0, verbose=True)
+# Functions
+def get_seed(seed):
+    seed = seed.strip()
+    if seed.isdigit():
+        return int(seed)
+    else:
+        return random.randint(0, MAX_SEED)
+def generate(input=DEFAULT_INPUT, history=[], system=DEFAULT_SYSTEM, stream=False, temperature=1, top_p=0.95, top_k=50, rep_p=1.2, max_tokens=64, seed=None, separator=DEFAULT_SEPARATOR, stop_sequences=DEFAULT_STOP_SEQUENCES):
+    print("[GENERATE] Model is generating...")
+    memory = ""
+    for item in history:
+        if item[0]:
+            memory += f"{TAG_USER}: {item[0].strip()}\n"
+        if item[1]:
+            memory += f"{TAG_ASSISTANT}: {item[1].strip()}</s>\n"
+    prompt = f"{system.strip()}\n{memory}{TAG_USER}: {input.strip()}\n{TAG_ASSISTANT}: "
+    print(prompt)
+    parameters = {
+        "prompt": prompt,
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "repeat_penalty": rep_p,
+        "max_tokens": max_tokens,
+        "stop": [seq.strip() for seq in stop_sequences.split(separator)] if stop_sequences else [],
+        "seed": get_seed(seed),
+        "stream": stream
+    }
+    event = threading.Event()
+    try:
+        output = model.create_completion(**parameters)
+        print("[GENERATE] Model has generated.")
+        if stream:
+            buffer = ""
+            timer = threading.Timer(TIMEOUT, event.set)
+            timer.start()
+            try:
+                for _, item in enumerate(output):
+                    if event.is_set():
+                        raise TimeoutError("[ERROR] Generation timed out.")
+                    buffer += item["choices"][0]["text"]
+                    yield buffer
+                    timer.cancel()
+                    timer = threading.Timer(TIMEOUT, event.set)
+                    timer.start()
+            finally:
+                timer.cancel()
+        else:
+            yield output["choices"][0]["text"]
+    except TimeoutError as e:
+        yield str(e)
+    finally:
+        timer.cancel()
+@spaces.GPU(duration=15)
+def gpu():
+    return
+# Initialize
+theme = gr.themes.Default(
+    primary_hue="violet",
+    secondary_hue="indigo",
+    neutral_hue="zinc",
+    spacing_size="sm",
+    radius_size="lg",
+    font=[gr.themes.GoogleFont('Kanit'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
+    font_mono=[gr.themes.GoogleFont('Kanit'), 'ui-monospace', 'Consolas', 'monospace'],
+).set(background_fill_primary='*neutral_50', background_fill_secondary='*neutral_100')
+model_base = "https://huggingface.co/MaziyarPanahi/WizardLM-2-8x22B-GGUF" # [::-1]
+model_quant = "https://huggingface.co/alpindale/WizardLM-2-8x22B" # [::-1]
+with gr.Blocks(theme=theme) as main:
+    with gr.Column():
+        gr.Markdown("# 👁️‍🗨️ WizardLM")
+        gr.Markdown("⠀⠀• ⚡ A text generation inference for one of the best open-source text models: WizardLM-2-8x22B.")
+        gr.Markdown("⠀⠀• ⚠️ WARNING! The inference is very slow due to the model being HUGE; it takes 10 seconds before it starts generating; please avoid high max token parameters and sending large amounts of text; note it uses CPU because I cannot figure out how to run it in GPU without overloading the model.")
+        gr.Markdown(f"⠀⠀• 🔗 Link to models: {model_base} (BASE), {model_quant} (QUANT)")
+    with gr.Column():
+        gr.ChatInterface(
+            fn=generate,
+            additional_inputs_accordion=gr.Accordion(label="⚙️ Configurations", open=False, render=False),
+            additional_inputs=[
+                gr.Textbox(lines=1, value=DEFAULT_SYSTEM, label="🪄 System", render=False),
+                gr.Checkbox(label="⚡ Stream", value=True, render=False),
+                gr.Slider(minimum=0, maximum=2, step=0.01, value=1, label="🌡️ Temperature", render=False),
+                gr.Slider(minimum=0.01, maximum=0.99, step=0.01, value=0.95, label="🧲 Top P", render=False),
+                gr.Slider(minimum=1, maximum=2048, step=1, value=50, label="📊 Top K", render=False),
+                gr.Slider(minimum=0.01, maximum=2, step=0.01, value=1.2, label="📚 Repetition Penalty", render=False),
+                gr.Slider(minimum=1, maximum=2048, step=1, value=256, label="⏳ Max New Tokens", render=False),
+                gr.Textbox(lines=1, value="", label="🌱 Seed (Blank for random)", render=False),
+                gr.Textbox(lines=1, value=DEFAULT_SEPARATOR, label="🏷️ Stop Sequences Separator", render=False),
+                gr.Textbox(lines=1, value=DEFAULT_STOP_SEQUENCES, label="🛑 Stop Sequences (Blank for none)", render=False),
+            ]
+        )
+main.launch(show_api=False)