Staticaliza commited on
Commit
278edce
Β·
verified Β·
1 Parent(s): ca6ece3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -99
app.py CHANGED
@@ -1,13 +1,7 @@
1
- # Imports
2
  import gradio as gr
3
  import os
4
  import random
5
- import time
6
- import spaces
7
- import torch
8
  import threading
9
- import socket
10
- import requests
11
 
12
  from llama_cpp import Llama
13
  from huggingface_hub import hf_hub_download
@@ -15,124 +9,92 @@ from huggingface_hub import hf_hub_download
15
  # Variables
16
  HF_TOKEN = os.environ.get("HF_TOKEN")
17
 
18
- REPO = "bartowski/Mistral-Small-Instruct-2409-GGUF"
19
- FILE = "Mistral-Small-Instruct-2409-Q6_K_L.gguf"
20
 
21
  TIMEOUT = 30
22
 
23
  MAX_SEED = 9007199254740991
24
- DEFAULT_INPUT = ""
25
- DEFAULT_SYSTEM = "Statical is a totally normal human named 'Statical'. Statical responds very briefly, truthfully, and accurately."
26
 
27
- TAG_USER = "USER"
28
- TAG_ASSISTANT = "STATICAL"
 
 
 
 
 
 
29
 
30
- DEFAULT_SEPARATOR = ","
31
- DEFAULT_STOP_SEQUENCES = f"{TAG_USER}:,{TAG_ASSISTANT}:,</s>"
32
-
33
- model = Llama(model_path=hf_hub_download(repo_id=REPO, filename=FILE, token=HF_TOKEN), n_ctx=32768, n_threads=48, n_batch=512, n_gpu_layers=0, verbose=True)
34
-
35
- # Functions
36
  def get_seed(seed):
37
- seed = seed.strip()
38
- if seed.isdigit():
39
- return int(seed)
40
  else:
41
  return random.randint(0, MAX_SEED)
42
 
43
- def generate(input=DEFAULT_INPUT, history=[], system=DEFAULT_SYSTEM, stream=False, temperature=1, top_p=0.95, top_k=50, rep_p=1.2, max_tokens=64, seed=None, separator=DEFAULT_SEPARATOR, stop_sequences=DEFAULT_STOP_SEQUENCES):
44
  print("[GENERATE] Model is generating...")
45
-
46
- memory = ""
47
- for item in history:
48
- if item[0]:
49
- memory += f"{TAG_USER}: {item[0].strip()}\n"
50
- if item[1]:
51
- memory += f"{TAG_ASSISTANT}: {item[1].strip()}</s>\n"
52
- prompt = f"{system.strip()}\n{memory}{TAG_USER}: {input.strip()}\n{TAG_ASSISTANT}: "
53
-
54
- print(prompt)
55
-
56
  parameters = {
57
  "prompt": prompt,
58
  "temperature": temperature,
59
  "top_p": top_p,
60
- "top_k": top_k,
61
- "repeat_penalty": rep_p,
62
- "max_tokens": max_tokens,
63
- "stop": [seq.strip() for seq in stop_sequences.split(separator)] if stop_sequences else [],
64
  "seed": get_seed(seed),
65
- "stream": stream
66
  }
67
-
 
 
68
  event = threading.Event()
 
 
69
 
70
  try:
71
  output = model.create_completion(**parameters)
72
  print("[GENERATE] Model has generated.")
73
- if stream:
74
- buffer = ""
75
- timer = threading.Timer(TIMEOUT, event.set)
76
- timer.start()
77
- try:
78
- for _, item in enumerate(output):
79
- if event.is_set():
80
- raise TimeoutError("[ERROR] Generation timed out.")
81
- buffer += item["choices"][0]["text"]
82
- yield buffer
83
- timer.cancel()
84
- timer = threading.Timer(TIMEOUT, event.set)
85
- timer.start()
86
- finally:
87
- timer.cancel()
88
- else:
89
- yield output["choices"][0]["text"]
90
  except TimeoutError as e:
91
  yield str(e)
92
  finally:
93
  timer.cancel()
94
 
95
- @spaces.GPU(duration=15)
96
- def gpu():
97
- return
98
-
99
  # Initialize
100
- theme = gr.themes.Default(
101
- primary_hue="violet",
102
- secondary_hue="indigo",
103
- neutral_hue="zinc",
104
- spacing_size="sm",
105
- radius_size="lg",
106
- font=[gr.themes.GoogleFont('Kanit'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
107
- font_mono=[gr.themes.GoogleFont('Kanit'), 'ui-monospace', 'Consolas', 'monospace'],
108
- ).set(background_fill_primary='*neutral_50', background_fill_secondary='*neutral_100')
109
-
110
- model_base = "https://huggingface.co/MaziyarPanahi/WizardLM-2-8x22B-GGUF" # [::-1]
111
- model_quant = "https://huggingface.co/alpindale/WizardLM-2-8x22B" # [::-1]
112
-
113
- with gr.Blocks(theme=theme) as main:
114
- with gr.Column():
115
- gr.Markdown("# πŸ‘οΈβ€πŸ—¨οΈ WizardLM")
116
- gr.Markdown("β €β €β€’ ⚑ A text generation inference for one of the best open-source text models: WizardLM-2-8x22B.")
117
- gr.Markdown("β €β €β€’ ⚠️ WARNING! The inference is very slow due to the model being HUGE; it takes 10 seconds before it starts generating; please avoid high max token parameters and sending large amounts of text; note it uses CPU because I cannot figure out how to run it in GPU without overloading the model.")
118
- gr.Markdown(f"β €β €β€’ πŸ”— Link to models: {model_base} (BASE), {model_quant} (QUANT)")
119
-
120
- with gr.Column():
121
- gr.ChatInterface(
122
- fn=generate,
123
- additional_inputs_accordion=gr.Accordion(label="βš™οΈ Configurations", open=False, render=False),
124
- additional_inputs=[
125
- gr.Textbox(lines=1, value=DEFAULT_SYSTEM, label="πŸͺ„ System", render=False),
126
- gr.Checkbox(label="⚑ Stream", value=True, render=False),
127
- gr.Slider(minimum=0, maximum=2, step=0.01, value=1, label="🌑️ Temperature", render=False),
128
- gr.Slider(minimum=0.01, maximum=0.99, step=0.01, value=0.95, label="🧲 Top P", render=False),
129
- gr.Slider(minimum=1, maximum=2048, step=1, value=50, label="πŸ“Š Top K", render=False),
130
- gr.Slider(minimum=0.01, maximum=2, step=0.01, value=1.2, label="πŸ“š Repetition Penalty", render=False),
131
- gr.Slider(minimum=1, maximum=2048, step=1, value=256, label="⏳ Max New Tokens", render=False),
132
- gr.Textbox(lines=1, value="", label="🌱 Seed (Blank for random)", render=False),
133
- gr.Textbox(lines=1, value=DEFAULT_SEPARATOR, label="🏷️ Stop Sequences Separator", render=False),
134
- gr.Textbox(lines=1, value=DEFAULT_STOP_SEQUENCES, label="πŸ›‘ Stop Sequences (Blank for none)", render=False),
135
- ]
136
- )
137
-
138
- main.launch(show_api=False)
 
 
1
  import gradio as gr
2
  import os
3
  import random
 
 
 
4
  import threading
 
 
5
 
6
  from llama_cpp import Llama
7
  from huggingface_hub import hf_hub_download
 
9
  # Variables
10
  HF_TOKEN = os.environ.get("HF_TOKEN")
11
 
12
+ REPO = "HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF"
13
+ FILE = "smollm2-1.7b-instruct-q4_k_m.gguf"
14
 
15
  TIMEOUT = 30
16
 
17
  MAX_SEED = 9007199254740991
 
 
18
 
19
+ model = Llama(
20
+ model_path=hf_hub_download(repo_id=REPO, filename=FILE, token=HF_TOKEN),
21
+ n_ctx=32768,
22
+ n_threads=48,
23
+ n_batch=512,
24
+ n_gpu_layers=0,
25
+ verbose=True
26
+ )
27
 
 
 
 
 
 
 
28
  def get_seed(seed):
29
+ if seed and seed.strip().isdigit():
30
+ return int(seed.strip())
 
31
  else:
32
  return random.randint(0, MAX_SEED)
33
 
34
+ def generate(prompt, temperature, top_p, top_k, repetition_penalty, max_tokens, seed):
35
  print("[GENERATE] Model is generating...")
36
+
 
 
 
 
 
 
 
 
 
 
37
  parameters = {
38
  "prompt": prompt,
39
  "temperature": temperature,
40
  "top_p": top_p,
41
+ "top_k": int(top_k),
42
+ "repeat_penalty": repetition_penalty,
43
+ "max_tokens": int(max_tokens),
 
44
  "seed": get_seed(seed),
45
+ "stream": True
46
  }
47
+
48
+ print("Parameters:", parameters)
49
+
50
  event = threading.Event()
51
+ timer = threading.Timer(TIMEOUT, event.set)
52
+ timer.start()
53
 
54
  try:
55
  output = model.create_completion(**parameters)
56
  print("[GENERATE] Model has generated.")
57
+ buffer = ""
58
+ try:
59
+ for _, item in enumerate(output):
60
+ if event.is_set():
61
+ raise TimeoutError("[ERROR] Generation timed out.")
62
+ buffer += item["choices"][0]["text"]
63
+ yield buffer
64
+ finally:
65
+ timer.cancel()
 
 
 
 
 
 
 
 
66
  except TimeoutError as e:
67
  yield str(e)
68
  finally:
69
  timer.cancel()
70
 
 
 
 
 
71
  # Initialize
72
+ model_base = "Any"
73
+ model_quant = "Any Quant"
74
+
75
+ with gr.Blocks() as demo:
76
+ gr.Markdown("# πŸ‘οΈβ€πŸ—¨οΈ WizardLM")
77
+ gr.Markdown("β€’ ⚑ A text generation inference for one of the best open-source text models: WizardLM-2-8x22B.")
78
+ gr.Markdown("β€’ ⚠️ **WARNING!** The inference is very slow due to the model being **HUGE**; it takes about 10 seconds before it starts generating. Please avoid high max token parameters and sending large amounts of text. Note it uses CPU because running it on GPU overloads the model.")
79
+ gr.Markdown(f"β€’ πŸ”— Link to models: [{model_base}]({model_base}) (BASE), [{model_quant}]({model_quant}) (QUANT)")
80
+
81
+ prompt = gr.Textbox(lines=4, label="Enter your prompt")
82
+ output = gr.Textbox(lines=10, label="Model output")
83
+
84
+ with gr.Accordion("βš™οΈ Configurations", open=False):
85
+ temperature = gr.Slider(minimum=0.0, maximum=2.0, value=1.0, step=0.01, label="🌑️ Temperature")
86
+ top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.95, step=0.01, label="🧲 Top P")
87
+ top_k = gr.Slider(minimum=1, maximum=2048, value=50, step=1, label="πŸ“Š Top K")
88
+ repetition_penalty = gr.Slider(minimum=0.0, maximum=2.0, value=1.2, step=0.01, label="πŸ“š Repetition Penalty")
89
+ max_tokens = gr.Slider(minimum=1, maximum=2048, value=256, step=1, label="⏳ Max New Tokens")
90
+ seed = gr.Textbox(lines=1, label="🌱 Seed (Blank for random)", value="")
91
+
92
+ generate_button = gr.Button("Generate")
93
+
94
+ generate_button.click(
95
+ fn=generate,
96
+ inputs=[prompt, temperature, top_p, top_k, repetition_penalty, max_tokens, seed],
97
+ outputs=output,
98
+ )
99
+
100
+ demo.launch()