File size: 1,664 Bytes
fbbda22
 
 
 
 
 
 
 
 
 
 
 
fa4e434
fbbda22
 
cd5769b
d4c0a68
fbbda22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b9ea8e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import os
import threading
import torch
import torch._dynamo
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import gradio as gr
import spaces

os.system("pip install git+https://github.com/shumingma/transformers.git")
torch._dynamo.config.suppress_errors = True

model_id = "microsoft/bitnet-b1.58-2B-4T"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map={"": "cpu"}).to("cpu")
model.to("cpu")

@spaces.GPU(duration=15)
def gpu():
    print("[GPU] | GPU maintained.")

def respond_simple(message: str, max_tokens: int, temperature: float, top_p: float):
    inputs = tokenizer(message, return_tensors="pt").to("cpu")
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
    thread = threading.Thread(target=model.generate, kwargs={
        **inputs,
        "streamer": streamer,
        "max_new_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "do_sample": True
    })
    thread.start()
    output = ""
    for chunk in streamer:
        output += chunk
    return output

with gr.Blocks() as demo:
    gr.Markdown("## bitnet-b1.58-2b-4t completion")
    tok = gr.Slider(1, 8192, value=2048, step=1, label="max new tokens")
    temp = gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="temperature")
    top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="top-p")
    inp = gr.Textbox(label="prompt", lines=2)
    out = gr.Textbox(label="completion", lines=10)
    inp.submit(respond_simple, [inp, tok, temp, top_p], out)

if __name__ == "__main__":
    demo.launch()