File size: 2,792 Bytes
fb38431
c4f947a
 
 
 
 
 
 
5315eed
c778ae5
4d07925
 
 
c778ae5
b461977
 
839fca3
b461977
 
 
c778ae5
c4f947a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5315eed
b461977
 
 
 
 
 
 
 
c4f947a
b461977
 
839fca3
 
ff04433
 
839fca3
 
 
ff04433
839fca3
 
ff04433
 
839fca3
 
4d07925
839fca3
 
 
4d07925
839fca3
c4f947a
 
 
839fca3
 
 
b461977
 
5315eed
 
b461977
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TextIteratorStreamer,
    StoppingCriteriaList,
)
from threading import Thread
import gradio as gr

if torch.cuda.is_available():
    torch.set_default_device("cuda")

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    trust_remote_code=True,
)


def Phi2StoppingCriteria(
    input_ids: torch.LongTensor, score: torch.FloatTensor, **kwargs
) -> bool:
    stop_list = ["Exercise", "Exercises", "<|endoftext|>"]
    stop_tokens = []
    for stop in stop_list:
        stop_tokens.append(
            tokenizer(stop, add_special_tokens=False, return_tensors="pt").input_ids
        )
    return input_ids[-1] in stop_tokens


stopping_criteria = StoppingCriteriaList([Phi2StoppingCriteria])


def generate(prompt, max_new_tokens):
    inputs = tokenizer(prompt, return_tensors="pt")
    # thanks https://huggingface.co/spaces/joaogante/transformers_streaming/blob/main/app.py
    streamer = TextIteratorStreamer(inputs)
    generation_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        stopping_criteria=stopping_criteria,
    )
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    model_output = ""
    for new_text in streamer:
        model_output += new_text
        yield model_output
    return model_output


demo = gr.Interface(
    fn=generate,
    inputs=[
        gr.Text(
            label="prompt",
            value="Write a detailed analogy between mathematics and a lighthouse.",
        ),
        gr.Number(value=100, label="max new tokens", maximum=500),
    ],
    outputs="text",
    examples=[
        [
            "Write a detailed analogy between mathematics and a lighthouse.",
            75,
        ],
        [
            "Instruct: Write a detailed analogy between mathematics and a lighthouse.\nOutput:",
            75,
        ],
        [
            "Alice: I don't know why, I'm struggling to maintain focus while studying. Any suggestions?\n\nBob: ",
            150,
        ],
        [
            '''def print_prime(n):
   """
   Print all primes between 1 and n
   """\n''',
            100,
        ],
        ["User: How does sleep affect mood?\nAI:", 125],
        ["Who was Ada Lovelace?", 100],
        ["Explain the concept of skip lists.", 125],
    ],
    title="Microsoft Phi-2",
    description="Unofficial demo of Microsoft Phi-2, a high performing model with only 2.7B parameters.",
)


if __name__ == "__main__":
    demo.launch(show_api=False)