import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr

if torch.cuda.is_available():
    torch.set_default_device("cuda")

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    trust_remote_code=True,
)


def generate(prompt, length):
    inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)
    input_token_len = len(inputs.tokens())
    outputs = model.generate(**inputs, max_length=length if length >= input_token_len else input_token_len)
    return tokenizer.batch_decode(outputs)[0]


demo = gr.Interface(
    fn=generate,
    inputs=[
        gr.Text(
            label="prompt",
            value="Write a detailed analogy between mathematics and a lighthouse.",
        ),
        gr.Number(value=100, label="max length", maximum=500),
    ],
    outputs="text",
    examples=[
        [
            "Write a detailed analogy between mathematics and a lighthouse.",
            75,
        ],
        [
            "Instruct: Write a detailed analogy between mathematics and a lighthouse.\nOutput:",
            75,
        ],
        [
            "Alice: I don't know why, I'm struggling to maintain focus while studying. Any suggestions?\n\nBob: ",
            150,
        ],
        [
            '''def print_prime(n):
   """
   Print all primes between 1 and n
   """\n''',
            100,
        ],
    ],
    title="Microsoft Phi-2",
    description="Unofficial demo of Microsoft Phi-2, a high performing model with only 2.7B parameters.",
)


if __name__ == "__main__":
    demo.launch(show_api=False)