Spaces:

halme
/

ID2223-lab2

Sleeping

File size: 2,938 Bytes

1f45017
 
acbac18
bcb8b37
 
 
1f45017
 
 
 
50cd3df
de8a86b
1f45017
 
de8a86b
1f45017
 
 
 
 
 
 
 
 
 
bcb8b37
1f45017
de8a86b
1f45017
 
 
de8a86b
 
bcb8b37
de8a86b
 
 
 
bcb8b37
 
 
 
de8a86b
bcb8b37
de8a86b
acbac18
de8a86b
 
 
 
 
 
 
 
 
 
f7ea956
de8a86b
 
 
f7ea956
de8a86b
 
1f45017

import gradio as gr
from huggingface_hub import InferenceClient
#from unsloth import FastLanguageModel
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer


"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
#client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
#client = InferenceClient("halme/id2223_lora_model")


def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p,):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    #response = ""

    """ for message in client.chat_completion(messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p):
        token = message.choices[0].delta.content

        response += token
        yield response """

    """     model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "halme/id2223_lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_tokens,
        dtype = None,
        load_in_4bit = True,
    ) """

    model = AutoPeftModelForCausalLM.from_pretrained(
        "halme/id2223_lora_model", # YOUR MODEL YOU USED FOR TRAINING
    )
    tokenizer = AutoTokenizer.from_pretrained("halme/id2223_lora_model") 

    #FastLanguageModel.for_inference(model) # Enable native 2x faster inference

    """messages = [
        {"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
    ] """

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    )

    from transformers import TextStreamer
    text_streamer = TextStreamer(tokenizer, skip_prompt = True)

    yield model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                    use_cache = True, temperature = 1.5, min_p = 0.1)


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)


if __name__ == "__main__":
    demo.launch()