Spaces:

asdc
/

temporal_expression_normalization

Runtime error

File size: 2,458 Bytes

6ed0fd3
8b99594
 
d86d51d
 
d5bcccc
6ed0fd3
 
 
 
678e600
d86d51d
6ef7550
52e27c2
 
2589510
1719f05
d86d51d
 
 
c588f84
d5bcccc
d86d51d
 
8b99594
 
 
 
 
 
 
 
d86d51d
 
 
 
 
 
 
6ed0fd3
 
 
 
 
 
 
 
 
 
d86d51d
 
 
 
 
 
 
6ed0fd3
d86d51d
 
 
6ed0fd3
 
d86d51d
 
 
 
 
6ed0fd3

import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig

from peft import PeftModel
import torch
import os 

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""

# Set your model and adapter paths
API_KEY = os.environ.get("llama_ACCESS_TOKEN")
BASE_MODEL = "meta-llama/Meta-Llama-3-8B"
PEFT_ADAPTER = "asdc/Llama-3-8B-multilingual-temporal-expression-normalization"

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=API_KEY)
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
    token=API_KEY
)

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model = PeftModel.from_pretrained(base_model, PEFT_ADAPTER, token=API_KEY, quantization_config=nf4_config)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto"
)


def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    prompt = system_message + "\n"
    for user, assistant in history:
        if user:
            prompt += f"User: {user}\n"
        if assistant:
            prompt += f"Assistant: {assistant}\n"
    prompt += f"User: {message}\nAssistant:"

    outputs = pipe(
        prompt,
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )
    response = outputs[0]["generated_text"][len(prompt):]
    yield response


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)


if __name__ == "__main__":
    demo.launch()