Spaces:

MudassirFayaz
/

testing

Runtime error

App Files Files Community

MudassirFayaz commited on Jun 12, 2024

Commit

4431147

verified ·

1 Parent(s): 997e24d

Update app.py

Browse files

Files changed (1) hide show

app.py +173 -4

app.py CHANGED Viewed

@@ -1,19 +1,188 @@
-from datasets import load_dataset
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel
 base_model = AutoModelForCausalLM.from_pretrained(
     'meta-llama/Llama-2-7b-chat-hf',
     trust_remote_code=True,
     device_map="auto",
-    torch_dtype=torch.float16,   # optional if you have enough VRAM
 )
 tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-chat-hf')
 model = PeftModel.from_pretrained(base_model, 'FinGPT/fingpt-forecaster_dow30_llama2-7b_lora')
 model = model.eval()
-if __name__ == "__app__":
     demo.queue(max_size=20).launch()

+import os
+from threading import Thread
+from typing import Iterator, List, Tuple
+import torch
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel
+import gradio as gr
+from gradio import Blocks
+from transformers import TextIteratorStreamer
+# Load the base model and tokenizer
 base_model = AutoModelForCausalLM.from_pretrained(
     'meta-llama/Llama-2-7b-chat-hf',
     trust_remote_code=True,
     device_map="auto",
+    torch_dtype=torch.float16,
 )
 tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-chat-hf')
+# Load the finetuned model
 model = PeftModel.from_pretrained(base_model, 'FinGPT/fingpt-forecaster_dow30_llama2-7b_lora')
 model = model.eval()
+# Define constants
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+# FastAPI setup
+app = FastAPI()
+class ChatRequest(BaseModel):
+    message: str
+    chat_history: List[Tuple[str, str]] = []
+    system_prompt: str = ""
+    max_new_tokens: int = 1024
+    temperature: float = 0.6
+    top_p: float = 0.9
+    top_k: int = 50
+    repetition_penalty: float = 1.2
+@app.post("/chat/")
+async def chat(request: ChatRequest):
+    try:
+        response = await generate_response(
+            request.message,
+            request.chat_history,
+            request.system_prompt,
+            request.max_new_tokens,
+            request.temperature,
+            request.top_p,
+            request.top_k,
+            request.repetition_penalty
+        )
+        return {"response": response}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+async def generate_response(
+    message: str,
+    chat_history: List[Tuple[str, str]],
+    system_prompt: str,
+    max_new_tokens: int = 1024,
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    top_k: int = 50,
+    repetition_penalty: float = 1.2,
+) -> str:
+    conversation = []
+    if system_prompt:
+        conversation.append({"role": "system", "content": system_prompt})
+    for user, assistant in chat_history:
+        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+    conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
+    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+    input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = {
+        "input_ids": input_ids,
+        "streamer": streamer,
+        "max_new_tokens": max_new_tokens,
+        "do_sample": True,
+        "top_p": top_p,
+        "top_k": top_k,
+        "temperature": temperature,
+        "num_beams": 1,
+        "repetition_penalty": repetition_penalty,
+    }
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+    return "".join(outputs)
+# Gradio setup
+def generate(
+    message: str,
+    chat_history: List[Tuple[str, str]],
+    system_prompt: str,
+    max_new_tokens: int = 1024,
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    top_k: int = 50,
+    repetition_penalty: float = 1.2,
+) -> Iterator[str]:
+    return generate_response(
+        message,
+        chat_history,
+        system_prompt,
+        max_new_tokens,
+        temperature,
+        top_p,
+        top_k,
+        repetition_penalty
+    )
+chat_interface = gr.ChatInterface(
+    fn=generate,
+    additional_inputs=[
+        gr.Textbox(label="System prompt", lines=6),
+        gr.Slider(
+            label="Max new tokens",
+            minimum=1,
+            maximum=MAX_MAX_NEW_TOKENS,
+            step=1,
+            value=DEFAULT_MAX_NEW_TOKENS,
+        ),
+        gr.Slider(
+            label="Temperature",
+            minimum=0.1,
+            maximum=4.0,
+            step=0.1,
+            value=0.6,
+        ),
+        gr.Slider(
+            label="Top-p (nucleus sampling)",
+            minimum=0.05,
+            maximum=1.0,
+            step=0.05,
+            value=0.9,
+        ),
+        gr.Slider(
+            label="Top-k",
+            minimum=1,
+            maximum=1000,
+            step=1,
+            value=50,
+        ),
+        gr.Slider(
+            label="Repetition penalty",
+            minimum=1.0,
+            maximum=2.0,
+            step=0.05,
+            value=1.2,
+        ),
+    ],
+    stop_btn=None,
+    examples=[
+        ["Hello there! How are you doing?"],
+        ["Can you explain briefly to me what is the Python programming language?"],
+        ["Explain the plot of Cinderella in a sentence."],
+        ["How many hours does it take a man to eat a Helicopter?"],
+        ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
+    ],
+)
+with Blocks(css="style.css") as demo:
+    gr.Markdown("# Llama-2 7B Chat")
+    gr.Markdown("""
+    This Space demonstrates the Llama-2 7B Chat model by Meta, fine-tuned for chat instructions.
+    Feel free to chat with the model here or use the API to integrate it into your applications.
+    """)
+    chat_interface.render()
+    gr.Markdown("---")
+    gr.Markdown("This demo is governed by the original [license](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/LICENSE.txt).")
+if __name__ == "__main__":
     demo.queue(max_size=20).launch()