Spaces:

ivpich
/

t-lite

Runtime error

App Files Files Community

ivpich commited on Aug 2, 2024

Commit

23dcf34

verified ·

1 Parent(s): 5426f12

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -22

app.py CHANGED Viewed

@@ -17,7 +17,6 @@ PLACEHOLDER = """
 </center>
 """
 CSS = """
 .duplicate-button {
     margin: auto !important;
@@ -30,7 +29,7 @@ h3 {
 }
 """
-device = "cuda" # for GPU usage or "cpu" for CPU usage
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
 model = AutoModelForCausalLM.from_pretrained(
@@ -39,53 +38,56 @@ model = AutoModelForCausalLM.from_pretrained(
     device_map="auto",
     ignore_mismatched_sizes=True)
 @spaces.GPU()
 def stream_chat(
-    message: str,
-    history: list,
-    temperature: float = 0.3,
-    max_new_tokens: int = 1024,
-    top_p: float = 1.0,
-    top_k: int = 20,
     penalty: float = 1.2,
 ):
     print(f'message: {message}')
     print(f'history: {history}')
-    conversation = []
     for prompt, answer in history:
         conversation.extend([
-            {"role": "user", "content": prompt},
             {"role": "assistant", "content": answer},
         ])
     conversation.append({"role": "user", "content": message})
-    input_text=tokenizer.apply_chat_template(conversation, tokenize=False)
     inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
     streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
-        input_ids=inputs,
-        max_new_tokens = max_new_tokens,
-        do_sample = False if temperature == 0 else True,
-        top_p = top_p,
-        top_k = top_k,
-        temperature = temperature,
         streamer=streamer,
-        pad_token_id = 10,
     )
     with torch.no_grad():
         thread = Thread(target=model.generate, kwargs=generate_kwargs)
         thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
         yield buffer
 chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
 with gr.Blocks(css=CSS, theme="soft") as demo:
@@ -97,6 +99,12 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
         fill_height=True,
         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
         additional_inputs=[
             gr.Slider(
                 minimum=0,
                 maximum=1,
@@ -149,4 +157,4 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
 if __name__ == "__main__":
-    demo.launch()

 </center>
 """
 CSS = """
 .duplicate-button {
     margin: auto !important;
 }
 """
+device = "cuda"  # for GPU usage or "cpu" for CPU usage
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
 model = AutoModelForCausalLM.from_pretrained(
     device_map="auto",
     ignore_mismatched_sizes=True)
 @spaces.GPU()
 def stream_chat(
+    message: str,
+    history: list,
+    system_prompt: str,
+    temperature: float = 0.3,
+    max_new_tokens: int = 1024,
+    top_p: float = 1.0,
+    top_k: int = 20,
     penalty: float = 1.2,
 ):
     print(f'message: {message}')
     print(f'history: {history}')
+    print(f'system_prompt: {system_prompt}')
+    conversation = [{"role": "system", "content": system_prompt}]
     for prompt, answer in history:
         conversation.extend([
+            {"role": "user", "content": prompt},
             {"role": "assistant", "content": answer},
         ])
     conversation.append({"role": "user", "content": message})
+    input_text = tokenizer.apply_chat_template(conversation, tokenize=False)
     inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
     streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
+        input_ids=inputs,
+        max_new_tokens=max_new_tokens,
+        do_sample=False if temperature == 0 else True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
         streamer=streamer,
+        pad_token_id=10,
     )
     with torch.no_grad():
         thread = Thread(target=model.generate, kwargs=generate_kwargs)
         thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
         yield buffer
 chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
 with gr.Blocks(css=CSS, theme="soft") as demo:
         fill_height=True,
         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
         additional_inputs=[
+            gr.Textbox(
+                lines=2,
+                placeholder="Enter system prompt here...",
+                label="System Prompt",
+                render=True,
+            ),
             gr.Slider(
                 minimum=0,
                 maximum=1,
 if __name__ == "__main__":
+    demo.launch()