Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -11,25 +11,30 @@ import spaces
|
|
| 11 |
import time
|
| 12 |
import subprocess
|
| 13 |
|
|
|
|
| 14 |
subprocess.run(
|
| 15 |
"pip install flash-attn --no-build-isolation",
|
| 16 |
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
|
| 17 |
shell=True,
|
| 18 |
)
|
| 19 |
|
|
|
|
| 20 |
token = os.environ["HF_TOKEN"]
|
| 21 |
|
| 22 |
-
|
| 23 |
model = AutoModelForCausalLM.from_pretrained(
|
| 24 |
"microsoft/Phi-3-mini-128k-instruct",
|
| 25 |
token=token,
|
| 26 |
trust_remote_code=True,
|
| 27 |
)
|
| 28 |
tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", token=token)
|
|
|
|
|
|
|
| 29 |
terminators = [
|
| 30 |
tok.eos_token_id,
|
| 31 |
]
|
| 32 |
|
|
|
|
| 33 |
if torch.cuda.is_available():
|
| 34 |
device = torch.device("cuda")
|
| 35 |
print(f"Using GPU: {torch.cuda.get_device_name(device)}")
|
|
@@ -38,37 +43,46 @@ else:
|
|
| 38 |
print("Using CPU")
|
| 39 |
|
| 40 |
model = model.to(device)
|
| 41 |
-
# Dispatch Errors
|
| 42 |
-
|
| 43 |
|
|
|
|
| 44 |
@spaces.GPU(duration=60)
|
| 45 |
def chat(message, history, temperature, do_sample, max_tokens):
|
|
|
|
| 46 |
chat = []
|
| 47 |
for item in history:
|
| 48 |
chat.append({"role": "user", "content": item[0]})
|
| 49 |
if item[1] is not None:
|
| 50 |
chat.append({"role": "assistant", "content": item[1]})
|
| 51 |
chat.append({"role": "user", "content": message})
|
|
|
|
|
|
|
| 52 |
messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
| 53 |
model_inputs = tok([messages], return_tensors="pt").to(device)
|
|
|
|
|
|
|
| 54 |
streamer = TextIteratorStreamer(
|
| 55 |
tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
|
| 56 |
)
|
|
|
|
|
|
|
| 57 |
generate_kwargs = dict(
|
| 58 |
model_inputs,
|
| 59 |
streamer=streamer,
|
| 60 |
-
max_new_tokens=max_tokens,
|
| 61 |
-
do_sample=True,
|
| 62 |
-
temperature=temperature,
|
| 63 |
-
eos_token_id=terminators,
|
| 64 |
)
|
| 65 |
|
|
|
|
| 66 |
if temperature == 0:
|
| 67 |
generate_kwargs["do_sample"] = False
|
| 68 |
|
|
|
|
| 69 |
t = Thread(target=model.generate, kwargs=generate_kwargs)
|
| 70 |
t.start()
|
| 71 |
|
|
|
|
| 72 |
partial_text = ""
|
| 73 |
for new_text in streamer:
|
| 74 |
partial_text += new_text
|
|
@@ -76,11 +90,10 @@ def chat(message, history, temperature, do_sample, max_tokens):
|
|
| 76 |
|
| 77 |
yield partial_text
|
| 78 |
|
| 79 |
-
|
| 80 |
demo = gr.ChatInterface(
|
| 81 |
fn=chat,
|
| 82 |
examples=[["Write me a poem about Machine Learning."]],
|
| 83 |
-
# multimodal=False,
|
| 84 |
additional_inputs_accordion=gr.Accordion(
|
| 85 |
label="โ๏ธ Parameters", open=False, render=False
|
| 86 |
),
|
|
@@ -102,4 +115,6 @@ demo = gr.ChatInterface(
|
|
| 102 |
title="Chat With LLMs",
|
| 103 |
description="Now Running [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)",
|
| 104 |
)
|
| 105 |
-
|
|
|
|
|
|
|
|
|
| 11 |
import time
|
| 12 |
import subprocess
|
| 13 |
|
| 14 |
+
# flash-attn ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ค์น. CUDA ๋น๋๋ ๊ฑด๋๋.
|
| 15 |
subprocess.run(
|
| 16 |
"pip install flash-attn --no-build-isolation",
|
| 17 |
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
|
| 18 |
shell=True,
|
| 19 |
)
|
| 20 |
|
| 21 |
+
# Hugging Face ํ ํฐ ๊ฐ์ ธ์ค๊ธฐ
|
| 22 |
token = os.environ["HF_TOKEN"]
|
| 23 |
|
| 24 |
+
# microsoft/Phi-3-mini-128k-instruct ๋ชจ๋ธ๊ณผ ํ ํฌ๋์ด์ ๋ก๋
|
| 25 |
model = AutoModelForCausalLM.from_pretrained(
|
| 26 |
"microsoft/Phi-3-mini-128k-instruct",
|
| 27 |
token=token,
|
| 28 |
trust_remote_code=True,
|
| 29 |
)
|
| 30 |
tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", token=token)
|
| 31 |
+
|
| 32 |
+
# ์ข
๋ฃ ํ ํฐ ID ์ค์
|
| 33 |
terminators = [
|
| 34 |
tok.eos_token_id,
|
| 35 |
]
|
| 36 |
|
| 37 |
+
# GPU๊ฐ ์ฌ์ฉ ๊ฐ๋ฅํ ๊ฒฝ์ฐ GPU๋ก, ์๋๋ฉด CPU๋ก ๋ชจ๋ธ ๋ก๋
|
| 38 |
if torch.cuda.is_available():
|
| 39 |
device = torch.device("cuda")
|
| 40 |
print(f"Using GPU: {torch.cuda.get_device_name(device)}")
|
|
|
|
| 43 |
print("Using CPU")
|
| 44 |
|
| 45 |
model = model.to(device)
|
|
|
|
|
|
|
| 46 |
|
| 47 |
+
# Spaces์ GPU ์์์ ์ฌ์ฉํ์ฌ chat ํจ์ ์คํ. ์ต๋ 60์ด ๋์ GPU ์์ ์ฌ์ฉ ๊ฐ๋ฅ.
|
| 48 |
@spaces.GPU(duration=60)
|
| 49 |
def chat(message, history, temperature, do_sample, max_tokens):
|
| 50 |
+
# ์ฑํ
๊ธฐ๋ก์ ์ ์ ํ ํ์์ผ๋ก ๋ณํ
|
| 51 |
chat = []
|
| 52 |
for item in history:
|
| 53 |
chat.append({"role": "user", "content": item[0]})
|
| 54 |
if item[1] is not None:
|
| 55 |
chat.append({"role": "assistant", "content": item[1]})
|
| 56 |
chat.append({"role": "user", "content": message})
|
| 57 |
+
|
| 58 |
+
# ํ ํฌ๋์ด์ ๋ฅผ ์ฌ์ฉํ์ฌ ์
๋ ฅ ์ฒ๋ฆฌ
|
| 59 |
messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
| 60 |
model_inputs = tok([messages], return_tensors="pt").to(device)
|
| 61 |
+
|
| 62 |
+
# TextIteratorStreamer๋ฅผ ์ฌ์ฉํ์ฌ ๋ชจ๋ธ ์ถ๋ ฅ ์คํธ๋ฆฌ๋ฐ
|
| 63 |
streamer = TextIteratorStreamer(
|
| 64 |
tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
|
| 65 |
)
|
| 66 |
+
|
| 67 |
+
# ์์ฑ ๊ด๋ จ ๋งค๊ฐ๋ณ์ ์ค์
|
| 68 |
generate_kwargs = dict(
|
| 69 |
model_inputs,
|
| 70 |
streamer=streamer,
|
| 71 |
+
max_new_tokens=max_tokens, # ์์ฑํ ์ต๋ ์ ํ ํฐ ์
|
| 72 |
+
do_sample=True, # ์ํ๋ง ์ฌ๋ถ
|
| 73 |
+
temperature=temperature, # ์จ๋ ๋งค๊ฐ๋ณ์. ๋์์๋ก ๋ค์์ฑ ์ฆ๊ฐ
|
| 74 |
+
eos_token_id=terminators, # ์ข
๋ฃ ํ ํฐ ID
|
| 75 |
)
|
| 76 |
|
| 77 |
+
# ์จ๋๊ฐ 0์ด๋ฉด ์ํ๋งํ์ง ์์
|
| 78 |
if temperature == 0:
|
| 79 |
generate_kwargs["do_sample"] = False
|
| 80 |
|
| 81 |
+
# ๋ณ๋ ์ค๋ ๋์์ ๋ชจ๋ธ ์์ฑ ์์
|
| 82 |
t = Thread(target=model.generate, kwargs=generate_kwargs)
|
| 83 |
t.start()
|
| 84 |
|
| 85 |
+
# ์์ฑ๋ ํ
์คํธ๋ฅผ ๋ฐ๋ณต์ ์ผ๋ก yield
|
| 86 |
partial_text = ""
|
| 87 |
for new_text in streamer:
|
| 88 |
partial_text += new_text
|
|
|
|
| 90 |
|
| 91 |
yield partial_text
|
| 92 |
|
| 93 |
+
# Gradio์ ChatInterface๋ฅผ ์ฌ์ฉํ์ฌ ๋ํํ ์ธํฐํ์ด์ค ์์ฑ
|
| 94 |
demo = gr.ChatInterface(
|
| 95 |
fn=chat,
|
| 96 |
examples=[["Write me a poem about Machine Learning."]],
|
|
|
|
| 97 |
additional_inputs_accordion=gr.Accordion(
|
| 98 |
label="โ๏ธ Parameters", open=False, render=False
|
| 99 |
),
|
|
|
|
| 115 |
title="Chat With LLMs",
|
| 116 |
description="Now Running [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)",
|
| 117 |
)
|
| 118 |
+
|
| 119 |
+
# Gradio ์ธํฐํ์ด์ค ์คํ
|
| 120 |
+
demo.launch()
|