multi chatbot
Browse files- app-old.py +78 -0
- app.py +76 -7
- local.ipynb → debug.ipynb +30 -58
- local-requirements.txt → finetuning-requirements.txt +0 -0
app-old.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from llama_cpp import Llama
|
| 2 |
+
import gradio as gr
|
| 3 |
+
|
| 4 |
+
llm = Llama.from_pretrained(
|
| 5 |
+
repo_id="Robzy/Llama-3.2-1B-Instruct-Finetuned-q4_k_m",
|
| 6 |
+
filename="unsloth.Q4_K_M.gguf",
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
llm2 = Llama.from_pretrained(
|
| 10 |
+
repo_id="Robzy/Llama-3.2-1B-Instruct-Finetuned-16bit",
|
| 11 |
+
filename="unsloth.F16.gguf",
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
def predict(message, history):
|
| 15 |
+
messages = [{"role": "system", "content": "You are a helpful assistant."}]
|
| 16 |
+
for user_message, bot_message in history:
|
| 17 |
+
if user_message:
|
| 18 |
+
messages.append({"role": "user", "content": user_message})
|
| 19 |
+
if bot_message:
|
| 20 |
+
messages.append({"role": "assistant", "content": bot_message})
|
| 21 |
+
messages.append({"role": "user", "content": message})
|
| 22 |
+
|
| 23 |
+
response = ""
|
| 24 |
+
for chunk in llm.create_chat_completion(
|
| 25 |
+
stream=True,
|
| 26 |
+
messages=messages,
|
| 27 |
+
):
|
| 28 |
+
part = chunk["choices"][0]["delta"].get("content", None)
|
| 29 |
+
if part:
|
| 30 |
+
response += part
|
| 31 |
+
yield response
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def predict2(message, history):
|
| 35 |
+
messages = [{"role": "system", "content": "You are a helpful assistant."}]
|
| 36 |
+
for user_message, bot_message in history:
|
| 37 |
+
if user_message:
|
| 38 |
+
messages.append({"role": "user", "content": user_message})
|
| 39 |
+
if bot_message:
|
| 40 |
+
messages.append({"role": "assistant", "content": bot_message})
|
| 41 |
+
messages.append({"role": "user", "content": message})
|
| 42 |
+
|
| 43 |
+
response = ""
|
| 44 |
+
for chunk in llm2.create_chat_completion(
|
| 45 |
+
stream=True,
|
| 46 |
+
messages=messages,
|
| 47 |
+
):
|
| 48 |
+
part = chunk["choices"][0]["delta"].get("content", None)
|
| 49 |
+
if part:
|
| 50 |
+
response += part
|
| 51 |
+
yield response
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
chat1 = gr.ChatInterface(predict, title="4-bit")
|
| 55 |
+
chat2 = gr.ChatInterface(predict2, title="16-bit")
|
| 56 |
+
chat3 = gr.ChatInterface(predict2, title="16-bit")
|
| 57 |
+
|
| 58 |
+
def update_chat(value):
|
| 59 |
+
if value == "4-bit":
|
| 60 |
+
chat1.render(visible=True)
|
| 61 |
+
chat2.render(visible=False)
|
| 62 |
+
chat3.render(visible=False)
|
| 63 |
+
elif value == "16-bit":
|
| 64 |
+
chat1.render(visible=False)
|
| 65 |
+
chat2.render(visible=True)
|
| 66 |
+
chat3.render(visible=False)
|
| 67 |
+
else:
|
| 68 |
+
chat1.render(visible=False)
|
| 69 |
+
chat2.render(visible=False)
|
| 70 |
+
chat3.render(visible=True)
|
| 71 |
+
|
| 72 |
+
with gr.Blocks() as demo:
|
| 73 |
+
|
| 74 |
+
gr.Markdown("# Quantized Llama Comparison for Code Generation")
|
| 75 |
+
dropdown = gr.Dropdown(["4-bit", "16-bit", "32-bit"], label="Choose model version", value="4-bit")
|
| 76 |
+
dropdown.change(fn=update_chat, inputs=dropdown, outputs=[chat1, chat2, chat3])
|
| 77 |
+
|
| 78 |
+
demo.launch()
|
app.py
CHANGED
|
@@ -1,12 +1,19 @@
|
|
| 1 |
-
from llama_cpp import Llama
|
| 2 |
import gradio as gr
|
|
|
|
| 3 |
|
|
|
|
| 4 |
llm = Llama.from_pretrained(
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
)
|
| 8 |
|
| 9 |
-
|
|
|
|
| 10 |
messages = [{"role": "system", "content": "You are a helpful assistant."}]
|
| 11 |
for user_message, bot_message in history:
|
| 12 |
if user_message:
|
|
@@ -25,7 +32,69 @@ def predict(message, history):
|
|
| 25 |
response += part
|
| 26 |
yield response
|
| 27 |
|
| 28 |
-
demo = gr.ChatInterface(predict)
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
from llama_cpp import Llama
|
| 3 |
|
| 4 |
+
# Load models
|
| 5 |
llm = Llama.from_pretrained(
|
| 6 |
+
repo_id="Robzy/Llama-3.2-1B-Instruct-Finetuned-q4_k_m",
|
| 7 |
+
filename="unsloth.Q4_K_M.gguf",
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
llm2 = Llama.from_pretrained(
|
| 11 |
+
repo_id="Robzy/Llama-3.2-1B-Instruct-Finetuned-16bit",
|
| 12 |
+
filename="unsloth.F16.gguf",
|
| 13 |
)
|
| 14 |
|
| 15 |
+
# Define prediction functions
|
| 16 |
+
def predict(message, history, model):
|
| 17 |
messages = [{"role": "system", "content": "You are a helpful assistant."}]
|
| 18 |
for user_message, bot_message in history:
|
| 19 |
if user_message:
|
|
|
|
| 32 |
response += part
|
| 33 |
yield response
|
| 34 |
|
|
|
|
| 35 |
|
| 36 |
+
def predict2(message, history, model):
|
| 37 |
+
messages = [{"role": "system", "content": "You are a helpful assistant."}]
|
| 38 |
+
for user_message, bot_message in history:
|
| 39 |
+
if user_message:
|
| 40 |
+
messages.append({"role": "user", "content": user_message})
|
| 41 |
+
if bot_message:
|
| 42 |
+
messages.append({"role": "assistant", "content": bot_message})
|
| 43 |
+
messages.append({"role": "user", "content": message})
|
| 44 |
+
|
| 45 |
+
response = ""
|
| 46 |
+
for chunk in llm2.create_chat_completion(
|
| 47 |
+
stream=True,
|
| 48 |
+
messages=messages,
|
| 49 |
+
):
|
| 50 |
+
part = chunk["choices"][0]["delta"].get("content", None)
|
| 51 |
+
if part:
|
| 52 |
+
response += part
|
| 53 |
+
yield response
|
| 54 |
+
|
| 55 |
+
def predict3(message, history, model):
|
| 56 |
+
messages = [{"role": "system", "content": "You are a helpful assistant."}]
|
| 57 |
+
for user_message, bot_message in history:
|
| 58 |
+
if user_message:
|
| 59 |
+
messages.append({"role": "user", "content": user_message})
|
| 60 |
+
if bot_message:
|
| 61 |
+
messages.append({"role": "assistant", "content": bot_message})
|
| 62 |
+
messages.append({"role": "user", "content": message})
|
| 63 |
+
|
| 64 |
+
response = ""
|
| 65 |
+
for chunk in llm2.create_chat_completion(
|
| 66 |
+
stream=True,
|
| 67 |
+
messages=messages,
|
| 68 |
+
):
|
| 69 |
+
part = chunk["choices"][0]["delta"].get("content", None)
|
| 70 |
+
if part:
|
| 71 |
+
response += part
|
| 72 |
+
yield response
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# Define ChatInterfaces
|
| 77 |
+
io1 = gr.ChatInterface(predict, title="4-bit")
|
| 78 |
+
io2 = gr.ChatInterface(predict2, title="8-bit") # Placeholder
|
| 79 |
+
io3 = gr.ChatInterface(predict3, title="16-bit")
|
| 80 |
+
io4 = gr.ChatInterface(predict2, title="32-bit") # Placeholder
|
| 81 |
+
|
| 82 |
+
# Dropdown and visibility mapping
|
| 83 |
+
chat_interfaces = {"4-bit": io1, "8-bit": io2, "16-bit": io3, "32-bit": io4}
|
| 84 |
+
|
| 85 |
+
# Define UI
|
| 86 |
+
with gr.Blocks() as demo:
|
| 87 |
+
gr.Markdown("# Quantized Llama Comparison for Code Generation")
|
| 88 |
+
|
| 89 |
+
with gr.Tab("4-bit"):
|
| 90 |
+
io1.render()
|
| 91 |
+
with gr.Tab("8-bit"):
|
| 92 |
+
io2.render()
|
| 93 |
+
with gr.Tab("16-bit"):
|
| 94 |
+
io3.render()
|
| 95 |
+
with gr.Tab("32-bit"):
|
| 96 |
+
io4.render()
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
demo.launch()
|
local.ipynb → debug.ipynb
RENAMED
|
@@ -2,40 +2,9 @@
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
-
"execution_count":
|
| 6 |
"metadata": {},
|
| 7 |
"outputs": [
|
| 8 |
-
{
|
| 9 |
-
"name": "stderr",
|
| 10 |
-
"output_type": "stream",
|
| 11 |
-
"text": [
|
| 12 |
-
"None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
|
| 13 |
-
]
|
| 14 |
-
}
|
| 15 |
-
],
|
| 16 |
-
"source": [
|
| 17 |
-
"import transformers"
|
| 18 |
-
]
|
| 19 |
-
},
|
| 20 |
-
{
|
| 21 |
-
"cell_type": "code",
|
| 22 |
-
"execution_count": 3,
|
| 23 |
-
"metadata": {},
|
| 24 |
-
"outputs": [
|
| 25 |
-
{
|
| 26 |
-
"data": {
|
| 27 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 28 |
-
"model_id": "c94c88bacc2c48cb8ce50e93d73e15eb",
|
| 29 |
-
"version_major": 2,
|
| 30 |
-
"version_minor": 0
|
| 31 |
-
},
|
| 32 |
-
"text/plain": [
|
| 33 |
-
"unsloth.Q4_K_M.gguf: 0%| | 0.00/808M [00:00<?, ?B/s]"
|
| 34 |
-
]
|
| 35 |
-
},
|
| 36 |
-
"metadata": {},
|
| 37 |
-
"output_type": "display_data"
|
| 38 |
-
},
|
| 39 |
{
|
| 40 |
"name": "stderr",
|
| 41 |
"output_type": "stream",
|
|
@@ -552,6 +521,8 @@
|
|
| 552 |
],
|
| 553 |
"source": [
|
| 554 |
"from llama_cpp import Llama\n",
|
|
|
|
|
|
|
| 555 |
"\n",
|
| 556 |
"llm = Llama.from_pretrained(\n",
|
| 557 |
"\trepo_id=\"Robzy/Llama-3.2-1B-Instruct-Finetuned-q4_k_m\",\n",
|
|
@@ -561,44 +532,45 @@
|
|
| 561 |
},
|
| 562 |
{
|
| 563 |
"cell_type": "code",
|
| 564 |
-
"execution_count":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 565 |
"metadata": {},
|
| 566 |
"outputs": [
|
| 567 |
{
|
| 568 |
"name": "stderr",
|
| 569 |
"output_type": "stream",
|
| 570 |
"text": [
|
| 571 |
-
"
|
| 572 |
-
"llama_perf_context_print:
|
| 573 |
-
"llama_perf_context_print:
|
| 574 |
-
"llama_perf_context_print:
|
|
|
|
| 575 |
]
|
| 576 |
},
|
| 577 |
{
|
| 578 |
-
"
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
" 'model': '/home/robert/.cache/huggingface/hub/models--Robzy--Llama-3.2-1B-Instruct-Finetuned-q4_k_m/snapshots/49dc2f37761bb04ce3513b70087676029ccd4f20/./unsloth.Q4_K_M.gguf',\n",
|
| 584 |
-
" 'choices': [{'index': 0,\n",
|
| 585 |
-
" 'message': {'role': 'assistant',\n",
|
| 586 |
-
" 'content': \"The tower is a prominent landmark in the capital of France, standing tall and proud in the heart of the city. It is a grandiose structure, with a sleek and modern design that reflects the country's rich history and architectural heritage. The tower is adorned with intricate details and ornate carvings, adding to its majestic appearance.\\n\\nThe tower is a marvel of engineering, with a sturdy foundation that allows it to stand tall for centuries. Its height is impressive, with a grand staircase that winds its way up to the top of the tower. The staircase is lined with elegant railings, providing a comfortable and safe path for visitors to ascend.\\n\\nThe tower is also home to a museum, showcasing a vast collection of art and artifacts from French history. The museum is a treasure trove of knowledge, with exhibits on everything from the Renaissance to the modern era. Visitors can explore the exhibits, learning about the country's rich cultural heritage.\\n\\nThe tower is a popular destination for tourists and locals alike, offering a unique and unforgettable experience. Visitors can take a guided tour of the tower, learning about its history and significance. The tower is also a popular spot for weddings and other special events, making it a beloved landmark in the city.\\n\\nOverall, the tower is a stunning and iconic landmark that reflects the best of French culture and architecture. Its grandeur and beauty make it a must-visit destination for anyone traveling to the capital of France.\"},\n",
|
| 587 |
-
" 'logprobs': None,\n",
|
| 588 |
-
" 'finish_reason': 'stop'}],\n",
|
| 589 |
-
" 'usage': {'prompt_tokens': 45, 'completion_tokens': 288, 'total_tokens': 333}}"
|
| 590 |
-
]
|
| 591 |
-
},
|
| 592 |
-
"execution_count": 20,
|
| 593 |
-
"metadata": {},
|
| 594 |
-
"output_type": "execute_result"
|
| 595 |
}
|
| 596 |
],
|
| 597 |
"source": [
|
| 598 |
-
"
|
| 599 |
-
"
|
| 600 |
-
"
|
| 601 |
-
"
|
|
|
|
|
|
|
|
|
|
| 602 |
]
|
| 603 |
}
|
| 604 |
],
|
|
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
"metadata": {},
|
| 7 |
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
{
|
| 9 |
"name": "stderr",
|
| 10 |
"output_type": "stream",
|
|
|
|
| 521 |
],
|
| 522 |
"source": [
|
| 523 |
"from llama_cpp import Llama\n",
|
| 524 |
+
"import gradio as gr\n",
|
| 525 |
+
"import time\n",
|
| 526 |
"\n",
|
| 527 |
"llm = Llama.from_pretrained(\n",
|
| 528 |
"\trepo_id=\"Robzy/Llama-3.2-1B-Instruct-Finetuned-q4_k_m\",\n",
|
|
|
|
| 532 |
},
|
| 533 |
{
|
| 534 |
"cell_type": "code",
|
| 535 |
+
"execution_count": 4,
|
| 536 |
+
"metadata": {},
|
| 537 |
+
"outputs": [],
|
| 538 |
+
"source": [
|
| 539 |
+
"import time"
|
| 540 |
+
]
|
| 541 |
+
},
|
| 542 |
+
{
|
| 543 |
+
"cell_type": "code",
|
| 544 |
+
"execution_count": 5,
|
| 545 |
"metadata": {},
|
| 546 |
"outputs": [
|
| 547 |
{
|
| 548 |
"name": "stderr",
|
| 549 |
"output_type": "stream",
|
| 550 |
"text": [
|
| 551 |
+
"Llama.generate: 35 prefix-match hit, remaining 1 prompt tokens to eval\n",
|
| 552 |
+
"llama_perf_context_print: load time = 406.81 ms\n",
|
| 553 |
+
"llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second)\n",
|
| 554 |
+
"llama_perf_context_print: eval time = 0.00 ms / 31 runs ( 0.00 ms per token, inf tokens per second)\n",
|
| 555 |
+
"llama_perf_context_print: total time = 953.44 ms / 32 tokens\n"
|
| 556 |
]
|
| 557 |
},
|
| 558 |
{
|
| 559 |
+
"name": "stdout",
|
| 560 |
+
"output_type": "stream",
|
| 561 |
+
"text": [
|
| 562 |
+
"Tokens per second: 31.380398024839145\n"
|
| 563 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 564 |
}
|
| 565 |
],
|
| 566 |
"source": [
|
| 567 |
+
"t0 = time.time()\n",
|
| 568 |
+
"res = llm.create_chat_completion(messages = [{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}])\n",
|
| 569 |
+
"t1 = time.time()\n",
|
| 570 |
+
"\n",
|
| 571 |
+
"num_response_tokens = int(res['usage']['completion_tokens'])\n",
|
| 572 |
+
"tokens_per_second = num_response_tokens / (t1 - t0)\n",
|
| 573 |
+
"print(f\"Tokens per second: {tokens_per_second}\")"
|
| 574 |
]
|
| 575 |
}
|
| 576 |
],
|
local-requirements.txt → finetuning-requirements.txt
RENAMED
|
File without changes
|