Spaces:
Sleeping
Sleeping
File size: 3,907 Bytes
e904228 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import streamlit as st
from llama_cpp_cuda_tensorcores import Llama
from huggingface_hub import hf_hub_download
import spaces
# Constants
REPO_ID = "MaziyarPanahi/Meta-Llama-3-70B-Instruct-GGUF"
MODEL_NAME = "Meta-Llama-3-70B-Instruct.Q3_K_L.gguf"
MAX_CONTEXT_LENGTH = 8192
CUDA = True
SYSTEM_PROMPT = "You are a helpful, smart, kind, and efficient AI assistant. You always fulfill the user's requests to the best of your ability."
TOKEN_STOP = ["<|eot_id|>"]
SYS_MSG = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nSYSTEM_PROMPT<|eot_id|>\n"
USER_PROMPT = (
"<|start_header_id|>user<|end_header_id|>\n\nUSER_PROMPT<|eot_id|>\n"
)
ASSIS_PROMPT = "<|start_header_id|>assistant<|end_header_id|>\n\n"
END_ASSIS_PREVIOUS_RESPONSE = "<|eot_id|>\n"
TASK_PROMPT = {
"Assistant": SYSTEM_PROMPT,
}
# ChatLLM class for handling the chat
class ChatLLM:
def __init__(self, config_model):
self.llm = None
self.config_model = config_model
def load_cpp_model(self):
self.llm = Llama(**self.config_model)
def apply_chat_template(self, history, system_message):
history = history or []
messages = SYS_MSG.replace("SYSTEM_PROMPT", system_message.strip())
for msg in history:
messages += (
USER_PROMPT.replace("USER_PROMPT", msg[0]) + ASSIS_PROMPT + msg[1]
)
messages += END_ASSIS_PREVIOUS_RESPONSE if msg[1] else ""
return messages
@spaces.GPU(duration=120)
def response(
self,
history,
system_message,
max_tokens,
temperature,
top_p,
top_k,
repeat_penalty,
):
messages = self.apply_chat_template(history, system_message)
history[-1][1] = ""
if not self.llm:
print("Loading model")
self.load_cpp_model()
for output in self.llm(
messages,
echo=False,
stream=True,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repeat_penalty=repeat_penalty,
stop=TOKEN_STOP,
):
answer = output["choices"][0]["text"]
history[-1][1] += answer
return history
# Download model from Hugging Face
model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_NAME)
# Model configuration
config_model = {
"model_path": model_path,
"n_ctx": MAX_CONTEXT_LENGTH,
"n_gpu_layers": -1 if CUDA else 0,
}
# Instantiate the chat model
llm_chat = ChatLLM(config_model)
# Streamlit UI
st.title("AI Chat Assistant")
# Initialize session state to store the chat history
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
if "input_text" not in st.session_state:
st.session_state.input_text = ""
# Define response area
def chat_response():
if st.session_state.input_text.strip():
# User message
history = st.session_state.chat_history
history.append([st.session_state.input_text, ""])
# Model response
history = llm_chat.response(
history=history,
system_message=SYSTEM_PROMPT,
max_tokens=100, # Adjust token length as needed
temperature=0.7,
top_p=0.9,
top_k=50,
repeat_penalty=1.0,
)
st.session_state.chat_history = history
st.session_state.input_text = ""
# Textbox for user input
st.text_input("You: ", key="input_text", on_change=chat_response)
# Display chat history
if st.session_state.chat_history:
for user_msg, bot_resp in st.session_state.chat_history:
st.markdown(f"**You:** {user_msg}")
st.markdown(f"**Assistant:** {bot_resp}")
# Clear chat button
def clear_chat():
st.session_state.chat_history = []
st.button("Clear History", on_click=clear_chat)
|