File size: 3,907 Bytes
e904228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import streamlit as st
from llama_cpp_cuda_tensorcores import Llama
from huggingface_hub import hf_hub_download
import spaces

# Constants
REPO_ID = "MaziyarPanahi/Meta-Llama-3-70B-Instruct-GGUF"
MODEL_NAME = "Meta-Llama-3-70B-Instruct.Q3_K_L.gguf"
MAX_CONTEXT_LENGTH = 8192
CUDA = True
SYSTEM_PROMPT = "You are a helpful, smart, kind, and efficient AI assistant. You always fulfill the user's requests to the best of your ability."
TOKEN_STOP = ["<|eot_id|>"]
SYS_MSG = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nSYSTEM_PROMPT<|eot_id|>\n"
USER_PROMPT = (
    "<|start_header_id|>user<|end_header_id|>\n\nUSER_PROMPT<|eot_id|>\n"
)
ASSIS_PROMPT = "<|start_header_id|>assistant<|end_header_id|>\n\n"
END_ASSIS_PREVIOUS_RESPONSE = "<|eot_id|>\n"

TASK_PROMPT = {
    "Assistant": SYSTEM_PROMPT,
}

# ChatLLM class for handling the chat
class ChatLLM:
    def __init__(self, config_model):
        self.llm = None
        self.config_model = config_model

    def load_cpp_model(self):
        self.llm = Llama(**self.config_model)

    def apply_chat_template(self, history, system_message):
        history = history or []
        messages = SYS_MSG.replace("SYSTEM_PROMPT", system_message.strip())
        for msg in history:
            messages += (
                USER_PROMPT.replace("USER_PROMPT", msg[0]) + ASSIS_PROMPT + msg[1]
            )
            messages += END_ASSIS_PREVIOUS_RESPONSE if msg[1] else ""

        return messages

    @spaces.GPU(duration=120)
    def response(
        self,
        history,
        system_message,
        max_tokens,
        temperature,
        top_p,
        top_k,
        repeat_penalty,
    ):
        messages = self.apply_chat_template(history, system_message)

        history[-1][1] = ""

        if not self.llm:
            print("Loading model")
            self.load_cpp_model()

        for output in self.llm(
            messages,
            echo=False,
            stream=True,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            repeat_penalty=repeat_penalty,
            stop=TOKEN_STOP,
        ):
            answer = output["choices"][0]["text"]
            history[-1][1] += answer

        return history

# Download model from Hugging Face
model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_NAME)

# Model configuration
config_model = {
    "model_path": model_path,
    "n_ctx": MAX_CONTEXT_LENGTH,
    "n_gpu_layers": -1 if CUDA else 0,
}

# Instantiate the chat model
llm_chat = ChatLLM(config_model)

# Streamlit UI
st.title("AI Chat Assistant")

# Initialize session state to store the chat history
if "chat_history" not in st.session_state:
    st.session_state.chat_history = []

if "input_text" not in st.session_state:
    st.session_state.input_text = ""

# Define response area
def chat_response():
    if st.session_state.input_text.strip():
        # User message
        history = st.session_state.chat_history
        history.append([st.session_state.input_text, ""])

        # Model response
        history = llm_chat.response(
            history=history,
            system_message=SYSTEM_PROMPT,
            max_tokens=100,  # Adjust token length as needed
            temperature=0.7,
            top_p=0.9,
            top_k=50,
            repeat_penalty=1.0,
        )

        st.session_state.chat_history = history
        st.session_state.input_text = ""

# Textbox for user input
st.text_input("You: ", key="input_text", on_change=chat_response)

# Display chat history
if st.session_state.chat_history:
    for user_msg, bot_resp in st.session_state.chat_history:
        st.markdown(f"**You:** {user_msg}")
        st.markdown(f"**Assistant:** {bot_resp}")

# Clear chat button
def clear_chat():
    st.session_state.chat_history = []

st.button("Clear History", on_click=clear_chat)