File size: 4,964 Bytes
bba8253
 
088f906
3e64474
bba8253
088f906
bba8253
 
 
088f906
bba8253
 
088f906
 
 
bba8253
088f906
bba8253
088f906
 
3d4f049
088f906
 
 
 
 
 
3d4f049
 
088f906
 
 
 
 
3e64474
 
088f906
3e64474
088f906
 
3e64474
088f906
bba8253
 
088f906
 
0bd5ba6
088f906
 
 
 
 
 
 
bba8253
3e64474
088f906
 
bba8253
 
088f906
bba8253
 
088f906
bba8253
 
088f906
bba8253
 
088f906
 
bba8253
 
 
 
088f906
bba8253
088f906
 
 
 
bba8253
088f906
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import datetime

# Page configuration
st.set_page_config(
    page_title="Qwen2.5-Coder Chat",
    page_icon="πŸ’¬",
    layout="wide"
)

# Initialize session state for conversation history
if 'messages' not in st.session_state:
    st.session_state.messages = []

# Cache the model loading
@st.cache_resource
def load_model_and_tokenizer():
    model_name = "Qwen/Qwen2.5-Coder-32B-Instruct"
    
    # Configure quantization
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=False,
    )
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(
        model_name, 
        trust_remote_code=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    
    return tokenizer, model

# Main title
st.title("πŸ’¬ Qwen2.5-Coder Chat")

# Sidebar settings
with st.sidebar:
    st.header("Settings")
    
    max_length = st.slider(
        "Maximum Length",
        min_value=64,
        max_value=4096,
        value=512,
        step=64,
        help="Maximum number of tokens to generate"
    )
    
    temperature = st.slider(
        "Temperature",
        min_value=0.1,
        max_value=2.0,
        value=0.7,
        step=0.1,
        help="Higher values make output more random, lower values more deterministic"
    )
    
    top_p = st.slider(
        "Top P",
        min_value=0.1,
        max_value=1.0,
        value=0.9,
        step=0.1,
        help="Nucleus sampling: higher values consider more tokens, lower values are more focused"
    )
    
    if st.button("Clear Conversation"):
        st.session_state.messages = []
        st.rerun()

# Load model with error handling
try:
    with st.spinner("Loading model... Please wait..."):
        tokenizer, model = load_model_and_tokenizer()
except Exception as e:
    st.error(f"Error loading model: {str(e)}")
    st.stop()

def generate_response(prompt, max_new_tokens=512, temperature=0.7, top_p=0.9):
    """Generate response from the model"""
    try:
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        
        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )
        
        # Decode and return response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract only the model's response (after the prompt)
        response = response[len(prompt):].strip()
        return response
    
    except Exception as e:
        st.error(f"Error generating response: {str(e)}")
        return None

# Display chat history
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.write(f"{message['content']}\n\n_{message['timestamp']}_")

# Chat input
if prompt := st.chat_input("Ask me anything about coding..."):
    # Add user message to chat
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    st.session_state.messages.append({
        "role": "user",
        "content": prompt,
        "timestamp": timestamp
    })
    
    # Display user message
    with st.chat_message("user"):
        st.write(f"{prompt}\n\n_{timestamp}_")
    
    # Generate and display response
    with st.chat_message("assistant"):
        with st.spinner("Thinking..."):
            # Prepare conversation history
            conversation = ""
            for msg in st.session_state.messages:
                if msg["role"] == "user":
                    conversation += f"Human: {msg['content']}\n"
                else:
                    conversation += f"Assistant: {msg['content']}\n"
            conversation += "Assistant:"
            
            response = generate_response(
                conversation,
                max_new_tokens=max_length,
                temperature=temperature,
                top_p=top_p
            )
            
            if response:
                timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                st.write(f"{response}\n\n_{timestamp}_")
                
                # Add assistant response to chat history
                st.session_state.messages.append({
                    "role": "assistant",
                    "content": response,
                    "timestamp": timestamp
                })