File size: 5,238 Bytes
80351f4
bba8253
 
fb19b6e
3e64474
bba8253
59fb13e
bba8253
59fb13e
bba8253
088f906
bba8253
 
59fb13e
80351f4
 
088f906
 
 
bba8253
59fb13e
bba8253
088f906
59fb13e
80351f4
fb19b6e
088f906
 
 
 
80351f4
 
fb19b6e
 
80351f4
59fb13e
 
 
 
 
 
 
 
088f906
bba8253
 
80351f4
088f906
0bd5ba6
088f906
 
 
 
 
 
 
59fb13e
 
088f906
 
bba8253
 
088f906
bba8253
 
59fb13e
 
bba8253
088f906
bba8253
 
088f906
 
bba8253
 
59fb13e
bba8253
088f906
bba8253
088f906
 
 
 
bba8253
80351f4
088f906
 
 
 
 
 
 
80351f4
59fb13e
088f906
 
59fb13e
088f906
 
59fb13e
088f906
 
 
 
 
 
 
 
 
 
 
59fb13e
088f906
59fb13e
088f906
 
 
 
 
80351f4
59fb13e
088f906
 
 
 
 
80351f4
088f906
 
 
 
 
 
 
 
 
 
 
 
 
 
59fb13e
80351f4
 
59fb13e
80351f4
088f906
 
 
 
 
 
 
 
 
 
 
 
80351f4
088f906
 
 
 
80351f4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import os
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import datetime

# Page configuration
st.set_page_config(
    page_title="πŸ’¬ Qwen2.5-Coder Chat",
    page_icon="πŸ’¬",
    layout="wide"
)

# Set cache directory explicitly for Hugging Face Spaces
os.environ["TRANSFORMERS_CACHE"] = "/root/.cache/huggingface"

# Initialize session state for conversation history
if 'messages' not in st.session_state:
    st.session_state.messages = []

# Cache model loading to prevent re-loading each session
@st.cache_resource
def load_model_and_tokenizer():
    model_name = "Qwen/Qwen2.5-Coder-3B-Instruct"  # Smaller 3B model for efficiency

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name, 
        trust_remote_code=True
    )

    # Device configuration
    device = "cuda" if torch.cuda.is_available() else "cpu"
    st.info(f"Using device: {device}")

    # Load model with optimizations for CPU
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32 if device == "cpu" else torch.float16,
        device_map="auto" if device == "cuda" else {"": device},
        trust_remote_code=True,
        low_cpu_mem_usage=True  # Reduce memory usage for CPU
    )
    
    return tokenizer, model

# Title
st.title("πŸ’¬ Qwen2.5-Coder Chat")

# Sidebar settings
with st.sidebar:
    st.header("Settings")
    
    max_length = st.slider(
        "Maximum Length",
        min_value=64,
        max_value=1024,  # Lowered for CPU
        value=256,  # Default setting for CPU
        step=64,
        help="Maximum number of tokens to generate"
    )
    
    temperature = st.slider(
        "Temperature",
        min_value=0.1,
        max_value=1.5,  # Lower range to make output more deterministic
        value=0.5,
        step=0.1,
        help="Higher values make output more random, lower values more deterministic"
    )
    
    top_p = st.slider(
        "Top P",
        min_value=0.1,
        max_value=1.0,
        value=0.8,
        step=0.1,
        help="Nucleus sampling: higher values consider more tokens, lower values are more focused"
    )
    
    if st.button("Clear Conversation"):
        st.session_state.messages = []
        st.rerun()

# Load model with caching
try:
    with st.spinner("Loading model... Please wait..."):
        tokenizer, model = load_model_and_tokenizer()
except Exception as e:
    st.error(f"Error loading model: {str(e)}")
    st.stop()

# Response generation function
def generate_response(prompt, max_new_tokens=256, temperature=0.5, top_p=0.8):
    """Generate response from the model"""
    try:
        # Tokenize the input
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        
        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )
        
        # Decode and return response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response[len(prompt):].strip()  # Extract only the model's response
    
    except Exception as e:
        st.error(f"Error generating response: {str(e)}")
        return None

# Display conversation history
for message in st.session_state.messages[-5:]:  # Limit to last 5 messages for efficiency
    with st.chat_message(message["role"]):
        st.write(f"{message['content']}\n\n_{message['timestamp']}_")

# Chat input
if prompt := st.chat_input("Ask me anything about coding..."):
    # Add user message
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    st.session_state.messages.append({
        "role": "user",
        "content": prompt,
        "timestamp": timestamp
    })
    
    # Display user message
    with st.chat_message("user"):
        st.write(f"{prompt}\n\n_{timestamp}_")
    
    # Generate and display response
    with st.chat_message("assistant"):
        with st.spinner("Thinking..."):
            # Prepare conversation context, limited to recent exchanges
            conversation = "\n".join(
                f"{'Human' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}" 
                for msg in st.session_state.messages[-3:]  # Send only the last 3 messages
            ) + "\nAssistant:"
            
            response = generate_response(
                conversation,
                max_new_tokens=max_length,
                temperature=temperature,
                top_p=top_p
            )
            
            if response:
                timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                st.write(f"{response}\n\n_{timestamp}_")
                
                # Add response to chat history
                st.session_state.messages.append({
                    "role": "assistant",
                    "content": response,
                    "timestamp": timestamp
                })