File size: 10,684 Bytes
f76f5bc
 
 
 
d75f179
f76f5bc
949aa02
 
 
 
 
5ca330a
 
 
949aa02
dc45496
f76f5bc
949aa02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f76f5bc
 
949aa02
 
f76f5bc
dc45496
949aa02
 
 
 
dc45496
f76f5bc
 
 
 
 
 
 
 
 
 
 
 
 
 
949aa02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f76f5bc
 
949aa02
 
 
 
f76f5bc
949aa02
f76f5bc
 
 
949aa02
 
 
 
f76f5bc
 
 
 
 
 
d75f179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f76f5bc
 
949aa02
f76f5bc
5ca330a
f76f5bc
5ca330a
f76f5bc
949aa02
5ca330a
 
 
f76f5bc
 
949aa02
 
 
5ca330a
949aa02
 
 
 
f76f5bc
 
949aa02
f76f5bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5eb33d
 
 
f76f5bc
949aa02
 
f76f5bc
 
 
 
 
 
949aa02
 
f76f5bc
 
 
 
 
 
949aa02
 
f76f5bc
 
 
 
 
 
949aa02
f76f5bc
 
 
 
 
949aa02
f76f5bc
 
 
 
 
 
 
949aa02
 
 
f76f5bc
 
949aa02
f76f5bc
 
 
949aa02
f76f5bc
 
 
 
949aa02
 
f76f5bc
 
 
5ca330a
 
949aa02
5ca330a
f76f5bc
5ca330a
f76f5bc
bd126c6
f76f5bc
949aa02
f76f5bc
 
 
 
 
 
 
d75f179
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
import gradio as gr
import spaces
from transformers import pipeline
import torch
from typing import List, Dict, Optional

# Global variable to store pipelines
model_cache = {}

# Available models
AVAILABLE_MODELS = {
    "Nous-1-4B": "apexion-ai/Nous-1-4B",
    "Nous-1-8B": "apexion-ai/Nous-1-8B",
    "Nous-1-2B": "apexion-ai/Nous-1-2B",
}

@spaces.GPU
def initialize_model(model_name):
    global model_cache
    
    if model_name not in AVAILABLE_MODELS:
        raise ValueError(f"Model {model_name} not found in available models")
    
    model_id = AVAILABLE_MODELS[model_name]
    
    # Check if model is already cached
    if model_id not in model_cache:
        try:
            model_cache[model_id] = pipeline(
                "text-generation", 
                model=model_id,
                torch_dtype=torch.float16,
                device_map="auto",
                trust_remote_code=True
            )
        except Exception as e:
            # Fallback to CPU if GPU fails
            model_cache[model_id] = pipeline(
                "text-generation", 
                model=model_id,
                torch_dtype=torch.float32,
                device_map="cpu",
                trust_remote_code=True
            )
    
    return model_cache[model_id]

@spaces.GPU
def generate_response(message, history, model_name, max_length=512, temperature=0.7, top_p=0.9):
    """Generate response using the selected model"""
    
    # Initialize model inside the GPU-decorated function
    try:
        model_pipe = initialize_model(model_name)
    except Exception as e:
        return f"Error loading model {model_name}: {str(e)}"
    
    # Format the conversation history
    messages = []
    
    # Add conversation history
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})
    
    # Add current message
    messages.append({"role": "user", "content": message})
    
    # Generate response
    try:
        # Some models may not support the messages format, so we'll try different approaches
        try:
            # Try with messages format first
            response = model_pipe(
                messages,
                max_length=max_length,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=model_pipe.tokenizer.eos_token_id,
                return_full_text=False
            )
        except:
            # Fallback to simple text format
            conversation_text = ""
            for msg in messages:
                if msg["role"] == "user":
                    conversation_text += f"User: {msg['content']}\n"
                else:
                    conversation_text += f"Assistant: {msg['content']}\n"
            conversation_text += "Assistant:"
            
            response = model_pipe(
                conversation_text,
                max_length=max_length,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=model_pipe.tokenizer.eos_token_id,
                return_full_text=False
            )
        
        # Extract the generated text
        if isinstance(response, list) and len(response) > 0:
            generated_text = response[0]['generated_text']
        else:
            generated_text = str(response)
        
        # Clean up the response
        if isinstance(generated_text, list):
            assistant_response = generated_text[-1]['content']
        else:
            # Remove the prompt and extract assistant response
            assistant_response = str(generated_text).strip()
            if "Assistant:" in assistant_response:
                assistant_response = assistant_response.split("Assistant:")[-1].strip()
        
        return assistant_response
        
    except Exception as e:
        return f"Error generating response: {str(e)}"

@spaces.GPU
def generate(
    model: str,
    user_input: str,
    history: Optional[str] = "",
    temperature: float = 0.7,
    system_prompt: Optional[str] = "",
    max_tokens: int = 512
):
    """
    API endpoint for LLM generation
    
    Args:
        model: Model name to use (Nous-1-2B, Nous-1-4B, or Nous-1-8B)
        user_input: Current user message/input
        history: JSON string of conversation history in format [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]
        temperature: Temperature for generation (0.1-2.0)
        system_prompt: System prompt to guide the model
        max_tokens: Maximum tokens to generate (1-8192)
    
    Returns:
        Generated response from the model
    """
    
    # Validate model
    if model not in AVAILABLE_MODELS:
        return f"Error: Model {model} not available. Available models: {list(AVAILABLE_MODELS.keys())}"
    
    # Initialize model
    try:
        model_pipe = initialize_model(model)
    except Exception as e:
        return f"Error loading model {model}: {str(e)}"
    
    # Parse history if provided and convert to gradio format
    gradio_history = []
    if history and history.strip():
        try:
            import json
            history_list = json.loads(history)
            current_pair = [None, None]
            for msg in history_list:
                if isinstance(msg, dict) and "role" in msg and "content" in msg:
                    if msg["role"] == "user":
                        if current_pair[0] is not None:
                            gradio_history.append([current_pair[0], current_pair[1]])
                        current_pair = [msg["content"], None]
                    elif msg["role"] == "assistant":
                        current_pair[1] = msg["content"]
            if current_pair[0] is not None:
                gradio_history.append([current_pair[0], current_pair[1]])
        except:
            # If history parsing fails, continue without history
            pass
    
    # Add system prompt to user input if provided
    final_user_input = user_input
    if system_prompt and system_prompt.strip():
        final_user_input = f"System: {system_prompt}\n\nUser: {user_input}"
    
    # Use the original generate_response function
    return generate_response(final_user_input, gradio_history, model, max_tokens, temperature, 0.9)

# Create the Gradio interface
def create_interface():
    with gr.Blocks(title="Multi-Model Chat", theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # πŸš€ Nous-1 Model Chat Interface
        
        Chat with the Nous-1 models by Apexion AI.
        
        **Available Models:**
        - Nous-1-4B (4 billion parameters)
        - Nous-1-8B (8 billion parameters)
        - Nous-1-2B (2 billion parameters)
        """)
        
        with gr.Row():
            model_selector = gr.Dropdown(
                choices=list(AVAILABLE_MODELS.keys()),
                value="Nous-1-4B",
                label="Select Model",
                info="Choose which model to use for generation"
            )
        
        chatbot = gr.Chatbot(
            height=400,
            placeholder="Select a model and start chatting...",
            label="Chat"
        )
        
        msg = gr.Textbox(
            placeholder="Type your message here...",
            label="Message",
            lines=2
        )
        
        with gr.Row():
            submit_btn = gr.Button("Send", variant="primary")
            clear_btn = gr.Button("Clear Chat", variant="secondary")
        
        with gr.Accordion("Advanced Settings", open=False):
            max_length = gr.Slider(
                minimum=200,
                maximum=8192,
                value=2048,
                step=50,
                label="Max Length",
                info="Maximum length of generated response"
            )
            temperature = gr.Slider(
                minimum=0.1,
                maximum=2.0,
                value=0.7,
                step=0.1,
                label="Temperature",
                info="Controls randomness in generation"
            )
            top_p = gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.9,
                step=0.1,
                label="Top P",
                info="Controls diversity via nucleus sampling"
            )
        
        # Event handlers
        def user_message(message, history):
            return "", history + [[message, None]]
        
        def bot_response(history, model_name, max_len, temp, top_p):
            if history:
                user_message = history[-1][0]
                bot_message = generate_response(
                    user_message, 
                    history[:-1], 
                    model_name,
                    max_len, 
                    temp, 
                    top_p
                )
                history[-1][1] = bot_message
            return history
        
        def model_changed(model_name):
            return gr.update(placeholder=f"Chat with {model_name}...")
        
        # Wire up the events
        msg.submit(user_message, [msg, chatbot], [msg, chatbot]).then(
            bot_response, [chatbot, model_selector, max_length, temperature, top_p], chatbot
        )
        
        submit_btn.click(user_message, [msg, chatbot], [msg, chatbot]).then(
            bot_response, [chatbot, model_selector, max_length, temperature, top_p], chatbot
        )
        
        clear_btn.click(lambda: None, None, chatbot, queue=False)
        
        model_selector.change(model_changed, model_selector, chatbot)
        
        gr.Markdown("""
        ---
        
        ### About the Nous-1 Models
        **Nous-1-2B**: 2 billion parameter model by Apexion AI, designed for fast and quick infrencing
        
        **Nous-1-4B**: 4 billion parameter model by Apexion AI, optimisd for efficient conversation and text generation
        
        **Nous-1-8B**: 8 billion parameter model by Apexion AI, offering enhanced capabilities and better performance for complex tasks
        
        All models are designed for conversational AI and support various text generation tasks. The 8B model provides more sophisticated responses but requires more computational resources.
        
        This Space uses ZeroGPU for efficient GPU allocation across both model sizes.
        """)
    
    return demo

# Launch the app
if __name__ == "__main__":
    demo = create_interface()
    # Enable API and launch
    demo.launch(share=True)