Debito commited on
Commit
477a0e7
Β·
verified Β·
1 Parent(s): f88e557

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +795 -795
app.py CHANGED
@@ -1,795 +1,795 @@
1
- #!/usr/bin/env python3
2
- """
3
- renamed from app_real.py - Production-Ready Mamba Encoder Swarm Demo
4
- Combines real model functionality with rich UI and comprehensive error handling
5
- """
6
-
7
- import gradio as gr
8
- import torch
9
- import numpy as np
10
- import time
11
- import json
12
- import logging
13
- import os
14
- import psutil
15
- from typing import Optional, Dict, Any, Tuple
16
- from datetime import datetime
17
- from transformers import AutoTokenizer, AutoConfig
18
-
19
- # Setup comprehensive logging
20
- logging.basicConfig(
21
- level=logging.INFO,
22
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
23
- handlers=[
24
- logging.FileHandler('mamba_swarm_demo.log'),
25
- logging.StreamHandler()
26
- ]
27
- )
28
- logger = logging.getLogger(__name__)
29
-
30
- class MambaSwarmDemo:
31
- """Production-ready Mamba Swarm Demo with fallback capabilities"""
32
-
33
- def __init__(self, model_path: str = "./", fallback_mode: bool = False):
34
- self.model = None
35
- self.tokenizer = None
36
- self.config = None
37
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38
- self.model_path = model_path
39
- self.fallback_mode = fallback_mode
40
- self.model_loaded = False
41
-
42
- # Performance tracking
43
- self.stats = {
44
- 'total_requests': 0,
45
- 'successful_generations': 0,
46
- 'failed_generations': 0,
47
- 'avg_generation_time': 0.0,
48
- 'total_tokens_generated': 0
49
- }
50
-
51
- # Domain mappings for intelligent routing
52
- self.domain_keywords = {
53
- 'medical': ['medical', 'health', 'doctor', 'patient', 'disease', 'treatment', 'symptom', 'diagnosis'],
54
- 'legal': ['legal', 'law', 'court', 'judge', 'contract', 'patent', 'lawsuit', 'attorney'],
55
- 'code': ['code', 'python', 'programming', 'function', 'algorithm', 'software', 'debug', 'api'],
56
- 'science': ['science', 'research', 'experiment', 'theory', 'physics', 'chemistry', 'biology'],
57
- 'creative': ['story', 'creative', 'write', 'novel', 'poem', 'character', 'plot', 'narrative'],
58
- 'business': ['business', 'marketing', 'strategy', 'finance', 'management', 'sales', 'revenue'],
59
- 'general': ['explain', 'what', 'how', 'why', 'describe', 'tell', 'information']
60
- }
61
-
62
- self._initialize_model()
63
- logger.info(f"Demo initialized - Model loaded: {self.model_loaded}, Fallback mode: {self.fallback_mode}")
64
-
65
- def _initialize_model(self):
66
- """Initialize model with comprehensive error handling and fallback"""
67
- try:
68
- logger.info("Attempting to load Mamba Swarm model...")
69
-
70
- # Check if model files exist
71
- config_path = os.path.join(self.model_path, "config.json")
72
- if not os.path.exists(config_path) and not self.fallback_mode:
73
- logger.warning(f"Config file not found at {config_path}, enabling fallback mode")
74
- self.fallback_mode = True
75
-
76
- if not self.fallback_mode:
77
- # Try to load real model
78
- self._load_real_model()
79
- else:
80
- # Initialize in fallback mode
81
- self._initialize_fallback_mode()
82
-
83
- except Exception as e:
84
- logger.error(f"Model initialization failed: {e}")
85
- logger.info("Falling back to simulation mode")
86
- self.fallback_mode = True
87
- self._initialize_fallback_mode()
88
-
89
- def _load_real_model(self):
90
- """Load the actual Mamba Swarm model"""
91
- try:
92
- # Import here to avoid dependency issues if not available
93
- from upload_to_hf import MambaSwarmForCausalLM
94
-
95
- # Load configuration
96
- self.config = AutoConfig.from_pretrained(self.model_path, trust_remote_code=True)
97
- logger.info(f"Loaded config: {self.config.__class__.__name__}")
98
-
99
- # Load tokenizer
100
- self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
101
- if self.tokenizer.pad_token is None:
102
- self.tokenizer.pad_token = self.tokenizer.eos_token
103
- logger.info("Tokenizer loaded successfully")
104
-
105
- # Load model with memory optimization
106
- dtype = torch.float16 if self.device.type == "cuda" else torch.float32
107
-
108
- self.model = MambaSwarmForCausalLM.from_pretrained(
109
- self.model_path,
110
- config=self.config,
111
- torch_dtype=dtype,
112
- trust_remote_code=True,
113
- low_cpu_mem_usage=True
114
- ).to(self.device)
115
-
116
- self.model.eval()
117
- self.model_loaded = True
118
-
119
- # Log model info
120
- num_params = sum(p.numel() for p in self.model.parameters())
121
- logger.info(f"Model loaded successfully on {self.device}")
122
- logger.info(f"Model parameters: {num_params:,} ({num_params/1e6:.1f}M)")
123
-
124
- except ImportError as e:
125
- logger.error(f"MambaSwarmForCausalLM not available: {e}")
126
- raise
127
- except Exception as e:
128
- logger.error(f"Real model loading failed: {e}")
129
- raise
130
-
131
- def _initialize_fallback_mode(self):
132
- """Initialize fallback/simulation mode"""
133
- logger.info("Initializing fallback simulation mode")
134
-
135
- # Create mock config
136
- self.config = type('MockConfig', (), {
137
- 'max_mamba_encoders': 100,
138
- 'd_model': 768,
139
- 'vocab_size': 50257,
140
- 'max_sequence_length': 2048
141
- })()
142
-
143
- # Create mock tokenizer
144
- class MockTokenizer:
145
- def __init__(self):
146
- self.pad_token_id = 0
147
- self.eos_token_id = 1
148
- self.pad_token = "[PAD]"
149
- self.eos_token = "[EOS]"
150
-
151
- def encode(self, text, return_tensors=None):
152
- # Simple word-based tokenization for simulation
153
- tokens = text.split()
154
- token_ids = [hash(token) % 1000 for token in tokens]
155
- if return_tensors == "pt":
156
- return torch.tensor([token_ids])
157
- return token_ids
158
-
159
- def decode(self, token_ids, skip_special_tokens=True):
160
- # Mock decoding
161
- return f"Generated response for {len(token_ids)} tokens"
162
-
163
- self.tokenizer = MockTokenizer()
164
-
165
- # Create mock model
166
- class MockModel:
167
- def __init__(self, config):
168
- self.config = config
169
- self.num_active_encoders = 5
170
-
171
- def set_active_encoders(self, num):
172
- self.num_active_encoders = min(num, self.config.max_mamba_encoders)
173
-
174
- def eval(self):
175
- pass
176
-
177
- self.model = MockModel(self.config)
178
- logger.info("Fallback mode initialized successfully")
179
-
180
- def _detect_domain(self, prompt: str) -> Tuple[str, float]:
181
- """Detect the domain of the prompt for intelligent routing"""
182
- prompt_lower = prompt.lower()
183
- domain_scores = {}
184
-
185
- for domain, keywords in self.domain_keywords.items():
186
- score = sum(1 for keyword in keywords if keyword in prompt_lower)
187
- if score > 0:
188
- domain_scores[domain] = score / len(keywords)
189
-
190
- if domain_scores:
191
- best_domain = max(domain_scores, key=domain_scores.get)
192
- confidence = domain_scores[best_domain]
193
- return best_domain, confidence
194
-
195
- return 'general', 0.5
196
-
197
- def _simulate_encoder_selection(self, prompt: str, num_encoders: int) -> Dict[str, Any]:
198
- """Simulate intelligent encoder selection based on domain"""
199
- domain, confidence = self._detect_domain(prompt)
200
-
201
- # Domain-specific encoder ranges (simulated)
202
- domain_ranges = {
203
- 'medical': (1, 20),
204
- 'legal': (21, 40),
205
- 'code': (41, 60),
206
- 'science': (61, 80),
207
- 'creative': (81, 95),
208
- 'business': (96, 100),
209
- 'general': (1, 100)
210
- }
211
-
212
- start, end = domain_ranges.get(domain, (1, 100))
213
- available_encoders = list(range(start, min(end + 1, 101)))
214
-
215
- # Select encoders based on prompt complexity and domain
216
- prompt_complexity = min(len(prompt.split()) / 10, 3.0) # Complexity factor
217
- optimal_count = min(max(int(num_encoders * (1 + prompt_complexity)), 3), 25)
218
-
219
- if len(available_encoders) >= optimal_count:
220
- selected = np.random.choice(available_encoders, size=optimal_count, replace=False)
221
- else:
222
- selected = available_encoders
223
-
224
- selected_encoders = sorted(selected.tolist())
225
-
226
- # Generate confidence scores
227
- base_confidence = max(0.6, confidence)
228
- confidence_scores = np.random.normal(base_confidence, 0.1, len(selected_encoders))
229
- confidence_scores = np.clip(confidence_scores, 0.5, 0.98).tolist()
230
-
231
- return {
232
- 'selected_encoders': selected_encoders,
233
- 'confidence_scores': confidence_scores,
234
- 'detected_domain': domain,
235
- 'domain_confidence': confidence,
236
- 'total_active': len(selected_encoders)
237
- }
238
-
239
- def _simulate_generation(self, prompt: str, routing_info: Dict, max_length: int) -> str:
240
- """Generate sophisticated simulated responses based on domain"""
241
- domain = routing_info['detected_domain']
242
-
243
- domain_responses = {
244
- 'medical': f"""Based on medical literature and current research, regarding "{prompt[:50]}...":
245
-
246
- This condition/topic involves multiple factors including genetic predisposition, environmental influences, and lifestyle factors. Key considerations include:
247
-
248
- β€’ Proper medical evaluation is essential
249
- β€’ Individual symptoms may vary significantly
250
- β€’ Treatment approaches should be personalized
251
- β€’ Regular monitoring is typically recommended
252
-
253
- **Important**: This information is for educational purposes only. Please consult with qualified healthcare professionals for personalized medical advice and treatment recommendations.""",
254
-
255
- 'legal': f"""From a legal perspective on "{prompt[:50]}...":
256
-
257
- The legal framework surrounding this matter involves several key considerations:
258
-
259
- β€’ Jurisdictional requirements and applicable statutes
260
- β€’ Precedent cases and regulatory guidelines
261
- β€’ Compliance obligations and reporting requirements
262
- β€’ Risk assessment and mitigation strategies
263
-
264
- **Disclaimer**: This information is for general informational purposes only and does not constitute legal advice. Consult with qualified legal professionals for specific legal matters.""",
265
-
266
- 'code': f"""Here's a comprehensive solution for "{prompt[:50]}...":
267
-
268
- ```python
269
- def optimized_solution(input_data):
270
- \"\"\"
271
- Efficient implementation with error handling
272
- Time complexity: O(n log n)
273
- Space complexity: O(n)
274
- \"\"\"
275
- try:
276
- # Input validation
277
- if not input_data:
278
- raise ValueError("Input data cannot be empty")
279
-
280
- # Core algorithm implementation
281
- result = process_data(input_data)
282
-
283
- # Additional optimizations
284
- result = optimize_output(result)
285
-
286
- return result
287
-
288
- except Exception as e:
289
- logger.error(f"Processing error: {{e}}")
290
- return None
291
-
292
- def process_data(data):
293
- # Implementation details here
294
- return processed_data
295
-
296
- def optimize_output(data):
297
- # Performance optimizations
298
- return optimized_data
299
- ```
300
-
301
- **Key Features:**
302
- β€’ Error handling and input validation
303
- β€’ Optimized performance characteristics
304
- β€’ Comprehensive documentation
305
- β€’ Production-ready implementation""",
306
-
307
- 'science': f"""Scientific Analysis of "{prompt[:50]}...":
308
-
309
- Based on current scientific understanding and peer-reviewed research:
310
-
311
- **Theoretical Framework:**
312
- The underlying principles involve complex interactions between multiple variables, governed by established scientific laws and emerging theories.
313
-
314
- **Methodology:**
315
- β€’ Systematic observation and data collection
316
- β€’ Controlled experimental design
317
- β€’ Statistical analysis and validation
318
- β€’ Peer review and reproducibility testing
319
-
320
- **Current Research:**
321
- Recent studies indicate significant progress in understanding the mechanisms involved, with several promising avenues for future investigation.
322
-
323
- **Implications:**
324
- These findings have broad applications across multiple disciplines and may lead to significant advances in the field.""",
325
-
326
- 'creative': f"""**{prompt[:30]}...**
327
-
328
- The story unfolds in a world where imagination meets reality, where every character carries the weight of their dreams and the burden of their choices.
329
-
330
- *Chapter 1: The Beginning*
331
-
332
- In the quiet moments before dawn, when the world holds its breath between night and day, our tale begins. The protagonist stands at the threshold of an adventure that will challenge everything they thought they knew about themselves and the world around them.
333
-
334
- The narrative weaves through layers of meaning, exploring themes of identity, purpose, and the delicate balance between hope and reality. Each scene is crafted with careful attention to emotional resonance and character development.
335
-
336
- *As the story progresses, we discover that the true journey is not external, but internalβ€”a transformation of the soul that mirrors the changing landscape of the world itself.*
337
-
338
- **Themes Explored:**
339
- β€’ Personal growth and self-discovery
340
- β€’ The power of resilience and determination
341
- β€’ The complexity of human relationships
342
- β€’ The intersection of dreams and reality""",
343
-
344
- 'business': f"""**Strategic Analysis: {prompt[:50]}...**
345
-
346
- **Executive Summary:**
347
- This comprehensive analysis examines the strategic implications and market opportunities related to the identified business challenge.
348
-
349
- **Market Assessment:**
350
- β€’ Current market size and growth projections
351
- β€’ Competitive landscape analysis
352
- β€’ Key trends and disruption factors
353
- β€’ Customer segment identification
354
-
355
- **Strategic Recommendations:**
356
- 1. **Short-term actions** (0-6 months)
357
- - Immediate market positioning
358
- - Resource allocation optimization
359
- - Risk mitigation strategies
360
-
361
- 2. **Medium-term initiatives** (6-18 months)
362
- - Strategic partnerships and alliances
363
- - Product/service development
364
- - Market expansion opportunities
365
-
366
- 3. **Long-term vision** (18+ months)
367
- - Innovation and R&D investment
368
- - Scalability and sustainability
369
- - Market leadership positioning
370
-
371
- **Financial Projections:**
372
- Based on conservative estimates, implementation of these strategies could result in significant ROI and market share growth.""",
373
-
374
- 'general': f"""**Comprehensive Response to: "{prompt[:50]}..."**
375
-
376
- Thank you for your inquiry. Based on available knowledge and expertise from {routing_info['total_active']} specialized domains, here's a comprehensive analysis:
377
-
378
- **Key Points:**
379
- β€’ The topic involves multiple interconnected factors that require careful consideration
380
- β€’ Current understanding is based on established principles and ongoing research
381
- β€’ Practical applications vary depending on specific context and requirements
382
- β€’ Best practices emphasize a balanced, evidence-based approach
383
-
384
- **Detailed Analysis:**
385
- The subject matter encompasses several important dimensions that merit thorough examination. Each aspect contributes to a deeper understanding of the overall concept and its implications.
386
-
387
- **Practical Considerations:**
388
- Implementation requires careful planning, adequate resources, and ongoing monitoring to ensure optimal outcomes. Success factors include stakeholder engagement, clear communication, and adaptive management strategies.
389
-
390
- **Conclusion:**
391
- This analysis provides a foundation for informed decision-making while acknowledging the complexity and nuanced nature of the topic."""
392
- }
393
-
394
- return domain_responses.get(domain, domain_responses['general'])
395
-
396
- def generate_text(self, prompt: str, max_length: int = 100, temperature: float = 0.7,
397
- top_p: float = 0.9, num_encoders: int = 5, show_routing: bool = True) -> Tuple[str, str]:
398
- """
399
- Generate text with comprehensive error handling and routing information
400
-
401
- Returns:
402
- Tuple of (generated_text, routing_info_display)
403
- """
404
- start_time = time.time()
405
-
406
- # Update statistics
407
- self.stats['total_requests'] += 1
408
-
409
- try:
410
- if not prompt.strip():
411
- return "Please enter a prompt.", ""
412
-
413
- # Simulate routing decision
414
- routing_info = self._simulate_encoder_selection(prompt, num_encoders)
415
-
416
- if self.model_loaded and not self.fallback_mode:
417
- # Real model generation
418
- response = self._generate_real(prompt, max_length, temperature, top_p, num_encoders)
419
- else:
420
- # Simulated generation with sophisticated responses
421
- response = self._simulate_generation(prompt, routing_info, max_length)
422
-
423
- # Calculate performance metrics
424
- generation_time = time.time() - start_time
425
- estimated_tokens = len(response.split())
426
-
427
- # Update statistics
428
- self.stats['successful_generations'] += 1
429
- self.stats['total_tokens_generated'] += estimated_tokens
430
-
431
- # Update average generation time
432
- total_successful = self.stats['successful_generations']
433
- prev_avg = self.stats['avg_generation_time']
434
- self.stats['avg_generation_time'] = (prev_avg * (total_successful - 1) + generation_time) / total_successful
435
-
436
- # Generate routing display
437
- routing_display = ""
438
- if show_routing:
439
- routing_display = self._create_routing_display(routing_info, generation_time, estimated_tokens)
440
-
441
- logger.info(f"Generated {estimated_tokens} tokens in {generation_time:.2f}s")
442
- return response, routing_display
443
-
444
- except Exception as e:
445
- self.stats['failed_generations'] += 1
446
- error_msg = f"Error generating response: {str(e)}"
447
- logger.error(error_msg)
448
- return error_msg, ""
449
-
450
- def _generate_real(self, prompt: str, max_length: int, temperature: float,
451
- top_p: float, num_encoders: int) -> str:
452
- """Generate using real model"""
453
- try:
454
- # Encode input
455
- inputs = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
456
-
457
- # Adjust number of active encoders
458
- if hasattr(self.model, 'set_active_encoders'):
459
- self.model.set_active_encoders(min(num_encoders, self.config.max_mamba_encoders))
460
-
461
- # Generate with memory optimization
462
- with torch.no_grad():
463
- outputs = self.model.generate(
464
- inputs,
465
- max_length=min(max_length, getattr(self.config, 'max_sequence_length', 2048)),
466
- temperature=temperature,
467
- top_p=top_p,
468
- do_sample=True,
469
- pad_token_id=self.tokenizer.pad_token_id,
470
- eos_token_id=self.tokenizer.eos_token_id,
471
- use_cache=True
472
- )
473
-
474
- # Decode output
475
- generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
476
-
477
- # Remove input prompt from output
478
- response = generated_text[len(prompt):].strip()
479
-
480
- return response if response else "Generated response was empty."
481
-
482
- except torch.cuda.OutOfMemoryError:
483
- logger.error("CUDA out of memory during generation")
484
- return "Error: GPU memory insufficient. Try reducing max_length or num_encoders."
485
- except Exception as e:
486
- logger.error(f"Real generation error: {e}")
487
- return f"Generation error: {str(e)}"
488
-
489
- def _create_routing_display(self, routing_info: Dict, generation_time: float,
490
- estimated_tokens: int) -> str:
491
- """Create rich routing information display"""
492
- return f"""
493
- ## 🧠 Intelligent Routing Analysis
494
-
495
- **🎯 Domain Detection:**
496
- - **Primary Domain**: {routing_info['detected_domain'].title()}
497
- - **Confidence**: {routing_info['domain_confidence']:.1%}
498
- - **Specialization Level**: {'High' if routing_info['domain_confidence'] > 0.7 else 'Medium' if routing_info['domain_confidence'] > 0.4 else 'General'}
499
-
500
- **⚑ Encoder Activation:**
501
- - **Active Encoders**: {routing_info['total_active']}/{self.config.max_mamba_encoders}
502
- - **Selection Strategy**: Domain-optimized routing
503
- - **Load Distribution**: Balanced across specialized encoders
504
-
505
- **πŸ”’ Selected Encoder IDs:**
506
- {', '.join(map(str, routing_info['selected_encoders'][:15]))}{'...' if len(routing_info['selected_encoders']) > 15 else ''}
507
-
508
- **πŸ“Š Performance Metrics:**
509
- - **Generation Time**: {generation_time:.2f}s
510
- - **Estimated Tokens**: {estimated_tokens}
511
- - **Tokens/Second**: {estimated_tokens/generation_time:.1f}
512
- - **Model Mode**: {'Real Model' if self.model_loaded and not self.fallback_mode else 'Simulation'}
513
-
514
- **🎚️ Confidence Scores (Top 5):**
515
- {', '.join([f'{score:.3f}' for score in routing_info['confidence_scores'][:5]])}{'...' if len(routing_info['confidence_scores']) > 5 else ''}
516
-
517
- **πŸ’‘ Optimization Notes:**
518
- - Encoder selection optimized for domain: {routing_info['detected_domain']}
519
- - Dynamic load balancing across {routing_info['total_active']} active encoders
520
- - Confidence-weighted aggregation applied
521
- """
522
-
523
- def get_model_info(self) -> str:
524
- """Get comprehensive model information"""
525
- if not self.model:
526
- return "Model not initialized"
527
-
528
- # Get system information
529
- memory_info = psutil.virtual_memory()
530
- gpu_info = "N/A"
531
- if torch.cuda.is_available():
532
- gpu_info = f"{torch.cuda.get_device_name(0)} ({torch.cuda.get_device_properties(0).total_memory // 1024**3}GB)"
533
-
534
- return f"""
535
- **πŸ€– Mamba Encoder Swarm Model Information**
536
-
537
- **Model Configuration:**
538
- - **Status**: {'βœ… Loaded' if self.model_loaded else '⚠️ Simulation Mode'}
539
- - **Active Encoders**: {getattr(self.model, 'num_active_encoders', 'N/A')}
540
- - **Max Encoders**: {self.config.max_mamba_encoders}
541
- - **Model Dimension**: {self.config.d_model}
542
- - **Vocabulary Size**: {self.config.vocab_size:,}
543
- - **Max Sequence Length**: {getattr(self.config, 'max_sequence_length', 'N/A')}
544
-
545
- **System Information:**
546
- - **Device**: {self.device} {f'({gpu_info})' if gpu_info != 'N/A' else ''}
547
- - **RAM Usage**: {memory_info.percent:.1f}% ({memory_info.used // 1024**3}GB / {memory_info.total // 1024**3}GB)
548
- - **Python/PyTorch**: {torch.__version__}
549
-
550
- **Performance Statistics:**
551
- - **Total Requests**: {self.stats['total_requests']}
552
- - **Successful**: {self.stats['successful_generations']}
553
- - **Failed**: {self.stats['failed_generations']}
554
- - **Success Rate**: {(self.stats['successful_generations'] / max(self.stats['total_requests'], 1) * 100):.1f}%
555
- - **Avg Generation Time**: {self.stats['avg_generation_time']:.2f}s
556
- - **Total Tokens Generated**: {self.stats['total_tokens_generated']:,}
557
-
558
- **Fallback Mode**: {'⚠️ Active' if self.fallback_mode else 'βœ… Disabled'}
559
- """
560
-
561
- def get_system_status(self) -> Dict[str, Any]:
562
- """Get system status for monitoring"""
563
- return {
564
- 'model_loaded': self.model_loaded,
565
- 'fallback_mode': self.fallback_mode,
566
- 'device': str(self.device),
567
- 'stats': self.stats.copy(),
568
- 'timestamp': datetime.now().isoformat()
569
- }
570
-
571
- def create_production_demo() -> gr.Blocks:
572
- """Create production-ready Gradio interface"""
573
-
574
- # Initialize demo with fallback capability
575
- try:
576
- demo_instance = MambaSwarmDemo(model_path="./", fallback_mode=False)
577
- except Exception as e:
578
- logger.warning(f"Primary initialization failed: {e}")
579
- demo_instance = MambaSwarmDemo(model_path="./", fallback_mode=True)
580
-
581
- def generate_response(prompt, max_length, temperature, top_p, num_encoders, show_routing):
582
- return demo_instance.generate_text(prompt, max_length, temperature, top_p, num_encoders, show_routing)
583
-
584
- def show_model_info():
585
- return demo_instance.get_model_info()
586
-
587
- def refresh_model_info():
588
- return demo_instance.get_model_info()
589
-
590
- # Create interface
591
- with gr.Blocks(
592
- title="Mamba Encoder Swarm - Production Demo",
593
- theme=gr.themes.Soft(),
594
- css="""
595
- .gradio-container {
596
- max-width: 1200px;
597
- margin: auto;
598
- }
599
- .model-info {
600
- background-color: #f8f9fa;
601
- border-radius: 8px;
602
- padding: 15px;
603
- margin: 10px 0;
604
- }
605
- .routing-info {
606
- background-color: #e8f4fd;
607
- border-radius: 8px;
608
- padding: 15px;
609
- margin: 10px 0;
610
- }
611
- """
612
- ) as demo:
613
-
614
- # Header
615
- gr.Markdown("""
616
- # 🐍 Mamba Encoder Swarm - Production Demo
617
-
618
- **Advanced Language Model with Dynamic Routing & Intelligent Encoder Selection**
619
-
620
- Experience the power of up to 100 specialized Mamba encoders with intelligent domain-aware routing,
621
- comprehensive error handling, and production-ready performance monitoring.
622
- """)
623
-
624
- # Status indicator
625
- with gr.Row():
626
- with gr.Column(scale=1):
627
- status_indicator = gr.Markdown(
628
- f"**Status**: {'🟒 Real Model' if demo_instance.model_loaded and not demo_instance.fallback_mode else '🟑 Simulation Mode'}"
629
- )
630
-
631
- with gr.Row():
632
- # Left column - Input and controls
633
- with gr.Column(scale=2):
634
- prompt_input = gr.Textbox(
635
- label="πŸ“ Input Prompt",
636
- placeholder="Enter your prompt here... (e.g., 'Explain quantum computing', 'Write a Python function', 'Analyze market trends')",
637
- lines=4,
638
- max_lines=8
639
- )
640
-
641
- with gr.Accordion("βš™οΈ Generation Parameters", open=False):
642
- with gr.Row():
643
- max_length = gr.Slider(
644
- label="Max Length",
645
- minimum=50,
646
- maximum=1000,
647
- value=200,
648
- step=25,
649
- info="Maximum number of tokens to generate"
650
- )
651
- temperature = gr.Slider(
652
- label="Temperature",
653
- minimum=0.1,
654
- maximum=2.0,
655
- value=0.7,
656
- step=0.1,
657
- info="Controls randomness (lower = more focused)"
658
- )
659
-
660
- with gr.Row():
661
- top_p = gr.Slider(
662
- label="Top-p (Nucleus Sampling)",
663
- minimum=0.1,
664
- maximum=1.0,
665
- value=0.9,
666
- step=0.05,
667
- info="Probability mass for nucleus sampling"
668
- )
669
- num_encoders = gr.Slider(
670
- label="Target Active Encoders",
671
- minimum=1,
672
- maximum=25,
673
- value=8,
674
- step=1,
675
- info="Preferred number of encoders to activate"
676
- )
677
-
678
- show_routing = gr.Checkbox(
679
- label="Show Routing Information",
680
- value=True,
681
- info="Display detailed routing and performance metrics"
682
- )
683
-
684
- generate_btn = gr.Button("πŸš€ Generate Response", variant="primary", size="lg")
685
-
686
- # Right column - Output and information
687
- with gr.Column(scale=3):
688
- response_output = gr.Textbox(
689
- label="πŸ“„ Generated Response",
690
- lines=12,
691
- max_lines=20,
692
- interactive=False,
693
- show_copy_button=True
694
- )
695
-
696
- routing_output = gr.Markdown(
697
- label="πŸ” Routing & Performance Analysis",
698
- visible=True,
699
- elem_classes=["routing-info"]
700
- )
701
-
702
- # Model information section
703
- with gr.Accordion("πŸ€– Model Information & Statistics", open=False):
704
- with gr.Row():
705
- model_info_display = gr.Markdown(
706
- value=show_model_info(),
707
- elem_classes=["model-info"]
708
- )
709
- refresh_info_btn = gr.Button("πŸ”„ Refresh Info", size="sm")
710
-
711
- # Examples section
712
- with gr.Accordion("πŸ’‘ Example Prompts", open=True):
713
- gr.Markdown("### Try these examples to see domain-specific routing in action:")
714
-
715
- examples = [
716
- ["Explain the process of photosynthesis in detail", 300, 0.7, 0.9, 10, True],
717
- ["Write a Python function to implement binary search with error handling", 250, 0.5, 0.8, 8, True],
718
- ["What are the early symptoms of Type 2 diabetes?", 200, 0.6, 0.9, 12, True],
719
- ["Analyze the legal implications of AI-generated content", 350, 0.7, 0.9, 15, True],
720
- ["Write a creative short story about a time-traveling scientist", 400, 0.9, 0.95, 12, True],
721
- ["Develop a marketing strategy for a sustainable fashion startup", 300, 0.8, 0.9, 10, True],
722
- ["How does quantum entanglement work and what are its applications?", 350, 0.6, 0.9, 15, True]
723
- ]
724
-
725
- gr.Examples(
726
- examples=examples,
727
- inputs=[prompt_input, max_length, temperature, top_p, num_encoders, show_routing],
728
- outputs=[response_output, routing_output],
729
- fn=generate_response,
730
- cache_examples=False,
731
- label="Click any example to load it"
732
- )
733
-
734
- # Event handlers
735
- generate_btn.click(
736
- fn=generate_response,
737
- inputs=[prompt_input, max_length, temperature, top_p, num_encoders, show_routing],
738
- outputs=[response_output, routing_output],
739
- api_name="generate"
740
- )
741
-
742
- refresh_info_btn.click(
743
- fn=refresh_model_info,
744
- outputs=model_info_display
745
- )
746
-
747
- # Footer
748
- gr.Markdown("""
749
- ---
750
- ### πŸ—οΈ Architecture Overview
751
-
752
- **🧠 Intelligent Routing System**
753
- - Domain detection based on prompt analysis
754
- - Dynamic encoder selection optimized for content type
755
- - Load balancing across specialized encoder pools
756
-
757
- **πŸ”§ Production Features**
758
- - Comprehensive error handling and fallback modes
759
- - Real-time performance monitoring and statistics
760
- - Memory optimization and CUDA support
761
- - Detailed logging and debugging capabilities
762
-
763
- **πŸ“Š Specialized Domains**
764
- - **Medical & Healthcare** β€’ **Legal & Regulatory** β€’ **Code & Technical**
765
- - **Science & Research** β€’ **Creative Writing** β€’ **Business & Finance**
766
-
767
- Built with ❀️ using Gradio, PyTorch, and the Mamba architecture
768
- """)
769
-
770
- return demo
771
-
772
- if __name__ == "__main__":
773
- # Create and launch production demo
774
- try:
775
- demo = create_production_demo()
776
-
777
- # Launch with production settings
778
- demo.launch(
779
- server_name="0.0.0.0",
780
- server_port=7860,
781
- share=False, # Set to True for public sharing
782
- debug=False,
783
- show_error=True,
784
- quiet=False,
785
- favicon_path=None,
786
- ssl_verify=False,
787
- show_tips=True,
788
- enable_queue=True,
789
- max_threads=10
790
- )
791
-
792
- except Exception as e:
793
- logger.error(f"Failed to launch demo: {e}")
794
- print(f"❌ Demo launch failed: {e}")
795
- print("Please check the logs for more details.")
 
1
+ # Cache bust: 2025-08-03-v2
2
+ #!/usr/bin/env python3
3
+ """
4
+ renamed from app_real.py - Production-Ready Mamba Encoder Swarm Demo
5
+ Combines real model functionality with rich UI and comprehensive error handling
6
+ """
7
+ import gradio as gr
8
+ import torch
9
+ import numpy as np
10
+ import time
11
+ import json
12
+ import logging
13
+ import os
14
+ import psutil
15
+ from typing import Optional, Dict, Any, Tuple
16
+ from datetime import datetime
17
+ from transformers import AutoTokenizer, AutoConfig
18
+
19
+ # Setup comprehensive logging
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
23
+ handlers=[
24
+ logging.FileHandler('mamba_swarm_demo.log'),
25
+ logging.StreamHandler()
26
+ ]
27
+ )
28
+ logger = logging.getLogger(__name__)
29
+
30
+ class MambaSwarmDemo:
31
+ """Production-ready Mamba Swarm Demo with fallback capabilities"""
32
+
33
+ def __init__(self, model_path: str = "./", fallback_mode: bool = False):
34
+ self.model = None
35
+ self.tokenizer = None
36
+ self.config = None
37
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38
+ self.model_path = model_path
39
+ self.fallback_mode = fallback_mode
40
+ self.model_loaded = False
41
+
42
+ # Performance tracking
43
+ self.stats = {
44
+ 'total_requests': 0,
45
+ 'successful_generations': 0,
46
+ 'failed_generations': 0,
47
+ 'avg_generation_time': 0.0,
48
+ 'total_tokens_generated': 0
49
+ }
50
+
51
+ # Domain mappings for intelligent routing
52
+ self.domain_keywords = {
53
+ 'medical': ['medical', 'health', 'doctor', 'patient', 'disease', 'treatment', 'symptom', 'diagnosis'],
54
+ 'legal': ['legal', 'law', 'court', 'judge', 'contract', 'patent', 'lawsuit', 'attorney'],
55
+ 'code': ['code', 'python', 'programming', 'function', 'algorithm', 'software', 'debug', 'api'],
56
+ 'science': ['science', 'research', 'experiment', 'theory', 'physics', 'chemistry', 'biology'],
57
+ 'creative': ['story', 'creative', 'write', 'novel', 'poem', 'character', 'plot', 'narrative'],
58
+ 'business': ['business', 'marketing', 'strategy', 'finance', 'management', 'sales', 'revenue'],
59
+ 'general': ['explain', 'what', 'how', 'why', 'describe', 'tell', 'information']
60
+ }
61
+
62
+ self._initialize_model()
63
+ logger.info(f"Demo initialized - Model loaded: {self.model_loaded}, Fallback mode: {self.fallback_mode}")
64
+
65
+ def _initialize_model(self):
66
+ """Initialize model with comprehensive error handling and fallback"""
67
+ try:
68
+ logger.info("Attempting to load Mamba Swarm model...")
69
+
70
+ # Check if model files exist
71
+ config_path = os.path.join(self.model_path, "config.json")
72
+ if not os.path.exists(config_path) and not self.fallback_mode:
73
+ logger.warning(f"Config file not found at {config_path}, enabling fallback mode")
74
+ self.fallback_mode = True
75
+
76
+ if not self.fallback_mode:
77
+ # Try to load real model
78
+ self._load_real_model()
79
+ else:
80
+ # Initialize in fallback mode
81
+ self._initialize_fallback_mode()
82
+
83
+ except Exception as e:
84
+ logger.error(f"Model initialization failed: {e}")
85
+ logger.info("Falling back to simulation mode")
86
+ self.fallback_mode = True
87
+ self._initialize_fallback_mode()
88
+
89
+ def _load_real_model(self):
90
+ """Load the actual Mamba Swarm model"""
91
+ try:
92
+ # Import here to avoid dependency issues if not available
93
+ from upload_to_hf import MambaSwarmForCausalLM
94
+
95
+ # Load configuration
96
+ self.config = AutoConfig.from_pretrained(self.model_path, trust_remote_code=True)
97
+ logger.info(f"Loaded config: {self.config.__class__.__name__}")
98
+
99
+ # Load tokenizer
100
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
101
+ if self.tokenizer.pad_token is None:
102
+ self.tokenizer.pad_token = self.tokenizer.eos_token
103
+ logger.info("Tokenizer loaded successfully")
104
+
105
+ # Load model with memory optimization
106
+ dtype = torch.float16 if self.device.type == "cuda" else torch.float32
107
+
108
+ self.model = MambaSwarmForCausalLM.from_pretrained(
109
+ self.model_path,
110
+ config=self.config,
111
+ torch_dtype=dtype,
112
+ trust_remote_code=True,
113
+ low_cpu_mem_usage=True
114
+ ).to(self.device)
115
+
116
+ self.model.eval()
117
+ self.model_loaded = True
118
+
119
+ # Log model info
120
+ num_params = sum(p.numel() for p in self.model.parameters())
121
+ logger.info(f"Model loaded successfully on {self.device}")
122
+ logger.info(f"Model parameters: {num_params:,} ({num_params/1e6:.1f}M)")
123
+
124
+ except ImportError as e:
125
+ logger.error(f"MambaSwarmForCausalLM not available: {e}")
126
+ raise
127
+ except Exception as e:
128
+ logger.error(f"Real model loading failed: {e}")
129
+ raise
130
+
131
+ def _initialize_fallback_mode(self):
132
+ """Initialize fallback/simulation mode"""
133
+ logger.info("Initializing fallback simulation mode")
134
+
135
+ # Create mock config
136
+ self.config = type('MockConfig', (), {
137
+ 'max_mamba_encoders': 100,
138
+ 'd_model': 768,
139
+ 'vocab_size': 50257,
140
+ 'max_sequence_length': 2048
141
+ })()
142
+
143
+ # Create mock tokenizer
144
+ class MockTokenizer:
145
+ def __init__(self):
146
+ self.pad_token_id = 0
147
+ self.eos_token_id = 1
148
+ self.pad_token = "[PAD]"
149
+ self.eos_token = "[EOS]"
150
+
151
+ def encode(self, text, return_tensors=None):
152
+ # Simple word-based tokenization for simulation
153
+ tokens = text.split()
154
+ token_ids = [hash(token) % 1000 for token in tokens]
155
+ if return_tensors == "pt":
156
+ return torch.tensor([token_ids])
157
+ return token_ids
158
+
159
+ def decode(self, token_ids, skip_special_tokens=True):
160
+ # Mock decoding
161
+ return f"Generated response for {len(token_ids)} tokens"
162
+
163
+ self.tokenizer = MockTokenizer()
164
+
165
+ # Create mock model
166
+ class MockModel:
167
+ def __init__(self, config):
168
+ self.config = config
169
+ self.num_active_encoders = 5
170
+
171
+ def set_active_encoders(self, num):
172
+ self.num_active_encoders = min(num, self.config.max_mamba_encoders)
173
+
174
+ def eval(self):
175
+ pass
176
+
177
+ self.model = MockModel(self.config)
178
+ logger.info("Fallback mode initialized successfully")
179
+
180
+ def _detect_domain(self, prompt: str) -> Tuple[str, float]:
181
+ """Detect the domain of the prompt for intelligent routing"""
182
+ prompt_lower = prompt.lower()
183
+ domain_scores = {}
184
+
185
+ for domain, keywords in self.domain_keywords.items():
186
+ score = sum(1 for keyword in keywords if keyword in prompt_lower)
187
+ if score > 0:
188
+ domain_scores[domain] = score / len(keywords)
189
+
190
+ if domain_scores:
191
+ best_domain = max(domain_scores, key=domain_scores.get)
192
+ confidence = domain_scores[best_domain]
193
+ return best_domain, confidence
194
+
195
+ return 'general', 0.5
196
+
197
+ def _simulate_encoder_selection(self, prompt: str, num_encoders: int) -> Dict[str, Any]:
198
+ """Simulate intelligent encoder selection based on domain"""
199
+ domain, confidence = self._detect_domain(prompt)
200
+
201
+ # Domain-specific encoder ranges (simulated)
202
+ domain_ranges = {
203
+ 'medical': (1, 20),
204
+ 'legal': (21, 40),
205
+ 'code': (41, 60),
206
+ 'science': (61, 80),
207
+ 'creative': (81, 95),
208
+ 'business': (96, 100),
209
+ 'general': (1, 100)
210
+ }
211
+
212
+ start, end = domain_ranges.get(domain, (1, 100))
213
+ available_encoders = list(range(start, min(end + 1, 101)))
214
+
215
+ # Select encoders based on prompt complexity and domain
216
+ prompt_complexity = min(len(prompt.split()) / 10, 3.0) # Complexity factor
217
+ optimal_count = min(max(int(num_encoders * (1 + prompt_complexity)), 3), 25)
218
+
219
+ if len(available_encoders) >= optimal_count:
220
+ selected = np.random.choice(available_encoders, size=optimal_count, replace=False)
221
+ else:
222
+ selected = available_encoders
223
+
224
+ selected_encoders = sorted(selected.tolist())
225
+
226
+ # Generate confidence scores
227
+ base_confidence = max(0.6, confidence)
228
+ confidence_scores = np.random.normal(base_confidence, 0.1, len(selected_encoders))
229
+ confidence_scores = np.clip(confidence_scores, 0.5, 0.98).tolist()
230
+
231
+ return {
232
+ 'selected_encoders': selected_encoders,
233
+ 'confidence_scores': confidence_scores,
234
+ 'detected_domain': domain,
235
+ 'domain_confidence': confidence,
236
+ 'total_active': len(selected_encoders)
237
+ }
238
+
239
+ def _simulate_generation(self, prompt: str, routing_info: Dict, max_length: int) -> str:
240
+ """Generate sophisticated simulated responses based on domain"""
241
+ domain = routing_info['detected_domain']
242
+
243
+ domain_responses = {
244
+ 'medical': f"""Based on medical literature and current research, regarding "{prompt[:50]}...":
245
+
246
+ This condition/topic involves multiple factors including genetic predisposition, environmental influences, and lifestyle factors. Key considerations include:
247
+
248
+ β€’ Proper medical evaluation is essential
249
+ β€’ Individual symptoms may vary significantly
250
+ β€’ Treatment approaches should be personalized
251
+ β€’ Regular monitoring is typically recommended
252
+
253
+ **Important**: This information is for educational purposes only. Please consult with qualified healthcare professionals for personalized medical advice and treatment recommendations.""",
254
+
255
+ 'legal': f"""From a legal perspective on "{prompt[:50]}...":
256
+
257
+ The legal framework surrounding this matter involves several key considerations:
258
+
259
+ β€’ Jurisdictional requirements and applicable statutes
260
+ β€’ Precedent cases and regulatory guidelines
261
+ β€’ Compliance obligations and reporting requirements
262
+ β€’ Risk assessment and mitigation strategies
263
+
264
+ **Disclaimer**: This information is for general informational purposes only and does not constitute legal advice. Consult with qualified legal professionals for specific legal matters.""",
265
+
266
+ 'code': f"""Here's a comprehensive solution for "{prompt[:50]}...":
267
+
268
+ ```python
269
+ def optimized_solution(input_data):
270
+ \"\"\"
271
+ Efficient implementation with error handling
272
+ Time complexity: O(n log n)
273
+ Space complexity: O(n)
274
+ \"\"\"
275
+ try:
276
+ # Input validation
277
+ if not input_data:
278
+ raise ValueError("Input data cannot be empty")
279
+
280
+ # Core algorithm implementation
281
+ result = process_data(input_data)
282
+
283
+ # Additional optimizations
284
+ result = optimize_output(result)
285
+
286
+ return result
287
+
288
+ except Exception as e:
289
+ logger.error(f"Processing error: {{e}}")
290
+ return None
291
+
292
+ def process_data(data):
293
+ # Implementation details here
294
+ return processed_data
295
+
296
+ def optimize_output(data):
297
+ # Performance optimizations
298
+ return optimized_data
299
+ ```
300
+
301
+ **Key Features:**
302
+ β€’ Error handling and input validation
303
+ β€’ Optimized performance characteristics
304
+ β€’ Comprehensive documentation
305
+ β€’ Production-ready implementation""",
306
+
307
+ 'science': f"""Scientific Analysis of "{prompt[:50]}...":
308
+
309
+ Based on current scientific understanding and peer-reviewed research:
310
+
311
+ **Theoretical Framework:**
312
+ The underlying principles involve complex interactions between multiple variables, governed by established scientific laws and emerging theories.
313
+
314
+ **Methodology:**
315
+ β€’ Systematic observation and data collection
316
+ β€’ Controlled experimental design
317
+ β€’ Statistical analysis and validation
318
+ β€’ Peer review and reproducibility testing
319
+
320
+ **Current Research:**
321
+ Recent studies indicate significant progress in understanding the mechanisms involved, with several promising avenues for future investigation.
322
+
323
+ **Implications:**
324
+ These findings have broad applications across multiple disciplines and may lead to significant advances in the field.""",
325
+
326
+ 'creative': f"""**{prompt[:30]}...**
327
+
328
+ The story unfolds in a world where imagination meets reality, where every character carries the weight of their dreams and the burden of their choices.
329
+
330
+ *Chapter 1: The Beginning*
331
+
332
+ In the quiet moments before dawn, when the world holds its breath between night and day, our tale begins. The protagonist stands at the threshold of an adventure that will challenge everything they thought they knew about themselves and the world around them.
333
+
334
+ The narrative weaves through layers of meaning, exploring themes of identity, purpose, and the delicate balance between hope and reality. Each scene is crafted with careful attention to emotional resonance and character development.
335
+
336
+ *As the story progresses, we discover that the true journey is not external, but internalβ€”a transformation of the soul that mirrors the changing landscape of the world itself.*
337
+
338
+ **Themes Explored:**
339
+ β€’ Personal growth and self-discovery
340
+ β€’ The power of resilience and determination
341
+ β€’ The complexity of human relationships
342
+ β€’ The intersection of dreams and reality""",
343
+
344
+ 'business': f"""**Strategic Analysis: {prompt[:50]}...**
345
+
346
+ **Executive Summary:**
347
+ This comprehensive analysis examines the strategic implications and market opportunities related to the identified business challenge.
348
+
349
+ **Market Assessment:**
350
+ β€’ Current market size and growth projections
351
+ β€’ Competitive landscape analysis
352
+ β€’ Key trends and disruption factors
353
+ β€’ Customer segment identification
354
+
355
+ **Strategic Recommendations:**
356
+ 1. **Short-term actions** (0-6 months)
357
+ - Immediate market positioning
358
+ - Resource allocation optimization
359
+ - Risk mitigation strategies
360
+
361
+ 2. **Medium-term initiatives** (6-18 months)
362
+ - Strategic partnerships and alliances
363
+ - Product/service development
364
+ - Market expansion opportunities
365
+
366
+ 3. **Long-term vision** (18+ months)
367
+ - Innovation and R&D investment
368
+ - Scalability and sustainability
369
+ - Market leadership positioning
370
+
371
+ **Financial Projections:**
372
+ Based on conservative estimates, implementation of these strategies could result in significant ROI and market share growth.""",
373
+
374
+ 'general': f"""**Comprehensive Response to: "{prompt[:50]}..."**
375
+
376
+ Thank you for your inquiry. Based on available knowledge and expertise from {routing_info['total_active']} specialized domains, here's a comprehensive analysis:
377
+
378
+ **Key Points:**
379
+ β€’ The topic involves multiple interconnected factors that require careful consideration
380
+ β€’ Current understanding is based on established principles and ongoing research
381
+ β€’ Practical applications vary depending on specific context and requirements
382
+ β€’ Best practices emphasize a balanced, evidence-based approach
383
+
384
+ **Detailed Analysis:**
385
+ The subject matter encompasses several important dimensions that merit thorough examination. Each aspect contributes to a deeper understanding of the overall concept and its implications.
386
+
387
+ **Practical Considerations:**
388
+ Implementation requires careful planning, adequate resources, and ongoing monitoring to ensure optimal outcomes. Success factors include stakeholder engagement, clear communication, and adaptive management strategies.
389
+
390
+ **Conclusion:**
391
+ This analysis provides a foundation for informed decision-making while acknowledging the complexity and nuanced nature of the topic."""
392
+ }
393
+
394
+ return domain_responses.get(domain, domain_responses['general'])
395
+
396
+ def generate_text(self, prompt: str, max_length: int = 100, temperature: float = 0.7,
397
+ top_p: float = 0.9, num_encoders: int = 5, show_routing: bool = True) -> Tuple[str, str]:
398
+ """
399
+ Generate text with comprehensive error handling and routing information
400
+
401
+ Returns:
402
+ Tuple of (generated_text, routing_info_display)
403
+ """
404
+ start_time = time.time()
405
+
406
+ # Update statistics
407
+ self.stats['total_requests'] += 1
408
+
409
+ try:
410
+ if not prompt.strip():
411
+ return "Please enter a prompt.", ""
412
+
413
+ # Simulate routing decision
414
+ routing_info = self._simulate_encoder_selection(prompt, num_encoders)
415
+
416
+ if self.model_loaded and not self.fallback_mode:
417
+ # Real model generation
418
+ response = self._generate_real(prompt, max_length, temperature, top_p, num_encoders)
419
+ else:
420
+ # Simulated generation with sophisticated responses
421
+ response = self._simulate_generation(prompt, routing_info, max_length)
422
+
423
+ # Calculate performance metrics
424
+ generation_time = time.time() - start_time
425
+ estimated_tokens = len(response.split())
426
+
427
+ # Update statistics
428
+ self.stats['successful_generations'] += 1
429
+ self.stats['total_tokens_generated'] += estimated_tokens
430
+
431
+ # Update average generation time
432
+ total_successful = self.stats['successful_generations']
433
+ prev_avg = self.stats['avg_generation_time']
434
+ self.stats['avg_generation_time'] = (prev_avg * (total_successful - 1) + generation_time) / total_successful
435
+
436
+ # Generate routing display
437
+ routing_display = ""
438
+ if show_routing:
439
+ routing_display = self._create_routing_display(routing_info, generation_time, estimated_tokens)
440
+
441
+ logger.info(f"Generated {estimated_tokens} tokens in {generation_time:.2f}s")
442
+ return response, routing_display
443
+
444
+ except Exception as e:
445
+ self.stats['failed_generations'] += 1
446
+ error_msg = f"Error generating response: {str(e)}"
447
+ logger.error(error_msg)
448
+ return error_msg, ""
449
+
450
+ def _generate_real(self, prompt: str, max_length: int, temperature: float,
451
+ top_p: float, num_encoders: int) -> str:
452
+ """Generate using real model"""
453
+ try:
454
+ # Encode input
455
+ inputs = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
456
+
457
+ # Adjust number of active encoders
458
+ if hasattr(self.model, 'set_active_encoders'):
459
+ self.model.set_active_encoders(min(num_encoders, self.config.max_mamba_encoders))
460
+
461
+ # Generate with memory optimization
462
+ with torch.no_grad():
463
+ outputs = self.model.generate(
464
+ inputs,
465
+ max_length=min(max_length, getattr(self.config, 'max_sequence_length', 2048)),
466
+ temperature=temperature,
467
+ top_p=top_p,
468
+ do_sample=True,
469
+ pad_token_id=self.tokenizer.pad_token_id,
470
+ eos_token_id=self.tokenizer.eos_token_id,
471
+ use_cache=True
472
+ )
473
+
474
+ # Decode output
475
+ generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
476
+
477
+ # Remove input prompt from output
478
+ response = generated_text[len(prompt):].strip()
479
+
480
+ return response if response else "Generated response was empty."
481
+
482
+ except torch.cuda.OutOfMemoryError:
483
+ logger.error("CUDA out of memory during generation")
484
+ return "Error: GPU memory insufficient. Try reducing max_length or num_encoders."
485
+ except Exception as e:
486
+ logger.error(f"Real generation error: {e}")
487
+ return f"Generation error: {str(e)}"
488
+
489
+ def _create_routing_display(self, routing_info: Dict, generation_time: float,
490
+ estimated_tokens: int) -> str:
491
+ """Create rich routing information display"""
492
+ return f"""
493
+ ## 🧠 Intelligent Routing Analysis
494
+
495
+ **🎯 Domain Detection:**
496
+ - **Primary Domain**: {routing_info['detected_domain'].title()}
497
+ - **Confidence**: {routing_info['domain_confidence']:.1%}
498
+ - **Specialization Level**: {'High' if routing_info['domain_confidence'] > 0.7 else 'Medium' if routing_info['domain_confidence'] > 0.4 else 'General'}
499
+
500
+ **⚑ Encoder Activation:**
501
+ - **Active Encoders**: {routing_info['total_active']}/{self.config.max_mamba_encoders}
502
+ - **Selection Strategy**: Domain-optimized routing
503
+ - **Load Distribution**: Balanced across specialized encoders
504
+
505
+ **πŸ”’ Selected Encoder IDs:**
506
+ {', '.join(map(str, routing_info['selected_encoders'][:15]))}{'...' if len(routing_info['selected_encoders']) > 15 else ''}
507
+
508
+ **πŸ“Š Performance Metrics:**
509
+ - **Generation Time**: {generation_time:.2f}s
510
+ - **Estimated Tokens**: {estimated_tokens}
511
+ - **Tokens/Second**: {estimated_tokens/generation_time:.1f}
512
+ - **Model Mode**: {'Real Model' if self.model_loaded and not self.fallback_mode else 'Simulation'}
513
+
514
+ **🎚️ Confidence Scores (Top 5):**
515
+ {', '.join([f'{score:.3f}' for score in routing_info['confidence_scores'][:5]])}{'...' if len(routing_info['confidence_scores']) > 5 else ''}
516
+
517
+ **πŸ’‘ Optimization Notes:**
518
+ - Encoder selection optimized for domain: {routing_info['detected_domain']}
519
+ - Dynamic load balancing across {routing_info['total_active']} active encoders
520
+ - Confidence-weighted aggregation applied
521
+ """
522
+
523
+ def get_model_info(self) -> str:
524
+ """Get comprehensive model information"""
525
+ if not self.model:
526
+ return "Model not initialized"
527
+
528
+ # Get system information
529
+ memory_info = psutil.virtual_memory()
530
+ gpu_info = "N/A"
531
+ if torch.cuda.is_available():
532
+ gpu_info = f"{torch.cuda.get_device_name(0)} ({torch.cuda.get_device_properties(0).total_memory // 1024**3}GB)"
533
+
534
+ return f"""
535
+ **πŸ€– Mamba Encoder Swarm Model Information**
536
+
537
+ **Model Configuration:**
538
+ - **Status**: {'βœ… Loaded' if self.model_loaded else '⚠️ Simulation Mode'}
539
+ - **Active Encoders**: {getattr(self.model, 'num_active_encoders', 'N/A')}
540
+ - **Max Encoders**: {self.config.max_mamba_encoders}
541
+ - **Model Dimension**: {self.config.d_model}
542
+ - **Vocabulary Size**: {self.config.vocab_size:,}
543
+ - **Max Sequence Length**: {getattr(self.config, 'max_sequence_length', 'N/A')}
544
+
545
+ **System Information:**
546
+ - **Device**: {self.device} {f'({gpu_info})' if gpu_info != 'N/A' else ''}
547
+ - **RAM Usage**: {memory_info.percent:.1f}% ({memory_info.used // 1024**3}GB / {memory_info.total // 1024**3}GB)
548
+ - **Python/PyTorch**: {torch.__version__}
549
+
550
+ **Performance Statistics:**
551
+ - **Total Requests**: {self.stats['total_requests']}
552
+ - **Successful**: {self.stats['successful_generations']}
553
+ - **Failed**: {self.stats['failed_generations']}
554
+ - **Success Rate**: {(self.stats['successful_generations'] / max(self.stats['total_requests'], 1) * 100):.1f}%
555
+ - **Avg Generation Time**: {self.stats['avg_generation_time']:.2f}s
556
+ - **Total Tokens Generated**: {self.stats['total_tokens_generated']:,}
557
+
558
+ **Fallback Mode**: {'⚠️ Active' if self.fallback_mode else 'βœ… Disabled'}
559
+ """
560
+
561
+ def get_system_status(self) -> Dict[str, Any]:
562
+ """Get system status for monitoring"""
563
+ return {
564
+ 'model_loaded': self.model_loaded,
565
+ 'fallback_mode': self.fallback_mode,
566
+ 'device': str(self.device),
567
+ 'stats': self.stats.copy(),
568
+ 'timestamp': datetime.now().isoformat()
569
+ }
570
+
571
+ def create_production_demo() -> gr.Blocks:
572
+ """Create production-ready Gradio interface"""
573
+
574
+ # Initialize demo with fallback capability
575
+ try:
576
+ demo_instance = MambaSwarmDemo(model_path="./", fallback_mode=False)
577
+ except Exception as e:
578
+ logger.warning(f"Primary initialization failed: {e}")
579
+ demo_instance = MambaSwarmDemo(model_path="./", fallback_mode=True)
580
+
581
+ def generate_response(prompt, max_length, temperature, top_p, num_encoders, show_routing):
582
+ return demo_instance.generate_text(prompt, max_length, temperature, top_p, num_encoders, show_routing)
583
+
584
+ def show_model_info():
585
+ return demo_instance.get_model_info()
586
+
587
+ def refresh_model_info():
588
+ return demo_instance.get_model_info()
589
+
590
+ # Create interface
591
+ with gr.Blocks(
592
+ title="Mamba Encoder Swarm - Production Demo",
593
+ theme=gr.themes.Soft(),
594
+ css="""
595
+ .gradio-container {
596
+ max-width: 1200px;
597
+ margin: auto;
598
+ }
599
+ .model-info {
600
+ background-color: #f8f9fa;
601
+ border-radius: 8px;
602
+ padding: 15px;
603
+ margin: 10px 0;
604
+ }
605
+ .routing-info {
606
+ background-color: #e8f4fd;
607
+ border-radius: 8px;
608
+ padding: 15px;
609
+ margin: 10px 0;
610
+ }
611
+ """
612
+ ) as demo:
613
+
614
+ # Header
615
+ gr.Markdown("""
616
+ # 🐍 Mamba Encoder Swarm - Production Demo
617
+
618
+ **Advanced Language Model with Dynamic Routing & Intelligent Encoder Selection**
619
+
620
+ Experience the power of up to 100 specialized Mamba encoders with intelligent domain-aware routing,
621
+ comprehensive error handling, and production-ready performance monitoring.
622
+ """)
623
+
624
+ # Status indicator
625
+ with gr.Row():
626
+ with gr.Column(scale=1):
627
+ status_indicator = gr.Markdown(
628
+ f"**Status**: {'🟒 Real Model' if demo_instance.model_loaded and not demo_instance.fallback_mode else '🟑 Simulation Mode'}"
629
+ )
630
+
631
+ with gr.Row():
632
+ # Left column - Input and controls
633
+ with gr.Column(scale=2):
634
+ prompt_input = gr.Textbox(
635
+ label="πŸ“ Input Prompt",
636
+ placeholder="Enter your prompt here... (e.g., 'Explain quantum computing', 'Write a Python function', 'Analyze market trends')",
637
+ lines=4,
638
+ max_lines=8
639
+ )
640
+
641
+ with gr.Accordion("βš™οΈ Generation Parameters", open=False):
642
+ with gr.Row():
643
+ max_length = gr.Slider(
644
+ label="Max Length",
645
+ minimum=50,
646
+ maximum=1000,
647
+ value=200,
648
+ step=25,
649
+ info="Maximum number of tokens to generate"
650
+ )
651
+ temperature = gr.Slider(
652
+ label="Temperature",
653
+ minimum=0.1,
654
+ maximum=2.0,
655
+ value=0.7,
656
+ step=0.1,
657
+ info="Controls randomness (lower = more focused)"
658
+ )
659
+
660
+ with gr.Row():
661
+ top_p = gr.Slider(
662
+ label="Top-p (Nucleus Sampling)",
663
+ minimum=0.1,
664
+ maximum=1.0,
665
+ value=0.9,
666
+ step=0.05,
667
+ info="Probability mass for nucleus sampling"
668
+ )
669
+ num_encoders = gr.Slider(
670
+ label="Target Active Encoders",
671
+ minimum=1,
672
+ maximum=25,
673
+ value=8,
674
+ step=1,
675
+ info="Preferred number of encoders to activate"
676
+ )
677
+
678
+ show_routing = gr.Checkbox(
679
+ label="Show Routing Information",
680
+ value=True,
681
+ info="Display detailed routing and performance metrics"
682
+ )
683
+
684
+ generate_btn = gr.Button("πŸš€ Generate Response", variant="primary", size="lg")
685
+
686
+ # Right column - Output and information
687
+ with gr.Column(scale=3):
688
+ response_output = gr.Textbox(
689
+ label="πŸ“„ Generated Response",
690
+ lines=12,
691
+ max_lines=20,
692
+ interactive=False,
693
+ show_copy_button=True
694
+ )
695
+
696
+ routing_output = gr.Markdown(
697
+ label="πŸ” Routing & Performance Analysis",
698
+ visible=True,
699
+ elem_classes=["routing-info"]
700
+ )
701
+
702
+ # Model information section
703
+ with gr.Accordion("πŸ€– Model Information & Statistics", open=False):
704
+ with gr.Row():
705
+ model_info_display = gr.Markdown(
706
+ value=show_model_info(),
707
+ elem_classes=["model-info"]
708
+ )
709
+ refresh_info_btn = gr.Button("πŸ”„ Refresh Info", size="sm")
710
+
711
+ # Examples section
712
+ with gr.Accordion("πŸ’‘ Example Prompts", open=True):
713
+ gr.Markdown("### Try these examples to see domain-specific routing in action:")
714
+
715
+ examples = [
716
+ ["Explain the process of photosynthesis in detail", 300, 0.7, 0.9, 10, True],
717
+ ["Write a Python function to implement binary search with error handling", 250, 0.5, 0.8, 8, True],
718
+ ["What are the early symptoms of Type 2 diabetes?", 200, 0.6, 0.9, 12, True],
719
+ ["Analyze the legal implications of AI-generated content", 350, 0.7, 0.9, 15, True],
720
+ ["Write a creative short story about a time-traveling scientist", 400, 0.9, 0.95, 12, True],
721
+ ["Develop a marketing strategy for a sustainable fashion startup", 300, 0.8, 0.9, 10, True],
722
+ ["How does quantum entanglement work and what are its applications?", 350, 0.6, 0.9, 15, True]
723
+ ]
724
+
725
+ gr.Examples(
726
+ examples=examples,
727
+ inputs=[prompt_input, max_length, temperature, top_p, num_encoders, show_routing],
728
+ outputs=[response_output, routing_output],
729
+ fn=generate_response,
730
+ cache_examples=False,
731
+ label="Click any example to load it"
732
+ )
733
+
734
+ # Event handlers
735
+ generate_btn.click(
736
+ fn=generate_response,
737
+ inputs=[prompt_input, max_length, temperature, top_p, num_encoders, show_routing],
738
+ outputs=[response_output, routing_output],
739
+ api_name="generate"
740
+ )
741
+
742
+ refresh_info_btn.click(
743
+ fn=refresh_model_info,
744
+ outputs=model_info_display
745
+ )
746
+
747
+ # Footer
748
+ gr.Markdown("""
749
+ ---
750
+ ### πŸ—οΈ Architecture Overview
751
+
752
+ **🧠 Intelligent Routing System**
753
+ - Domain detection based on prompt analysis
754
+ - Dynamic encoder selection optimized for content type
755
+ - Load balancing across specialized encoder pools
756
+
757
+ **πŸ”§ Production Features**
758
+ - Comprehensive error handling and fallback modes
759
+ - Real-time performance monitoring and statistics
760
+ - Memory optimization and CUDA support
761
+ - Detailed logging and debugging capabilities
762
+
763
+ **πŸ“Š Specialized Domains**
764
+ - **Medical & Healthcare** β€’ **Legal & Regulatory** β€’ **Code & Technical**
765
+ - **Science & Research** β€’ **Creative Writing** β€’ **Business & Finance**
766
+
767
+ Built with ❀️ using Gradio, PyTorch, and the Mamba architecture
768
+ """)
769
+
770
+ return demo
771
+
772
+ if __name__ == "__main__":
773
+ # Create and launch production demo
774
+ try:
775
+ demo = create_production_demo()
776
+
777
+ # Launch with production settings
778
+ demo.launch(
779
+ server_name="0.0.0.0",
780
+ server_port=7860,
781
+ share=False, # Set to True for public sharing
782
+ debug=False,
783
+ show_error=True,
784
+ quiet=False,
785
+ favicon_path=None,
786
+ ssl_verify=False,
787
+ show_tips=True,
788
+ enable_queue=True,
789
+ max_threads=10
790
+ )
791
+
792
+ except Exception as e:
793
+ logger.error(f"Failed to launch demo: {e}")
794
+ print(f"❌ Demo launch failed: {e}")
795
+ print("Please check the logs for more details.")