entropy25 commited on
Commit
0c511f2
·
verified ·
1 Parent(s): 061ab6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +614 -1032
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import torch
2
  import gradio as gr
3
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
  import plotly.graph_objects as go
5
  import plotly.express as px
6
  from plotly.subplots import make_subplots
@@ -18,16 +18,8 @@ from functools import lru_cache, wraps
18
  from dataclasses import dataclass
19
  from typing import List, Dict, Optional, Tuple, Any, Callable
20
  from contextlib import contextmanager
21
- import nltk
22
- from nltk.corpus import stopwords
23
- import langdetect
24
- import pandas as pd
25
  import gc
26
-
27
- # Advanced analysis imports
28
- import shap
29
- import lime
30
- from lime.lime_text import LimeTextExplainer
31
 
32
  # Configuration
33
  @dataclass
@@ -39,45 +31,52 @@ class Config:
39
  CACHE_SIZE: int = 128
40
  BATCH_PROCESSING_SIZE: int = 8
41
 
42
- # Supported languages and models
43
- SUPPORTED_LANGUAGES = {
44
- 'auto': 'Auto Detect',
45
- 'en': 'English',
46
- 'zh': 'Chinese',
47
- 'es': 'Spanish',
48
- 'fr': 'French',
49
- 'de': 'German',
50
- 'sv': 'Swedish'
 
51
  }
52
 
 
53
  MODELS = {
54
- 'en': "cardiffnlp/twitter-roberta-base-sentiment-latest",
55
- 'multilingual': "cardiffnlp/twitter-xlm-roberta-base-sentiment",
56
- 'zh': "uer/roberta-base-finetuned-dianping-chinese"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  }
58
 
59
- # Color themes for Plotly
60
- THEMES = {
61
- 'default': {'pos': '#4CAF50', 'neg': '#F44336', 'neu': '#FF9800'},
62
- 'ocean': {'pos': '#0077BE', 'neg': '#FF6B35', 'neu': '#00BCD4'},
63
- 'dark': {'pos': '#66BB6A', 'neg': '#EF5350', 'neu': '#FFA726'},
64
- 'rainbow': {'pos': '#9C27B0', 'neg': '#E91E63', 'neu': '#FF5722'}
65
  }
66
 
67
  config = Config()
68
-
69
- # Logging setup
70
- logging.basicConfig(level=logging.INFO)
71
  logger = logging.getLogger(__name__)
72
 
73
- # Initialize NLTK
74
- try:
75
- nltk.download('stopwords', quiet=True)
76
- nltk.download('punkt', quiet=True)
77
- STOP_WORDS = set(stopwords.words('english'))
78
- except:
79
- STOP_WORDS = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
80
-
81
  # Decorators and Context Managers
82
  def handle_errors(default_return=None):
83
  """Centralized error handling decorator"""
@@ -92,166 +91,113 @@ def handle_errors(default_return=None):
92
  return wrapper
93
  return decorator
94
 
95
- @contextmanager
96
- def memory_cleanup():
97
- """Context manager for memory cleanup"""
98
- try:
99
- yield
100
- finally:
101
- gc.collect()
102
-
103
  class ThemeContext:
104
  """Theme management context"""
105
  def __init__(self, theme: str = 'default'):
106
  self.theme = theme
107
  self.colors = config.THEMES.get(theme, config.THEMES['default'])
108
 
109
- # Enhanced Model Manager with Multi-language Support
110
  class ModelManager:
111
  """Multi-language model manager with lazy loading"""
112
  _instance = None
 
 
 
 
113
 
114
  def __new__(cls):
115
  if cls._instance is None:
116
  cls._instance = super().__new__(cls)
117
- cls._instance._initialized = False
118
  return cls._instance
119
 
120
- def __init__(self):
121
- if not self._initialized:
122
- self.models = {}
123
- self.tokenizers = {}
124
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
125
- self._load_default_models()
126
- self._initialized = True
127
-
128
- def _load_default_models(self):
129
- """Load default models"""
130
- try:
131
- # Load multilingual model as default
132
- model_name = config.MODELS['multilingual']
133
- self.tokenizers['default'] = AutoTokenizer.from_pretrained(model_name)
134
- self.models['default'] = AutoModelForSequenceClassification.from_pretrained(model_name)
135
- self.models['default'].to(self.device)
136
- logger.info(f"Default model loaded: {model_name}")
137
-
138
- # Load Chinese model
139
- zh_model_name = config.MODELS['zh']
140
- self.tokenizers['zh'] = AutoTokenizer.from_pretrained(zh_model_name)
141
- self.models['zh'] = AutoModelForSequenceClassification.from_pretrained(zh_model_name)
142
- self.models['zh'].to(self.device)
143
- logger.info(f"Chinese model loaded: {zh_model_name}")
144
-
145
- except Exception as e:
146
- logger.error(f"Failed to load models: {e}")
147
- raise
148
 
149
- def get_model(self, language='en'):
150
- """Get model for specific language"""
151
- if language == 'zh':
152
- return self.models['zh'], self.tokenizers['zh']
153
- return self.models['default'], self.tokenizers['default']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  @staticmethod
156
  def detect_language(text: str) -> str:
157
- """Detect text language"""
158
- try:
159
- detected = langdetect.detect(text)
160
- language_mapping = {
161
- 'zh-cn': 'zh',
162
- 'zh-tw': 'zh'
163
- }
164
- detected = language_mapping.get(detected, detected)
165
- return detected if detected in config.SUPPORTED_LANGUAGES else 'en'
166
- except:
167
- return 'en'
 
 
168
 
169
- # Simplified Text Processing
170
  class TextProcessor:
171
  """Optimized text processing with multi-language support"""
172
-
173
  @staticmethod
174
  @lru_cache(maxsize=config.CACHE_SIZE)
175
- def clean_text(text: str, remove_punctuation: bool = True, remove_numbers: bool = False) -> str:
176
- """Clean text with language awareness"""
177
- text = text.strip()
178
-
179
- # Don't clean Chinese text aggressively
180
- if re.search(r'[\u4e00-\u9fff]', text):
181
- return text
182
-
183
- text = text.lower()
184
-
185
- if remove_numbers:
186
- text = re.sub(r'\d+', '', text)
187
-
188
- if remove_punctuation:
189
- text = re.sub(r'[^\w\s]', '', text)
190
-
191
- words = text.split()
192
- cleaned_words = [w for w in words if w not in STOP_WORDS and len(w) >= config.MIN_WORD_LENGTH]
193
- return ' '.join(cleaned_words)
194
-
195
- @staticmethod
196
- def extract_keywords(text: str, top_k: int = 5) -> List[str]:
197
- """Extract keywords with language support"""
198
- if re.search(r'[\u4e00-\u9fff]', text):
199
- # Chinese text processing
200
- words = re.findall(r'[\u4e00-\u9fff]+', text)
201
- all_chars = ''.join(words)
202
- char_freq = Counter(all_chars)
203
- return [char for char, _ in char_freq.most_common(top_k)]
204
- else:
205
- # Other languages
206
- cleaned = TextProcessor.clean_text(text)
207
- words = cleaned.split()
208
- word_freq = Counter(words)
209
- return [word for word, _ in word_freq.most_common(top_k)]
210
-
211
- @staticmethod
212
- def parse_batch_input(text: str) -> List[str]:
213
- """Parse batch input from textarea"""
214
- lines = text.strip().split('\n')
215
- return [line.strip() for line in lines if line.strip()]
216
 
217
- # Enhanced History Manager
218
  class HistoryManager:
219
- """Enhanced history management with filtering"""
220
  def __init__(self):
221
  self._history = []
222
 
223
  def add(self, entry: Dict):
224
- """Add entry with timestamp"""
225
- entry['timestamp'] = datetime.now().isoformat()
226
- self._history.append(entry)
227
  if len(self._history) > config.MAX_HISTORY_SIZE:
228
  self._history = self._history[-config.MAX_HISTORY_SIZE:]
229
 
230
- def add_batch(self, entries: List[Dict]):
231
- """Add multiple entries"""
232
- for entry in entries:
233
- self.add(entry)
234
-
235
  def get_all(self) -> List[Dict]:
236
  return self._history.copy()
237
 
238
- def get_recent(self, n: int = 10) -> List[Dict]:
239
- return self._history[-n:] if self._history else []
240
-
241
- def filter_by(self, sentiment: str = None, language: str = None,
242
- min_confidence: float = None) -> List[Dict]:
243
- """Filter history by criteria"""
244
- filtered = self._history
245
-
246
- if sentiment:
247
- filtered = [h for h in filtered if h['sentiment'] == sentiment]
248
- if language:
249
- filtered = [h for h in filtered if h.get('language', 'en') == language]
250
- if min_confidence:
251
- filtered = [h for h in filtered if h['confidence'] >= min_confidence]
252
-
253
- return filtered
254
-
255
  def clear(self) -> int:
256
  count = len(self._history)
257
  self._history.clear()
@@ -259,404 +205,207 @@ class HistoryManager:
259
 
260
  def size(self) -> int:
261
  return len(self._history)
262
-
263
- def get_stats(self) -> Dict:
264
- """Get comprehensive statistics"""
265
- if not self._history:
266
- return {}
267
-
268
- sentiments = [item['sentiment'] for item in self._history]
269
- confidences = [item['confidence'] for item in self._history]
270
- languages = [item.get('language', 'en') for item in self._history]
271
-
272
- return {
273
- 'total_analyses': len(self._history),
274
- 'positive_count': sentiments.count('Positive'),
275
- 'negative_count': sentiments.count('Negative'),
276
- 'neutral_count': sentiments.count('Neutral'),
277
- 'avg_confidence': np.mean(confidences),
278
- 'max_confidence': np.max(confidences),
279
- 'min_confidence': np.min(confidences),
280
- 'languages_detected': len(set(languages)),
281
- 'most_common_language': Counter(languages).most_common(1)[0][0] if languages else 'en'
282
- }
283
 
284
- # Core Sentiment Analysis Engine (Modified - removed attention analysis)
285
  class SentimentEngine:
286
- """Multi-language sentiment analysis engine"""
287
-
288
  def __init__(self):
289
  self.model_manager = ModelManager()
 
290
 
291
- @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0, 'keywords': []})
292
- def analyze_single(self, text: str, language: str = 'auto', preprocessing_options: Dict = None) -> Dict:
293
- """Analyze single text with basic features"""
294
- if not text.strip():
295
- raise ValueError("Empty text provided")
296
-
297
- # Detect language
298
- if language == 'auto':
299
- detected_lang = self.model_manager.detect_language(text)
300
- else:
301
- detected_lang = language
302
-
303
- # Get appropriate model
304
- model, tokenizer = self.model_manager.get_model(detected_lang)
305
-
306
- # Preprocessing
307
- options = preprocessing_options or {}
308
- processed_text = text
309
- if options.get('clean_text', False) and not re.search(r'[\u4e00-\u9fff]', text):
310
- processed_text = TextProcessor.clean_text(
311
- text,
312
- options.get('remove_punctuation', True),
313
- options.get('remove_numbers', False)
314
- )
315
-
316
- # Tokenize and analyze
317
- inputs = tokenizer(processed_text, return_tensors="pt", padding=True,
318
- truncation=True, max_length=config.MAX_TEXT_LENGTH).to(self.model_manager.device)
319
-
320
- with torch.no_grad():
321
- outputs = model(**inputs)
322
- probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
323
-
324
- # Handle different model outputs
325
- if len(probs) == 3: # negative, neutral, positive
326
- sentiment_idx = np.argmax(probs)
327
- sentiment_labels = ['Negative', 'Neutral', 'Positive']
328
- sentiment = sentiment_labels[sentiment_idx]
329
- confidence = float(probs[sentiment_idx])
330
-
331
- result = {
332
- 'sentiment': sentiment,
333
- 'confidence': confidence,
334
- 'neg_prob': float(probs[0]),
335
- 'neu_prob': float(probs[1]),
336
- 'pos_prob': float(probs[2]),
337
- 'has_neutral': True
338
- }
339
- else: # negative, positive
340
- pred = np.argmax(probs)
341
- sentiment = "Positive" if pred == 1 else "Negative"
342
- confidence = float(probs[pred])
343
-
344
- result = {
345
- 'sentiment': sentiment,
346
- 'confidence': confidence,
347
- 'neg_prob': float(probs[0]),
348
- 'pos_prob': float(probs[1]),
349
- 'neu_prob': 0.0,
350
- 'has_neutral': False
351
- }
352
-
353
- # Extract basic keywords
354
- keywords = TextProcessor.extract_keywords(text, 10)
355
- keyword_tuples = [(word, 0.1) for word in keywords] # Simple keyword extraction
356
-
357
- # Add metadata
358
- result.update({
359
- 'language': detected_lang,
360
- 'keywords': keyword_tuples,
361
- 'word_count': len(text.split()),
362
- 'char_count': len(text)
363
- })
364
-
365
- return result
366
-
367
- @handle_errors(default_return=[])
368
- def analyze_batch(self, texts: List[str], language: str = 'auto',
369
- preprocessing_options: Dict = None, progress_callback=None) -> List[Dict]:
370
- """Optimized batch processing"""
371
- if len(texts) > config.BATCH_SIZE_LIMIT:
372
- texts = texts[:config.BATCH_SIZE_LIMIT]
373
-
374
- results = []
375
- batch_size = config.BATCH_PROCESSING_SIZE
376
-
377
- for i in range(0, len(texts), batch_size):
378
- batch = texts[i:i+batch_size]
379
-
380
- if progress_callback:
381
- progress_callback((i + len(batch)) / len(texts))
382
-
383
- for text in batch:
384
- try:
385
- result = self.analyze_single(text, language, preprocessing_options)
386
- result['batch_index'] = len(results)
387
- result['text'] = text[:100] + '...' if len(text) > 100 else text
388
- result['full_text'] = text
389
- results.append(result)
390
- except Exception as e:
391
- results.append({
392
- 'sentiment': 'Error',
393
- 'confidence': 0.0,
394
- 'error': str(e),
395
- 'batch_index': len(results),
396
- 'text': text[:100] + '...' if len(text) > 100 else text,
397
- 'full_text': text
398
- })
399
-
400
- return results
401
-
402
- # Advanced Analysis Engine (NEW)
403
- class AdvancedAnalysisEngine:
404
- """Advanced analysis using SHAP and LIME"""
405
-
406
- def __init__(self):
407
- self.model_manager = ModelManager()
408
-
409
- def create_prediction_function(self, model, tokenizer, device):
410
- """Create prediction function for LIME/SHAP"""
411
- def predict_proba(texts):
412
- results = []
413
- for text in texts:
414
- inputs = tokenizer(text, return_tensors="pt", padding=True,
415
- truncation=True, max_length=config.MAX_TEXT_LENGTH).to(device)
416
- with torch.no_grad():
417
- outputs = model(**inputs)
418
- probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
419
- results.append(probs)
420
- return np.array(results)
421
- return predict_proba
422
-
423
- @handle_errors(default_return=("Analysis failed", None, None))
424
- def analyze_with_shap(self, text: str, language: str = 'auto') -> Tuple[str, go.Figure, Dict]:
425
- """Perform SHAP analysis"""
426
- if not text.strip():
427
- return "Please enter text for analysis", None, {}
428
-
429
- # Detect language and get model
430
- if language == 'auto':
431
- detected_lang = self.model_manager.detect_language(text)
432
- else:
433
- detected_lang = language
434
-
435
- model, tokenizer = self.model_manager.get_model(detected_lang)
436
-
437
- # Create prediction function
438
- predict_fn = self.create_prediction_function(model, tokenizer, self.model_manager.device)
439
-
440
  try:
441
- # Initialize SHAP explainer
442
- explainer = shap.Explainer(predict_fn, tokenizer)
443
-
444
- # Get SHAP values
445
- shap_values = explainer([text])
446
 
447
- # Extract token importance
448
- tokens = shap_values.data[0]
449
- values = shap_values.values[0]
 
450
 
451
- # Create visualization data
452
- if len(values.shape) > 1:
453
- # Multi-class case
454
- pos_values = values[:, -1] if values.shape[1] == 3 else values[:, 1]
455
- else:
456
- pos_values = values
457
-
458
- # Create SHAP plot
459
- fig = go.Figure()
460
-
461
- colors = ['red' if v < 0 else 'green' for v in pos_values]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
 
463
- fig.add_trace(go.Bar(
464
- x=list(range(len(tokens))),
465
- y=pos_values,
466
- text=tokens,
467
- textposition='outside',
468
- marker_color=colors,
469
- name='SHAP Values'
470
- ))
471
 
472
- fig.update_layout(
473
- title="SHAP Analysis - Token Importance",
474
- xaxis_title="Token Index",
475
- yaxis_title="SHAP Value",
476
- height=500,
477
- xaxis=dict(tickmode='array', tickvals=list(range(len(tokens))), ticktext=tokens)
478
- )
479
 
480
- # Create analysis summary
481
- analysis_data = {
482
- 'method': 'SHAP',
483
- 'language': detected_lang,
484
- 'total_tokens': len(tokens),
485
- 'positive_influence': sum(1 for v in pos_values if v > 0),
486
- 'negative_influence': sum(1 for v in pos_values if v < 0),
487
- 'most_important_tokens': [(tokens[i], float(pos_values[i]))
488
- for i in np.argsort(np.abs(pos_values))[-5:]]
489
  }
490
 
491
- summary_text = f"""
492
- **SHAP Analysis Results:**
493
- - **Language:** {detected_lang.upper()}
494
- - **Total Tokens:** {analysis_data['total_tokens']}
495
- - **Positive Influence Tokens:** {analysis_data['positive_influence']}
496
- - **Negative Influence Tokens:** {analysis_data['negative_influence']}
497
- - **Most Important Tokens:** {', '.join([f"{token}({score:.3f})" for token, score in analysis_data['most_important_tokens']])}
498
- """
499
-
500
- return summary_text, fig, analysis_data
501
 
502
  except Exception as e:
503
- logger.error(f"SHAP analysis failed: {e}")
504
- return f"SHAP analysis failed: {str(e)}", None, {}
505
 
506
- @handle_errors(default_return=("Analysis failed", None, None))
507
- def analyze_with_lime(self, text: str, language: str = 'auto') -> Tuple[str, go.Figure, Dict]:
508
- """Perform LIME analysis"""
509
  if not text.strip():
510
- return "Please enter text for analysis", None, {}
511
-
512
- # Detect language and get model
513
- if language == 'auto':
514
- detected_lang = self.model_manager.detect_language(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515
  else:
516
- detected_lang = language
 
517
 
518
- model, tokenizer = self.model_manager.get_model(detected_lang)
 
519
 
520
- # Create prediction function
521
- predict_fn = self.create_prediction_function(model, tokenizer, self.model_manager.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
522
 
523
- try:
524
- # Initialize LIME explainer
525
- explainer = LimeTextExplainer(class_names=['Negative', 'Neutral', 'Positive'])
526
-
527
- # Get LIME explanation
528
- exp = explainer.explain_instance(text, predict_fn, num_features=20)
529
-
530
- # Extract feature importance
531
- lime_data = exp.as_list()
532
-
533
- # Create visualization
534
- words = [item[0] for item in lime_data]
535
- scores = [item[1] for item in lime_data]
536
-
537
- fig = go.Figure()
538
-
539
- colors = ['red' if s < 0 else 'green' for s in scores]
540
-
541
- fig.add_trace(go.Bar(
542
- y=words,
543
- x=scores,
544
- orientation='h',
545
- marker_color=colors,
546
- text=[f'{s:.3f}' for s in scores],
547
- textposition='auto',
548
- name='LIME Importance'
549
- ))
550
-
551
- fig.update_layout(
552
- title="LIME Analysis - Feature Importance",
553
- xaxis_title="Importance Score",
554
- yaxis_title="Words/Phrases",
555
- height=500
556
- )
557
-
558
- # Create analysis summary
559
- analysis_data = {
560
- 'method': 'LIME',
561
- 'language': detected_lang,
562
- 'features_analyzed': len(lime_data),
563
- 'positive_features': sum(1 for _, score in lime_data if score > 0),
564
- 'negative_features': sum(1 for _, score in lime_data if score < 0),
565
- 'feature_importance': lime_data
566
- }
567
-
568
- summary_text = f"""
569
- **LIME Analysis Results:**
570
- - **Language:** {detected_lang.upper()}
571
- - **Features Analyzed:** {analysis_data['features_analyzed']}
572
- - **Positive Features:** {analysis_data['positive_features']}
573
- - **Negative Features:** {analysis_data['negative_features']}
574
- - **Top Features:** {', '.join([f"{word}({score:.3f})" for word, score in lime_data[:5]])}
575
- """
576
-
577
- return summary_text, fig, analysis_data
578
 
579
- except Exception as e:
580
- logger.error(f"LIME analysis failed: {e}")
581
- return f"LIME analysis failed: {str(e)}", None, {}
582
-
583
- # Advanced Plotly Visualization System (Updated - removed attention visualization)
584
- class PlotlyVisualizer:
585
- """Enhanced Plotly visualizations"""
586
-
587
- @staticmethod
588
- @handle_errors(default_return=None)
589
- def create_sentiment_gauge(result: Dict, theme: ThemeContext) -> go.Figure:
590
- """Create animated sentiment gauge"""
591
- colors = theme.colors
592
-
593
- if result.get('has_neutral', False):
594
- # Three-way gauge
595
- fig = go.Figure(go.Indicator(
596
- mode="gauge+number+delta",
597
- value=result['pos_prob'] * 100,
598
- domain={'x': [0, 1], 'y': [0, 1]},
599
- title={'text': f"Sentiment: {result['sentiment']}"},
600
- delta={'reference': 50},
601
- gauge={
602
- 'axis': {'range': [None, 100]},
603
- 'bar': {'color': colors['pos'] if result['sentiment'] == 'Positive' else colors['neg']},
604
- 'steps': [
605
- {'range': [0, 33], 'color': colors['neg']},
606
- {'range': [33, 67], 'color': colors['neu']},
607
- {'range': [67, 100], 'color': colors['pos']}
608
- ],
609
- 'threshold': {
610
- 'line': {'color': "red", 'width': 4},
611
- 'thickness': 0.75,
612
- 'value': 90
613
- }
614
- }
615
- ))
616
- else:
617
- # Two-way gauge
618
- fig = go.Figure(go.Indicator(
619
- mode="gauge+number",
620
- value=result['confidence'] * 100,
621
- domain={'x': [0, 1], 'y': [0, 1]},
622
- title={'text': f"Confidence: {result['sentiment']}"},
623
- gauge={
624
- 'axis': {'range': [None, 100]},
625
- 'bar': {'color': colors['pos'] if result['sentiment'] == 'Positive' else colors['neg']},
626
- 'steps': [
627
- {'range': [0, 50], 'color': "lightgray"},
628
- {'range': [50, 100], 'color': "gray"}
629
- ]
630
- }
631
- ))
632
 
633
- fig.update_layout(height=400, font={'size': 16})
634
- return fig
 
 
 
635
 
636
  @staticmethod
637
  @handle_errors(default_return=None)
638
- def create_probability_bars(result: Dict, theme: ThemeContext) -> go.Figure:
639
- """Create probability bar chart"""
640
- colors = theme.colors
641
-
642
- if result.get('has_neutral', False):
643
- labels = ['Negative', 'Neutral', 'Positive']
644
- values = [result['neg_prob'], result['neu_prob'], result['pos_prob']]
645
- bar_colors = [colors['neg'], colors['neu'], colors['pos']]
646
- else:
647
- labels = ['Negative', 'Positive']
648
- values = [result['neg_prob'], result['pos_prob']]
649
- bar_colors = [colors['neg'], colors['pos']]
 
 
 
 
 
 
 
 
650
 
651
  fig = go.Figure(data=[
652
- go.Bar(x=labels, y=values, marker_color=bar_colors,
653
- text=[f'{v:.3f}' for v in values], textposition='outside')
 
 
 
 
 
654
  ])
655
 
656
  fig.update_layout(
657
  title="Sentiment Probabilities",
 
658
  yaxis_title="Probability",
659
- height=400,
 
 
660
  showlegend=False
661
  )
662
 
@@ -664,160 +413,171 @@ class PlotlyVisualizer:
664
 
665
  @staticmethod
666
  @handle_errors(default_return=None)
667
- def create_keyword_chart(keywords: List[Tuple[str, float]], sentiment: str, theme: ThemeContext) -> go.Figure:
668
- """Create basic keyword chart"""
669
- if not keywords:
670
- fig = go.Figure()
671
- fig.add_annotation(text="No keywords extracted",
672
- xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)
673
- fig.update_layout(height=400, title="Keywords")
674
- return fig
675
-
676
- words = [word for word, score in keywords]
677
- scores = [score for word, score in keywords]
678
-
679
- color = theme.colors['pos'] if sentiment == 'Positive' else theme.colors['neg']
680
-
681
- fig = go.Figure(data=[
682
- go.Bar(
683
- y=words,
684
- x=scores,
685
- orientation='h',
686
- marker_color=color,
687
- text=[f'{score:.3f}' for score in scores],
688
- textposition='auto'
689
- )
690
- ])
691
 
692
  fig.update_layout(
693
- title=f"Top Keywords ({sentiment})",
694
- xaxis_title="Frequency Score",
695
- yaxis_title="Keywords",
696
- height=400,
697
- showlegend=False
698
  )
699
 
700
  return fig
701
 
702
  @staticmethod
703
  @handle_errors(default_return=None)
704
- def create_batch_summary(results: List[Dict], theme: ThemeContext) -> go.Figure:
705
- """Create batch analysis summary"""
706
- colors = theme.colors
707
-
708
- # Count sentiments
709
- sentiments = [r['sentiment'] for r in results if 'sentiment' in r and r['sentiment'] != 'Error']
710
- sentiment_counts = Counter(sentiments)
711
-
712
- # Create pie chart
713
- fig = go.Figure(data=[go.Pie(
714
- labels=list(sentiment_counts.keys()),
715
- values=list(sentiment_counts.values()),
716
- marker_colors=[colors.get(s.lower()[:3], '#999999') for s in sentiment_counts.keys()],
717
- textinfo='label+percent',
718
- hole=0.3
719
- )])
 
 
 
720
 
721
  fig.update_layout(
722
- title=f"Batch Analysis Summary ({len(results)} texts)",
723
- height=400
 
 
 
 
724
  )
725
 
726
  return fig
727
 
728
  @staticmethod
729
  @handle_errors(default_return=None)
730
- def create_confidence_distribution(results: List[Dict]) -> go.Figure:
731
- """Create confidence distribution plot"""
732
- confidences = [r['confidence'] for r in results if 'confidence' in r and r['sentiment'] != 'Error']
733
-
734
- if not confidences:
735
- return go.Figure()
736
 
737
- fig = go.Figure(data=[go.Histogram(
738
- x=confidences,
739
- nbinsx=20,
740
- marker_color='skyblue',
741
- opacity=0.7
742
- )])
743
-
744
- fig.update_layout(
745
- title="Confidence Distribution",
746
- xaxis_title="Confidence Score",
747
- yaxis_title="Frequency",
748
- height=400
749
- )
750
-
751
- return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
752
 
753
  @staticmethod
754
  @handle_errors(default_return=None)
755
- def create_history_dashboard(history: List[Dict], theme: ThemeContext) -> go.Figure:
756
- """Create comprehensive history dashboard"""
757
- if len(history) < 2:
758
- return go.Figure()
759
-
760
- # Create subplots
761
  fig = make_subplots(
762
  rows=2, cols=2,
763
- subplot_titles=['Sentiment Timeline', 'Confidence Distribution',
764
- 'Language Distribution', 'Sentiment Summary'],
765
- specs=[[{"secondary_y": False}, {"secondary_y": False}],
766
- [{"type": "pie"}, {"type": "bar"}]]
767
  )
768
 
769
- # Extract data
770
- indices = list(range(len(history)))
771
- pos_probs = [item.get('pos_prob', 0) for item in history]
772
- confidences = [item['confidence'] for item in history]
773
- sentiments = [item['sentiment'] for item in history]
774
- languages = [item.get('language', 'en') for item in history]
775
-
776
- # Sentiment timeline
777
- colors_map = {'Positive': theme.colors['pos'], 'Negative': theme.colors['neg'], 'Neutral': theme.colors['neu']}
778
- colors = [colors_map.get(s, '#999999') for s in sentiments]
779
 
780
  fig.add_trace(
781
- go.Scatter(x=indices, y=pos_probs, mode='lines+markers',
782
- marker=dict(color=colors, size=8),
783
- name='Positive Probability'),
784
  row=1, col=1
785
  )
786
 
787
- # Confidence distribution
 
788
  fig.add_trace(
789
- go.Histogram(x=confidences, nbinsx=10, name='Confidence'),
790
  row=1, col=2
791
  )
792
 
793
- # Language distribution
794
- lang_counts = Counter(languages)
 
 
 
 
 
795
  fig.add_trace(
796
- go.Pie(labels=list(lang_counts.keys()), values=list(lang_counts.values()),
797
- name="Languages"),
 
798
  row=2, col=1
799
  )
800
 
801
- # Sentiment summary
802
- sent_counts = Counter(sentiments)
803
- sent_colors = [colors_map.get(k, '#999999') for k in sent_counts.keys()]
804
- fig.add_trace(
805
- go.Bar(x=list(sent_counts.keys()), y=list(sent_counts.values()),
806
- marker_color=sent_colors),
807
- row=2, col=2
 
808
  )
809
 
810
- fig.update_layout(height=800, showlegend=False)
811
  return fig
812
 
813
- # Universal Data Handler
814
  class DataHandler:
815
- """Enhanced data operations"""
816
 
817
  @staticmethod
818
  @handle_errors(default_return=(None, "Export failed"))
819
  def export_data(data: List[Dict], format_type: str) -> Tuple[Optional[str], str]:
820
- """Export data with comprehensive information"""
821
  if not data:
822
  return None, "No data to export"
823
 
@@ -826,21 +586,18 @@ class DataHandler:
826
 
827
  if format_type == 'csv':
828
  writer = csv.writer(temp_file)
829
- writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Language',
830
- 'Pos_Prob', 'Neg_Prob', 'Neu_Prob', 'Keywords', 'Word_Count'])
831
  for entry in data:
832
- keywords_str = "|".join([f"{word}:{score:.3f}" for word, score in entry.get('keywords', [])])
833
  writer.writerow([
834
  entry.get('timestamp', ''),
835
  entry.get('text', ''),
836
  entry.get('sentiment', ''),
837
  f"{entry.get('confidence', 0):.4f}",
838
- entry.get('language', 'en'),
839
  f"{entry.get('pos_prob', 0):.4f}",
840
  f"{entry.get('neg_prob', 0):.4f}",
841
- f"{entry.get('neu_prob', 0):.4f}",
842
- keywords_str,
843
- entry.get('word_count', 0)
844
  ])
845
  elif format_type == 'json':
846
  json.dump(data, temp_file, indent=2, ensure_ascii=False)
@@ -851,26 +608,27 @@ class DataHandler:
851
  @staticmethod
852
  @handle_errors(default_return="")
853
  def process_file(file) -> str:
854
- """Process uploaded files"""
855
  if not file:
856
  return ""
857
-
858
  content = file.read().decode('utf-8')
859
 
860
  if file.name.endswith('.csv'):
 
861
  csv_file = io.StringIO(content)
862
  reader = csv.reader(csv_file)
863
  try:
864
- next(reader) # Skip header
865
  texts = []
866
  for row in reader:
867
  if row and row[0].strip():
868
  text = row[0].strip().strip('"')
869
- if text:
870
  texts.append(text)
871
  return '\n'.join(texts)
872
- except:
873
- lines = content.strip().split('\n')[1:]
874
  texts = []
875
  for line in lines:
876
  if line.strip():
@@ -878,271 +636,171 @@ class DataHandler:
878
  if text:
879
  texts.append(text)
880
  return '\n'.join(texts)
881
-
882
  return content
883
 
884
- # Main Application Class
885
  class SentimentApp:
886
- """Main multilingual sentiment analysis application"""
887
 
888
  def __init__(self):
889
  self.engine = SentimentEngine()
890
- self.advanced_engine = AdvancedAnalysisEngine() # NEW
891
  self.history = HistoryManager()
892
  self.data_handler = DataHandler()
893
 
894
  # Multi-language examples
895
  self.examples = [
896
- ["This movie was absolutely fantastic! The acting was superb and the plot kept me engaged throughout."],
897
- ["The film was disappointing with poor character development and a confusing storyline."],
898
- ["这部电影真的很棒!演技精湛,情节引人入胜。"], # Chinese
899
- ["Esta película fue increíble, me encantó la cinematografía."], # Spanish
900
- ["Ce film était magnifique, j'ai adoré la réalisation."], # French
901
  ]
902
 
903
- @handle_errors(default_return=("Please enter text", None, None, None))
904
- def analyze_single(self, text: str, language: str, theme: str, clean_text: bool,
905
- remove_punct: bool, remove_nums: bool):
906
- """Single text analysis with basic visualizations (removed attention analysis)"""
907
  if not text.strip():
908
- return "Please enter text", None, None, None
909
 
910
- # Map display names to language codes
911
- language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
912
- language_code = language_map.get(language, 'auto')
913
 
914
- preprocessing_options = {
915
- 'clean_text': clean_text,
916
- 'remove_punctuation': remove_punct,
917
- 'remove_numbers': remove_nums
918
- }
 
919
 
920
- with memory_cleanup():
921
- result = self.engine.analyze_single(text, language_code, preprocessing_options)
922
-
923
- # Add to history
924
- history_entry = {
925
- 'text': text[:100] + '...' if len(text) > 100 else text,
926
- 'full_text': text,
927
- 'sentiment': result['sentiment'],
928
- 'confidence': result['confidence'],
929
- 'pos_prob': result.get('pos_prob', 0),
930
- 'neg_prob': result.get('neg_prob', 0),
931
- 'neu_prob': result.get('neu_prob', 0),
932
- 'language': result['language'],
933
- 'keywords': result['keywords'],
934
- 'word_count': result['word_count'],
935
- 'analysis_type': 'single'
936
- }
937
- self.history.add(history_entry)
938
-
939
- # Create visualizations
940
- theme_ctx = ThemeContext(theme)
941
- gauge_fig = PlotlyVisualizer.create_sentiment_gauge(result, theme_ctx)
942
- bars_fig = PlotlyVisualizer.create_probability_bars(result, theme_ctx)
943
- keyword_fig = PlotlyVisualizer.create_keyword_chart(result['keywords'], result['sentiment'], theme_ctx)
944
-
945
- # Create comprehensive result text
946
- keywords_str = ", ".join([f"{word}({score:.3f})" for word, score in result['keywords'][:5]])
947
-
948
- info_text = f"""
949
- **Analysis Results:**
950
- - **Sentiment:** {result['sentiment']} ({result['confidence']:.3f} confidence)
951
- - **Language:** {result['language'].upper()}
952
- - **Keywords:** {keywords_str}
953
- - **Statistics:** {result['word_count']} words, {result['char_count']} characters
954
- """
955
-
956
- return info_text, gauge_fig, bars_fig, keyword_fig
957
-
958
- @handle_errors(default_return=("Please enter texts", None, None, None))
959
- def analyze_batch(self, batch_text: str, language: str, theme: str,
960
- clean_text: bool, remove_punct: bool, remove_nums: bool):
961
- """Enhanced batch analysis"""
962
- if not batch_text.strip():
963
- return "Please enter texts (one per line)", None, None, None
964
 
965
- # Parse batch input
966
- texts = TextProcessor.parse_batch_input(batch_text)
 
 
967
 
968
- if len(texts) > config.BATCH_SIZE_LIMIT:
969
- return f"Too many texts. Maximum {config.BATCH_SIZE_LIMIT} allowed.", None, None, None
 
 
 
970
 
971
- if not texts:
972
- return "No valid texts found", None, None, None
 
 
 
 
 
973
 
974
- # Map display names to language codes
975
- language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
976
- language_code = language_map.get(language, 'auto')
977
 
978
- preprocessing_options = {
979
- 'clean_text': clean_text,
980
- 'remove_punctuation': remove_punct,
981
- 'remove_numbers': remove_nums
982
- }
983
 
984
- with memory_cleanup():
985
- results = self.engine.analyze_batch(texts, language_code, preprocessing_options)
986
-
987
- # Add to history
988
- batch_entries = []
989
- for result in results:
990
- if 'error' not in result:
991
- entry = {
992
- 'text': result['text'],
993
- 'full_text': result['full_text'],
994
- 'sentiment': result['sentiment'],
995
- 'confidence': result['confidence'],
996
- 'pos_prob': result.get('pos_prob', 0),
997
- 'neg_prob': result.get('neg_prob', 0),
998
- 'neu_prob': result.get('neu_prob', 0),
999
- 'language': result['language'],
1000
- 'keywords': result['keywords'],
1001
- 'word_count': result['word_count'],
1002
- 'analysis_type': 'batch',
1003
- 'batch_index': result['batch_index']
1004
- }
1005
- batch_entries.append(entry)
1006
-
1007
- self.history.add_batch(batch_entries)
1008
-
1009
- # Create visualizations
1010
- theme_ctx = ThemeContext(theme)
1011
- summary_fig = PlotlyVisualizer.create_batch_summary(results, theme_ctx)
1012
- confidence_fig = PlotlyVisualizer.create_confidence_distribution(results)
1013
-
1014
- # Create results DataFrame
1015
- df_data = []
1016
- for result in results:
1017
- if 'error' in result:
1018
- df_data.append({
1019
- 'Index': result['batch_index'] + 1,
1020
- 'Text': result['text'],
1021
- 'Sentiment': 'Error',
1022
- 'Confidence': 0.0,
1023
- 'Language': 'Unknown',
1024
- 'Error': result['error']
1025
- })
1026
- else:
1027
- keywords_str = ', '.join([word for word, _ in result['keywords'][:3]])
1028
- df_data.append({
1029
- 'Index': result['batch_index'] + 1,
1030
- 'Text': result['text'],
1031
- 'Sentiment': result['sentiment'],
1032
- 'Confidence': f"{result['confidence']:.3f}",
1033
- 'Language': result['language'].upper(),
1034
- 'Keywords': keywords_str
1035
- })
1036
-
1037
- df = pd.DataFrame(df_data)
1038
-
1039
- # Create summary text
1040
- successful_results = [r for r in results if 'error' not in r]
1041
- error_count = len(results) - len(successful_results)
1042
-
1043
- if successful_results:
1044
- sentiment_counts = Counter([r['sentiment'] for r in successful_results])
1045
- avg_confidence = np.mean([r['confidence'] for r in successful_results])
1046
- languages = Counter([r['language'] for r in successful_results])
1047
-
1048
- summary_text = f"""
1049
- **Batch Analysis Summary:**
1050
- - **Total Texts:** {len(texts)}
1051
- - **Successful:** {len(successful_results)}
1052
- - **Errors:** {error_count}
1053
- - **Average Confidence:** {avg_confidence:.3f}
1054
- - **Sentiments:** {dict(sentiment_counts)}
1055
- - **Languages Detected:** {dict(languages)}
1056
- """
1057
- else:
1058
- summary_text = f"All {len(texts)} texts failed to analyze."
1059
-
1060
- return summary_text, df, summary_fig, confidence_fig
1061
-
1062
- # NEW: Advanced analysis methods
1063
- @handle_errors(default_return=("Please enter text", None))
1064
- def analyze_with_shap(self, text: str, language: str):
1065
- """Perform SHAP analysis"""
1066
- language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
1067
- language_code = language_map.get(language, 'auto')
1068
-
1069
- return self.advanced_engine.analyze_with_shap(text, language_code)
1070
-
1071
- @handle_errors(default_return=("Please enter text", None))
1072
- def analyze_with_lime(self, text: str, language: str):
1073
- """Perform LIME analysis"""
1074
- language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
1075
- language_code = language_map.get(language, 'auto')
1076
 
1077
- return self.advanced_engine.analyze_with_lime(text, language_code)
 
 
1078
 
1079
  @handle_errors(default_return=(None, "No history available"))
1080
  def plot_history(self, theme: str = 'default'):
1081
- """Plot comprehensive history analysis"""
1082
  history = self.history.get_all()
1083
  if len(history) < 2:
1084
  return None, f"Need at least 2 analyses for trends. Current: {len(history)}"
1085
 
1086
  theme_ctx = ThemeContext(theme)
1087
 
1088
- with memory_cleanup():
1089
- fig = PlotlyVisualizer.create_history_dashboard(history, theme_ctx)
1090
- stats = self.history.get_stats()
1091
-
1092
- stats_text = f"""
1093
- **History Statistics:**
1094
- - **Total Analyses:** {stats.get('total_analyses', 0)}
1095
- - **Positive:** {stats.get('positive_count', 0)}
1096
- - **Negative:** {stats.get('negative_count', 0)}
1097
- - **Neutral:** {stats.get('neutral_count', 0)}
1098
- - **Average Confidence:** {stats.get('avg_confidence', 0):.3f}
1099
- - **Languages:** {stats.get('languages_detected', 0)}
1100
- - **Most Common Language:** {stats.get('most_common_language', 'N/A').upper()}
1101
- """
1102
-
1103
- return fig, stats_text
1104
-
1105
- @handle_errors(default_return=("No data available",))
1106
- def get_history_status(self):
1107
- """Get current history status"""
1108
- stats = self.history.get_stats()
1109
- if not stats:
1110
- return "No analyses performed yet"
1111
-
1112
- return f"""
1113
- **Current Status:**
1114
- - **Total Analyses:** {stats['total_analyses']}
1115
- - **Recent Sentiment Distribution:**
1116
- * Positive: {stats['positive_count']}
1117
- * Negative: {stats['negative_count']}
1118
- * Neutral: {stats['neutral_count']}
1119
- - **Average Confidence:** {stats['avg_confidence']:.3f}
1120
- - **Languages Detected:** {stats['languages_detected']}
1121
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1122
 
1123
- # Gradio Interface (Updated with Advanced Analysis tab)
1124
  def create_interface():
1125
- """Create comprehensive Gradio interface with Advanced Analysis tab"""
1126
  app = SentimentApp()
1127
 
1128
- with gr.Blocks(theme=gr.themes.Soft(), title="Multilingual Sentiment Analyzer") as demo:
1129
- gr.Markdown("# 🌍 Advanced Multilingual Sentiment Analyzer")
1130
- gr.Markdown("AI-powered sentiment analysis with support for multiple languages, advanced visualizations, and explainable AI features")
1131
 
1132
  with gr.Tab("Single Analysis"):
1133
  with gr.Row():
1134
  with gr.Column():
1135
  text_input = gr.Textbox(
1136
- label="Enter Text for Analysis",
1137
- placeholder="Enter your text in any supported language...",
1138
  lines=5
1139
  )
1140
-
1141
  with gr.Row():
1142
- language_selector = gr.Dropdown(
1143
- choices=list(config.SUPPORTED_LANGUAGES.values()),
1144
- value="Auto Detect",
1145
- label="Language"
 
 
 
 
 
 
 
 
1146
  )
1147
  theme_selector = gr.Dropdown(
1148
  choices=list(config.THEMES.keys()),
@@ -1150,218 +808,142 @@ def create_interface():
1150
  label="Theme"
1151
  )
1152
 
1153
- with gr.Row():
1154
- clean_text_cb = gr.Checkbox(label="Clean Text", value=False)
1155
- remove_punct_cb = gr.Checkbox(label="Remove Punctuation", value=False)
1156
- remove_nums_cb = gr.Checkbox(label="Remove Numbers", value=False)
1157
-
1158
- analyze_btn = gr.Button("Analyze", variant="primary", size="lg")
1159
-
1160
  gr.Examples(
1161
  examples=app.examples,
1162
  inputs=text_input,
1163
- cache_examples=False
1164
  )
1165
 
1166
  with gr.Column():
1167
- result_output = gr.Textbox(label="Analysis Results", lines=8)
1168
 
1169
  with gr.Row():
1170
- gauge_plot = gr.Plot(label="Sentiment Gauge")
1171
- probability_plot = gr.Plot(label="Probability Distribution")
1172
 
1173
  with gr.Row():
1174
- keyword_plot = gr.Plot(label="Basic Keywords")
1175
-
1176
- # NEW: Advanced Analysis Tab
1177
- with gr.Tab("Advanced Analysis"):
1178
- gr.Markdown("## 🔬 Explainable AI Analysis")
1179
- gr.Markdown("Use SHAP and LIME to understand which words and phrases most influence the sentiment prediction.")
1180
-
1181
- with gr.Row():
1182
- with gr.Column():
1183
- advanced_text_input = gr.Textbox(
1184
- label="Enter Text for Advanced Analysis",
1185
- placeholder="Enter text to analyze with SHAP and LIME...",
1186
- lines=6
1187
- )
1188
-
1189
- advanced_language = gr.Dropdown(
1190
- choices=list(config.SUPPORTED_LANGUAGES.values()),
1191
- value="Auto Detect",
1192
- label="Language"
1193
- )
1194
-
1195
- with gr.Row():
1196
- shap_btn = gr.Button("SHAP Analysis", variant="primary")
1197
- lime_btn = gr.Button("LIME Analysis", variant="secondary")
1198
-
1199
- gr.Markdown("""
1200
- **Analysis Methods:**
1201
- - **SHAP**: Shows token-level importance scores
1202
- - **LIME**: Explains predictions by perturbing input features
1203
- """)
1204
-
1205
- with gr.Column():
1206
- advanced_results = gr.Textbox(label="Analysis Summary", lines=10)
1207
-
1208
- with gr.Row():
1209
- advanced_plot = gr.Plot(label="Feature Importance Visualization")
1210
 
1211
  with gr.Tab("Batch Analysis"):
1212
  with gr.Row():
1213
  with gr.Column():
1214
- file_upload = gr.File(
1215
- label="Upload File (CSV/TXT)",
1216
- file_types=[".csv", ".txt"]
1217
- )
1218
  batch_input = gr.Textbox(
1219
- label="Batch Input (one text per line)",
1220
- placeholder="Enter multiple texts, one per line...",
1221
- lines=10
1222
  )
1223
-
1224
- with gr.Row():
1225
- batch_language = gr.Dropdown(
1226
- choices=list(config.SUPPORTED_LANGUAGES.values()),
1227
- value="Auto Detect",
1228
- label="Language"
1229
- )
1230
- batch_theme = gr.Dropdown(
1231
- choices=list(config.THEMES.keys()),
1232
- value="default",
1233
- label="Theme"
1234
- )
1235
-
1236
- with gr.Row():
1237
- batch_clean_cb = gr.Checkbox(label="Clean Text", value=False)
1238
- batch_punct_cb = gr.Checkbox(label="Remove Punctuation", value=False)
1239
- batch_nums_cb = gr.Checkbox(label="Remove Numbers", value=False)
1240
-
1241
- with gr.Row():
1242
- load_file_btn = gr.Button("Load File")
1243
- analyze_batch_btn = gr.Button("Analyze Batch", variant="primary")
1244
 
1245
  with gr.Column():
1246
- batch_summary = gr.Textbox(label="Batch Summary", lines=8)
1247
- batch_results_df = gr.Dataframe(
1248
- label="Detailed Results",
1249
- headers=["Index", "Text", "Sentiment", "Confidence", "Language", "Keywords"],
1250
- datatype=["number", "str", "str", "str", "str", "str"]
1251
- )
 
 
 
 
 
 
 
 
 
1252
 
1253
- with gr.Row():
1254
- batch_plot = gr.Plot(label="Batch Analysis Summary")
1255
- confidence_dist_plot = gr.Plot(label="Confidence Distribution")
1256
 
1257
- with gr.Tab("History & Analytics"):
1258
  with gr.Row():
1259
- with gr.Column():
1260
- with gr.Row():
1261
- refresh_history_btn = gr.Button("Refresh History")
1262
- clear_history_btn = gr.Button("Clear History", variant="stop")
1263
- status_btn = gr.Button("Get Status")
1264
-
1265
- history_theme = gr.Dropdown(
1266
- choices=list(config.THEMES.keys()),
1267
- value="default",
1268
- label="Dashboard Theme"
1269
- )
1270
-
1271
- with gr.Row():
1272
- export_csv_btn = gr.Button("Export CSV")
1273
- export_json_btn = gr.Button("Export JSON")
1274
-
1275
- with gr.Column():
1276
- history_status = gr.Textbox(label="History Status", lines=8)
1277
-
1278
- history_dashboard = gr.Plot(label="History Analytics Dashboard")
1279
 
1280
  with gr.Row():
1281
- csv_download = gr.File(label="CSV Download", visible=True)
1282
- json_download = gr.File(label="JSON Download", visible=True)
1283
-
1284
- # Event Handlers
1285
-
1286
- # Single Analysis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1287
  analyze_btn.click(
1288
  app.analyze_single,
1289
- inputs=[text_input, language_selector, theme_selector,
1290
- clean_text_cb, remove_punct_cb, remove_nums_cb],
1291
- outputs=[result_output, gauge_plot, probability_plot, keyword_plot]
1292
- )
1293
-
1294
- # Advanced Analysis (NEW)
1295
- shap_btn.click(
1296
- app.analyze_with_shap,
1297
- inputs=[advanced_text_input, advanced_language],
1298
- outputs=[advanced_results, advanced_plot]
1299
  )
1300
 
1301
- lime_btn.click(
1302
- app.analyze_with_lime,
1303
- inputs=[advanced_text_input, advanced_language],
1304
- outputs=[advanced_results, advanced_plot]
1305
- )
1306
-
1307
- # Batch Analysis
1308
- load_file_btn.click(
1309
- app.data_handler.process_file,
1310
- inputs=file_upload,
1311
  outputs=batch_input
1312
  )
1313
 
1314
- analyze_batch_btn.click(
1315
- app.analyze_batch,
1316
- inputs=[batch_input, batch_language, batch_theme,
1317
- batch_clean_cb, batch_punct_cb, batch_nums_cb],
1318
- outputs=[batch_summary, batch_results_df, batch_plot, confidence_dist_plot]
1319
  )
1320
 
1321
- # History & Analytics
1322
- refresh_history_btn.click(
1323
- app.plot_history,
1324
- inputs=history_theme,
1325
- outputs=[history_dashboard, history_status]
1326
  )
1327
 
1328
- clear_history_btn.click(
1329
  lambda: f"Cleared {app.history.clear()} entries",
1330
  outputs=history_status
1331
  )
1332
 
1333
  status_btn.click(
1334
- app.get_history_status,
1335
  outputs=history_status
1336
  )
1337
 
1338
- export_csv_btn.click(
1339
  lambda: app.data_handler.export_data(app.history.get_all(), 'csv'),
1340
- outputs=[csv_download, history_status]
1341
  )
1342
 
1343
- export_json_btn.click(
1344
  lambda: app.data_handler.export_data(app.history.get_all(), 'json'),
1345
- outputs=[json_download, history_status]
1346
  )
1347
 
1348
  return demo
1349
 
1350
  # Application Entry Point
1351
  if __name__ == "__main__":
1352
- logging.basicConfig(
1353
- level=logging.INFO,
1354
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
1355
- )
1356
-
1357
- try:
1358
- demo = create_interface()
1359
- demo.launch(
1360
- share=True,
1361
- server_name="0.0.0.0",
1362
- server_port=7860,
1363
- show_error=True
1364
- )
1365
- except Exception as e:
1366
- logger.error(f"Failed to launch application: {e}")
1367
- raise
 
1
  import torch
2
  import gradio as gr
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
4
  import plotly.graph_objects as go
5
  import plotly.express as px
6
  from plotly.subplots import make_subplots
 
18
  from dataclasses import dataclass
19
  from typing import List, Dict, Optional, Tuple, Any, Callable
20
  from contextlib import contextmanager
 
 
 
 
21
  import gc
22
+ import base64
 
 
 
 
23
 
24
  # Configuration
25
  @dataclass
 
31
  CACHE_SIZE: int = 128
32
  BATCH_PROCESSING_SIZE: int = 8
33
 
34
+ # Visualization settings
35
+ FIGURE_WIDTH: int = 800
36
+ FIGURE_HEIGHT: int = 500
37
+ WORDCLOUD_SIZE: Tuple[int, int] = (800, 400)
38
+
39
+ THEMES = {
40
+ 'default': {'pos': '#4ecdc4', 'neg': '#ff6b6b'},
41
+ 'ocean': {'pos': '#0077be', 'neg': '#ff6b35'},
42
+ 'forest': {'pos': '#228b22', 'neg': '#dc143c'},
43
+ 'sunset': {'pos': '#ff8c00', 'neg': '#8b0000'}
44
  }
45
 
46
+ # Multi-language models
47
  MODELS = {
48
+ 'multilingual': {
49
+ 'name': 'cardiffnlp/twitter-xlm-roberta-base-sentiment',
50
+ 'labels': ['NEGATIVE', 'NEUTRAL', 'POSITIVE']
51
+ },
52
+ 'english': {
53
+ 'name': 'cardiffnlp/twitter-roberta-base-sentiment-latest',
54
+ 'labels': ['NEGATIVE', 'NEUTRAL', 'POSITIVE']
55
+ },
56
+ 'chinese': {
57
+ 'name': 'uer/roberta-base-finetuned-chinanews-chinese',
58
+ 'labels': ['NEGATIVE', 'POSITIVE']
59
+ },
60
+ 'spanish': {
61
+ 'name': 'finiteautomata/beto-sentiment-analysis',
62
+ 'labels': ['NEGATIVE', 'NEUTRAL', 'POSITIVE']
63
+ },
64
+ 'french': {
65
+ 'name': 'tblard/tf-allocine',
66
+ 'labels': ['NEGATIVE', 'POSITIVE']
67
+ }
68
  }
69
 
70
+ STOP_WORDS = {
71
+ 'en': {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'will', 'would', 'could', 'should'},
72
+ 'zh': {'', '', '', '', '', '', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看'},
73
+ 'es': {'el', 'la', 'de', 'que', 'y', 'a', 'en', 'un', 'es', 'se', 'no', 'te', 'lo', 'le', 'da', 'su', 'por', 'son', 'con', 'para', 'al', 'del', 'los', 'las'},
74
+ 'fr': {'le', 'la', 'les', 'de', 'un', 'une', 'du', 'des', 'et', 'à', 'ce', 'il', 'que', 'qui', 'ne', 'se', 'pas', 'tout', 'être', 'avoir', 'sur', 'avec', 'par'},
 
75
  }
76
 
77
  config = Config()
 
 
 
78
  logger = logging.getLogger(__name__)
79
 
 
 
 
 
 
 
 
 
80
  # Decorators and Context Managers
81
  def handle_errors(default_return=None):
82
  """Centralized error handling decorator"""
 
91
  return wrapper
92
  return decorator
93
 
 
 
 
 
 
 
 
 
94
  class ThemeContext:
95
  """Theme management context"""
96
  def __init__(self, theme: str = 'default'):
97
  self.theme = theme
98
  self.colors = config.THEMES.get(theme, config.THEMES['default'])
99
 
100
+ # Enhanced Model Manager for Multi-language Support
101
  class ModelManager:
102
  """Multi-language model manager with lazy loading"""
103
  _instance = None
104
+ _models = {}
105
+ _tokenizers = {}
106
+ _pipelines = {}
107
+ _device = None
108
 
109
  def __new__(cls):
110
  if cls._instance is None:
111
  cls._instance = super().__new__(cls)
 
112
  return cls._instance
113
 
114
+ @property
115
+ def device(self):
116
+ if self._device is None:
117
+ self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
118
+ return self._device
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
+ def get_pipeline(self, model_key: str = 'multilingual'):
121
+ """Get or create sentiment analysis pipeline for specified model"""
122
+ if model_key not in self._pipelines:
123
+ try:
124
+ model_config = config.MODELS[model_key]
125
+ self._pipelines[model_key] = pipeline(
126
+ "sentiment-analysis",
127
+ model=model_config['name'],
128
+ tokenizer=model_config['name'],
129
+ device=0 if torch.cuda.is_available() else -1,
130
+ top_k=None
131
+ )
132
+ logger.info(f"Model {model_key} loaded successfully")
133
+ except Exception as e:
134
+ logger.error(f"Failed to load model {model_key}: {e}")
135
+ # Fallback to multilingual model
136
+ if model_key != 'multilingual':
137
+ return self.get_pipeline('multilingual')
138
+ raise
139
+ return self._pipelines[model_key]
140
+
141
+ def get_model_and_tokenizer(self, model_key: str = 'multilingual'):
142
+ """Get model and tokenizer for attention extraction"""
143
+ if model_key not in self._models:
144
+ try:
145
+ model_config = config.MODELS[model_key]
146
+ self._tokenizers[model_key] = AutoTokenizer.from_pretrained(model_config['name'])
147
+ self._models[model_key] = AutoModelForSequenceClassification.from_pretrained(model_config['name'])
148
+ self._models[model_key].to(self.device)
149
+ logger.info(f"Model and tokenizer {model_key} loaded for attention extraction")
150
+ except Exception as e:
151
+ logger.error(f"Failed to load model/tokenizer {model_key}: {e}")
152
+ if model_key != 'multilingual':
153
+ return self.get_model_and_tokenizer('multilingual')
154
+ raise
155
+ return self._models[model_key], self._tokenizers[model_key]
156
+
157
+ # Language Detection
158
+ class LanguageDetector:
159
+ """Simple language detection based on character patterns"""
160
 
161
  @staticmethod
162
  def detect_language(text: str) -> str:
163
+ """Detect language based on character patterns"""
164
+ # Chinese characters
165
+ if re.search(r'[\u4e00-\u9fff]', text):
166
+ return 'chinese'
167
+ # Spanish patterns
168
+ elif re.search(r'[ñáéíóúü]', text.lower()):
169
+ return 'spanish'
170
+ # French patterns
171
+ elif re.search(r'[àâäçéèêëïîôùûüÿ]', text.lower()):
172
+ return 'french'
173
+ # Default to English/Multilingual
174
+ else:
175
+ return 'multilingual'
176
 
177
+ # Simplified Core Classes
178
  class TextProcessor:
179
  """Optimized text processing with multi-language support"""
 
180
  @staticmethod
181
  @lru_cache(maxsize=config.CACHE_SIZE)
182
+ def clean_text(text: str, language: str = 'en') -> Tuple[str, ...]:
183
+ """Single-pass text cleaning with language-specific stop words"""
184
+ words = re.findall(r'\b\w{2,}\b', text.lower())
185
+ stop_words = config.STOP_WORDS.get(language, config.STOP_WORDS['en'])
186
+ return tuple(w for w in words if w not in stop_words and len(w) >= config.MIN_WORD_LENGTH)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
 
188
  class HistoryManager:
189
+ """Simplified history management"""
190
  def __init__(self):
191
  self._history = []
192
 
193
  def add(self, entry: Dict):
194
+ self._history.append({**entry, 'timestamp': datetime.now().isoformat()})
 
 
195
  if len(self._history) > config.MAX_HISTORY_SIZE:
196
  self._history = self._history[-config.MAX_HISTORY_SIZE:]
197
 
 
 
 
 
 
198
  def get_all(self) -> List[Dict]:
199
  return self._history.copy()
200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  def clear(self) -> int:
202
  count = len(self._history)
203
  self._history.clear()
 
205
 
206
  def size(self) -> int:
207
  return len(self._history)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
+ # Core Analysis Engine with Multi-language Support
210
  class SentimentEngine:
211
+ """Multi-language sentiment analysis with attention-based keyword extraction"""
 
212
  def __init__(self):
213
  self.model_manager = ModelManager()
214
+ self.language_detector = LanguageDetector()
215
 
216
+ def extract_key_words(self, text: str, model_key: str = 'multilingual', top_k: int = 10) -> List[Tuple[str, float]]:
217
+ """Extract contributing words using attention weights"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  try:
219
+ model, tokenizer = self.model_manager.get_model_and_tokenizer(model_key)
 
 
 
 
220
 
221
+ inputs = tokenizer(
222
+ text, return_tensors="pt", padding=True,
223
+ truncation=True, max_length=config.MAX_TEXT_LENGTH
224
+ ).to(self.model_manager.device)
225
 
226
+ # Get model outputs with attention weights
227
+ with torch.no_grad():
228
+ outputs = model(**inputs, output_attentions=True)
229
+ attention = outputs.attentions
230
+
231
+ # Use the last layer's attention, average over all heads
232
+ last_attention = attention[-1]
233
+ avg_attention = last_attention.mean(dim=1)
234
+
235
+ # Focus on attention to [CLS] token
236
+ cls_attention = avg_attention[0, 0, :]
237
+
238
+ # Get tokens and their attention scores
239
+ tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
240
+ attention_scores = cls_attention.cpu().numpy()
241
+
242
+ # Filter out special tokens and combine subword tokens
243
+ word_scores = {}
244
+ current_word = ""
245
+ current_score = 0.0
246
+
247
+ for i, (token, score) in enumerate(zip(tokens, attention_scores)):
248
+ if token in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>', '<pad>']:
249
+ continue
250
+
251
+ if token.startswith('##') or token.startswith('▁'):
252
+ # Subword token
253
+ current_word += token[2:] if token.startswith('##') else token[1:]
254
+ current_score = max(current_score, score)
255
+ else:
256
+ # New word, save previous if exists
257
+ if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
258
+ word_scores[current_word.lower()] = current_score
259
+
260
+ current_word = token
261
+ current_score = score
262
 
263
+ # Don't forget the last word
264
+ if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
265
+ word_scores[current_word.lower()] = current_score
 
 
 
 
 
266
 
267
+ # Filter out stop words and sort by attention score
268
+ lang_code = 'zh' if model_key == 'chinese' else 'es' if model_key == 'spanish' else 'fr' if model_key == 'french' else 'en'
269
+ stop_words = config.STOP_WORDS.get(lang_code, config.STOP_WORDS['en'])
 
 
 
 
270
 
271
+ filtered_words = {
272
+ word: score for word, score in word_scores.items()
273
+ if word not in stop_words and len(word) >= config.MIN_WORD_LENGTH
 
 
 
 
 
 
274
  }
275
 
276
+ # Sort by attention score and return top_k
277
+ sorted_words = sorted(filtered_words.items(), key=lambda x: x[1], reverse=True)
278
+ return sorted_words[:top_k]
 
 
 
 
 
 
 
279
 
280
  except Exception as e:
281
+ logger.error(f"Key word extraction failed: {e}")
282
+ return []
283
 
284
+ @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0, 'key_words': []})
285
+ def analyze_single(self, text: str, model_key: str = None) -> Dict:
286
+ """Analyze single text with automatic language detection"""
287
  if not text.strip():
288
+ raise ValueError("Empty text")
289
+
290
+ # Auto-detect language if not specified
291
+ if model_key is None:
292
+ detected_lang = self.language_detector.detect_language(text)
293
+ model_key = detected_lang if detected_lang in config.MODELS else 'multilingual'
294
+
295
+ # Get sentiment analysis pipeline
296
+ classifier = self.model_manager.get_pipeline(model_key)
297
+ results = classifier(text)
298
+
299
+ # Process results based on model output format
300
+ if isinstance(results[0], list):
301
+ results = results[0]
302
+
303
+ # Map results to standardized format
304
+ sentiment_map = {'POSITIVE': 'Positive', 'NEGATIVE': 'Negative', 'NEUTRAL': 'Neutral'}
305
+
306
+ # Find positive and negative scores
307
+ pos_score = 0.0
308
+ neg_score = 0.0
309
+ neutral_score = 0.0
310
+
311
+ for result in results:
312
+ label = result['label']
313
+ score = result['score']
314
+
315
+ if 'POSITIVE' in label:
316
+ pos_score = score
317
+ elif 'NEGATIVE' in label:
318
+ neg_score = score
319
+ elif 'NEUTRAL' in label:
320
+ neutral_score = score
321
+
322
+ # Determine final sentiment
323
+ if pos_score > neg_score and pos_score > neutral_score:
324
+ sentiment = 'Positive'
325
+ confidence = pos_score
326
+ elif neg_score > pos_score and neg_score > neutral_score:
327
+ sentiment = 'Negative'
328
+ confidence = neg_score
329
  else:
330
+ sentiment = 'Neutral'
331
+ confidence = neutral_score
332
 
333
+ # Extract key contributing words
334
+ key_words = self.extract_key_words(text, model_key)
335
 
336
+ return {
337
+ 'sentiment': sentiment,
338
+ 'confidence': float(confidence),
339
+ 'pos_prob': float(pos_score),
340
+ 'neg_prob': float(neg_score),
341
+ 'neutral_prob': float(neutral_score),
342
+ 'key_words': key_words,
343
+ 'language': model_key
344
+ }
345
+
346
+ @handle_errors(default_return=[])
347
+ def analyze_batch(self, texts: List[str], model_key: str = None, progress_callback=None) -> List[Dict]:
348
+ """Optimized batch processing with key words"""
349
+ if len(texts) > config.BATCH_SIZE_LIMIT:
350
+ texts = texts[:config.BATCH_SIZE_LIMIT]
351
 
352
+ results = []
353
+
354
+ for i, text in enumerate(texts):
355
+ if progress_callback:
356
+ progress_callback((i + 1) / len(texts))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
 
358
+ result = self.analyze_single(text, model_key)
359
+ result['text'] = text[:50] + '...' if len(text) > 50 else text
360
+ result['full_text'] = text
361
+ results.append(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
 
363
+ return results
364
+
365
+ # Plotly Visualization System
366
+ class PlotFactory:
367
+ """Factory for creating Plotly visualizations"""
368
 
369
  @staticmethod
370
  @handle_errors(default_return=None)
371
+ def create_sentiment_bars(result: Dict, theme: ThemeContext) -> go.Figure:
372
+ """Create sentiment probability bars using Plotly"""
373
+ labels = []
374
+ values = []
375
+ colors = []
376
+
377
+ if 'neg_prob' in result and result['neg_prob'] > 0:
378
+ labels.append("Negative")
379
+ values.append(result['neg_prob'])
380
+ colors.append(theme.colors['neg'])
381
+
382
+ if 'neutral_prob' in result and result['neutral_prob'] > 0:
383
+ labels.append("Neutral")
384
+ values.append(result['neutral_prob'])
385
+ colors.append('#FFA500') # Orange for neutral
386
+
387
+ if 'pos_prob' in result and result['pos_prob'] > 0:
388
+ labels.append("Positive")
389
+ values.append(result['pos_prob'])
390
+ colors.append(theme.colors['pos'])
391
 
392
  fig = go.Figure(data=[
393
+ go.Bar(
394
+ x=labels,
395
+ y=values,
396
+ marker_color=colors,
397
+ text=[f'{v:.3f}' for v in values],
398
+ textposition='auto',
399
+ )
400
  ])
401
 
402
  fig.update_layout(
403
  title="Sentiment Probabilities",
404
+ xaxis_title="Sentiment",
405
  yaxis_title="Probability",
406
+ yaxis=dict(range=[0, 1]),
407
+ width=config.FIGURE_WIDTH,
408
+ height=config.FIGURE_HEIGHT,
409
  showlegend=False
410
  )
411
 
 
413
 
414
  @staticmethod
415
  @handle_errors(default_return=None)
416
+ def create_confidence_gauge(confidence: float, sentiment: str, theme: ThemeContext) -> go.Figure:
417
+ """Create confidence gauge using Plotly"""
418
+ color = theme.colors['pos'] if sentiment == 'Positive' else theme.colors['neg'] if sentiment == 'Negative' else '#FFA500'
419
+
420
+ fig = go.Figure(go.Indicator(
421
+ mode = "gauge+number+delta",
422
+ value = confidence,
423
+ domain = {'x': [0, 1], 'y': [0, 1]},
424
+ title = {'text': f"{sentiment} Confidence"},
425
+ delta = {'reference': 0.5},
426
+ gauge = {
427
+ 'axis': {'range': [None, 1]},
428
+ 'bar': {'color': color},
429
+ 'steps': [
430
+ {'range': [0, 0.5], 'color': "lightgray"},
431
+ {'range': [0.5, 1], 'color': "gray"}
432
+ ],
433
+ 'threshold': {
434
+ 'line': {'color': "red", 'width': 4},
435
+ 'thickness': 0.75,
436
+ 'value': 0.9
437
+ }
438
+ }
439
+ ))
440
 
441
  fig.update_layout(
442
+ width=config.FIGURE_WIDTH,
443
+ height=config.FIGURE_HEIGHT
 
 
 
444
  )
445
 
446
  return fig
447
 
448
  @staticmethod
449
  @handle_errors(default_return=None)
450
+ def create_keyword_chart(key_words: List[Tuple[str, float]], sentiment: str, theme: ThemeContext) -> Optional[go.Figure]:
451
+ """Create horizontal bar chart for key contributing words"""
452
+ if not key_words:
453
+ return None
454
+
455
+ words = [word for word, score in key_words]
456
+ scores = [score for word, score in key_words]
457
+
458
+ # Choose color based on sentiment
459
+ color = theme.colors['pos'] if sentiment == 'Positive' else theme.colors['neg'] if sentiment == 'Negative' else '#FFA500'
460
+
461
+ fig = go.Figure(go.Bar(
462
+ x=scores,
463
+ y=words,
464
+ orientation='h',
465
+ marker_color=color,
466
+ text=[f'{score:.3f}' for score in scores],
467
+ textposition='auto',
468
+ ))
469
 
470
  fig.update_layout(
471
+ title=f'Top Contributing Words ({sentiment})',
472
+ xaxis_title='Attention Weight',
473
+ yaxis_title='Words',
474
+ width=config.FIGURE_WIDTH,
475
+ height=config.FIGURE_HEIGHT,
476
+ yaxis={'categoryorder': 'total ascending'}
477
  )
478
 
479
  return fig
480
 
481
  @staticmethod
482
  @handle_errors(default_return=None)
483
+ def create_wordcloud_plot(text: str, sentiment: str, theme: ThemeContext) -> Optional[go.Figure]:
484
+ """Create word cloud visualization"""
485
+ if len(text.split()) < 3:
486
+ return None
 
 
487
 
488
+ try:
489
+ colormap = 'Greens' if sentiment == 'Positive' else 'Reds' if sentiment == 'Negative' else 'Blues'
490
+ wc = WordCloud(
491
+ width=config.WORDCLOUD_SIZE[0],
492
+ height=config.WORDCLOUD_SIZE[1],
493
+ background_color='white',
494
+ colormap=colormap,
495
+ max_words=30
496
+ ).generate(text)
497
+
498
+ # Convert to image
499
+ img_array = wc.to_array()
500
+
501
+ fig = go.Figure()
502
+ fig.add_trace(go.Image(z=img_array))
503
+ fig.update_layout(
504
+ title=f'{sentiment} Word Cloud',
505
+ xaxis={'visible': False},
506
+ yaxis={'visible': False},
507
+ width=config.FIGURE_WIDTH,
508
+ height=config.FIGURE_HEIGHT,
509
+ margin=dict(l=0, r=0, t=30, b=0)
510
+ )
511
+
512
+ return fig
513
+
514
+ except Exception as e:
515
+ logger.error(f"Word cloud generation failed: {e}")
516
+ return None
517
 
518
  @staticmethod
519
  @handle_errors(default_return=None)
520
+ def create_batch_analysis(results: List[Dict], theme: ThemeContext) -> go.Figure:
521
+ """Create comprehensive batch visualization using Plotly subplots"""
 
 
 
 
522
  fig = make_subplots(
523
  rows=2, cols=2,
524
+ subplot_titles=['Sentiment Distribution', 'Confidence Distribution',
525
+ 'Sentiment Progression', 'Language Distribution'],
526
+ specs=[[{"type": "pie"}, {"type": "histogram"}],
527
+ [{"type": "scatter", "colspan": 2}, None]]
528
  )
529
 
530
+ # Sentiment distribution (pie chart)
531
+ sent_counts = Counter([r['sentiment'] for r in results])
532
+ colors_pie = [theme.colors['pos'] if s == 'Positive' else theme.colors['neg'] if s == 'Negative' else '#FFA500' for s in sent_counts.keys()]
 
 
 
 
 
 
 
533
 
534
  fig.add_trace(
535
+ go.Pie(labels=list(sent_counts.keys()), values=list(sent_counts.values()),
536
+ marker_colors=colors_pie, name="Sentiment"),
 
537
  row=1, col=1
538
  )
539
 
540
+ # Confidence histogram
541
+ confs = [r['confidence'] for r in results]
542
  fig.add_trace(
543
+ go.Histogram(x=confs, nbinsx=8, marker_color='skyblue', name="Confidence"),
544
  row=1, col=2
545
  )
546
 
547
+ # Sentiment progression
548
+ pos_probs = [r.get('pos_prob', 0) for r in results]
549
+ indices = list(range(len(results)))
550
+ colors_scatter = [theme.colors['pos'] if r['sentiment'] == 'Positive'
551
+ else theme.colors['neg'] if r['sentiment'] == 'Negative'
552
+ else '#FFA500' for r in results]
553
+
554
  fig.add_trace(
555
+ go.Scatter(x=indices, y=pos_probs, mode='markers',
556
+ marker=dict(color=colors_scatter, size=8),
557
+ name="Sentiment Progression"),
558
  row=2, col=1
559
  )
560
 
561
+ # Add horizontal line at 0.5
562
+ fig.add_hline(y=0.5, line_dash="dash", line_color="gray", row=2, col=1)
563
+
564
+ fig.update_layout(
565
+ height=800,
566
+ width=1000,
567
+ showlegend=False,
568
+ title_text="Batch Analysis Results"
569
  )
570
 
 
571
  return fig
572
 
573
+ # Unified Data Handler (unchanged)
574
  class DataHandler:
575
+ """Handles all data operations"""
576
 
577
  @staticmethod
578
  @handle_errors(default_return=(None, "Export failed"))
579
  def export_data(data: List[Dict], format_type: str) -> Tuple[Optional[str], str]:
580
+ """Universal data export"""
581
  if not data:
582
  return None, "No data to export"
583
 
 
586
 
587
  if format_type == 'csv':
588
  writer = csv.writer(temp_file)
589
+ writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Pos_Prob', 'Neg_Prob', 'Neutral_Prob', 'Language', 'Key_Words'])
 
590
  for entry in data:
 
591
  writer.writerow([
592
  entry.get('timestamp', ''),
593
  entry.get('text', ''),
594
  entry.get('sentiment', ''),
595
  f"{entry.get('confidence', 0):.4f}",
 
596
  f"{entry.get('pos_prob', 0):.4f}",
597
  f"{entry.get('neg_prob', 0):.4f}",
598
+ f"{entry.get('neutral_prob', 0):.4f}",
599
+ entry.get('language', ''),
600
+ "|".join([f"{word}:{score:.3f}" for word, score in entry.get('key_words', [])])
601
  ])
602
  elif format_type == 'json':
603
  json.dump(data, temp_file, indent=2, ensure_ascii=False)
 
608
  @staticmethod
609
  @handle_errors(default_return="")
610
  def process_file(file) -> str:
611
+ """Process uploaded file"""
612
  if not file:
613
  return ""
614
+
615
  content = file.read().decode('utf-8')
616
 
617
  if file.name.endswith('.csv'):
618
+ import io
619
  csv_file = io.StringIO(content)
620
  reader = csv.reader(csv_file)
621
  try:
622
+ next(reader)
623
  texts = []
624
  for row in reader:
625
  if row and row[0].strip():
626
  text = row[0].strip().strip('"')
627
+ if text:
628
  texts.append(text)
629
  return '\n'.join(texts)
630
+ except Exception as e:
631
+ lines = content.strip().split('\n')[1:]
632
  texts = []
633
  for line in lines:
634
  if line.strip():
 
636
  if text:
637
  texts.append(text)
638
  return '\n'.join(texts)
 
639
  return content
640
 
641
+ # Main Application with Multi-language Support
642
  class SentimentApp:
643
+ """Main application orchestrator with multi-language support"""
644
 
645
  def __init__(self):
646
  self.engine = SentimentEngine()
 
647
  self.history = HistoryManager()
648
  self.data_handler = DataHandler()
649
 
650
  # Multi-language examples
651
  self.examples = [
652
+ ["While the film's visual effects were undeniably impressive, the story lacked emotional weight, and the pacing felt inconsistent throughout."],
653
+ ["这部电影的视觉效果令人印象深刻,但故事缺乏情感深度,节奏感也不够连贯。"],
654
+ ["Aunque los efectos visuales de la película fueron innegablemente impresionantes, la historia carecía de peso emocional."],
655
+ ["Bien que les effets visuels du film soient indéniablement impressionnants, l'histoire manquait de poids émotionnel."],
656
+ ["An extraordinary achievement in filmmaking — the direction was masterful, the script was sharp, and every performance added depth and realism."]
657
  ]
658
 
659
+ @handle_errors(default_return=("Please enter text", None, None, None, None))
660
+ def analyze_single(self, text: str, model_key: str = 'multilingual', theme: str = 'default'):
661
+ """Single text analysis with multi-language support"""
 
662
  if not text.strip():
663
+ return "Please enter text", None, None, None, None
664
 
665
+ result = self.engine.analyze_single(text, model_key)
 
 
666
 
667
+ # Add to history
668
+ self.history.add({
669
+ 'text': text[:100],
670
+ 'full_text': text,
671
+ **result
672
+ })
673
 
674
+ # Create visualizations
675
+ theme_ctx = ThemeContext(theme)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
676
 
677
+ prob_plot = PlotFactory.create_sentiment_bars(result, theme_ctx)
678
+ gauge_plot = PlotFactory.create_confidence_gauge(result['confidence'], result['sentiment'], theme_ctx)
679
+ cloud_plot = PlotFactory.create_wordcloud_plot(text, result['sentiment'], theme_ctx)
680
+ keyword_plot = PlotFactory.create_keyword_chart(result['key_words'], result['sentiment'], theme_ctx)
681
 
682
+ # Format result text with key words
683
+ key_words_str = ", ".join([f"{word}({score:.3f})" for word, score in result['key_words'][:5]])
684
+ result_text = (f"Sentiment: {result['sentiment']} (Confidence: {result['confidence']:.3f})\n"
685
+ f"Language: {result['language']}\n"
686
+ f"Key Words: {key_words_str}")
687
 
688
+ return result_text, prob_plot, gauge_plot, cloud_plot, keyword_plot
689
+
690
+ @handle_errors(default_return=None)
691
+ def analyze_batch(self, reviews: str, model_key: str = 'multilingual', progress=None):
692
+ """Batch analysis with multi-language support"""
693
+ if not reviews.strip():
694
+ return None
695
 
696
+ texts = [r.strip() for r in reviews.split('\n') if r.strip()]
697
+ if len(texts) < 2:
698
+ return None
699
 
700
+ results = self.engine.analyze_batch(texts, model_key, progress)
 
 
 
 
701
 
702
+ # Add to history
703
+ for result in results:
704
+ self.history.add(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
705
 
706
+ # Create visualization
707
+ theme_ctx = ThemeContext('default')
708
+ return PlotFactory.create_batch_analysis(results, theme_ctx)
709
 
710
  @handle_errors(default_return=(None, "No history available"))
711
  def plot_history(self, theme: str = 'default'):
712
+ """Plot analysis history using Plotly"""
713
  history = self.history.get_all()
714
  if len(history) < 2:
715
  return None, f"Need at least 2 analyses for trends. Current: {len(history)}"
716
 
717
  theme_ctx = ThemeContext(theme)
718
 
719
+ # Create subplots
720
+ fig = make_subplots(
721
+ rows=2, cols=1,
722
+ subplot_titles=['Sentiment History', 'Confidence Over Time'],
723
+ vertical_spacing=0.12
724
+ )
725
+
726
+ indices = list(range(len(history)))
727
+ pos_probs = [item.get('pos_prob', 0) for item in history]
728
+ confs = [item['confidence'] for item in history]
729
+
730
+ # Sentiment trend
731
+ colors = [theme_ctx.colors['pos'] if p > 0.5 else theme_ctx.colors['neg'] for p in pos_probs]
732
+
733
+ fig.add_trace(
734
+ go.Scatter(
735
+ x=indices,
736
+ y=pos_probs,
737
+ mode='markers+lines',
738
+ marker=dict(color=colors, size=8),
739
+ line=dict(color='gray', width=2),
740
+ name='Sentiment Trend'
741
+ ),
742
+ row=1, col=1
743
+ )
744
+
745
+ # Add horizontal line at 0.5
746
+ fig.add_hline(y=0.5, line_dash="dash", line_color="gray", row=1, col=1)
747
+
748
+ # Confidence trend
749
+ fig.add_trace(
750
+ go.Bar(
751
+ x=indices,
752
+ y=confs,
753
+ marker_color='lightblue',
754
+ marker_line_color='navy',
755
+ marker_line_width=1,
756
+ name='Confidence'
757
+ ),
758
+ row=2, col=1
759
+ )
760
+
761
+ fig.update_layout(
762
+ height=800,
763
+ width=1000,
764
+ showlegend=False,
765
+ title_text="Analysis History"
766
+ )
767
+
768
+ fig.update_xaxes(title_text="Analysis Number", row=2, col=1)
769
+ fig.update_yaxes(title_text="Positive Probability", row=1, col=1)
770
+ fig.update_yaxes(title_text="Confidence", row=2, col=1)
771
+
772
+ return fig, f"History: {len(history)} analyses"
773
 
774
+ # Gradio Interface Setup with Multi-language Support
775
  def create_interface():
776
+ """Create streamlined Gradio interface with multi-language support"""
777
  app = SentimentApp()
778
 
779
+ with gr.Blocks(theme=gr.themes.Soft(), title="Multi-language Sentiment Analyzer") as demo:
780
+ gr.Markdown("# 🌍 AI Multi-language Sentiment Analyzer")
781
+ gr.Markdown("Advanced sentiment analysis supporting multiple languages with Plotly visualizations and key word extraction")
782
 
783
  with gr.Tab("Single Analysis"):
784
  with gr.Row():
785
  with gr.Column():
786
  text_input = gr.Textbox(
787
+ label="Review Text (Multiple Languages Supported)",
788
+ placeholder="Enter your review in any supported language...",
789
  lines=5
790
  )
 
791
  with gr.Row():
792
+ analyze_btn = gr.Button("Analyze", variant="primary")
793
+ model_selector = gr.Dropdown(
794
+ choices=[
795
+ ('Auto-detect', 'multilingual'),
796
+ ('Multilingual', 'multilingual'),
797
+ ('English', 'english'),
798
+ ('Chinese 中文', 'chinese'),
799
+ ('Spanish Español', 'spanish'),
800
+ ('French Français', 'french')
801
+ ],
802
+ value="multilingual",
803
+ label="Language Model"
804
  )
805
  theme_selector = gr.Dropdown(
806
  choices=list(config.THEMES.keys()),
 
808
  label="Theme"
809
  )
810
 
 
 
 
 
 
 
 
811
  gr.Examples(
812
  examples=app.examples,
813
  inputs=text_input,
814
+ label="Multi-language Examples"
815
  )
816
 
817
  with gr.Column():
818
+ result_output = gr.Textbox(label="Analysis Result", lines=4)
819
 
820
  with gr.Row():
821
+ prob_plot = gr.Plot(label="Sentiment Probabilities")
822
+ gauge_plot = gr.Plot(label="Confidence Gauge")
823
 
824
  with gr.Row():
825
+ wordcloud_plot = gr.Plot(label="Word Cloud")
826
+ keyword_plot = gr.Plot(label="Key Contributing Words")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
827
 
828
  with gr.Tab("Batch Analysis"):
829
  with gr.Row():
830
  with gr.Column():
831
+ file_upload = gr.File(label="Upload File", file_types=[".csv", ".txt"])
 
 
 
832
  batch_input = gr.Textbox(
833
+ label="Reviews (one per line, mixed languages supported)",
834
+ lines=8,
835
+ placeholder="Enter multiple reviews, one per line...\nSupports mixed languages in the same batch!"
836
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
837
 
838
  with gr.Column():
839
+ load_btn = gr.Button("Load File")
840
+ with gr.Row():
841
+ batch_btn = gr.Button("Analyze Batch", variant="primary")
842
+ batch_model_selector = gr.Dropdown(
843
+ choices=[
844
+ ('Auto-detect', 'multilingual'),
845
+ ('Multilingual', 'multilingual'),
846
+ ('English', 'english'),
847
+ ('Chinese 中文', 'chinese'),
848
+ ('Spanish Español', 'spanish'),
849
+ ('French Français', 'french')
850
+ ],
851
+ value="multilingual",
852
+ label="Batch Model"
853
+ )
854
 
855
+ batch_plot = gr.Plot(label="Batch Analysis Results")
 
 
856
 
857
+ with gr.Tab("History & Export"):
858
  with gr.Row():
859
+ refresh_btn = gr.Button("Refresh History")
860
+ clear_btn = gr.Button("Clear History", variant="stop")
861
+ status_btn = gr.Button("Show Status")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
862
 
863
  with gr.Row():
864
+ csv_btn = gr.Button("Export CSV")
865
+ json_btn = gr.Button("Export JSON")
866
+
867
+ history_status = gr.Textbox(label="Status Information")
868
+ history_plot = gr.Plot(label="History Trends")
869
+ csv_file = gr.File(label="CSV Download", visible=True)
870
+ json_file = gr.File(label="JSON Download", visible=True)
871
+
872
+ with gr.Tab("Model Information"):
873
+ gr.Markdown("""
874
+ ## Supported Languages and Models
875
+
876
+ | Language | Model | Description |
877
+ |----------|-------|-------------|
878
+ | **Multilingual** | XLM-RoBERTa | Supports 100+ languages automatically |
879
+ | **English** | RoBERTa-base | Optimized for English text |
880
+ | **Chinese 中文** | RoBERTa-Chinese | Specialized for Chinese language |
881
+ | **Spanish Español** | BETO | Fine-tuned for Spanish sentiment |
882
+ | **French Français** | tf-allocine | Trained on French movie reviews |
883
+
884
+ ### Features:
885
+ - **Automatic Language Detection**: The system can automatically detect the input language
886
+ - **Attention-based Keywords**: Extract words that contribute most to sentiment prediction
887
+ - **Interactive Visualizations**: Plotly-powered charts and graphs
888
+ - **Batch Processing**: Analyze multiple texts at once
889
+ - **Export Capabilities**: Save results in CSV or JSON format
890
+ - **Multi-language Support**: Mix different languages in batch analysis
891
+ """)
892
+
893
+ # Event bindings
894
  analyze_btn.click(
895
  app.analyze_single,
896
+ inputs=[text_input, model_selector, theme_selector],
897
+ outputs=[result_output, prob_plot, gauge_plot, wordcloud_plot, keyword_plot]
 
 
 
 
 
 
 
 
898
  )
899
 
900
+ load_btn.click(
901
+ app.data_handler.process_file,
902
+ inputs=file_upload,
 
 
 
 
 
 
 
903
  outputs=batch_input
904
  )
905
 
906
+ batch_btn.click(
907
+ app.analyze_batch,
908
+ inputs=[batch_input, batch_model_selector],
909
+ outputs=batch_plot
 
910
  )
911
 
912
+ refresh_btn.click(
913
+ lambda theme: app.plot_history(theme),
914
+ inputs=theme_selector,
915
+ outputs=[history_plot, history_status]
 
916
  )
917
 
918
+ clear_btn.click(
919
  lambda: f"Cleared {app.history.clear()} entries",
920
  outputs=history_status
921
  )
922
 
923
  status_btn.click(
924
+ lambda: f"History: {app.history.size()} entries | Available Models: {', '.join(config.MODELS.keys())}",
925
  outputs=history_status
926
  )
927
 
928
+ csv_btn.click(
929
  lambda: app.data_handler.export_data(app.history.get_all(), 'csv'),
930
+ outputs=[csv_file, history_status]
931
  )
932
 
933
+ json_btn.click(
934
  lambda: app.data_handler.export_data(app.history.get_all(), 'json'),
935
+ outputs=[json_file, history_status]
936
  )
937
 
938
  return demo
939
 
940
  # Application Entry Point
941
  if __name__ == "__main__":
942
+ logging.basicConfig(level=logging.INFO)
943
+ demo = create_interface()
944
+ demo.launch(
945
+ share=True,
946
+ server_name="0.0.0.0",
947
+ server_port=7860,
948
+ show_error=True
949
+ )