entropy25 commited on
Commit
a58e3ae
·
verified ·
1 Parent(s): d3eb8f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1093 -745
app.py CHANGED
@@ -14,21 +14,25 @@ import io
14
  import tempfile
15
  from datetime import datetime
16
  import logging
17
- from functools import lru_cache
18
  from dataclasses import dataclass
19
- from typing import List, Dict, Optional, Tuple
 
20
  import nltk
21
  from nltk.corpus import stopwords
22
  import langdetect
23
  import pandas as pd
 
24
 
25
  # Configuration
26
  @dataclass
27
  class Config:
28
- MAX_HISTORY_SIZE: int = 500
29
- BATCH_SIZE_LIMIT: int = 30
30
  MAX_TEXT_LENGTH: int = 512
31
- CACHE_SIZE: int = 64
 
 
32
 
33
  # Supported languages and models
34
  SUPPORTED_LANGUAGES = {
@@ -47,7 +51,7 @@ class Config:
47
  'zh': "uer/roberta-base-finetuned-dianping-chinese"
48
  }
49
 
50
- # Color themes
51
  THEMES = {
52
  'default': {'pos': '#4CAF50', 'neg': '#F44336', 'neu': '#FF9800'},
53
  'ocean': {'pos': '#0077BE', 'neg': '#FF6B35', 'neu': '#00BCD4'},
@@ -69,16 +73,55 @@ try:
69
  except:
70
  STOP_WORDS = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  class ModelManager:
73
- """Manages multiple language models"""
 
 
 
 
 
 
 
 
74
  def __init__(self):
75
- self.models = {}
76
- self.tokenizers = {}
77
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
78
- self._load_default_model()
 
 
79
 
80
- def _load_default_model(self):
81
- """Load the default models"""
82
  try:
83
  # Load multilingual model as default
84
  model_name = config.MODELS['multilingual']
@@ -102,17 +145,13 @@ class ModelManager:
102
  """Get model for specific language"""
103
  if language == 'zh':
104
  return self.models['zh'], self.tokenizers['zh']
105
- elif language in ['en', 'auto'] or language not in config.SUPPORTED_LANGUAGES:
106
- return self.models['default'], self.tokenizers['default']
107
- return self.models['default'], self.tokenizers['default'] # Use multilingual for other languages
108
 
109
  @staticmethod
110
  def detect_language(text: str) -> str:
111
- """Detect text language properly"""
112
  try:
113
- # Use langdetect for all languages
114
  detected = langdetect.detect(text)
115
- # Map some common langdetect codes to our supported languages
116
  language_mapping = {
117
  'zh-cn': 'zh',
118
  'zh-tw': 'zh'
@@ -122,32 +161,80 @@ class ModelManager:
122
  except:
123
  return 'en'
124
 
125
- model_manager = ModelManager()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
 
127
  class HistoryManager:
128
- """Enhanced history manager with more features"""
129
  def __init__(self):
130
  self._history = []
131
 
132
- def add_entry(self, entry: Dict):
 
 
133
  self._history.append(entry)
134
  if len(self._history) > config.MAX_HISTORY_SIZE:
135
  self._history = self._history[-config.MAX_HISTORY_SIZE:]
136
 
137
- def add_batch_entries(self, entries: List[Dict]):
138
- """Add multiple entries at once"""
139
  for entry in entries:
140
- self.add_entry(entry)
141
 
142
- def get_history(self) -> List[Dict]:
143
  return self._history.copy()
144
 
145
- def get_recent_history(self, n: int = 10) -> List[Dict]:
146
- """Get n most recent entries"""
147
  return self._history[-n:] if self._history else []
148
 
149
- def filter_history(self, sentiment: str = None, language: str = None,
150
- min_confidence: float = None) -> List[Dict]:
151
  """Filter history by criteria"""
152
  filtered = self._history
153
 
@@ -165,7 +252,11 @@ class HistoryManager:
165
  self._history.clear()
166
  return count
167
 
 
 
 
168
  def get_stats(self) -> Dict:
 
169
  if not self._history:
170
  return {}
171
 
@@ -182,72 +273,93 @@ class HistoryManager:
182
  'max_confidence': np.max(confidences),
183
  'min_confidence': np.min(confidences),
184
  'languages_detected': len(set(languages)),
185
- 'most_common_language': Counter(languages).most_common(1)[0][0] if languages else 'en',
186
- 'avg_text_length': np.mean([len(item.get('full_text', '')) for item in self._history])
187
  }
188
 
189
- history_manager = HistoryManager()
190
-
191
- class TextProcessor:
192
- """Enhanced text processing"""
193
 
194
- @staticmethod
195
- @lru_cache(maxsize=config.CACHE_SIZE)
196
- def clean_text(text: str, remove_punctuation: bool = True, remove_numbers: bool = False) -> str:
197
- """Clean text with options"""
198
- text = text.lower().strip()
199
-
200
- if remove_numbers:
201
- text = re.sub(r'\d+', '', text)
202
-
203
- if remove_punctuation:
204
- text = re.sub(r'[^\w\s]', '', text)
205
-
206
- words = text.split()
207
- cleaned_words = [w for w in words if w not in STOP_WORDS and len(w) > 2]
208
- return ' '.join(cleaned_words)
209
 
210
- @staticmethod
211
- def extract_keywords(text: str, top_k: int = 5) -> List[str]:
212
- """Extract key words from text"""
213
- # For Chinese text, extract characters
214
- if re.search(r'[\u4e00-\u9fff]', text):
215
- words = re.findall(r'[\u4e00-\u9fff]+', text)
216
- all_chars = ''.join(words)
217
- char_freq = Counter(all_chars)
218
- return [char for char, _ in char_freq.most_common(top_k)]
219
- else:
220
- # For other languages, use word-based extraction
221
- cleaned = TextProcessor.clean_text(text)
222
- words = cleaned.split()
223
- word_freq = Counter(words)
224
- return [word for word, _ in word_freq.most_common(top_k)]
225
-
226
- @staticmethod
227
- def parse_batch_input(text: str) -> List[str]:
228
- """Parse batch input from textarea"""
229
- lines = text.strip().split('\n')
230
- return [line.strip() for line in lines if line.strip()]
231
-
232
- class SentimentAnalyzer:
233
- """Enhanced sentiment analysis"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
- @staticmethod
236
- def analyze_text(text: str, language: str = 'auto', preprocessing_options: Dict = None) -> Dict:
237
- """Analyze single text with language support"""
238
  if not text.strip():
239
  raise ValueError("Empty text provided")
240
 
241
- # Detect language if auto
242
  if language == 'auto':
243
- detected_lang = model_manager.detect_language(text)
244
  else:
245
  detected_lang = language
246
 
247
  # Get appropriate model
248
- model, tokenizer = model_manager.get_model(detected_lang)
249
 
250
- # Preprocessing options - don't clean Chinese text
251
  options = preprocessing_options or {}
252
  processed_text = text
253
  if options.get('clean_text', False) and not re.search(r'[\u4e00-\u9fff]', text):
@@ -257,96 +369,110 @@ class SentimentAnalyzer:
257
  options.get('remove_numbers', False)
258
  )
259
 
260
- try:
261
- # Tokenize and analyze
262
- inputs = tokenizer(processed_text, return_tensors="pt", padding=True,
263
- truncation=True, max_length=config.MAX_TEXT_LENGTH).to(model_manager.device)
 
 
 
 
 
 
 
 
 
 
264
 
265
- with torch.no_grad():
266
- outputs = model(**inputs)
267
- probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
268
-
269
- # Handle different model outputs
270
- if len(probs) == 3: # negative, neutral, positive
271
- sentiment_idx = np.argmax(probs)
272
- sentiment_labels = ['Negative', 'Neutral', 'Positive']
273
- sentiment = sentiment_labels[sentiment_idx]
274
- confidence = float(probs[sentiment_idx])
275
-
276
- result = {
277
- 'sentiment': sentiment,
278
- 'confidence': confidence,
279
- 'neg_prob': float(probs[0]),
280
- 'neu_prob': float(probs[1]),
281
- 'pos_prob': float(probs[2]),
282
- 'has_neutral': True
283
- }
284
- else: # negative, positive
285
- pred = np.argmax(probs)
286
- sentiment = "Positive" if pred == 1 else "Negative"
287
- confidence = float(probs[pred])
288
-
289
- result = {
290
- 'sentiment': sentiment,
291
- 'confidence': confidence,
292
- 'neg_prob': float(probs[0]),
293
- 'pos_prob': float(probs[1]),
294
- 'neu_prob': 0.0,
295
- 'has_neutral': False
296
- }
297
 
298
- # Add metadata
299
- result.update({
300
- 'language': detected_lang,
301
- 'keywords': TextProcessor.extract_keywords(text),
302
- 'word_count': len(text.split()),
303
- 'char_count': len(text)
304
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
- return result
 
307
 
308
- except Exception as e:
309
- logger.error(f"Analysis failed: {e}")
310
- raise
311
-
312
- @staticmethod
313
- def analyze_batch(texts: List[str], language: str = 'auto',
314
- preprocessing_options: Dict = None) -> List[Dict]:
315
- """Analyze multiple texts"""
316
- results = []
317
- for i, text in enumerate(texts):
318
- try:
319
- result = SentimentAnalyzer.analyze_text(text, language, preprocessing_options)
320
- result['batch_index'] = i
321
- results.append(result)
322
- except Exception as e:
323
- # Add error result
324
- results.append({
325
- 'sentiment': 'Error',
326
- 'confidence': 0.0,
327
- 'error': str(e),
328
- 'batch_index': i,
329
- 'text': text
330
- })
331
  return results
332
 
 
333
  class PlotlyVisualizer:
334
- """Enhanced visualizations with Plotly"""
335
 
336
  @staticmethod
337
- def create_sentiment_gauge(result: Dict, theme: str = 'default') -> go.Figure:
338
- """Create an animated sentiment gauge"""
339
- colors = config.THEMES[theme]
 
340
 
341
- if result['has_neutral']:
342
  # Three-way gauge
343
  fig = go.Figure(go.Indicator(
344
- mode = "gauge+number+delta",
345
- value = result['pos_prob'] * 100,
346
- domain = {'x': [0, 1], 'y': [0, 1]},
347
- title = {'text': f"Sentiment: {result['sentiment']}"},
348
- delta = {'reference': 50},
349
- gauge = {
350
  'axis': {'range': [None, 100]},
351
  'bar': {'color': colors['pos'] if result['sentiment'] == 'Positive' else colors['neg']},
352
  'steps': [
@@ -364,11 +490,11 @@ class PlotlyVisualizer:
364
  else:
365
  # Two-way gauge
366
  fig = go.Figure(go.Indicator(
367
- mode = "gauge+number",
368
- value = result['confidence'] * 100,
369
- domain = {'x': [0, 1], 'y': [0, 1]},
370
- title = {'text': f"Confidence: {result['sentiment']}"},
371
- gauge = {
372
  'axis': {'range': [None, 100]},
373
  'bar': {'color': colors['pos'] if result['sentiment'] == 'Positive' else colors['neg']},
374
  'steps': [
@@ -382,11 +508,12 @@ class PlotlyVisualizer:
382
  return fig
383
 
384
  @staticmethod
385
- def create_probability_bars(result: Dict, theme: str = 'default') -> go.Figure:
 
386
  """Create probability bar chart"""
387
- colors = config.THEMES[theme]
388
 
389
- if result['has_neutral']:
390
  labels = ['Negative', 'Neutral', 'Positive']
391
  values = [result['neg_prob'], result['neu_prob'], result['pos_prob']]
392
  bar_colors = [colors['neg'], colors['neu'], colors['pos']]
@@ -396,10 +523,10 @@ class PlotlyVisualizer:
396
  bar_colors = [colors['neg'], colors['pos']]
397
 
398
  fig = go.Figure(data=[
399
- go.Bar(x=labels, y=values, marker_color=bar_colors, text=[f'{v:.3f}' for v in values])
 
400
  ])
401
 
402
- fig.update_traces(texttemplate='%{text}', textposition='outside')
403
  fig.update_layout(
404
  title="Sentiment Probabilities",
405
  yaxis_title="Probability",
@@ -408,14 +535,52 @@ class PlotlyVisualizer:
408
  )
409
 
410
  return fig
411
-
412
  @staticmethod
413
- def create_batch_summary(results: List[Dict], theme: str = 'default') -> go.Figure:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
  """Create batch analysis summary"""
415
- colors = config.THEMES[theme]
416
 
417
  # Count sentiments
418
- sentiments = [r['sentiment'] for r in results if 'sentiment' in r]
419
  sentiment_counts = Counter(sentiments)
420
 
421
  # Create pie chart
@@ -433,8 +598,9 @@ class PlotlyVisualizer:
433
  )
434
 
435
  return fig
436
-
437
  @staticmethod
 
438
  def create_confidence_distribution(results: List[Dict]) -> go.Figure:
439
  """Create confidence distribution plot"""
440
  confidences = [r['confidence'] for r in results if 'confidence' in r and r['sentiment'] != 'Error']
@@ -459,7 +625,8 @@ class PlotlyVisualizer:
459
  return fig
460
 
461
  @staticmethod
462
- def create_history_dashboard(history: List[Dict]) -> go.Figure:
 
463
  """Create comprehensive history dashboard"""
464
  if len(history) < 2:
465
  return go.Figure()
@@ -475,13 +642,15 @@ class PlotlyVisualizer:
475
 
476
  # Extract data
477
  indices = list(range(len(history)))
478
- pos_probs = [item['pos_prob'] for item in history]
479
  confidences = [item['confidence'] for item in history]
480
  sentiments = [item['sentiment'] for item in history]
481
  languages = [item.get('language', 'en') for item in history]
482
 
483
  # Sentiment timeline
484
- colors = ['#4CAF50' if s == 'Positive' else '#F44336' for s in sentiments]
 
 
485
  fig.add_trace(
486
  go.Scatter(x=indices, y=pos_probs, mode='lines+markers',
487
  marker=dict(color=colors, size=8),
@@ -505,33 +674,114 @@ class PlotlyVisualizer:
505
 
506
  # Sentiment summary
507
  sent_counts = Counter(sentiments)
 
508
  fig.add_trace(
509
  go.Bar(x=list(sent_counts.keys()), y=list(sent_counts.values()),
510
- marker_color=['#4CAF50' if k == 'Positive' else '#F44336' for k in sent_counts.keys()]),
511
  row=2, col=2
512
  )
513
 
514
  fig.update_layout(height=800, showlegend=False)
515
  return fig
516
 
517
- # Main application functions
518
- def analyze_single_text(text: str, language: str, theme: str, clean_text: bool,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
519
  remove_punct: bool, remove_nums: bool):
520
- """Enhanced single text analysis"""
521
- try:
522
  if not text.strip():
523
- return "Please enter text", None, None
524
-
525
- # Map display names back to language codes
526
- language_map = {
527
- 'Auto Detect': 'auto',
528
- 'English': 'en',
529
- 'Chinese': 'zh',
530
- 'Spanish': 'es',
531
- 'French': 'fr',
532
- 'German': 'de',
533
- 'Swedish': 'sv'
534
- }
535
  language_code = language_map.get(language, 'auto')
536
 
537
  preprocessing_options = {
@@ -540,46 +790,48 @@ def analyze_single_text(text: str, language: str, theme: str, clean_text: bool,
540
  'remove_numbers': remove_nums
541
  }
542
 
543
- result = SentimentAnalyzer.analyze_text(text, language_code, preprocessing_options)
544
-
545
- # Add to history
546
- history_entry = {
547
- 'text': text[:100] + '...' if len(text) > 100 else text,
548
- 'full_text': text,
549
- 'sentiment': result['sentiment'],
550
- 'confidence': result['confidence'],
551
- 'pos_prob': result['pos_prob'],
552
- 'neg_prob': result['neg_prob'],
553
- 'neu_prob': result.get('neu_prob', 0),
554
- 'language': result['language'],
555
- 'timestamp': datetime.now().isoformat(),
556
- 'analysis_type': 'single'
557
- }
558
- history_manager.add_entry(history_entry)
559
-
560
- # Create visualizations
561
- gauge_fig = PlotlyVisualizer.create_sentiment_gauge(result, theme)
562
- bars_fig = PlotlyVisualizer.create_probability_bars(result, theme)
563
-
564
- # Create info text
565
- info_text = f"""
 
 
 
 
 
 
566
  **Analysis Results:**
567
  - **Sentiment:** {result['sentiment']} ({result['confidence']:.3f} confidence)
568
  - **Language:** {result['language'].upper()}
569
- - **Keywords:** {', '.join(result['keywords'])}
570
- - **Stats:** {result['word_count']} words, {result['char_count']} characters
571
- """
572
-
573
- return info_text, gauge_fig, bars_fig
574
-
575
- except Exception as e:
576
- logger.error(f"Analysis failed: {e}")
577
- return f"Error: {str(e)}", None, None
578
-
579
- def analyze_batch_texts(batch_text: str, language: str, theme: str,
580
- clean_text: bool, remove_punct: bool, remove_nums: bool):
581
- """Batch text analysis"""
582
- try:
583
  if not batch_text.strip():
584
  return "Please enter texts (one per line)", None, None, None
585
 
@@ -592,16 +844,8 @@ def analyze_batch_texts(batch_text: str, language: str, theme: str,
592
  if not texts:
593
  return "No valid texts found", None, None, None
594
 
595
- # Map display names back to language codes
596
- language_map = {
597
- 'Auto Detect': 'auto',
598
- 'English': 'en',
599
- 'Chinese': 'zh',
600
- 'Spanish': 'es',
601
- 'French': 'fr',
602
- 'German': 'de',
603
- 'Swedish': 'sv'
604
- }
605
  language_code = language_map.get(language, 'auto')
606
 
607
  preprocessing_options = {
@@ -610,556 +854,660 @@ def analyze_batch_texts(batch_text: str, language: str, theme: str,
610
  'remove_numbers': remove_nums
611
  }
612
 
613
- # Analyze all texts
614
- results = SentimentAnalyzer.analyze_batch(texts, language_code, preprocessing_options)
615
-
616
- # Add to history
617
- batch_entries = []
618
- for i, (text, result) in enumerate(zip(texts, results)):
619
- if 'error' not in result:
620
- entry = {
621
- 'text': text[:100] + '...' if len(text) > 100 else text,
622
- 'full_text': text,
623
- 'sentiment': result['sentiment'],
624
- 'confidence': result['confidence'],
625
- 'pos_prob': result['pos_prob'],
626
- 'neg_prob': result['neg_prob'],
627
- 'neu_prob': result.get('neu_prob', 0),
628
- 'language': result['language'],
629
- 'timestamp': datetime.now().isoformat(),
630
- 'analysis_type': 'batch',
631
- 'batch_index': i
632
- }
633
- batch_entries.append(entry)
634
-
635
- history_manager.add_batch_entries(batch_entries)
636
-
637
- # Create visualizations
638
- summary_fig = PlotlyVisualizer.create_batch_summary(results, theme)
639
- confidence_fig = PlotlyVisualizer.create_confidence_distribution(results)
640
-
641
- # Create results table
642
- df_data = []
643
- for i, (text, result) in enumerate(zip(texts, results)):
644
- if 'error' in result:
645
- df_data.append({
646
- 'Index': i+1,
647
- 'Text': text[:50] + '...' if len(text) > 50 else text,
648
- 'Sentiment': 'Error',
649
- 'Confidence': 0.0,
650
- 'Language': 'Unknown',
651
- 'Error': result['error']
652
- })
653
- else:
654
- df_data.append({
655
- 'Index': i+1,
656
- 'Text': text[:50] + '...' if len(text) > 50 else text,
657
- 'Sentiment': result['sentiment'],
658
- 'Confidence': f"{result['confidence']:.3f}",
659
- 'Language': result['language'].upper(),
660
- 'Keywords': ', '.join(result['keywords'][:3])
661
- })
662
-
663
- df = pd.DataFrame(df_data)
664
-
665
- # Summary info
666
- successful_results = [r for r in results if 'error' not in r]
667
- error_count = len(results) - len(successful_results)
668
-
669
- if successful_results:
670
- sentiment_counts = Counter([r['sentiment'] for r in successful_results])
671
- avg_confidence = np.mean([r['confidence'] for r in successful_results])
672
 
673
- summary_text = f"""
 
 
 
 
 
 
 
 
 
 
 
674
  **Batch Analysis Summary:**
675
  - **Total Texts:** {len(texts)}
676
  - **Successful:** {len(successful_results)}
677
  - **Errors:** {error_count}
678
  - **Average Confidence:** {avg_confidence:.3f}
679
  - **Sentiments:** {dict(sentiment_counts)}
680
- """
681
- else:
682
- summary_text = f"All {len(texts)} texts failed to analyze."
 
 
 
 
 
 
 
 
 
 
683
 
684
- return summary_text, df, summary_fig, confidence_fig
685
 
686
- except Exception as e:
687
- logger.error(f"Batch analysis failed: {e}")
688
- return f"Error: {str(e)}", None, None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
689
 
690
- def analyze_advanced_text(text: str, language: str, theme: str, include_keywords: bool,
691
- keyword_count: int, min_confidence: float):
692
- """Advanced analysis with additional features"""
693
- try:
694
- if not text.strip():
695
- return "Please enter text", None, None
696
-
697
- # Map display names back to language codes
698
- language_map = {
699
- 'Auto Detect': 'auto',
700
- 'English': 'en',
701
- 'Chinese': 'zh',
702
- 'Spanish': 'es',
703
- 'French': 'fr',
704
- 'German': 'de',
705
- 'Swedish': 'sv'
706
- }
707
- language_code = language_map.get(language, 'auto')
708
-
709
- result = SentimentAnalyzer.analyze_text(text, language_code)
710
-
711
- # Advanced keyword extraction
712
- if include_keywords:
713
- result['keywords'] = TextProcessor.extract_keywords(text, keyword_count)
714
-
715
- # Confidence filtering
716
- meets_confidence = result['confidence'] >= min_confidence
717
-
718
- # Add to history
719
- history_entry = {
720
- 'text': text[:100] + '...' if len(text) > 100 else text,
721
- 'full_text': text,
722
- 'sentiment': result['sentiment'],
723
- 'confidence': result['confidence'],
724
- 'pos_prob': result['pos_prob'],
725
- 'neg_prob': result['neg_prob'],
726
- 'neu_prob': result.get('neu_prob', 0),
727
- 'language': result['language'],
728
- 'timestamp': datetime.now().isoformat(),
729
- 'analysis_type': 'advanced',
730
- 'meets_confidence_threshold': meets_confidence
731
- }
732
- history_manager.add_entry(history_entry)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
733
 
734
- # Create visualizations
735
- gauge_fig = PlotlyVisualizer.create_sentiment_gauge(result, theme)
736
- bars_fig = PlotlyVisualizer.create_probability_bars(result, theme)
 
 
737
 
738
- # Create detailed info text
739
- confidence_status = "✅ High Confidence" if meets_confidence else "⚠️ Low Confidence"
 
 
 
 
740
 
741
- info_text = f"""
742
- **Advanced Analysis Results:**
743
- - **Sentiment:** {result['sentiment']} ({result['confidence']:.3f} confidence)
744
- - **Confidence Status:** {confidence_status}
745
- - **Language:** {result['language'].upper()}
746
- - **Text Statistics:**
747
- - Words: {result['word_count']}
748
- - Characters: {result['char_count']}
749
- - Average word length: {result['char_count']/max(result['word_count'], 1):.1f}
750
- """
751
 
752
- if include_keywords:
753
- info_text += f"\n- **Top Keywords:** {', '.join(result['keywords'])}"
 
 
754
 
755
- if not meets_confidence:
756
- info_text += f"\n\n⚠️ **Note:** Confidence ({result['confidence']:.3f}) is below threshold ({min_confidence})"
 
 
757
 
758
- return info_text, gauge_fig, bars_fig
 
 
 
759
 
760
- except Exception as e:
761
- logger.error(f"Advanced analysis failed: {e}")
762
- return f"Error: {str(e)}", None, None
763
-
764
- def get_history_stats():
765
- """Get enhanced history statistics"""
766
- stats = history_manager.get_stats()
767
- if not stats:
768
- return "No analysis history available"
769
 
770
- return f"""
771
- **Comprehensive History Statistics:**
772
 
773
- **Analysis Counts:**
774
- - Total Analyses: {stats['total_analyses']}
775
- - Positive: {stats['positive_count']}
776
- - Negative: {stats['negative_count']}
777
- - Neutral: {stats['neutral_count']}
778
-
779
- **Confidence Metrics:**
780
- - Average Confidence: {stats['avg_confidence']:.3f}
781
- - Highest Confidence: {stats['max_confidence']:.3f}
782
- - Lowest Confidence: {stats['min_confidence']:.3f}
783
-
784
- **Language Statistics:**
785
- - Languages Detected: {stats['languages_detected']}
786
- - Most Common Language: {stats['most_common_language'].upper()}
787
-
788
- **Text Statistics:**
789
- - Average Text Length: {stats['avg_text_length']:.1f} characters
790
- """
791
-
792
- def filter_history_display(sentiment_filter: str, language_filter: str, min_confidence: float):
793
- """Display filtered history"""
794
- # Convert filters
795
- sentiment = sentiment_filter if sentiment_filter != "All" else None
796
- language = language_filter.lower() if language_filter != "All" else None
797
-
798
- filtered_history = history_manager.filter_history(
799
- sentiment=sentiment,
800
- language=language,
801
- min_confidence=min_confidence if min_confidence > 0 else None
802
  )
803
 
804
- if not filtered_history:
805
- return "No entries match the filter criteria", None
806
-
807
- # Create DataFrame for display
808
- df_data = []
809
- for entry in filtered_history[-20:]: # Show last 20 entries
810
- df_data.append({
811
- 'Timestamp': entry['timestamp'][:16], # YYYY-MM-DD HH:MM
812
- 'Text': entry['text'],
813
- 'Sentiment': entry['sentiment'],
814
- 'Confidence': f"{entry['confidence']:.3f}",
815
- 'Language': entry['language'].upper(),
816
- 'Type': entry.get('analysis_type', 'single')
817
- })
818
-
819
- df = pd.DataFrame(df_data)
820
-
821
- summary = f"""
822
- **Filtered Results:**
823
- - Found {len(filtered_history)} entries matching criteria
824
- - Showing most recent {min(20, len(filtered_history))} entries
825
- """
826
-
827
- return summary, df
828
-
829
- def plot_history_dashboard():
830
- """Create history dashboard"""
831
- history = history_manager.get_history()
832
- if len(history) < 2:
833
- return None, "Need at least 2 analyses for dashboard"
834
-
835
- fig = PlotlyVisualizer.create_history_dashboard(history)
836
- return fig, f"Dashboard showing {len(history)} analyses"
837
-
838
- def export_history_csv():
839
- """Export history to CSV"""
840
- history = history_manager.get_history()
841
- if not history:
842
- return None, "No history to export"
843
-
844
  try:
845
- df = pd.DataFrame(history)
846
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv', mode='w')
847
- df.to_csv(temp_file.name, index=False)
848
- return temp_file.name, f"Exported {len(history)} entries to CSV"
 
 
 
849
  except Exception as e:
850
- return None, f"Export failed: {str(e)}"
851
-
852
- def export_history_excel():
853
- """Export history to Excel"""
854
- history = history_manager.get_history()
855
- if not history:
856
- return None, "No history to export"
857
 
858
- try:
859
- df = pd.DataFrame(history)
860
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx')
861
- df.to_excel(temp_file.name, index=False)
862
- return temp_file.name, f"Exported {len(history)} entries to Excel"
863
- except Exception as e:
864
- return None, f"Export failed: {str(e)}"
865
-
866
- def clear_all_history():
867
- """Clear analysis history"""
868
- count = history_manager.clear()
869
- return f"Cleared {count} entries from history"
870
-
871
- def get_recent_analyses():
872
- """Get recent analysis summary"""
873
- recent = history_manager.get_recent_history(10)
874
- if not recent:
875
- return "No recent analyses available"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
876
 
877
- summary_text = "**Recent Analyses (Last 10):**\n\n"
878
- for i, entry in enumerate(recent, 1):
879
- summary_text += f"{i}. **{entry['sentiment']}** ({entry['confidence']:.3f}) - {entry['text']}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
880
 
881
- return summary_text
882
-
883
- # Sample data
884
- SAMPLE_TEXTS = [
885
- # Auto Detect
886
- ["The film had its moments, but overall it felt a bit too long and lacked emotional depth."],
887
-
888
- # English
889
- ["I was completely blown away by the movie — the performances were raw and powerful, and the story stayed with me long after the credits rolled."],
890
-
891
- # Chinese
892
- ["这部电影节奏拖沓,剧情老套,完全没有让我产生任何共鸣,是一次失望的观影体验。"],
893
-
894
- # Spanish
895
- ["Una obra maestra del cine contemporáneo, con actuaciones sobresalientes, un guion bien escrito y una dirección impecable."],
896
-
897
- # French
898
- ["Je m'attendais à beaucoup mieux. Le scénario était confus, les dialogues ennuyeux, et je me suis presque endormi au milieu du film."],
899
-
900
- # German
901
- ["Der Film war ein emotionales Erlebnis mit großartigen Bildern, einem mitreißenden Soundtrack und einer Geschichte, die zum Nachdenken anregt."],
902
-
903
- # Swedish
904
- ["Filmen var en besvikelse – tråkig handling, överdrivet skådespeleri och ett slut som inte gav något avslut alls."]
905
- ]
906
-
907
- BATCH_SAMPLE = """I love this product! It works perfectly.
908
- The service was terrible and slow.
909
- Not sure if I like it or not.
910
- Amazing quality and fast delivery!
911
- Could be better, but it's okay."""
912
 
913
  # Gradio Interface
914
- with gr.Blocks(theme=gr.themes.Soft(), title="Advanced Multilingual Sentiment Analyzer") as demo:
915
- gr.Markdown("# 🎭 Advanced Multilingual Sentiment Analyzer")
916
- gr.Markdown("Comprehensive sentiment analysis with batch processing, advanced analytics, and multilingual support")
917
 
918
- with gr.Tab("📝 Single Analysis"):
919
- with gr.Row():
920
- with gr.Column(scale=2):
921
- text_input = gr.Textbox(
922
- label="Text to Analyze",
923
- placeholder="Enter your text here... (supports multiple languages)",
924
- lines=4
925
- )
926
-
927
- with gr.Row():
928
- language_select = gr.Dropdown(
929
- choices=['Auto Detect', 'English', 'Chinese', 'Spanish', 'French', 'German', 'Swedish'],
930
- value='Auto Detect',
931
- label="Language"
932
  )
933
- theme_select = gr.Dropdown(
934
- choices=list(config.THEMES.keys()),
935
- value='default',
936
- label="Theme"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
937
  )
938
 
939
- with gr.Row():
940
- clean_text = gr.Checkbox(label="Clean Text", value=False)
941
- remove_punct = gr.Checkbox(label="Remove Punctuation", value=True)
942
- remove_nums = gr.Checkbox(label="Remove Numbers", value=False)
943
-
944
- analyze_btn = gr.Button("🔍 Analyze", variant="primary", size="lg")
945
-
946
- gr.Examples(
947
- examples=SAMPLE_TEXTS,
948
- inputs=text_input,
949
- label="Sample Texts (Multiple Languages)"
950
- )
951
 
952
- with gr.Column(scale=1):
953
- result_info = gr.Markdown("Enter text and click Analyze")
954
-
955
- with gr.Row():
956
- gauge_plot = gr.Plot(label="Sentiment Gauge")
957
- bars_plot = gr.Plot(label="Probability Distribution")
958
-
959
- with gr.Tab("📊 Batch Analysis"):
960
- with gr.Row():
961
- with gr.Column(scale=2):
962
- batch_input = gr.Textbox(
963
- label="Batch Text Input (One text per line)",
964
- placeholder="Enter multiple texts, one per line...",
965
- lines=8
966
- )
967
-
968
- with gr.Row():
969
- batch_language = gr.Dropdown(
970
- choices=['Auto Detect', 'English', 'Chinese', 'Spanish', 'French', 'German', 'Swedish'],
971
- value='Auto Detect',
972
- label="Language"
973
  )
974
- batch_theme = gr.Dropdown(
975
- choices=list(config.THEMES.keys()),
976
- value='default',
977
- label="Theme"
978
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
979
 
980
- with gr.Row():
981
- batch_clean = gr.Checkbox(label="Clean Text", value=False)
982
- batch_remove_punct = gr.Checkbox(label="Remove Punctuation", value=True)
983
- batch_remove_nums = gr.Checkbox(label="Remove Numbers", value=False)
984
-
985
- batch_analyze_btn = gr.Button("🔍 Analyze Batch", variant="primary", size="lg")
986
-
987
- gr.Examples(
988
- examples=[[BATCH_SAMPLE]],
989
- inputs=batch_input,
990
- label="Sample Batch Input"
991
- )
992
-
993
- with gr.Column(scale=1):
994
- batch_summary = gr.Markdown("Enter texts and click Analyze Batch")
995
-
996
- with gr.Row():
997
- batch_results_table = gr.DataFrame(
998
- label="Detailed Results",
999
- interactive=False
1000
- )
1001
-
1002
- with gr.Row():
1003
- batch_summary_plot = gr.Plot(label="Sentiment Summary")
1004
- batch_confidence_plot = gr.Plot(label="Confidence Distribution")
1005
-
1006
- with gr.Tab("🔬 Advanced Analysis"):
1007
- with gr.Row():
1008
- with gr.Column(scale=2):
1009
- advanced_input = gr.Textbox(
1010
- label="Text for Advanced Analysis",
1011
- placeholder="Enter text for detailed analysis...",
1012
- lines=4
1013
- )
1014
-
1015
- with gr.Row():
1016
- advanced_language = gr.Dropdown(
1017
- choices=['Auto Detect', 'English', 'Chinese', 'Spanish', 'French', 'German', 'Swedish'],
1018
- value='Auto Detect',
1019
- label="Language"
1020
  )
1021
- advanced_theme = gr.Dropdown(
 
 
 
 
 
 
 
 
 
 
 
 
 
1022
  choices=list(config.THEMES.keys()),
1023
- value='default',
1024
- label="Theme"
1025
- )
1026
-
1027
- with gr.Row():
1028
- include_keywords = gr.Checkbox(label="Extract Keywords", value=True)
1029
- keyword_count = gr.Slider(
1030
- minimum=3,
1031
- maximum=10,
1032
- value=5,
1033
- step=1,
1034
- label="Number of Keywords"
1035
  )
 
 
 
 
1036
 
1037
- min_confidence_slider = gr.Slider(
1038
- minimum=0.0,
1039
- maximum=1.0,
1040
- value=0.7,
1041
- step=0.1,
1042
- label="Minimum Confidence Threshold"
1043
- )
1044
-
1045
- advanced_analyze_btn = gr.Button("🔬 Advanced Analyze", variant="primary", size="lg")
1046
 
1047
- with gr.Column(scale=1):
1048
- advanced_result_info = gr.Markdown("Configure settings and click Advanced Analyze")
1049
-
1050
- with gr.Row():
1051
- advanced_gauge_plot = gr.Plot(label="Sentiment Gauge")
1052
- advanced_bars_plot = gr.Plot(label="Probability Distribution")
1053
-
1054
- with gr.Tab("📈 History & Analytics"):
1055
- with gr.Row():
1056
- with gr.Column():
1057
- gr.Markdown("### 📊 Statistics")
1058
- stats_btn = gr.Button("📈 Get Statistics")
1059
- recent_btn = gr.Button("🕒 Recent Analyses")
1060
- stats_output = gr.Markdown("Click 'Get Statistics' to view analysis history")
1061
 
1062
- with gr.Column():
1063
- gr.Markdown("### 🔍 Filter History")
1064
- with gr.Row():
1065
- sentiment_filter = gr.Dropdown(
1066
- choices=["All", "Positive", "Negative", "Neutral"],
1067
- value="All",
1068
- label="Filter by Sentiment"
1069
- )
1070
- language_filter = gr.Dropdown(
1071
- choices=["All", "English", "Chinese", "Spanish", "French", "German", "Swedish"],
1072
- value="All",
1073
- label="Filter by Language"
1074
- )
1075
-
1076
- confidence_filter = gr.Slider(
1077
- minimum=0.0,
1078
- maximum=1.0,
1079
- value=0.0,
1080
- step=0.1,
1081
- label="Minimum Confidence"
1082
- )
1083
-
1084
- filter_btn = gr.Button("🔍 Filter History")
1085
 
1086
- with gr.Row():
1087
- dashboard_btn = gr.Button("📊 View Dashboard")
1088
- clear_btn = gr.Button("🗑️ Clear History", variant="stop")
 
 
1089
 
1090
- with gr.Row():
1091
- export_csv_btn = gr.Button("📄 Export CSV")
1092
- export_excel_btn = gr.Button("📊 Export Excel")
 
 
 
1093
 
1094
- dashboard_plot = gr.Plot(label="Analytics Dashboard")
 
 
 
 
1095
 
1096
- with gr.Row():
1097
- filtered_results = gr.Markdown("Use filters to view specific entries")
1098
- filtered_table = gr.DataFrame(label="Filtered History", interactive=False)
 
1099
 
1100
- csv_file = gr.File(label="Download CSV Report")
1101
- excel_file = gr.File(label="Download Excel Report")
1102
- history_status = gr.Textbox(label="Status", interactive=False)
1103
-
1104
- # Event handlers
1105
-
1106
- # Single Analysis
1107
- analyze_btn.click(
1108
- analyze_single_text,
1109
- inputs=[text_input, language_select, theme_select, clean_text, remove_punct, remove_nums],
1110
- outputs=[result_info, gauge_plot, bars_plot]
1111
- )
1112
-
1113
- # Batch Analysis
1114
- batch_analyze_btn.click(
1115
- analyze_batch_texts,
1116
- inputs=[batch_input, batch_language, batch_theme, batch_clean, batch_remove_punct, batch_remove_nums],
1117
- outputs=[batch_summary, batch_results_table, batch_summary_plot, batch_confidence_plot]
1118
- )
1119
-
1120
- # Advanced Analysis
1121
- advanced_analyze_btn.click(
1122
- analyze_advanced_text,
1123
- inputs=[advanced_input, advanced_language, advanced_theme, include_keywords, keyword_count, min_confidence_slider],
1124
- outputs=[advanced_result_info, advanced_gauge_plot, advanced_bars_plot]
1125
- )
1126
-
1127
- # History & Analytics
1128
- stats_btn.click(
1129
- get_history_stats,
1130
- outputs=stats_output
1131
- )
1132
-
1133
- recent_btn.click(
1134
- get_recent_analyses,
1135
- outputs=stats_output
1136
- )
1137
-
1138
- filter_btn.click(
1139
- filter_history_display,
1140
- inputs=[sentiment_filter, language_filter, confidence_filter],
1141
- outputs=[filtered_results, filtered_table]
1142
- )
1143
-
1144
- dashboard_btn.click(
1145
- plot_history_dashboard,
1146
- outputs=[dashboard_plot, history_status]
1147
- )
1148
-
1149
- export_csv_btn.click(
1150
- export_history_csv,
1151
- outputs=[csv_file, history_status]
1152
- )
1153
-
1154
- export_excel_btn.click(
1155
- export_history_excel,
1156
- outputs=[excel_file, history_status]
1157
- )
1158
 
1159
- clear_btn.click(
1160
- clear_all_history,
1161
- outputs=history_status
1162
- )
1163
 
 
1164
  if __name__ == "__main__":
1165
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  import tempfile
15
  from datetime import datetime
16
  import logging
17
+ from functools import lru_cache, wraps
18
  from dataclasses import dataclass
19
+ from typing import List, Dict, Optional, Tuple, Any, Callable
20
+ from contextlib import contextmanager
21
  import nltk
22
  from nltk.corpus import stopwords
23
  import langdetect
24
  import pandas as pd
25
+ import gc
26
 
27
  # Configuration
28
  @dataclass
29
  class Config:
30
+ MAX_HISTORY_SIZE: int = 1000
31
+ BATCH_SIZE_LIMIT: int = 50
32
  MAX_TEXT_LENGTH: int = 512
33
+ MIN_WORD_LENGTH: int = 2
34
+ CACHE_SIZE: int = 128
35
+ BATCH_PROCESSING_SIZE: int = 8
36
 
37
  # Supported languages and models
38
  SUPPORTED_LANGUAGES = {
 
51
  'zh': "uer/roberta-base-finetuned-dianping-chinese"
52
  }
53
 
54
+ # Color themes for Plotly
55
  THEMES = {
56
  'default': {'pos': '#4CAF50', 'neg': '#F44336', 'neu': '#FF9800'},
57
  'ocean': {'pos': '#0077BE', 'neg': '#FF6B35', 'neu': '#00BCD4'},
 
73
  except:
74
  STOP_WORDS = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
75
 
76
+ # Decorators and Context Managers
77
+ def handle_errors(default_return=None):
78
+ """Centralized error handling decorator"""
79
+ def decorator(func: Callable) -> Callable:
80
+ @wraps(func)
81
+ def wrapper(*args, **kwargs):
82
+ try:
83
+ return func(*args, **kwargs)
84
+ except Exception as e:
85
+ logger.error(f"{func.__name__} failed: {e}")
86
+ return default_return if default_return is not None else f"Error: {str(e)}"
87
+ return wrapper
88
+ return decorator
89
+
90
+ @contextmanager
91
+ def memory_cleanup():
92
+ """Context manager for memory cleanup"""
93
+ try:
94
+ yield
95
+ finally:
96
+ gc.collect()
97
+
98
+ class ThemeContext:
99
+ """Theme management context"""
100
+ def __init__(self, theme: str = 'default'):
101
+ self.theme = theme
102
+ self.colors = config.THEMES.get(theme, config.THEMES['default'])
103
+
104
+ # Enhanced Model Manager with Multi-language Support
105
  class ModelManager:
106
+ """Multi-language model manager with lazy loading"""
107
+ _instance = None
108
+
109
+ def __new__(cls):
110
+ if cls._instance is None:
111
+ cls._instance = super().__new__(cls)
112
+ cls._instance._initialized = False
113
+ return cls._instance
114
+
115
  def __init__(self):
116
+ if not self._initialized:
117
+ self.models = {}
118
+ self.tokenizers = {}
119
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
120
+ self._load_default_models()
121
+ self._initialized = True
122
 
123
+ def _load_default_models(self):
124
+ """Load default models"""
125
  try:
126
  # Load multilingual model as default
127
  model_name = config.MODELS['multilingual']
 
145
  """Get model for specific language"""
146
  if language == 'zh':
147
  return self.models['zh'], self.tokenizers['zh']
148
+ return self.models['default'], self.tokenizers['default']
 
 
149
 
150
  @staticmethod
151
  def detect_language(text: str) -> str:
152
+ """Detect text language"""
153
  try:
 
154
  detected = langdetect.detect(text)
 
155
  language_mapping = {
156
  'zh-cn': 'zh',
157
  'zh-tw': 'zh'
 
161
  except:
162
  return 'en'
163
 
164
+ # Simplified Text Processing
165
+ class TextProcessor:
166
+ """Optimized text processing with multi-language support"""
167
+
168
+ @staticmethod
169
+ @lru_cache(maxsize=config.CACHE_SIZE)
170
+ def clean_text(text: str, remove_punctuation: bool = True, remove_numbers: bool = False) -> str:
171
+ """Clean text with language awareness"""
172
+ text = text.strip()
173
+
174
+ # Don't clean Chinese text aggressively
175
+ if re.search(r'[\u4e00-\u9fff]', text):
176
+ return text
177
+
178
+ text = text.lower()
179
+
180
+ if remove_numbers:
181
+ text = re.sub(r'\d+', '', text)
182
+
183
+ if remove_punctuation:
184
+ text = re.sub(r'[^\w\s]', '', text)
185
+
186
+ words = text.split()
187
+ cleaned_words = [w for w in words if w not in STOP_WORDS and len(w) >= config.MIN_WORD_LENGTH]
188
+ return ' '.join(cleaned_words)
189
+
190
+ @staticmethod
191
+ def extract_keywords(text: str, top_k: int = 5) -> List[str]:
192
+ """Extract keywords with language support"""
193
+ if re.search(r'[\u4e00-\u9fff]', text):
194
+ # Chinese text processing
195
+ words = re.findall(r'[\u4e00-\u9fff]+', text)
196
+ all_chars = ''.join(words)
197
+ char_freq = Counter(all_chars)
198
+ return [char for char, _ in char_freq.most_common(top_k)]
199
+ else:
200
+ # Other languages
201
+ cleaned = TextProcessor.clean_text(text)
202
+ words = cleaned.split()
203
+ word_freq = Counter(words)
204
+ return [word for word, _ in word_freq.most_common(top_k)]
205
+
206
+ @staticmethod
207
+ def parse_batch_input(text: str) -> List[str]:
208
+ """Parse batch input from textarea"""
209
+ lines = text.strip().split('\n')
210
+ return [line.strip() for line in lines if line.strip()]
211
 
212
+ # Enhanced History Manager
213
  class HistoryManager:
214
+ """Enhanced history management with filtering"""
215
  def __init__(self):
216
  self._history = []
217
 
218
+ def add(self, entry: Dict):
219
+ """Add entry with timestamp"""
220
+ entry['timestamp'] = datetime.now().isoformat()
221
  self._history.append(entry)
222
  if len(self._history) > config.MAX_HISTORY_SIZE:
223
  self._history = self._history[-config.MAX_HISTORY_SIZE:]
224
 
225
+ def add_batch(self, entries: List[Dict]):
226
+ """Add multiple entries"""
227
  for entry in entries:
228
+ self.add(entry)
229
 
230
+ def get_all(self) -> List[Dict]:
231
  return self._history.copy()
232
 
233
+ def get_recent(self, n: int = 10) -> List[Dict]:
 
234
  return self._history[-n:] if self._history else []
235
 
236
+ def filter_by(self, sentiment: str = None, language: str = None,
237
+ min_confidence: float = None) -> List[Dict]:
238
  """Filter history by criteria"""
239
  filtered = self._history
240
 
 
252
  self._history.clear()
253
  return count
254
 
255
+ def size(self) -> int:
256
+ return len(self._history)
257
+
258
  def get_stats(self) -> Dict:
259
+ """Get comprehensive statistics"""
260
  if not self._history:
261
  return {}
262
 
 
273
  'max_confidence': np.max(confidences),
274
  'min_confidence': np.min(confidences),
275
  'languages_detected': len(set(languages)),
276
+ 'most_common_language': Counter(languages).most_common(1)[0][0] if languages else 'en'
 
277
  }
278
 
279
+ # Core Sentiment Analysis Engine
280
+ class SentimentEngine:
281
+ """Multi-language sentiment analysis engine"""
 
282
 
283
+ def __init__(self):
284
+ self.model_manager = ModelManager()
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
+ def extract_attention_keywords(self, text: str, language: str = 'auto', top_k: int = 10) -> List[Tuple[str, float]]:
287
+ """Extract keywords using attention weights"""
288
+ try:
289
+ if language == 'auto':
290
+ language = self.model_manager.detect_language(text)
291
+
292
+ model, tokenizer = self.model_manager.get_model(language)
293
+
294
+ inputs = tokenizer(
295
+ text, return_tensors="pt", padding=True,
296
+ truncation=True, max_length=config.MAX_TEXT_LENGTH
297
+ ).to(self.model_manager.device)
298
+
299
+ with torch.no_grad():
300
+ outputs = model(**inputs, output_attentions=True)
301
+
302
+ if hasattr(outputs, 'attentions') and outputs.attentions:
303
+ # Use attention weights
304
+ attention = outputs.attentions[-1]
305
+ avg_attention = attention.mean(dim=1)[0, 0, :]
306
+
307
+ tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
308
+ attention_scores = avg_attention.cpu().numpy()
309
+
310
+ # Process tokens and scores
311
+ word_scores = {}
312
+ current_word = ""
313
+ current_score = 0.0
314
+
315
+ for token, score in zip(tokens, attention_scores):
316
+ if token in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>']:
317
+ continue
318
+
319
+ if token.startswith('##') or token.startswith('▁'):
320
+ current_word += token.replace('##', '').replace('▁', '')
321
+ current_score = max(current_score, score)
322
+ else:
323
+ if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
324
+ word_scores[current_word.lower()] = current_score
325
+ current_word = token
326
+ current_score = score
327
+
328
+ if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
329
+ word_scores[current_word.lower()] = current_score
330
+
331
+ # Filter and sort
332
+ filtered_words = {
333
+ word: score for word, score in word_scores.items()
334
+ if word not in STOP_WORDS and len(word) >= config.MIN_WORD_LENGTH
335
+ }
336
+
337
+ sorted_words = sorted(filtered_words.items(), key=lambda x: x[1], reverse=True)
338
+ return sorted_words[:top_k]
339
+
340
+ except Exception as e:
341
+ logger.error(f"Attention keyword extraction failed: {e}")
342
+
343
+ # Fallback to simple keyword extraction
344
+ keywords = TextProcessor.extract_keywords(text, top_k)
345
+ return [(word, 0.1) for word in keywords]
346
 
347
+ @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0, 'keywords': []})
348
+ def analyze_single(self, text: str, language: str = 'auto', preprocessing_options: Dict = None) -> Dict:
349
+ """Analyze single text with enhanced features"""
350
  if not text.strip():
351
  raise ValueError("Empty text provided")
352
 
353
+ # Detect language
354
  if language == 'auto':
355
+ detected_lang = self.model_manager.detect_language(text)
356
  else:
357
  detected_lang = language
358
 
359
  # Get appropriate model
360
+ model, tokenizer = self.model_manager.get_model(detected_lang)
361
 
362
+ # Preprocessing
363
  options = preprocessing_options or {}
364
  processed_text = text
365
  if options.get('clean_text', False) and not re.search(r'[\u4e00-\u9fff]', text):
 
369
  options.get('remove_numbers', False)
370
  )
371
 
372
+ # Tokenize and analyze
373
+ inputs = tokenizer(processed_text, return_tensors="pt", padding=True,
374
+ truncation=True, max_length=config.MAX_TEXT_LENGTH).to(self.model_manager.device)
375
+
376
+ with torch.no_grad():
377
+ outputs = model(**inputs)
378
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
379
+
380
+ # Handle different model outputs
381
+ if len(probs) == 3: # negative, neutral, positive
382
+ sentiment_idx = np.argmax(probs)
383
+ sentiment_labels = ['Negative', 'Neutral', 'Positive']
384
+ sentiment = sentiment_labels[sentiment_idx]
385
+ confidence = float(probs[sentiment_idx])
386
 
387
+ result = {
388
+ 'sentiment': sentiment,
389
+ 'confidence': confidence,
390
+ 'neg_prob': float(probs[0]),
391
+ 'neu_prob': float(probs[1]),
392
+ 'pos_prob': float(probs[2]),
393
+ 'has_neutral': True
394
+ }
395
+ else: # negative, positive
396
+ pred = np.argmax(probs)
397
+ sentiment = "Positive" if pred == 1 else "Negative"
398
+ confidence = float(probs[pred])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
 
400
+ result = {
401
+ 'sentiment': sentiment,
402
+ 'confidence': confidence,
403
+ 'neg_prob': float(probs[0]),
404
+ 'pos_prob': float(probs[1]),
405
+ 'neu_prob': 0.0,
406
+ 'has_neutral': False
407
+ }
408
+
409
+ # Extract keywords
410
+ keywords = self.extract_attention_keywords(text, detected_lang)
411
+
412
+ # Add metadata
413
+ result.update({
414
+ 'language': detected_lang,
415
+ 'keywords': keywords,
416
+ 'word_count': len(text.split()),
417
+ 'char_count': len(text)
418
+ })
419
+
420
+ return result
421
+
422
+ @handle_errors(default_return=[])
423
+ def analyze_batch(self, texts: List[str], language: str = 'auto',
424
+ preprocessing_options: Dict = None, progress_callback=None) -> List[Dict]:
425
+ """Optimized batch processing"""
426
+ if len(texts) > config.BATCH_SIZE_LIMIT:
427
+ texts = texts[:config.BATCH_SIZE_LIMIT]
428
+
429
+ results = []
430
+ batch_size = config.BATCH_PROCESSING_SIZE
431
+
432
+ for i in range(0, len(texts), batch_size):
433
+ batch = texts[i:i+batch_size]
434
 
435
+ if progress_callback:
436
+ progress_callback((i + len(batch)) / len(texts))
437
 
438
+ for text in batch:
439
+ try:
440
+ result = self.analyze_single(text, language, preprocessing_options)
441
+ result['batch_index'] = len(results)
442
+ result['text'] = text[:100] + '...' if len(text) > 100 else text
443
+ result['full_text'] = text
444
+ results.append(result)
445
+ except Exception as e:
446
+ results.append({
447
+ 'sentiment': 'Error',
448
+ 'confidence': 0.0,
449
+ 'error': str(e),
450
+ 'batch_index': len(results),
451
+ 'text': text[:100] + '...' if len(text) > 100 else text,
452
+ 'full_text': text
453
+ })
454
+
 
 
 
 
 
 
455
  return results
456
 
457
+ # Advanced Plotly Visualization System
458
  class PlotlyVisualizer:
459
+ """Enhanced Plotly visualizations"""
460
 
461
  @staticmethod
462
+ @handle_errors(default_return=None)
463
+ def create_sentiment_gauge(result: Dict, theme: ThemeContext) -> go.Figure:
464
+ """Create animated sentiment gauge"""
465
+ colors = theme.colors
466
 
467
+ if result.get('has_neutral', False):
468
  # Three-way gauge
469
  fig = go.Figure(go.Indicator(
470
+ mode="gauge+number+delta",
471
+ value=result['pos_prob'] * 100,
472
+ domain={'x': [0, 1], 'y': [0, 1]},
473
+ title={'text': f"Sentiment: {result['sentiment']}"},
474
+ delta={'reference': 50},
475
+ gauge={
476
  'axis': {'range': [None, 100]},
477
  'bar': {'color': colors['pos'] if result['sentiment'] == 'Positive' else colors['neg']},
478
  'steps': [
 
490
  else:
491
  # Two-way gauge
492
  fig = go.Figure(go.Indicator(
493
+ mode="gauge+number",
494
+ value=result['confidence'] * 100,
495
+ domain={'x': [0, 1], 'y': [0, 1]},
496
+ title={'text': f"Confidence: {result['sentiment']}"},
497
+ gauge={
498
  'axis': {'range': [None, 100]},
499
  'bar': {'color': colors['pos'] if result['sentiment'] == 'Positive' else colors['neg']},
500
  'steps': [
 
508
  return fig
509
 
510
  @staticmethod
511
+ @handle_errors(default_return=None)
512
+ def create_probability_bars(result: Dict, theme: ThemeContext) -> go.Figure:
513
  """Create probability bar chart"""
514
+ colors = theme.colors
515
 
516
+ if result.get('has_neutral', False):
517
  labels = ['Negative', 'Neutral', 'Positive']
518
  values = [result['neg_prob'], result['neu_prob'], result['pos_prob']]
519
  bar_colors = [colors['neg'], colors['neu'], colors['pos']]
 
523
  bar_colors = [colors['neg'], colors['pos']]
524
 
525
  fig = go.Figure(data=[
526
+ go.Bar(x=labels, y=values, marker_color=bar_colors,
527
+ text=[f'{v:.3f}' for v in values], textposition='outside')
528
  ])
529
 
 
530
  fig.update_layout(
531
  title="Sentiment Probabilities",
532
  yaxis_title="Probability",
 
535
  )
536
 
537
  return fig
538
+
539
  @staticmethod
540
+ @handle_errors(default_return=None)
541
+ def create_keyword_chart(keywords: List[Tuple[str, float]], sentiment: str, theme: ThemeContext) -> go.Figure:
542
+ """Create keyword importance chart"""
543
+ if not keywords:
544
+ fig = go.Figure()
545
+ fig.add_annotation(text="No keywords extracted",
546
+ xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)
547
+ fig.update_layout(height=400, title="Keywords")
548
+ return fig
549
+
550
+ words = [word for word, score in keywords]
551
+ scores = [score for word, score in keywords]
552
+
553
+ color = theme.colors['pos'] if sentiment == 'Positive' else theme.colors['neg']
554
+
555
+ fig = go.Figure(data=[
556
+ go.Bar(
557
+ y=words,
558
+ x=scores,
559
+ orientation='h',
560
+ marker_color=color,
561
+ text=[f'{score:.3f}' for score in scores],
562
+ textposition='auto'
563
+ )
564
+ ])
565
+
566
+ fig.update_layout(
567
+ title=f"Top Keywords ({sentiment})",
568
+ xaxis_title="Attention Weight",
569
+ yaxis_title="Keywords",
570
+ height=400,
571
+ showlegend=False
572
+ )
573
+
574
+ return fig
575
+
576
+ @staticmethod
577
+ @handle_errors(default_return=None)
578
+ def create_batch_summary(results: List[Dict], theme: ThemeContext) -> go.Figure:
579
  """Create batch analysis summary"""
580
+ colors = theme.colors
581
 
582
  # Count sentiments
583
+ sentiments = [r['sentiment'] for r in results if 'sentiment' in r and r['sentiment'] != 'Error']
584
  sentiment_counts = Counter(sentiments)
585
 
586
  # Create pie chart
 
598
  )
599
 
600
  return fig
601
+
602
  @staticmethod
603
+ @handle_errors(default_return=None)
604
  def create_confidence_distribution(results: List[Dict]) -> go.Figure:
605
  """Create confidence distribution plot"""
606
  confidences = [r['confidence'] for r in results if 'confidence' in r and r['sentiment'] != 'Error']
 
625
  return fig
626
 
627
  @staticmethod
628
+ @handle_errors(default_return=None)
629
+ def create_history_dashboard(history: List[Dict], theme: ThemeContext) -> go.Figure:
630
  """Create comprehensive history dashboard"""
631
  if len(history) < 2:
632
  return go.Figure()
 
642
 
643
  # Extract data
644
  indices = list(range(len(history)))
645
+ pos_probs = [item.get('pos_prob', 0) for item in history]
646
  confidences = [item['confidence'] for item in history]
647
  sentiments = [item['sentiment'] for item in history]
648
  languages = [item.get('language', 'en') for item in history]
649
 
650
  # Sentiment timeline
651
+ colors_map = {'Positive': theme.colors['pos'], 'Negative': theme.colors['neg'], 'Neutral': theme.colors['neu']}
652
+ colors = [colors_map.get(s, '#999999') for s in sentiments]
653
+
654
  fig.add_trace(
655
  go.Scatter(x=indices, y=pos_probs, mode='lines+markers',
656
  marker=dict(color=colors, size=8),
 
674
 
675
  # Sentiment summary
676
  sent_counts = Counter(sentiments)
677
+ sent_colors = [colors_map.get(k, '#999999') for k in sent_counts.keys()]
678
  fig.add_trace(
679
  go.Bar(x=list(sent_counts.keys()), y=list(sent_counts.values()),
680
+ marker_color=sent_colors),
681
  row=2, col=2
682
  )
683
 
684
  fig.update_layout(height=800, showlegend=False)
685
  return fig
686
 
687
+ # Universal Data Handler
688
+ class DataHandler:
689
+ """Enhanced data operations"""
690
+
691
+ @staticmethod
692
+ @handle_errors(default_return=(None, "Export failed"))
693
+ def export_data(data: List[Dict], format_type: str) -> Tuple[Optional[str], str]:
694
+ """Export data with comprehensive information"""
695
+ if not data:
696
+ return None, "No data to export"
697
+
698
+ temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False,
699
+ suffix=f'.{format_type}', encoding='utf-8')
700
+
701
+ if format_type == 'csv':
702
+ writer = csv.writer(temp_file)
703
+ writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Language',
704
+ 'Pos_Prob', 'Neg_Prob', 'Neu_Prob', 'Keywords', 'Word_Count'])
705
+ for entry in data:
706
+ keywords_str = "|".join([f"{word}:{score:.3f}" for word, score in entry.get('keywords', [])])
707
+ writer.writerow([
708
+ entry.get('timestamp', ''),
709
+ entry.get('text', ''),
710
+ entry.get('sentiment', ''),
711
+ f"{entry.get('confidence', 0):.4f}",
712
+ entry.get('language', 'en'),
713
+ f"{entry.get('pos_prob', 0):.4f}",
714
+ f"{entry.get('neg_prob', 0):.4f}",
715
+ f"{entry.get('neu_prob', 0):.4f}",
716
+ keywords_str,
717
+ entry.get('word_count', 0)
718
+ ])
719
+ elif format_type == 'json':
720
+ json.dump(data, temp_file, indent=2, ensure_ascii=False)
721
+
722
+ temp_file.close()
723
+ return temp_file.name, f"Exported {len(data)} entries"
724
+
725
+ @staticmethod
726
+ @handle_errors(default_return="")
727
+ def process_file(file) -> str:
728
+ """Process uploaded files"""
729
+ if not file:
730
+ return ""
731
+
732
+ content = file.read().decode('utf-8')
733
+
734
+ if file.name.endswith('.csv'):
735
+ csv_file = io.StringIO(content)
736
+ reader = csv.reader(csv_file)
737
+ try:
738
+ next(reader) # Skip header
739
+ texts = []
740
+ for row in reader:
741
+ if row and row[0].strip():
742
+ text = row[0].strip().strip('"')
743
+ if text:
744
+ texts.append(text)
745
+ return '\n'.join(texts)
746
+ except:
747
+ lines = content.strip().split('\n')[1:]
748
+ texts = []
749
+ for line in lines:
750
+ if line.strip():
751
+ text = line.strip().strip('"')
752
+ if text:
753
+ texts.append(text)
754
+ return '\n'.join(texts)
755
+
756
+ return content
757
+
758
+ # Main Application Class
759
+ class SentimentApp:
760
+ """Main multilingual sentiment analysis application"""
761
+
762
+ def __init__(self):
763
+ self.engine = SentimentEngine()
764
+ self.history = HistoryManager()
765
+ self.data_handler = DataHandler()
766
+
767
+ # Multi-language examples
768
+ self.examples = [
769
+ ["This movie was absolutely fantastic! The acting was superb and the plot kept me engaged throughout."],
770
+ ["The film was disappointing with poor character development and a confusing storyline."],
771
+ ["这部电影真的很棒!演技精湛,情节引人入胜。"], # Chinese
772
+ ["Esta película fue increíble, me encantó la cinematografía."], # Spanish
773
+ ["Ce film était magnifique, j'ai adoré la réalisation."], # French
774
+ ]
775
+
776
+ @handle_errors(default_return=("Please enter text", None, None, None))
777
+ def analyze_single(self, text: str, language: str, theme: str, clean_text: bool,
778
  remove_punct: bool, remove_nums: bool):
779
+ """Single text analysis with enhanced visualizations"""
 
780
  if not text.strip():
781
+ return "Please enter text", None, None, None
782
+
783
+ # Map display names to language codes
784
+ language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
 
 
 
 
 
 
 
 
785
  language_code = language_map.get(language, 'auto')
786
 
787
  preprocessing_options = {
 
790
  'remove_numbers': remove_nums
791
  }
792
 
793
+ with memory_cleanup():
794
+ result = self.engine.analyze_single(text, language_code, preprocessing_options)
795
+
796
+ # Add to history
797
+ history_entry = {
798
+ 'text': text[:100] + '...' if len(text) > 100 else text,
799
+ 'full_text': text,
800
+ 'sentiment': result['sentiment'],
801
+ 'confidence': result['confidence'],
802
+ 'pos_prob': result.get('pos_prob', 0),
803
+ 'neg_prob': result.get('neg_prob', 0),
804
+ 'neu_prob': result.get('neu_prob', 0),
805
+ 'language': result['language'],
806
+ 'keywords': result['keywords'],
807
+ 'word_count': result['word_count'],
808
+ 'analysis_type': 'single'
809
+ }
810
+ self.history.add(history_entry)
811
+
812
+ # Create visualizations
813
+ theme_ctx = ThemeContext(theme)
814
+ gauge_fig = PlotlyVisualizer.create_sentiment_gauge(result, theme_ctx)
815
+ bars_fig = PlotlyVisualizer.create_probability_bars(result, theme_ctx)
816
+ keyword_fig = PlotlyVisualizer.create_keyword_chart(result['keywords'], result['sentiment'], theme_ctx)
817
+
818
+ # Create comprehensive result text
819
+ keywords_str = ", ".join([f"{word}({score:.3f})" for word, score in result['keywords'][:5]])
820
+
821
+ info_text = f"""
822
  **Analysis Results:**
823
  - **Sentiment:** {result['sentiment']} ({result['confidence']:.3f} confidence)
824
  - **Language:** {result['language'].upper()}
825
+ - **Keywords:** {keywords_str}
826
+ - **Statistics:** {result['word_count']} words, {result['char_count']} characters
827
+ """
828
+
829
+ return info_text, gauge_fig, bars_fig, keyword_fig
830
+
831
+ @handle_errors(default_return=("Please enter texts", None, None, None))
832
+ def analyze_batch(self, batch_text: str, language: str, theme: str,
833
+ clean_text: bool, remove_punct: bool, remove_nums: bool):
834
+ """Enhanced batch analysis"""
 
 
 
 
835
  if not batch_text.strip():
836
  return "Please enter texts (one per line)", None, None, None
837
 
 
844
  if not texts:
845
  return "No valid texts found", None, None, None
846
 
847
+ # Map display names to language codes
848
+ language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
 
 
 
 
 
 
 
 
849
  language_code = language_map.get(language, 'auto')
850
 
851
  preprocessing_options = {
 
854
  'remove_numbers': remove_nums
855
  }
856
 
857
+ with memory_cleanup():
858
+ results = self.engine.analyze_batch(texts, language_code, preprocessing_options)
859
+
860
+ # Add to history
861
+ batch_entries = []
862
+ for result in results:
863
+ if 'error' not in result:
864
+ entry = {
865
+ 'text': result['text'],
866
+ 'full_text': result['full_text'],
867
+ 'sentiment': result['sentiment'],
868
+ 'confidence': result['confidence'],
869
+ 'pos_prob': result.get('pos_prob', 0),
870
+ 'neg_prob': result.get('neg_prob', 0),
871
+ 'neu_prob': result.get('neu_prob', 0),
872
+ 'language': result['language'],
873
+ 'keywords': result['keywords'],
874
+ 'word_count': result['word_count'],
875
+ 'analysis_type': 'batch',
876
+ 'batch_index': result['batch_index']
877
+ }
878
+ batch_entries.append(entry)
879
+
880
+ self.history.add_batch(batch_entries)
881
+
882
+ # Create visualizations
883
+ theme_ctx = ThemeContext(theme)
884
+ summary_fig = PlotlyVisualizer.create_batch_summary(results, theme_ctx)
885
+ confidence_fig = PlotlyVisualizer.create_confidence_distribution(results)
886
+
887
+ # Create results DataFrame
888
+ df_data = []
889
+ for result in results:
890
+ if 'error' in result:
891
+ df_data.append({
892
+ 'Index': result['batch_index'] + 1,
893
+ 'Text': result['text'],
894
+ 'Sentiment': 'Error',
895
+ 'Confidence': 0.0,
896
+ 'Language': 'Unknown',
897
+ 'Error': result['error']
898
+ })
899
+ else:
900
+ keywords_str = ', '.join([word for word, _ in result['keywords'][:3]])
901
+ df_data.append({
902
+ 'Index': result['batch_index'] + 1,
903
+ 'Text': result['text'],
904
+ 'Sentiment': result['sentiment'],
905
+ 'Confidence': f"{result['confidence']:.3f}",
906
+ 'Language': result['language'].upper(),
907
+ 'Keywords': keywords_str
908
+ })
 
 
 
 
 
 
 
909
 
910
+ df = pd.DataFrame(df_data)
911
+
912
+ # Create summary text
913
+ successful_results = [r for r in results if 'error' not in r]
914
+ error_count = len(results) - len(successful_results)
915
+
916
+ if successful_results:
917
+ sentiment_counts = Counter([r['sentiment'] for r in successful_results])
918
+ avg_confidence = np.mean([r['confidence'] for r in successful_results])
919
+ languages = Counter([r['language'] for r in successful_results])
920
+
921
+ summary_text = f"""
922
  **Batch Analysis Summary:**
923
  - **Total Texts:** {len(texts)}
924
  - **Successful:** {len(successful_results)}
925
  - **Errors:** {error_count}
926
  - **Average Confidence:** {avg_confidence:.3f}
927
  - **Sentiments:** {dict(sentiment_counts)}
928
+ - **Languages Detected:** {dict(languages)}
929
+ """
930
+ else:
931
+ summary_text = f"All {len(texts)} texts failed to analyze."
932
+
933
+ return summary_text, df, summary_fig, confidence_fig
934
+
935
+ @handle_errors(default_return=(None, "No history available"))
936
+ def plot_history(self, theme: str = 'default'):
937
+ """Plot comprehensive history analysis"""
938
+ history = self.history.get_all()
939
+ if len(history) < 2:
940
+ return None, f"Need at least 2 analyses for trends. Current: {len(history)}"
941
 
942
+ theme_ctx = ThemeContext(theme)
943
 
944
+ with memory_cleanup():
945
+ fig = PlotlyVisualizer.create_history_dashboard(history, theme_ctx)
946
+ stats = self.history.get_stats()
947
+
948
+ stats_text = f"""
949
+ **History Statistics:**
950
+ - **Total Analyses:** {stats.get('total_analyses', 0)}
951
+ - **Positive:** {stats.get('positive_count', 0)}
952
+ - **Negative:** {stats.get('negative_count', 0)}
953
+ - **Neutral:** {stats.get('neutral_count', 0)}
954
+ - **Average Confidence:** {stats.get('avg_confidence', 0):.3f}
955
+ - **Languages:** {stats.get('languages_detected', 0)}
956
+ - **Most Common Language:** {stats.get('most_common_language', 'N/A').upper()}
957
+ """
958
+
959
+ return fig, stats_text
960
+
961
+ @handle_errors(default_return=("No data available",))
962
+ def get_history_status(self):
963
+ """Get current history status"""
964
+ stats = self.history.get_stats()
965
+ if not stats:
966
+ return "No analyses performed yet"
967
+
968
+ return f"""
969
+ **Current Status:**
970
+ - **Total Analyses:** {stats['total_analyses']}
971
+ - **Recent Sentiment Distribution:**
972
+ * Positive: {stats['positive_count']}
973
+ * Negative: {stats['negative_count']}
974
+ * Neutral: {stats['neutral_count']}
975
+ - **Average Confidence:** {stats['avg_confidence']:.3f}
976
+ - **Languages Detected:** {stats['languages_detected']}
977
+ """
978
 
979
+ # Gradio Interface
980
+ def create_interface():
981
+ """Create comprehensive Gradio interface"""
982
+ app = SentimentApp()
983
+
984
+ with gr.Blocks(theme=gr.themes.Soft(), title="Multilingual Sentiment Analyzer") as demo:
985
+ gr.Markdown("# 🌍 Advanced Multilingual Sentiment Analyzer")
986
+ gr.Markdown("AI-powered sentiment analysis with support for multiple languages, advanced visualizations, and explainable AI features")
987
+
988
+ with gr.Tab("Single Analysis"):
989
+ with gr.Row():
990
+ with gr.Column():
991
+ text_input = gr.Textbox(
992
+ label="Enter Text for Analysis",
993
+ placeholder="Enter your text in any supported language...",
994
+ lines=5
995
+ )
996
+
997
+ with gr.Row():
998
+ language_selector = gr.Dropdown(
999
+ choices=list(config.SUPPORTED_LANGUAGES.values()),
1000
+ value="Auto Detect",
1001
+ label="Language"
1002
+ )
1003
+ theme_selector = gr.Dropdown(
1004
+ choices=list(config.THEMES.keys()),
1005
+ value="default",
1006
+ label="Theme"
1007
+ )
1008
+
1009
+ with gr.Row():
1010
+ clean_text_cb = gr.Checkbox(label="Clean Text", value=False)
1011
+ remove_punct_cb = gr.Checkbox(label="Remove Punctuation", value=False)
1012
+ remove_nums_cb = gr.Checkbox(label="Remove Numbers", value=False)
1013
+
1014
+ analyze_btn = gr.Button("Analyze", variant="primary", size="lg")
1015
+
1016
+ gr.Examples(
1017
+ examples=app.examples,
1018
+ inputs=text_input,
1019
+ cache_examples=False
1020
+ )
1021
+
1022
+ with gr.Column():
1023
+ result_output = gr.Textbox(label="Analysis Results", lines=8)
1024
+
1025
+ with gr.Row():
1026
+ gauge_plot = gr.Plot(label="Sentiment Gauge")
1027
+ probability_plot = gr.Plot(label="Probability Distribution")
1028
+
1029
+ with gr.Row():
1030
+ keyword_plot = gr.Plot(label="Key Contributing Words")
1031
+
1032
+ with gr.Tab("Batch Analysis"):
1033
+ with gr.Row():
1034
+ with gr.Column():
1035
+ file_upload = gr.File(
1036
+ label="Upload File (CSV/TXT)",
1037
+ file_types=[".csv", ".txt"]
1038
+ )
1039
+ batch_input = gr.Textbox(
1040
+ label="Batch Input (one text per line)",
1041
+ placeholder="Enter multiple texts, one per line...",
1042
+ lines=10
1043
+ )
1044
+
1045
+ with gr.Row():
1046
+ batch_language = gr.Dropdown(
1047
+ choices=list(config.SUPPORTED_LANGUAGES.values()),
1048
+ value="Auto Detect",
1049
+ label="Language"
1050
+ )
1051
+ batch_theme = gr.Dropdown(
1052
+ choices=list(config.THEMES.keys()),
1053
+ value="default",
1054
+ label="Theme"
1055
+ )
1056
+
1057
+ with gr.Row():
1058
+ batch_clean_cb = gr.Checkbox(label="Clean Text", value=False)
1059
+ batch_punct_cb = gr.Checkbox(label="Remove Punctuation", value=False)
1060
+ batch_nums_cb = gr.Checkbox(label="Remove Numbers", value=False)
1061
+
1062
+ with gr.Row():
1063
+ load_file_btn = gr.Button("Load File")
1064
+ analyze_batch_btn = gr.Button("Analyze Batch", variant="primary")
1065
+
1066
+ with gr.Column():
1067
+ batch_summary = gr.Textbox(label="Batch Summary", lines=8)
1068
+ batch_results_df = gr.Dataframe(
1069
+ label="Detailed Results",
1070
+ headers=["Index", "Text", "Sentiment", "Confidence", "Language", "Keywords"],
1071
+ datatype=["number", "str", "str", "str", "str", "str"]
1072
+ )
1073
+
1074
+ with gr.Row():
1075
+ batch_plot = gr.Plot(label="Batch Analysis Summary")
1076
+ confidence_dist_plot = gr.Plot(label="Confidence Distribution")
1077
+
1078
+ with gr.Tab("History & Analytics"):
1079
+ with gr.Row():
1080
+ with gr.Column():
1081
+ with gr.Row():
1082
+ refresh_history_btn = gr.Button("Refresh History")
1083
+ clear_history_btn = gr.Button("Clear History", variant="stop")
1084
+ status_btn = gr.Button("Get Status")
1085
+
1086
+ history_theme = gr.Dropdown(
1087
+ choices=list(config.THEMES.keys()),
1088
+ value="default",
1089
+ label="Dashboard Theme"
1090
+ )
1091
+
1092
+ with gr.Row():
1093
+ export_csv_btn = gr.Button("Export CSV")
1094
+ export_json_btn = gr.Button("Export JSON")
1095
+
1096
+ with gr.Column():
1097
+ history_status = gr.Textbox(label="History Status", lines=8)
1098
+
1099
+ history_dashboard = gr.Plot(label="History Analytics Dashboard")
1100
+
1101
+ with gr.Row():
1102
+ csv_download = gr.File(label="CSV Download", visible=True)
1103
+ json_download = gr.File(label="JSON Download", visible=True)
1104
+
1105
+ # Event Handlers
1106
+ analyze_btn.click(
1107
+ app.analyze_single,
1108
+ inputs=[text_input, language_selector, theme_selector,
1109
+ clean_text_cb, remove_punct_cb, remove_nums_cb],
1110
+ outputs=[result_output, gauge_plot, probability_plot, keyword_plot]
1111
+ )
1112
 
1113
+ load_file_btn.click(
1114
+ app.data_handler.process_file,
1115
+ inputs=file_upload,
1116
+ outputs=batch_input
1117
+ )
1118
 
1119
+ analyze_batch_btn.click(
1120
+ app.analyze_batch,
1121
+ inputs=[batch_input, batch_language, batch_theme,
1122
+ batch_clean_cb, batch_punct_cb, batch_nums_cb],
1123
+ outputs=[batch_summary, batch_results_df, batch_plot, confidence_dist_plot]
1124
+ )
1125
 
1126
+ refresh_history_btn.click(
1127
+ app.plot_history,
1128
+ inputs=history_theme,
1129
+ outputs=[history_dashboard, history_status]
1130
+ )
 
 
 
 
 
1131
 
1132
+ clear_history_btn.click(
1133
+ lambda: f"Cleared {app.history.clear()} entries",
1134
+ outputs=history_status
1135
+ )
1136
 
1137
+ status_btn.click(
1138
+ app.get_history_status,
1139
+ outputs=history_status
1140
+ )
1141
 
1142
+ export_csv_btn.click(
1143
+ lambda: app.data_handler.export_data(app.history.get_all(), 'csv'),
1144
+ outputs=[csv_download, history_status]
1145
+ )
1146
 
1147
+ export_json_btn.click(
1148
+ lambda: app.data_handler.export_data(app.history.get_all(), 'json'),
1149
+ outputs=[json_download, history_status]
1150
+ )
 
 
 
 
 
1151
 
1152
+ return demo
 
1153
 
1154
+ # Application Entry Point
1155
+ if __name__ == "__main__":
1156
+ logging.basicConfig(
1157
+ level=logging.INFO,
1158
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1159
  )
1160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1161
  try:
1162
+ demo = create_interface()
1163
+ demo.launch(
1164
+ share=True,
1165
+ server_name="0.0.0.0",
1166
+ server_port=7860,
1167
+ show_error=True
1168
+ )
1169
  except Exception as e:
1170
+ logger.error(f"Failed to launch application: {e}")
1171
+ raise
 
 
 
 
 
1172
 
1173
+ @handle_errors(default_return=("Please enter texts", None, None, None))
1174
+ def analyze_batch(self, batch_text: str, language: str, theme: str,
1175
+ clean_text: bool, remove_punct: bool, remove_nums: bool):
1176
+ """Enhanced batch analysis"""
1177
+ if not batch_text.strip():
1178
+ return "Please enter texts (one per line)", None, None, None
1179
+
1180
+ # Parse batch input
1181
+ texts = TextProcessor.parse_batch_input(batch_text)
1182
+
1183
+ if len(texts) > config.BATCH_SIZE_LIMIT:
1184
+ return f"Too many texts. Maximum {config.BATCH_SIZE_LIMIT} allowed.", None, None, None
1185
+
1186
+ if not texts:
1187
+ return "No valid texts found", None, None, None
1188
+
1189
+ # Map display names to language codes
1190
+ language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
1191
+ language_code = language_map.get(language, 'auto')
1192
+
1193
+ preprocessing_options = {
1194
+ 'clean_text': clean_text,
1195
+ 'remove_punctuation': remove_punct,
1196
+ 'remove_numbers': remove_nums
1197
+ }
1198
+
1199
+ with memory_cleanup():
1200
+ results = self.engine.analyze_batch(texts, language_code, preprocessing_options)
1201
+
1202
+ # Add to history
1203
+ batch_entries = []
1204
+ for result in results:
1205
+ if 'error' not in result:
1206
+ entry = {
1207
+ 'text': result['text'],
1208
+ 'full_text': result['full_text'],
1209
+ 'sentiment': result['sentiment'],
1210
+ 'confidence': result['confidence'],
1211
+ 'pos_prob': result.get('pos_prob', 0),
1212
+ 'neg_prob': result.get('neg_prob', 0),
1213
+ 'neu_prob': result.get('neu_prob', 0),
1214
+ 'language': result['language'],
1215
+ 'keywords': result['keywords'],
1216
+ 'word_count': result['word_count'],
1217
+ 'analysis_type': 'batch',
1218
+ 'batch_index': result['batch_index']
1219
+ }
1220
+ batch_entries.append(entry)
1221
+
1222
+ self.history.add_batch(batch_entries)
1223
+
1224
+ # Create visualizations
1225
+ theme_ctx = ThemeContext(theme)
1226
+ summary_fig = PlotlyVisualizer.create_batch_summary(results, theme_ctx)
1227
+ confidence_fig = PlotlyVisualizer.create_confidence_distribution(results)
1228
+
1229
+ # Create results DataFrame
1230
+ df_data = []
1231
+ for result in results:
1232
+ if 'error' in result:
1233
+ df_data.append({
1234
+ 'Index': result['batch_index'] + 1,
1235
+ 'Text': result['text'],
1236
+ 'Sentiment': 'Error',
1237
+ 'Confidence': 0.0,
1238
+ 'Language': 'Unknown',
1239
+ 'Error': result['error']
1240
+ })
1241
+ else:
1242
+ keywords_str = ', '.join([word for word, _ in result['keywords'][:3]])
1243
+ df_data.append({
1244
+ 'Index': result['batch_index'] + 1,
1245
+ 'Text': result['text'],
1246
+ 'Sentiment': result['sentiment'],
1247
+ 'Confidence': f"{result['confidence']:.3f}",
1248
+ 'Language': result['language'].upper(),
1249
+ 'Keywords': keywords_str
1250
+ })
1251
+
1252
+ df = pd.DataFrame(df_data)
1253
+
1254
+ # Create summary text
1255
+ successful_results = [r for r in results if 'error' not in r]
1256
+ error_count = len(results) - len(successful_results)
1257
+
1258
+ if successful_results:
1259
+ sentiment_counts = Counter([r['sentiment'] for r in successful_results])
1260
+ avg_confidence = np.mean([r['confidence'] for r in successful_results])
1261
+ languages = Counter([r['language'] for r in successful_results])
1262
+
1263
+ summary_text = f"""
1264
+ **Batch Analysis Summary:**
1265
+ - **Total Texts:** {len(texts)}
1266
+ - **Successful:** {len(successful_results)}
1267
+ - **Errors:** {error_count}
1268
+ - **Average Confidence:** {avg_confidence:.3f}
1269
+ - **Sentiments:** {dict(sentiment_counts)}
1270
+ - **Languages Detected:** {dict(languages)}
1271
+ """
1272
+ else:
1273
+ summary_text = f"All {len(texts)} texts failed to analyze."
1274
+
1275
+ return summary_text, df, summary_fig, confidence_fig
1276
 
1277
+ @handle_errors(default_return=(None, "No history available"))
1278
+ def plot_history(self, theme: str = 'default'):
1279
+ """Plot comprehensive history analysis"""
1280
+ history = self.history.get_all()
1281
+ if len(history) < 2:
1282
+ return None, f"Need at least 2 analyses for trends. Current: {len(history)}"
1283
+
1284
+ theme_ctx = ThemeContext(theme)
1285
+
1286
+ with memory_cleanup():
1287
+ fig = PlotlyVisualizer.create_history_dashboard(history, theme_ctx)
1288
+ stats = self.history.get_stats()
1289
+
1290
+ stats_text = f"""
1291
+ **History Statistics:**
1292
+ - **Total Analyses:** {stats.get('total_analyses', 0)}
1293
+ - **Positive:** {stats.get('positive_count', 0)}
1294
+ - **Negative:** {stats.get('negative_count', 0)}
1295
+ - **Neutral:** {stats.get('neutral_count', 0)}
1296
+ - **Average Confidence:** {stats.get('avg_confidence', 0):.3f}
1297
+ - **Languages:** {stats.get('languages_detected', 0)}
1298
+ - **Most Common Language:** {stats.get('most_common_language', 'N/A').upper()}
1299
+ """
1300
+
1301
+ return fig, stats_text
1302
 
1303
+ @handle_errors(default_return=("No data available",))
1304
+ def get_history_status(self):
1305
+ """Get current history status"""
1306
+ stats = self.history.get_stats()
1307
+ if not stats:
1308
+ return "No analyses performed yet"
1309
+
1310
+ return f"""
1311
+ **Current Status:**
1312
+ - **Total Analyses:** {stats['total_analyses']}
1313
+ - **Recent Sentiment Distribution:**
1314
+ * Positive: {stats['positive_count']}
1315
+ * Negative: {stats['negative_count']}
1316
+ * Neutral: {stats['neutral_count']}
1317
+ - **Average Confidence:** {stats['avg_confidence']:.3f}
1318
+ - **Languages Detected:** {stats['languages_detected']}
1319
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1320
 
1321
  # Gradio Interface
1322
+ def create_interface():
1323
+ """Create comprehensive Gradio interface"""
1324
+ app = SentimentApp()
1325
 
1326
+ with gr.Blocks(theme=gr.themes.Soft(), title="Multilingual Sentiment Analyzer") as demo:
1327
+ gr.Markdown("# 🌍 Advanced Multilingual Sentiment Analyzer")
1328
+ gr.Markdown("AI-powered sentiment analysis with support for multiple languages, advanced visualizations, and explainable AI features")
1329
+
1330
+ with gr.Tab("Single Analysis"):
1331
+ with gr.Row():
1332
+ with gr.Column():
1333
+ text_input = gr.Textbox(
1334
+ label="Enter Text for Analysis",
1335
+ placeholder="Enter your text in any supported language...",
1336
+ lines=5
 
 
 
1337
  )
1338
+
1339
+ with gr.Row():
1340
+ language_selector = gr.Dropdown(
1341
+ choices=list(config.SUPPORTED_LANGUAGES.values()),
1342
+ value="Auto Detect",
1343
+ label="Language"
1344
+ )
1345
+ theme_selector = gr.Dropdown(
1346
+ choices=list(config.THEMES.keys()),
1347
+ value="default",
1348
+ label="Theme"
1349
+ )
1350
+
1351
+ with gr.Row():
1352
+ clean_text_cb = gr.Checkbox(label="Clean Text", value=False)
1353
+ remove_punct_cb = gr.Checkbox(label="Remove Punctuation", value=False)
1354
+ remove_nums_cb = gr.Checkbox(label="Remove Numbers", value=False)
1355
+
1356
+ analyze_btn = gr.Button("Analyze", variant="primary", size="lg")
1357
+
1358
+ gr.Examples(
1359
+ examples=app.examples,
1360
+ inputs=text_input,
1361
+ cache_examples=False
1362
  )
1363
 
1364
+ with gr.Column():
1365
+ result_output = gr.Textbox(label="Analysis Results", lines=8)
 
 
 
 
 
 
 
 
 
 
1366
 
1367
+ with gr.Row():
1368
+ gauge_plot = gr.Plot(label="Sentiment Gauge")
1369
+ probability_plot = gr.Plot(label="Probability Distribution")
1370
+
1371
+ with gr.Row():
1372
+ keyword_plot = gr.Plot(label="Key Contributing Words")
1373
+
1374
+ with gr.Tab("Batch Analysis"):
1375
+ with gr.Row():
1376
+ with gr.Column():
1377
+ file_upload = gr.File(
1378
+ label="Upload File (CSV/TXT)",
1379
+ file_types=[".csv", ".txt"]
 
 
 
 
 
 
 
 
1380
  )
1381
+ batch_input = gr.Textbox(
1382
+ label="Batch Input (one text per line)",
1383
+ placeholder="Enter multiple texts, one per line...",
1384
+ lines=10
1385
  )
1386
+
1387
+ with gr.Row():
1388
+ batch_language = gr.Dropdown(
1389
+ choices=list(config.SUPPORTED_LANGUAGES.values()),
1390
+ value="Auto Detect",
1391
+ label="Language"
1392
+ )
1393
+ batch_theme = gr.Dropdown(
1394
+ choices=list(config.THEMES.keys()),
1395
+ value="default",
1396
+ label="Theme"
1397
+ )
1398
+
1399
+ with gr.Row():
1400
+ batch_clean_cb = gr.Checkbox(label="Clean Text", value=False)
1401
+ batch_punct_cb = gr.Checkbox(label="Remove Punctuation", value=False)
1402
+ batch_nums_cb = gr.Checkbox(label="Remove Numbers", value=False)
1403
+
1404
+ with gr.Row():
1405
+ load_file_btn = gr.Button("Load File")
1406
+ analyze_batch_btn = gr.Button("Analyze Batch", variant="primary")
1407
 
1408
+ with gr.Column():
1409
+ batch_summary = gr.Textbox(label="Batch Summary", lines=8)
1410
+ batch_results_df = gr.Dataframe(
1411
+ label="Detailed Results",
1412
+ headers=["Index", "Text", "Sentiment", "Confidence", "Language", "Keywords"],
1413
+ datatype=["number", "str", "str", "str", "str", "str"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1414
  )
1415
+
1416
+ with gr.Row():
1417
+ batch_plot = gr.Plot(label="Batch Analysis Summary")
1418
+ confidence_dist_plot = gr.Plot(label="Confidence Distribution")
1419
+
1420
+ with gr.Tab("History & Analytics"):
1421
+ with gr.Row():
1422
+ with gr.Column():
1423
+ with gr.Row():
1424
+ refresh_history_btn = gr.Button("Refresh History")
1425
+ clear_history_btn = gr.Button("Clear History", variant="stop")
1426
+ status_btn = gr.Button("Get Status")
1427
+
1428
+ history_theme = gr.Dropdown(
1429
  choices=list(config.THEMES.keys()),
1430
+ value="default",
1431
+ label="Dashboard Theme"
 
 
 
 
 
 
 
 
 
 
1432
  )
1433
+
1434
+ with gr.Row():
1435
+ export_csv_btn = gr.Button("Export CSV")
1436
+ export_json_btn = gr.Button("Export JSON")
1437
 
1438
+ with gr.Column():
1439
+ history_status = gr.Textbox(label="History Status", lines=8)
 
 
 
 
 
 
 
1440
 
1441
+ history_dashboard = gr.Plot(label="History Analytics Dashboard")
 
 
 
 
 
 
 
 
 
 
 
 
 
1442
 
1443
+ with gr.Row():
1444
+ csv_download = gr.File(label="CSV Download", visible=True)
1445
+ json_download = gr.File(label="JSON Download", visible=True)
1446
+
1447
+ # Event Handlers
1448
+ analyze_btn.click(
1449
+ app.analyze_single,
1450
+ inputs=[text_input, language_selector, theme_selector,
1451
+ clean_text_cb, remove_punct_cb, remove_nums_cb],
1452
+ outputs=[result_output, gauge_plot, probability_plot, keyword_plot]
1453
+ )
 
 
 
 
 
 
 
 
 
 
 
 
1454
 
1455
+ load_file_btn.click(
1456
+ app.data_handler.process_file,
1457
+ inputs=file_upload,
1458
+ outputs=batch_input
1459
+ )
1460
 
1461
+ analyze_batch_btn.click(
1462
+ app.analyze_batch,
1463
+ inputs=[batch_input, batch_language, batch_theme,
1464
+ batch_clean_cb, batch_punct_cb, batch_nums_cb],
1465
+ outputs=[batch_summary, batch_results_df, batch_plot, confidence_dist_plot]
1466
+ )
1467
 
1468
+ refresh_history_btn.click(
1469
+ app.plot_history,
1470
+ inputs=history_theme,
1471
+ outputs=[history_dashboard, history_status]
1472
+ )
1473
 
1474
+ clear_history_btn.click(
1475
+ lambda: f"Cleared {app.history.clear()} entries",
1476
+ outputs=history_status
1477
+ )
1478
 
1479
+ status_btn.click(
1480
+ app.get_history_status,
1481
+ outputs=history_status
1482
+ )
1483
+
1484
+ export_csv_btn.click(
1485
+ lambda: app.data_handler.export_data(app.history.get_all(), 'csv'),
1486
+ outputs=[csv_download, history_status]
1487
+ )
1488
+
1489
+ export_json_btn.click(
1490
+ lambda: app.data_handler.export_data(app.history.get_all(), 'json'),
1491
+ outputs=[json_download, history_status]
1492
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1493
 
1494
+ return demo
 
 
 
1495
 
1496
+ # Application Entry Point
1497
  if __name__ == "__main__":
1498
+ logging.basicConfig(
1499
+ level=logging.INFO,
1500
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
1501
+ )
1502
+
1503
+ try:
1504
+ demo = create_interface()
1505
+ demo.launch(
1506
+ share=True,
1507
+ server_name="0.0.0.0",
1508
+ server_port=7860,
1509
+ show_error=True
1510
+ )
1511
+ except Exception as e:
1512
+ logger.error(f"Failed to launch application: {e}")
1513
+ raise