entropy25 commited on
Commit
75cb992
·
verified ·
1 Parent(s): c74121f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1205 -419
app.py CHANGED
@@ -1,7 +1,9 @@
1
  import torch
2
  import gradio as gr
3
- from transformers import BertTokenizer, BertForSequenceClassification
4
- import matplotlib.pyplot as plt
 
 
5
  import numpy as np
6
  from wordcloud import WordCloud
7
  from collections import Counter, defaultdict
@@ -16,6 +18,10 @@ from functools import lru_cache, wraps
16
  from dataclasses import dataclass
17
  from typing import List, Dict, Optional, Tuple, Any, Callable
18
  from contextlib import contextmanager
 
 
 
 
19
  import gc
20
 
21
  # Configuration
@@ -28,27 +34,45 @@ class Config:
28
  CACHE_SIZE: int = 128
29
  BATCH_PROCESSING_SIZE: int = 8
30
 
31
- # Visualization settings
32
- FIGURE_SIZE_SINGLE: Tuple[int, int] = (8, 5)
33
- FIGURE_SIZE_BATCH: Tuple[int, int] = (12, 8)
34
- WORDCLOUD_SIZE: Tuple[int, int] = (10, 5)
 
 
 
 
 
 
35
 
36
- THEMES = {
37
- 'default': {'pos': '#4ecdc4', 'neg': '#ff6b6b'},
38
- 'ocean': {'pos': '#0077be', 'neg': '#ff6b35'},
39
- 'forest': {'pos': '#228b22', 'neg': '#dc143c'},
40
- 'sunset': {'pos': '#ff8c00', 'neg': '#8b0000'}
41
  }
42
 
43
- STOP_WORDS = {
44
- 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
45
- 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be',
46
- 'been', 'have', 'has', 'had', 'will', 'would', 'could', 'should'
 
 
47
  }
48
 
49
  config = Config()
 
 
 
50
  logger = logging.getLogger(__name__)
51
 
 
 
 
 
 
 
 
 
52
  # Decorators and Context Managers
53
  def handle_errors(default_return=None):
54
  """Centralized error handling decorator"""
@@ -64,13 +88,11 @@ def handle_errors(default_return=None):
64
  return decorator
65
 
66
  @contextmanager
67
- def managed_figure(*args, **kwargs):
68
- """Context manager for matplotlib figures to prevent memory leaks"""
69
- fig = plt.figure(*args, **kwargs)
70
  try:
71
- yield fig
72
  finally:
73
- plt.close(fig)
74
  gc.collect()
75
 
76
  class ThemeContext:
@@ -79,72 +101,152 @@ class ThemeContext:
79
  self.theme = theme
80
  self.colors = config.THEMES.get(theme, config.THEMES['default'])
81
 
82
- # Lazy Model Manager
83
  class ModelManager:
84
- """Lazy loading model manager"""
85
  _instance = None
86
- _model = None
87
- _tokenizer = None
88
- _device = None
89
 
90
  def __new__(cls):
91
  if cls._instance is None:
92
  cls._instance = super().__new__(cls)
 
93
  return cls._instance
94
 
95
- @property
96
- def model(self):
97
- if self._model is None:
98
- self._load_model()
99
- return self._model
100
-
101
- @property
102
- def tokenizer(self):
103
- if self._tokenizer is None:
104
- self._load_model()
105
- return self._tokenizer
106
-
107
- @property
108
- def device(self):
109
- if self._device is None:
110
- self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
111
- return self._device
112
-
113
- def _load_model(self):
114
- """Load model and tokenizer"""
115
  try:
116
- self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
117
- self._tokenizer = BertTokenizer.from_pretrained("entropy25/sentimentanalysis")
118
- self._model = BertForSequenceClassification.from_pretrained("entropy25/sentimentanalysis")
119
- self._model.to(self._device)
120
- logger.info(f"Model loaded on {self._device}")
 
 
 
 
 
 
 
 
 
121
  except Exception as e:
122
- logger.error(f"Model loading failed: {e}")
123
  raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
- # Simplified Core Classes
126
  class TextProcessor:
127
- """Optimized text processing"""
 
128
  @staticmethod
129
  @lru_cache(maxsize=config.CACHE_SIZE)
130
- def clean_text(text: str) -> Tuple[str, ...]:
131
- """Single-pass text cleaning"""
132
- words = re.findall(r'\b\w{3,}\b', text.lower())
133
- return tuple(w for w in words if w not in config.STOP_WORDS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
 
135
  class HistoryManager:
136
- """Simplified history management"""
137
  def __init__(self):
138
  self._history = []
139
 
140
  def add(self, entry: Dict):
141
- self._history.append({**entry, 'timestamp': datetime.now().isoformat()})
 
 
142
  if len(self._history) > config.MAX_HISTORY_SIZE:
143
  self._history = self._history[-config.MAX_HISTORY_SIZE:]
144
 
 
 
 
 
 
145
  def get_all(self) -> List[Dict]:
146
  return self._history.copy()
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  def clear(self) -> int:
149
  count = len(self._history)
150
  self._history.clear()
@@ -152,107 +254,175 @@ class HistoryManager:
152
 
153
  def size(self) -> int:
154
  return len(self._history)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
- # Core Analysis Engine
157
  class SentimentEngine:
158
- """Streamlined sentiment analysis with attention-based keyword extraction"""
 
159
  def __init__(self):
160
  self.model_manager = ModelManager()
161
 
162
- def extract_key_words(self, text: str, top_k: int = 10) -> List[Tuple[str, float]]:
163
- """Extract contributing words using BERT attention weights"""
164
  try:
165
- inputs = self.model_manager.tokenizer(
 
 
 
 
 
166
  text, return_tensors="pt", padding=True,
167
  truncation=True, max_length=config.MAX_TEXT_LENGTH
168
  ).to(self.model_manager.device)
169
 
170
- # Get model outputs with attention weights
171
  with torch.no_grad():
172
- outputs = self.model_manager.model(**inputs, output_attentions=True)
173
- attention = outputs.attentions # Tuple of attention tensors for each layer
174
-
175
- # Use the last layer's attention, average over all heads
176
- last_attention = attention[-1] # Shape: [batch_size, num_heads, seq_len, seq_len]
177
- avg_attention = last_attention.mean(dim=1) # Average over heads: [batch_size, seq_len, seq_len]
178
-
179
- # Focus on attention to [CLS] token (index 0) as it represents the whole sequence
180
- cls_attention = avg_attention[0, 0, :] # Attention from CLS to all tokens
181
 
182
- # Get tokens and their attention scores
183
- tokens = self.model_manager.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
184
- attention_scores = cls_attention.cpu().numpy()
185
-
186
- # Filter out special tokens and combine subword tokens
187
- word_scores = {}
188
- current_word = ""
189
- current_score = 0.0
190
-
191
- for i, (token, score) in enumerate(zip(tokens, attention_scores)):
192
- if token in ['[CLS]', '[SEP]', '[PAD]']:
193
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
- if token.startswith('##'):
196
- # Subword token, add to current word
197
- current_word += token[2:]
198
- current_score = max(current_score, score) # Take max attention
199
- else:
200
- # New word, save previous if exists
201
  if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
202
  word_scores[current_word.lower()] = current_score
203
 
204
- current_word = token
205
- current_score = score
206
-
207
- # Don't forget the last word
208
- if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
209
- word_scores[current_word.lower()] = current_score
210
-
211
- # Filter out stop words and sort by attention score
212
- filtered_words = {
213
- word: score for word, score in word_scores.items()
214
- if word not in config.STOP_WORDS and len(word) >= config.MIN_WORD_LENGTH
215
- }
216
-
217
- # Sort by attention score and return top_k
218
- sorted_words = sorted(filtered_words.items(), key=lambda x: x[1], reverse=True)
219
- return sorted_words[:top_k]
220
-
221
  except Exception as e:
222
- logger.error(f"Key word extraction failed: {e}")
223
- return []
 
 
 
224
 
225
- @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0, 'key_words': []})
226
- def analyze_single(self, text: str) -> Dict:
227
- """Analyze single text with key word extraction"""
228
  if not text.strip():
229
- raise ValueError("Empty text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
- inputs = self.model_manager.tokenizer(
232
- text, return_tensors="pt", padding=True,
233
- truncation=True, max_length=config.MAX_TEXT_LENGTH
234
- ).to(self.model_manager.device)
235
 
236
  with torch.no_grad():
237
- outputs = self.model_manager.model(**inputs)
238
  probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
239
 
240
- sentiment = "Positive" if probs[1] > probs[0] else "Negative"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
- # Extract key contributing words
243
- key_words = self.extract_key_words(text)
244
 
245
- return {
246
- 'sentiment': sentiment,
247
- 'confidence': float(probs.max()),
248
- 'pos_prob': float(probs[1]),
249
- 'neg_prob': float(probs[0]),
250
- 'key_words': key_words
251
- }
 
 
252
 
253
  @handle_errors(default_return=[])
254
- def analyze_batch(self, texts: List[str], progress_callback=None) -> List[Dict]:
255
- """Optimized batch processing with key words"""
 
256
  if len(texts) > config.BATCH_SIZE_LIMIT:
257
  texts = texts[:config.BATCH_SIZE_LIMIT]
258
 
@@ -265,185 +435,263 @@ class SentimentEngine:
265
  if progress_callback:
266
  progress_callback((i + len(batch)) / len(texts))
267
 
268
- inputs = self.model_manager.tokenizer(
269
- batch, return_tensors="pt", padding=True,
270
- truncation=True, max_length=config.MAX_TEXT_LENGTH
271
- ).to(self.model_manager.device)
272
-
273
- with torch.no_grad():
274
- outputs = self.model_manager.model(**inputs)
275
- probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()
276
-
277
- for text, prob in zip(batch, probs):
278
- sentiment = "Positive" if prob[1] > prob[0] else "Negative"
279
- # Extract key words for each text in batch
280
- key_words = self.extract_key_words(text, top_k=5) # Fewer for batch processing
281
-
282
- results.append({
283
- 'text': text[:50] + '...' if len(text) > 50 else text,
284
- 'full_text': text,
285
- 'sentiment': sentiment,
286
- 'confidence': float(prob.max()),
287
- 'pos_prob': float(prob[1]),
288
- 'neg_prob': float(prob[0]),
289
- 'key_words': key_words
290
- })
291
 
292
  return results
293
 
294
- # Unified Visualization System
295
- class PlotFactory:
296
- """Factory for creating plots with proper memory management"""
297
 
298
  @staticmethod
299
  @handle_errors(default_return=None)
300
- def create_sentiment_bars(probs: np.ndarray, theme: ThemeContext) -> plt.Figure:
301
- """Create sentiment probability bars"""
302
- with managed_figure(figsize=config.FIGURE_SIZE_SINGLE) as fig:
303
- ax = fig.add_subplot(111)
304
- labels = ["Negative", "Positive"]
305
- colors = [theme.colors['neg'], theme.colors['pos']]
306
-
307
- bars = ax.bar(labels, probs, color=colors, alpha=0.8)
308
- ax.set_title("Sentiment Probabilities", fontweight='bold')
309
- ax.set_ylabel("Probability")
310
- ax.set_ylim(0, 1)
311
-
312
- # Add value labels
313
- for bar, prob in zip(bars, probs):
314
- ax.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.02,
315
- f'{prob:.3f}', ha='center', va='bottom', fontweight='bold')
316
-
317
- fig.tight_layout()
318
- return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
  @staticmethod
321
  @handle_errors(default_return=None)
322
- def create_confidence_gauge(confidence: float, sentiment: str, theme: ThemeContext) -> plt.Figure:
323
- """Create confidence gauge"""
324
- with managed_figure(figsize=config.FIGURE_SIZE_SINGLE) as fig:
325
- ax = fig.add_subplot(111)
326
-
327
- # Create gauge
328
- theta = np.linspace(0, np.pi, 100)
329
- colors = [theme.colors['neg'] if i < 50 else theme.colors['pos'] for i in range(100)]
330
-
331
- for i in range(len(theta)-1):
332
- ax.fill_between([theta[i], theta[i+1]], [0, 0], [0.8, 0.8],
333
- color=colors[i], alpha=0.7)
334
-
335
- # Needle position
336
- pos = np.pi * (0.5 + (0.4 if sentiment == 'Positive' else -0.4) * confidence)
337
- ax.plot([pos, pos], [0, 0.6], 'k-', linewidth=6)
338
- ax.plot(pos, 0.6, 'ko', markersize=10)
339
-
340
- ax.set_xlim(0, np.pi)
341
- ax.set_ylim(0, 1)
342
- ax.set_title(f'{sentiment} - Confidence: {confidence:.3f}', fontweight='bold')
343
- ax.set_xticks([0, np.pi/2, np.pi])
344
- ax.set_xticklabels(['Negative', 'Neutral', 'Positive'])
345
- ax.axis('off')
346
-
347
- fig.tight_layout()
348
- return fig
349
 
350
  @staticmethod
351
  @handle_errors(default_return=None)
352
- def create_keyword_chart(key_words: List[Tuple[str, float]], sentiment: str, theme: ThemeContext) -> Optional[plt.Figure]:
353
- """Create horizontal bar chart for key contributing words"""
354
- if not key_words:
355
- return None
356
-
357
- with managed_figure(figsize=config.FIGURE_SIZE_SINGLE) as fig:
358
- ax = fig.add_subplot(111)
359
-
360
- words = [word for word, score in key_words]
361
- scores = [score for word, score in key_words]
362
-
363
- # Choose color based on sentiment
364
- color = theme.colors['pos'] if sentiment == 'Positive' else theme.colors['neg']
365
-
366
- # Create horizontal bar chart
367
- bars = ax.barh(range(len(words)), scores, color=color, alpha=0.7)
368
- ax.set_yticks(range(len(words)))
369
- ax.set_yticklabels(words)
370
- ax.set_xlabel('Attention Weight')
371
- ax.set_title(f'Top Contributing Words ({sentiment})', fontweight='bold')
372
-
373
- # Add value labels on bars
374
- for i, (bar, score) in enumerate(zip(bars, scores)):
375
- ax.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2.,
376
- f'{score:.3f}', ha='left', va='center', fontsize=9)
377
-
378
- # Invert y-axis to show highest scoring word at top
379
- ax.invert_yaxis()
380
- ax.grid(axis='x', alpha=0.3)
381
- fig.tight_layout()
382
  return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
 
384
  @staticmethod
385
  @handle_errors(default_return=None)
386
- def create_wordcloud(text: str, sentiment: str, theme: ThemeContext) -> Optional[plt.Figure]:
387
- """Create word cloud"""
388
- if len(text.split()) < 3:
389
- return None
390
-
391
- colormap = 'Greens' if sentiment == 'Positive' else 'Reds'
392
- wc = WordCloud(width=800, height=400, background_color='white',
393
- colormap=colormap, max_words=30).generate(text)
394
-
395
- with managed_figure(figsize=config.WORDCLOUD_SIZE) as fig:
396
- ax = fig.add_subplot(111)
397
- ax.imshow(wc, interpolation='bilinear')
398
- ax.axis('off')
399
- ax.set_title(f'{sentiment} Word Cloud', fontweight='bold')
400
- fig.tight_layout()
401
- return fig
 
 
 
 
 
 
 
402
 
403
  @staticmethod
404
  @handle_errors(default_return=None)
405
- def create_batch_analysis(results: List[Dict], theme: ThemeContext) -> plt.Figure:
406
- """Create comprehensive batch visualization"""
407
- with managed_figure(figsize=config.FIGURE_SIZE_BATCH) as fig:
408
- gs = fig.add_gridspec(2, 2, hspace=0.3, wspace=0.3)
409
-
410
- # Sentiment distribution
411
- ax1 = fig.add_subplot(gs[0, 0])
412
- sent_counts = Counter([r['sentiment'] for r in results])
413
- colors = [theme.colors['pos'], theme.colors['neg']]
414
- ax1.pie(sent_counts.values(), labels=sent_counts.keys(),
415
- autopct='%1.1f%%', colors=colors[:len(sent_counts)])
416
- ax1.set_title('Sentiment Distribution')
417
-
418
- # Confidence histogram
419
- ax2 = fig.add_subplot(gs[0, 1])
420
- confs = [r['confidence'] for r in results]
421
- ax2.hist(confs, bins=8, alpha=0.7, color='skyblue', edgecolor='black')
422
- ax2.set_title('Confidence Distribution')
423
- ax2.set_xlabel('Confidence')
424
-
425
- # Sentiment over time
426
- ax3 = fig.add_subplot(gs[1, :])
427
- pos_probs = [r['pos_prob'] for r in results]
428
- indices = range(len(results))
429
- colors_scatter = [theme.colors['pos'] if r['sentiment'] == 'Positive'
430
- else theme.colors['neg'] for r in results]
431
- ax3.scatter(indices, pos_probs, c=colors_scatter, alpha=0.7, s=60)
432
- ax3.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)
433
- ax3.set_title('Sentiment Progression')
434
- ax3.set_xlabel('Review Index')
435
- ax3.set_ylabel('Positive Probability')
436
-
437
- return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
 
439
- # Unified Data Handler
440
  class DataHandler:
441
- """Handles all data operations"""
442
 
443
  @staticmethod
444
  @handle_errors(default_return=(None, "Export failed"))
445
  def export_data(data: List[Dict], format_type: str) -> Tuple[Optional[str], str]:
446
- """Universal data export"""
447
  if not data:
448
  return None, "No data to export"
449
 
@@ -452,16 +700,21 @@ class DataHandler:
452
 
453
  if format_type == 'csv':
454
  writer = csv.writer(temp_file)
455
- writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Pos_Prob', 'Neg_Prob', 'Key_Words'])
 
456
  for entry in data:
 
457
  writer.writerow([
458
  entry.get('timestamp', ''),
459
  entry.get('text', ''),
460
  entry.get('sentiment', ''),
461
  f"{entry.get('confidence', 0):.4f}",
 
462
  f"{entry.get('pos_prob', 0):.4f}",
463
  f"{entry.get('neg_prob', 0):.4f}",
464
- "|".join([f"{word}:{score:.3f}" for word, score in entry.get('key_words', [])])
 
 
465
  ])
466
  elif format_type == 'json':
467
  json.dump(data, temp_file, indent=2, ensure_ascii=False)
@@ -469,31 +722,29 @@ class DataHandler:
469
  temp_file.close()
470
  return temp_file.name, f"Exported {len(data)} entries"
471
 
472
-
473
  @staticmethod
474
  @handle_errors(default_return="")
475
  def process_file(file) -> str:
476
- """Process uploaded file"""
477
  if not file:
478
  return ""
479
-
480
  content = file.read().decode('utf-8')
481
 
482
  if file.name.endswith('.csv'):
483
- import io
484
  csv_file = io.StringIO(content)
485
  reader = csv.reader(csv_file)
486
  try:
487
- next(reader)
488
  texts = []
489
  for row in reader:
490
  if row and row[0].strip():
491
  text = row[0].strip().strip('"')
492
- if text:
493
  texts.append(text)
494
  return '\n'.join(texts)
495
- except Exception as e:
496
- lines = content.strip().split('\n')[1:]
497
  texts = []
498
  for line in lines:
499
  if line.strip():
@@ -501,227 +752,762 @@ class DataHandler:
501
  if text:
502
  texts.append(text)
503
  return '\n'.join(texts)
 
504
  return content
505
 
506
- # Main Application
507
  class SentimentApp:
508
- """Main application orchestrator"""
509
 
510
  def __init__(self):
511
  self.engine = SentimentEngine()
512
  self.history = HistoryManager()
513
  self.data_handler = DataHandler()
514
 
515
- # Example data
516
  self.examples = [
517
- ["While the film's visual effects were undeniably impressive, the story lacked emotional weight, and the pacing felt inconsistent throughout."],
518
- ["An extraordinary achievement in filmmaking — the direction was masterful, the script was sharp, and every performance added depth and realism."],
519
- ["Despite a promising start, the film quickly devolved into a series of clichés, with weak character development and an ending that felt rushed and unearned."],
520
- ["A beautifully crafted story with heartfelt moments and a soundtrack that perfectly captured the emotional tone of each scene."],
521
- ["The movie was far too long, with unnecessary subplots and dull dialogue that made it difficult to stay engaged until the end."]
522
  ]
523
-
524
 
525
- @handle_errors(default_return=("Please enter text", None, None, None, None))
526
- def analyze_single(self, text: str, theme: str = 'default'):
527
- """Single text analysis with key words"""
 
528
  if not text.strip():
529
- return "Please enter text", None, None, None, None
530
 
531
- result = self.engine.analyze_single(text)
 
 
532
 
533
- # Add to history
534
- self.history.add({
535
- 'text': text[:100],
536
- 'full_text': text,
537
- **result
538
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
 
540
- # Create visualizations
541
  theme_ctx = ThemeContext(theme)
542
- probs = np.array([result['neg_prob'], result['pos_prob']])
543
 
544
- prob_plot = PlotFactory.create_sentiment_bars(probs, theme_ctx)
545
- gauge_plot = PlotFactory.create_confidence_gauge(result['confidence'], result['sentiment'], theme_ctx)
546
- cloud_plot = PlotFactory.create_wordcloud(text, result['sentiment'], theme_ctx)
547
- keyword_plot = PlotFactory.create_keyword_chart(result['key_words'], result['sentiment'], theme_ctx)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548
 
549
- # Format result text with key words
550
- key_words_str = ", ".join([f"{word}({score:.3f})" for word, score in result['key_words'][:5]])
551
- result_text = (f"Sentiment: {result['sentiment']} (Confidence: {result['confidence']:.3f})\n"
552
- f"Key Words: {key_words_str}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
553
 
554
- return result_text, prob_plot, gauge_plot, cloud_plot, keyword_plot
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
555
 
556
- @handle_errors(default_return=None)
557
- def analyze_batch(self, reviews: str, progress=None):
558
- """Batch analysis"""
559
- if not reviews.strip():
560
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
561
 
562
- texts = [r.strip() for r in reviews.split('\n') if r.strip()]
563
- if len(texts) < 2:
564
- return None
 
 
565
 
566
- results = self.engine.analyze_batch(texts, progress)
 
 
567
 
568
- # Add to history
569
- for result in results:
570
- self.history.add(result)
 
 
571
 
572
- # Create visualization
573
- theme_ctx = ThemeContext('default')
574
- return PlotFactory.create_batch_analysis(results, theme_ctx)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
 
576
  @handle_errors(default_return=(None, "No history available"))
577
  def plot_history(self, theme: str = 'default'):
578
- """Plot analysis history"""
579
  history = self.history.get_all()
580
  if len(history) < 2:
581
  return None, f"Need at least 2 analyses for trends. Current: {len(history)}"
582
 
583
  theme_ctx = ThemeContext(theme)
584
 
585
- with managed_figure(figsize=(12, 8)) as fig:
586
- gs = fig.add_gridspec(2, 1, hspace=0.3)
587
-
588
- indices = list(range(len(history)))
589
- pos_probs = [item['pos_prob'] for item in history]
590
- confs = [item['confidence'] for item in history]
591
-
592
- # Sentiment trend
593
- ax1 = fig.add_subplot(gs[0, 0])
594
- colors = [theme_ctx.colors['pos'] if p > 0.5 else theme_ctx.colors['neg']
595
- for p in pos_probs]
596
- ax1.scatter(indices, pos_probs, c=colors, alpha=0.7, s=60)
597
- ax1.plot(indices, pos_probs, alpha=0.5, linewidth=2)
598
- ax1.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)
599
- ax1.set_title('Sentiment History')
600
- ax1.set_ylabel('Positive Probability')
601
- ax1.grid(True, alpha=0.3)
602
-
603
- # Confidence trend
604
- ax2 = fig.add_subplot(gs[1, 0])
605
- ax2.bar(indices, confs, alpha=0.7, color='lightblue', edgecolor='navy')
606
- ax2.set_title('Confidence Over Time')
607
- ax2.set_xlabel('Analysis Number')
608
- ax2.set_ylabel('Confidence')
609
- ax2.grid(True, alpha=0.3)
610
-
611
- fig.tight_layout()
612
- return fig, f"History: {len(history)} analyses"
 
 
 
 
 
 
613
 
614
- # Gradio Interface Setup
615
  def create_interface():
616
- """Create streamlined Gradio interface"""
617
  app = SentimentApp()
618
 
619
- with gr.Blocks(theme=gr.themes.Soft(), title="Movie Sentiment Analyzer") as demo:
620
- gr.Markdown("# 🎬 AI Movie Sentiment Analyzer")
621
- gr.Markdown("Optimized sentiment analysis with advanced visualizations and key word extraction")
622
 
623
  with gr.Tab("Single Analysis"):
624
  with gr.Row():
625
  with gr.Column():
626
  text_input = gr.Textbox(
627
- label="Movie Review",
628
- placeholder="Enter your movie review...",
629
  lines=5
630
  )
 
631
  with gr.Row():
632
- analyze_btn = gr.Button("Analyze", variant="primary")
 
 
 
 
633
  theme_selector = gr.Dropdown(
634
  choices=list(config.THEMES.keys()),
635
  value="default",
636
  label="Theme"
637
  )
638
 
 
 
 
 
 
 
 
639
  gr.Examples(
640
  examples=app.examples,
641
- inputs=text_input
 
642
  )
643
 
644
  with gr.Column():
645
- result_output = gr.Textbox(label="Result", lines=3)
646
 
647
  with gr.Row():
648
- prob_plot = gr.Plot(label="Probabilities")
649
- gauge_plot = gr.Plot(label="Confidence")
650
 
651
  with gr.Row():
652
- wordcloud_plot = gr.Plot(label="Word Cloud")
653
  keyword_plot = gr.Plot(label="Key Contributing Words")
654
 
655
  with gr.Tab("Batch Analysis"):
656
  with gr.Row():
657
  with gr.Column():
658
- file_upload = gr.File(label="Upload File", file_types=[".csv", ".txt"])
 
 
 
659
  batch_input = gr.Textbox(
660
- label="Reviews (one per line)",
661
- lines=8
 
662
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
663
 
664
  with gr.Column():
665
- load_btn = gr.Button("Load File")
666
- batch_btn = gr.Button("Analyze Batch", variant="primary")
 
 
 
 
667
 
668
- batch_plot = gr.Plot(label="Batch Results")
 
 
669
 
670
- with gr.Tab("History & Export"):
671
  with gr.Row():
672
- refresh_btn = gr.Button("Refresh")
673
- clear_btn = gr.Button("Clear", variant="stop")
674
- status_btn = gr.Button("Status")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
675
 
676
- with gr.Row():
677
- csv_btn = gr.Button("Export CSV")
678
- json_btn = gr.Button("Export JSON")
679
 
680
- history_status = gr.Textbox(label="Status")
681
- history_plot = gr.Plot(label="History Trends")
682
- csv_file = gr.File(label="CSV Download", visible=True)
683
- json_file = gr.File(label="JSON Download", visible=True)
684
 
685
- # Event bindings
686
  analyze_btn.click(
687
  app.analyze_single,
688
- inputs=[text_input, theme_selector],
689
- outputs=[result_output, prob_plot, gauge_plot, wordcloud_plot, keyword_plot]
 
690
  )
691
 
692
- load_btn.click(app.data_handler.process_file, inputs=file_upload, outputs=batch_input)
693
- batch_btn.click(app.analyze_batch, inputs=batch_input, outputs=batch_plot)
 
 
 
694
 
695
- refresh_btn.click(
696
- lambda theme: app.plot_history(theme),
697
- inputs=theme_selector,
698
- outputs=[history_plot, history_status]
 
699
  )
700
 
701
- clear_btn.click(
 
 
 
 
 
 
702
  lambda: f"Cleared {app.history.clear()} entries",
703
  outputs=history_status
704
  )
705
 
706
  status_btn.click(
707
- lambda: f"History: {app.history.size()} entries",
708
  outputs=history_status
709
  )
710
 
711
- csv_btn.click(
712
  lambda: app.data_handler.export_data(app.history.get_all(), 'csv'),
713
- outputs=[csv_file, history_status]
714
  )
715
 
716
- json_btn.click(
717
  lambda: app.data_handler.export_data(app.history.get_all(), 'json'),
718
- outputs=[json_file, history_status]
719
  )
720
 
721
  return demo
722
 
723
  # Application Entry Point
724
  if __name__ == "__main__":
725
- logging.basicConfig(level=logging.INFO)
726
- demo = create_interface()
727
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  import gradio as gr
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
+ import plotly.graph_objects as go
5
+ import plotly.express as px
6
+ from plotly.subplots import make_subplots
7
  import numpy as np
8
  from wordcloud import WordCloud
9
  from collections import Counter, defaultdict
 
18
  from dataclasses import dataclass
19
  from typing import List, Dict, Optional, Tuple, Any, Callable
20
  from contextlib import contextmanager
21
+ import nltk
22
+ from nltk.corpus import stopwords
23
+ import langdetect
24
+ import pandas as pd
25
  import gc
26
 
27
  # Configuration
 
34
  CACHE_SIZE: int = 128
35
  BATCH_PROCESSING_SIZE: int = 8
36
 
37
+ # Supported languages and models
38
+ SUPPORTED_LANGUAGES = {
39
+ 'auto': 'Auto Detect',
40
+ 'en': 'English',
41
+ 'zh': 'Chinese',
42
+ 'es': 'Spanish',
43
+ 'fr': 'French',
44
+ 'de': 'German',
45
+ 'sv': 'Swedish'
46
+ }
47
 
48
+ MODELS = {
49
+ 'en': "cardiffnlp/twitter-roberta-base-sentiment-latest",
50
+ 'multilingual': "cardiffnlp/twitter-xlm-roberta-base-sentiment",
51
+ 'zh': "uer/roberta-base-finetuned-dianping-chinese"
 
52
  }
53
 
54
+ # Color themes for Plotly
55
+ THEMES = {
56
+ 'default': {'pos': '#4CAF50', 'neg': '#F44336', 'neu': '#FF9800'},
57
+ 'ocean': {'pos': '#0077BE', 'neg': '#FF6B35', 'neu': '#00BCD4'},
58
+ 'dark': {'pos': '#66BB6A', 'neg': '#EF5350', 'neu': '#FFA726'},
59
+ 'rainbow': {'pos': '#9C27B0', 'neg': '#E91E63', 'neu': '#FF5722'}
60
  }
61
 
62
  config = Config()
63
+
64
+ # Logging setup
65
+ logging.basicConfig(level=logging.INFO)
66
  logger = logging.getLogger(__name__)
67
 
68
+ # Initialize NLTK
69
+ try:
70
+ nltk.download('stopwords', quiet=True)
71
+ nltk.download('punkt', quiet=True)
72
+ STOP_WORDS = set(stopwords.words('english'))
73
+ except:
74
+ STOP_WORDS = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
75
+
76
  # Decorators and Context Managers
77
  def handle_errors(default_return=None):
78
  """Centralized error handling decorator"""
 
88
  return decorator
89
 
90
  @contextmanager
91
+ def memory_cleanup():
92
+ """Context manager for memory cleanup"""
 
93
  try:
94
+ yield
95
  finally:
 
96
  gc.collect()
97
 
98
  class ThemeContext:
 
101
  self.theme = theme
102
  self.colors = config.THEMES.get(theme, config.THEMES['default'])
103
 
104
+ # Enhanced Model Manager with Multi-language Support
105
  class ModelManager:
106
+ """Multi-language model manager with lazy loading"""
107
  _instance = None
 
 
 
108
 
109
  def __new__(cls):
110
  if cls._instance is None:
111
  cls._instance = super().__new__(cls)
112
+ cls._instance._initialized = False
113
  return cls._instance
114
 
115
+ def __init__(self):
116
+ if not self._initialized:
117
+ self.models = {}
118
+ self.tokenizers = {}
119
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
120
+ self._load_default_models()
121
+ self._initialized = True
122
+
123
+ def _load_default_models(self):
124
+ """Load default models"""
 
 
 
 
 
 
 
 
 
 
125
  try:
126
+ # Load multilingual model as default
127
+ model_name = config.MODELS['multilingual']
128
+ self.tokenizers['default'] = AutoTokenizer.from_pretrained(model_name)
129
+ self.models['default'] = AutoModelForSequenceClassification.from_pretrained(model_name)
130
+ self.models['default'].to(self.device)
131
+ logger.info(f"Default model loaded: {model_name}")
132
+
133
+ # Load Chinese model
134
+ zh_model_name = config.MODELS['zh']
135
+ self.tokenizers['zh'] = AutoTokenizer.from_pretrained(zh_model_name)
136
+ self.models['zh'] = AutoModelForSequenceClassification.from_pretrained(zh_model_name)
137
+ self.models['zh'].to(self.device)
138
+ logger.info(f"Chinese model loaded: {zh_model_name}")
139
+
140
  except Exception as e:
141
+ logger.error(f"Failed to load models: {e}")
142
  raise
143
+
144
+ def get_model(self, language='en'):
145
+ """Get model for specific language"""
146
+ if language == 'zh':
147
+ return self.models['zh'], self.tokenizers['zh']
148
+ return self.models['default'], self.tokenizers['default']
149
+
150
+ @staticmethod
151
+ def detect_language(text: str) -> str:
152
+ """Detect text language"""
153
+ try:
154
+ detected = langdetect.detect(text)
155
+ language_mapping = {
156
+ 'zh-cn': 'zh',
157
+ 'zh-tw': 'zh'
158
+ }
159
+ detected = language_mapping.get(detected, detected)
160
+ return detected if detected in config.SUPPORTED_LANGUAGES else 'en'
161
+ except:
162
+ return 'en'
163
 
164
+ # Simplified Text Processing
165
  class TextProcessor:
166
+ """Optimized text processing with multi-language support"""
167
+
168
  @staticmethod
169
  @lru_cache(maxsize=config.CACHE_SIZE)
170
+ def clean_text(text: str, remove_punctuation: bool = True, remove_numbers: bool = False) -> str:
171
+ """Clean text with language awareness"""
172
+ text = text.strip()
173
+
174
+ # Don't clean Chinese text aggressively
175
+ if re.search(r'[\u4e00-\u9fff]', text):
176
+ return text
177
+
178
+ text = text.lower()
179
+
180
+ if remove_numbers:
181
+ text = re.sub(r'\d+', '', text)
182
+
183
+ if remove_punctuation:
184
+ text = re.sub(r'[^\w\s]', '', text)
185
+
186
+ words = text.split()
187
+ cleaned_words = [w for w in words if w not in STOP_WORDS and len(w) >= config.MIN_WORD_LENGTH]
188
+ return ' '.join(cleaned_words)
189
+
190
+ @staticmethod
191
+ def extract_keywords(text: str, top_k: int = 5) -> List[str]:
192
+ """Extract keywords with language support"""
193
+ if re.search(r'[\u4e00-\u9fff]', text):
194
+ # Chinese text processing
195
+ words = re.findall(r'[\u4e00-\u9fff]+', text)
196
+ all_chars = ''.join(words)
197
+ char_freq = Counter(all_chars)
198
+ return [char for char, _ in char_freq.most_common(top_k)]
199
+ else:
200
+ # Other languages
201
+ cleaned = TextProcessor.clean_text(text)
202
+ words = cleaned.split()
203
+ word_freq = Counter(words)
204
+ return [word for word, _ in word_freq.most_common(top_k)]
205
+
206
+ @staticmethod
207
+ def parse_batch_input(text: str) -> List[str]:
208
+ """Parse batch input from textarea"""
209
+ lines = text.strip().split('\n')
210
+ return [line.strip() for line in lines if line.strip()]
211
 
212
+ # Enhanced History Manager
213
  class HistoryManager:
214
+ """Enhanced history management with filtering"""
215
  def __init__(self):
216
  self._history = []
217
 
218
  def add(self, entry: Dict):
219
+ """Add entry with timestamp"""
220
+ entry['timestamp'] = datetime.now().isoformat()
221
+ self._history.append(entry)
222
  if len(self._history) > config.MAX_HISTORY_SIZE:
223
  self._history = self._history[-config.MAX_HISTORY_SIZE:]
224
 
225
+ def add_batch(self, entries: List[Dict]):
226
+ """Add multiple entries"""
227
+ for entry in entries:
228
+ self.add(entry)
229
+
230
  def get_all(self) -> List[Dict]:
231
  return self._history.copy()
232
 
233
+ def get_recent(self, n: int = 10) -> List[Dict]:
234
+ return self._history[-n:] if self._history else []
235
+
236
+ def filter_by(self, sentiment: str = None, language: str = None,
237
+ min_confidence: float = None) -> List[Dict]:
238
+ """Filter history by criteria"""
239
+ filtered = self._history
240
+
241
+ if sentiment:
242
+ filtered = [h for h in filtered if h['sentiment'] == sentiment]
243
+ if language:
244
+ filtered = [h for h in filtered if h.get('language', 'en') == language]
245
+ if min_confidence:
246
+ filtered = [h for h in filtered if h['confidence'] >= min_confidence]
247
+
248
+ return filtered
249
+
250
  def clear(self) -> int:
251
  count = len(self._history)
252
  self._history.clear()
 
254
 
255
  def size(self) -> int:
256
  return len(self._history)
257
+
258
+ def get_stats(self) -> Dict:
259
+ """Get comprehensive statistics"""
260
+ if not self._history:
261
+ return {}
262
+
263
+ sentiments = [item['sentiment'] for item in self._history]
264
+ confidences = [item['confidence'] for item in self._history]
265
+ languages = [item.get('language', 'en') for item in self._history]
266
+
267
+ return {
268
+ 'total_analyses': len(self._history),
269
+ 'positive_count': sentiments.count('Positive'),
270
+ 'negative_count': sentiments.count('Negative'),
271
+ 'neutral_count': sentiments.count('Neutral'),
272
+ 'avg_confidence': np.mean(confidences),
273
+ 'max_confidence': np.max(confidences),
274
+ 'min_confidence': np.min(confidences),
275
+ 'languages_detected': len(set(languages)),
276
+ 'most_common_language': Counter(languages).most_common(1)[0][0] if languages else 'en'
277
+ }
278
 
279
+ # Core Sentiment Analysis Engine
280
  class SentimentEngine:
281
+ """Multi-language sentiment analysis engine"""
282
+
283
  def __init__(self):
284
  self.model_manager = ModelManager()
285
 
286
+ def extract_attention_keywords(self, text: str, language: str = 'auto', top_k: int = 10) -> List[Tuple[str, float]]:
287
+ """Extract keywords using attention weights"""
288
  try:
289
+ if language == 'auto':
290
+ language = self.model_manager.detect_language(text)
291
+
292
+ model, tokenizer = self.model_manager.get_model(language)
293
+
294
+ inputs = tokenizer(
295
  text, return_tensors="pt", padding=True,
296
  truncation=True, max_length=config.MAX_TEXT_LENGTH
297
  ).to(self.model_manager.device)
298
 
 
299
  with torch.no_grad():
300
+ outputs = model(**inputs, output_attentions=True)
 
 
 
 
 
 
 
 
301
 
302
+ if hasattr(outputs, 'attentions') and outputs.attentions:
303
+ # Use attention weights
304
+ attention = outputs.attentions[-1]
305
+ avg_attention = attention.mean(dim=1)[0, 0, :]
306
+
307
+ tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
308
+ attention_scores = avg_attention.cpu().numpy()
309
+
310
+ # Process tokens and scores
311
+ word_scores = {}
312
+ current_word = ""
313
+ current_score = 0.0
314
+
315
+ for token, score in zip(tokens, attention_scores):
316
+ if token in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>']:
317
+ continue
318
+
319
+ if token.startswith('##') or token.startswith('▁'):
320
+ current_word += token.replace('##', '').replace('▁', '')
321
+ current_score = max(current_score, score)
322
+ else:
323
+ if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
324
+ word_scores[current_word.lower()] = current_score
325
+ current_word = token
326
+ current_score = score
327
 
 
 
 
 
 
 
328
  if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
329
  word_scores[current_word.lower()] = current_score
330
 
331
+ # Filter and sort
332
+ filtered_words = {
333
+ word: score for word, score in word_scores.items()
334
+ if word not in STOP_WORDS and len(word) >= config.MIN_WORD_LENGTH
335
+ }
336
+
337
+ sorted_words = sorted(filtered_words.items(), key=lambda x: x[1], reverse=True)
338
+ return sorted_words[:top_k]
339
+
 
 
 
 
 
 
 
 
340
  except Exception as e:
341
+ logger.error(f"Attention keyword extraction failed: {e}")
342
+
343
+ # Fallback to simple keyword extraction
344
+ keywords = TextProcessor.extract_keywords(text, top_k)
345
+ return [(word, 0.1) for word in keywords]
346
 
347
+ @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0, 'keywords': []})
348
+ def analyze_single(self, text: str, language: str = 'auto', preprocessing_options: Dict = None) -> Dict:
349
+ """Analyze single text with enhanced features"""
350
  if not text.strip():
351
+ raise ValueError("Empty text provided")
352
+
353
+ # Detect language
354
+ if language == 'auto':
355
+ detected_lang = self.model_manager.detect_language(text)
356
+ else:
357
+ detected_lang = language
358
+
359
+ # Get appropriate model
360
+ model, tokenizer = self.model_manager.get_model(detected_lang)
361
+
362
+ # Preprocessing
363
+ options = preprocessing_options or {}
364
+ processed_text = text
365
+ if options.get('clean_text', False) and not re.search(r'[\u4e00-\u9fff]', text):
366
+ processed_text = TextProcessor.clean_text(
367
+ text,
368
+ options.get('remove_punctuation', True),
369
+ options.get('remove_numbers', False)
370
+ )
371
 
372
+ # Tokenize and analyze
373
+ inputs = tokenizer(processed_text, return_tensors="pt", padding=True,
374
+ truncation=True, max_length=config.MAX_TEXT_LENGTH).to(self.model_manager.device)
 
375
 
376
  with torch.no_grad():
377
+ outputs = model(**inputs)
378
  probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
379
 
380
+ # Handle different model outputs
381
+ if len(probs) == 3: # negative, neutral, positive
382
+ sentiment_idx = np.argmax(probs)
383
+ sentiment_labels = ['Negative', 'Neutral', 'Positive']
384
+ sentiment = sentiment_labels[sentiment_idx]
385
+ confidence = float(probs[sentiment_idx])
386
+
387
+ result = {
388
+ 'sentiment': sentiment,
389
+ 'confidence': confidence,
390
+ 'neg_prob': float(probs[0]),
391
+ 'neu_prob': float(probs[1]),
392
+ 'pos_prob': float(probs[2]),
393
+ 'has_neutral': True
394
+ }
395
+ else: # negative, positive
396
+ pred = np.argmax(probs)
397
+ sentiment = "Positive" if pred == 1 else "Negative"
398
+ confidence = float(probs[pred])
399
+
400
+ result = {
401
+ 'sentiment': sentiment,
402
+ 'confidence': confidence,
403
+ 'neg_prob': float(probs[0]),
404
+ 'pos_prob': float(probs[1]),
405
+ 'neu_prob': 0.0,
406
+ 'has_neutral': False
407
+ }
408
 
409
+ # Extract keywords
410
+ keywords = self.extract_attention_keywords(text, detected_lang)
411
 
412
+ # Add metadata
413
+ result.update({
414
+ 'language': detected_lang,
415
+ 'keywords': keywords,
416
+ 'word_count': len(text.split()),
417
+ 'char_count': len(text)
418
+ })
419
+
420
+ return result
421
 
422
  @handle_errors(default_return=[])
423
+ def analyze_batch(self, texts: List[str], language: str = 'auto',
424
+ preprocessing_options: Dict = None, progress_callback=None) -> List[Dict]:
425
+ """Optimized batch processing"""
426
  if len(texts) > config.BATCH_SIZE_LIMIT:
427
  texts = texts[:config.BATCH_SIZE_LIMIT]
428
 
 
435
  if progress_callback:
436
  progress_callback((i + len(batch)) / len(texts))
437
 
438
+ for text in batch:
439
+ try:
440
+ result = self.analyze_single(text, language, preprocessing_options)
441
+ result['batch_index'] = len(results)
442
+ result['text'] = text[:100] + '...' if len(text) > 100 else text
443
+ result['full_text'] = text
444
+ results.append(result)
445
+ except Exception as e:
446
+ results.append({
447
+ 'sentiment': 'Error',
448
+ 'confidence': 0.0,
449
+ 'error': str(e),
450
+ 'batch_index': len(results),
451
+ 'text': text[:100] + '...' if len(text) > 100 else text,
452
+ 'full_text': text
453
+ })
 
 
 
 
 
 
 
454
 
455
  return results
456
 
457
+ # Advanced Plotly Visualization System
458
+ class PlotlyVisualizer:
459
+ """Enhanced Plotly visualizations"""
460
 
461
  @staticmethod
462
  @handle_errors(default_return=None)
463
+ def create_sentiment_gauge(result: Dict, theme: ThemeContext) -> go.Figure:
464
+ """Create animated sentiment gauge"""
465
+ colors = theme.colors
466
+
467
+ if result.get('has_neutral', False):
468
+ # Three-way gauge
469
+ fig = go.Figure(go.Indicator(
470
+ mode="gauge+number+delta",
471
+ value=result['pos_prob'] * 100,
472
+ domain={'x': [0, 1], 'y': [0, 1]},
473
+ title={'text': f"Sentiment: {result['sentiment']}"},
474
+ delta={'reference': 50},
475
+ gauge={
476
+ 'axis': {'range': [None, 100]},
477
+ 'bar': {'color': colors['pos'] if result['sentiment'] == 'Positive' else colors['neg']},
478
+ 'steps': [
479
+ {'range': [0, 33], 'color': colors['neg']},
480
+ {'range': [33, 67], 'color': colors['neu']},
481
+ {'range': [67, 100], 'color': colors['pos']}
482
+ ],
483
+ 'threshold': {
484
+ 'line': {'color': "red", 'width': 4},
485
+ 'thickness': 0.75,
486
+ 'value': 90
487
+ }
488
+ }
489
+ ))
490
+ else:
491
+ # Two-way gauge
492
+ fig = go.Figure(go.Indicator(
493
+ mode="gauge+number",
494
+ value=result['confidence'] * 100,
495
+ domain={'x': [0, 1], 'y': [0, 1]},
496
+ title={'text': f"Confidence: {result['sentiment']}"},
497
+ gauge={
498
+ 'axis': {'range': [None, 100]},
499
+ 'bar': {'color': colors['pos'] if result['sentiment'] == 'Positive' else colors['neg']},
500
+ 'steps': [
501
+ {'range': [0, 50], 'color': "lightgray"},
502
+ {'range': [50, 100], 'color': "gray"}
503
+ ]
504
+ }
505
+ ))
506
+
507
+ fig.update_layout(height=400, font={'size': 16})
508
+ return fig
509
 
510
  @staticmethod
511
  @handle_errors(default_return=None)
512
+ def create_probability_bars(result: Dict, theme: ThemeContext) -> go.Figure:
513
+ """Create probability bar chart"""
514
+ colors = theme.colors
515
+
516
+ if result.get('has_neutral', False):
517
+ labels = ['Negative', 'Neutral', 'Positive']
518
+ values = [result['neg_prob'], result['neu_prob'], result['pos_prob']]
519
+ bar_colors = [colors['neg'], colors['neu'], colors['pos']]
520
+ else:
521
+ labels = ['Negative', 'Positive']
522
+ values = [result['neg_prob'], result['pos_prob']]
523
+ bar_colors = [colors['neg'], colors['pos']]
524
+
525
+ fig = go.Figure(data=[
526
+ go.Bar(x=labels, y=values, marker_color=bar_colors,
527
+ text=[f'{v:.3f}' for v in values], textposition='outside')
528
+ ])
529
+
530
+ fig.update_layout(
531
+ title="Sentiment Probabilities",
532
+ yaxis_title="Probability",
533
+ height=400,
534
+ showlegend=False
535
+ )
536
+
537
+ return fig
 
538
 
539
  @staticmethod
540
  @handle_errors(default_return=None)
541
+ def create_keyword_chart(keywords: List[Tuple[str, float]], sentiment: str, theme: ThemeContext) -> go.Figure:
542
+ """Create keyword importance chart"""
543
+ if not keywords:
544
+ fig = go.Figure()
545
+ fig.add_annotation(text="No keywords extracted",
546
+ xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)
547
+ fig.update_layout(height=400, title="Keywords")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548
  return fig
549
+
550
+ words = [word for word, score in keywords]
551
+ scores = [score for word, score in keywords]
552
+
553
+ color = theme.colors['pos'] if sentiment == 'Positive' else theme.colors['neg']
554
+
555
+ fig = go.Figure(data=[
556
+ go.Bar(
557
+ y=words,
558
+ x=scores,
559
+ orientation='h',
560
+ marker_color=color,
561
+ text=[f'{score:.3f}' for score in scores],
562
+ textposition='auto'
563
+ )
564
+ ])
565
+
566
+ fig.update_layout(
567
+ title=f"Top Keywords ({sentiment})",
568
+ xaxis_title="Attention Weight",
569
+ yaxis_title="Keywords",
570
+ height=400,
571
+ showlegend=False
572
+ )
573
+
574
+ return fig
575
 
576
  @staticmethod
577
  @handle_errors(default_return=None)
578
+ def create_batch_summary(results: List[Dict], theme: ThemeContext) -> go.Figure:
579
+ """Create batch analysis summary"""
580
+ colors = theme.colors
581
+
582
+ # Count sentiments
583
+ sentiments = [r['sentiment'] for r in results if 'sentiment' in r and r['sentiment'] != 'Error']
584
+ sentiment_counts = Counter(sentiments)
585
+
586
+ # Create pie chart
587
+ fig = go.Figure(data=[go.Pie(
588
+ labels=list(sentiment_counts.keys()),
589
+ values=list(sentiment_counts.values()),
590
+ marker_colors=[colors.get(s.lower()[:3], '#999999') for s in sentiment_counts.keys()],
591
+ textinfo='label+percent',
592
+ hole=0.3
593
+ )])
594
+
595
+ fig.update_layout(
596
+ title=f"Batch Analysis Summary ({len(results)} texts)",
597
+ height=400
598
+ )
599
+
600
+ return fig
601
 
602
  @staticmethod
603
  @handle_errors(default_return=None)
604
+ def create_confidence_distribution(results: List[Dict]) -> go.Figure:
605
+ """Create confidence distribution plot"""
606
+ confidences = [r['confidence'] for r in results if 'confidence' in r and r['sentiment'] != 'Error']
607
+
608
+ if not confidences:
609
+ return go.Figure()
610
+
611
+ fig = go.Figure(data=[go.Histogram(
612
+ x=confidences,
613
+ nbinsx=20,
614
+ marker_color='skyblue',
615
+ opacity=0.7
616
+ )])
617
+
618
+ fig.update_layout(
619
+ title="Confidence Distribution",
620
+ xaxis_title="Confidence Score",
621
+ yaxis_title="Frequency",
622
+ height=400
623
+ )
624
+
625
+ return fig
626
+
627
+ @staticmethod
628
+ @handle_errors(default_return=None)
629
+ def create_history_dashboard(history: List[Dict], theme: ThemeContext) -> go.Figure:
630
+ """Create comprehensive history dashboard"""
631
+ if len(history) < 2:
632
+ return go.Figure()
633
+
634
+ # Create subplots
635
+ fig = make_subplots(
636
+ rows=2, cols=2,
637
+ subplot_titles=['Sentiment Timeline', 'Confidence Distribution',
638
+ 'Language Distribution', 'Sentiment Summary'],
639
+ specs=[[{"secondary_y": False}, {"secondary_y": False}],
640
+ [{"type": "pie"}, {"type": "bar"}]]
641
+ )
642
+
643
+ # Extract data
644
+ indices = list(range(len(history)))
645
+ pos_probs = [item.get('pos_prob', 0) for item in history]
646
+ confidences = [item['confidence'] for item in history]
647
+ sentiments = [item['sentiment'] for item in history]
648
+ languages = [item.get('language', 'en') for item in history]
649
+
650
+ # Sentiment timeline
651
+ colors_map = {'Positive': theme.colors['pos'], 'Negative': theme.colors['neg'], 'Neutral': theme.colors['neu']}
652
+ colors = [colors_map.get(s, '#999999') for s in sentiments]
653
+
654
+ fig.add_trace(
655
+ go.Scatter(x=indices, y=pos_probs, mode='lines+markers',
656
+ marker=dict(color=colors, size=8),
657
+ name='Positive Probability'),
658
+ row=1, col=1
659
+ )
660
+
661
+ # Confidence distribution
662
+ fig.add_trace(
663
+ go.Histogram(x=confidences, nbinsx=10, name='Confidence'),
664
+ row=1, col=2
665
+ )
666
+
667
+ # Language distribution
668
+ lang_counts = Counter(languages)
669
+ fig.add_trace(
670
+ go.Pie(labels=list(lang_counts.keys()), values=list(lang_counts.values()),
671
+ name="Languages"),
672
+ row=2, col=1
673
+ )
674
+
675
+ # Sentiment summary
676
+ sent_counts = Counter(sentiments)
677
+ sent_colors = [colors_map.get(k, '#999999') for k in sent_counts.keys()]
678
+ fig.add_trace(
679
+ go.Bar(x=list(sent_counts.keys()), y=list(sent_counts.values()),
680
+ marker_color=sent_colors),
681
+ row=2, col=2
682
+ )
683
+
684
+ fig.update_layout(height=800, showlegend=False)
685
+ return fig
686
 
687
+ # Universal Data Handler
688
  class DataHandler:
689
+ """Enhanced data operations"""
690
 
691
  @staticmethod
692
  @handle_errors(default_return=(None, "Export failed"))
693
  def export_data(data: List[Dict], format_type: str) -> Tuple[Optional[str], str]:
694
+ """Export data with comprehensive information"""
695
  if not data:
696
  return None, "No data to export"
697
 
 
700
 
701
  if format_type == 'csv':
702
  writer = csv.writer(temp_file)
703
+ writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Language',
704
+ 'Pos_Prob', 'Neg_Prob', 'Neu_Prob', 'Keywords', 'Word_Count'])
705
  for entry in data:
706
+ keywords_str = "|".join([f"{word}:{score:.3f}" for word, score in entry.get('keywords', [])])
707
  writer.writerow([
708
  entry.get('timestamp', ''),
709
  entry.get('text', ''),
710
  entry.get('sentiment', ''),
711
  f"{entry.get('confidence', 0):.4f}",
712
+ entry.get('language', 'en'),
713
  f"{entry.get('pos_prob', 0):.4f}",
714
  f"{entry.get('neg_prob', 0):.4f}",
715
+ f"{entry.get('neu_prob', 0):.4f}",
716
+ keywords_str,
717
+ entry.get('word_count', 0)
718
  ])
719
  elif format_type == 'json':
720
  json.dump(data, temp_file, indent=2, ensure_ascii=False)
 
722
  temp_file.close()
723
  return temp_file.name, f"Exported {len(data)} entries"
724
 
 
725
  @staticmethod
726
  @handle_errors(default_return="")
727
  def process_file(file) -> str:
728
+ """Process uploaded files"""
729
  if not file:
730
  return ""
731
+
732
  content = file.read().decode('utf-8')
733
 
734
  if file.name.endswith('.csv'):
 
735
  csv_file = io.StringIO(content)
736
  reader = csv.reader(csv_file)
737
  try:
738
+ next(reader) # Skip header
739
  texts = []
740
  for row in reader:
741
  if row and row[0].strip():
742
  text = row[0].strip().strip('"')
743
+ if text:
744
  texts.append(text)
745
  return '\n'.join(texts)
746
+ except:
747
+ lines = content.strip().split('\n')[1:]
748
  texts = []
749
  for line in lines:
750
  if line.strip():
 
752
  if text:
753
  texts.append(text)
754
  return '\n'.join(texts)
755
+
756
  return content
757
 
758
+ # Main Application Class
759
  class SentimentApp:
760
+ """Main multilingual sentiment analysis application"""
761
 
762
  def __init__(self):
763
  self.engine = SentimentEngine()
764
  self.history = HistoryManager()
765
  self.data_handler = DataHandler()
766
 
767
+ # Multi-language examples
768
  self.examples = [
769
+ ["This movie was absolutely fantastic! The acting was superb and the plot kept me engaged throughout."],
770
+ ["The film was disappointing with poor character development and a confusing storyline."],
771
+ ["这部电影真的很棒!演技精湛,情节引人入胜。"], # Chinese
772
+ ["Esta película fue increíble, me encantó la cinematografía."], # Spanish
773
+ ["Ce film était magnifique, j'ai adoré la réalisation."], # French
774
  ]
 
775
 
776
+ @handle_errors(default_return=("Please enter text", None, None, None))
777
+ def analyze_single(self, text: str, language: str, theme: str, clean_text: bool,
778
+ remove_punct: bool, remove_nums: bool):
779
+ """Single text analysis with enhanced visualizations"""
780
  if not text.strip():
781
+ return "Please enter text", None, None, None
782
 
783
+ # Map display names to language codes
784
+ language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
785
+ language_code = language_map.get(language, 'auto')
786
 
787
+ preprocessing_options = {
788
+ 'clean_text': clean_text,
789
+ 'remove_punctuation': remove_punct,
790
+ 'remove_numbers': remove_nums
791
+ }
792
+
793
+ with memory_cleanup():
794
+ result = self.engine.analyze_single(text, language_code, preprocessing_options)
795
+
796
+ # Add to history
797
+ history_entry = {
798
+ 'text': text[:100] + '...' if len(text) > 100 else text,
799
+ 'full_text': text,
800
+ 'sentiment': result['sentiment'],
801
+ 'confidence': result['confidence'],
802
+ 'pos_prob': result.get('pos_prob', 0),
803
+ 'neg_prob': result.get('neg_prob', 0),
804
+ 'neu_prob': result.get('neu_prob', 0),
805
+ 'language': result['language'],
806
+ 'keywords': result['keywords'],
807
+ 'word_count': result['word_count'],
808
+ 'analysis_type': 'single'
809
+ }
810
+ self.history.add(history_entry)
811
+
812
+ # Create visualizations
813
+ theme_ctx = ThemeContext(theme)
814
+ gauge_fig = PlotlyVisualizer.create_sentiment_gauge(result, theme_ctx)
815
+ bars_fig = PlotlyVisualizer.create_probability_bars(result, theme_ctx)
816
+ keyword_fig = PlotlyVisualizer.create_keyword_chart(result['keywords'], result['sentiment'], theme_ctx)
817
+
818
+ # Create comprehensive result text
819
+ keywords_str = ", ".join([f"{word}({score:.3f})" for word, score in result['keywords'][:5]])
820
+
821
+ info_text = f"""
822
+ **Analysis Results:**
823
+ - **Sentiment:** {result['sentiment']} ({result['confidence']:.3f} confidence)
824
+ - **Language:** {result['language'].upper()}
825
+ - **Keywords:** {keywords_str}
826
+ - **Statistics:** {result['word_count']} words, {result['char_count']} characters
827
+ """
828
+
829
+ return info_text, gauge_fig, bars_fig, keyword_fig
830
+
831
+ @handle_errors(default_return=("Please enter texts", None, None, None))
832
+ def analyze_batch(self, batch_text: str, language: str, theme: str,
833
+ clean_text: bool, remove_punct: bool, remove_nums: bool):
834
+ """Enhanced batch analysis"""
835
+ if not batch_text.strip():
836
+ return "Please enter texts (one per line)", None, None, None
837
+
838
+ # Parse batch input
839
+ texts = TextProcessor.parse_batch_input(batch_text)
840
+
841
+ if len(texts) > config.BATCH_SIZE_LIMIT:
842
+ return f"Too many texts. Maximum {config.BATCH_SIZE_LIMIT} allowed.", None, None, None
843
+
844
+ if not texts:
845
+ return "No valid texts found", None, None, None
846
+
847
+ # Map display names to language codes
848
+ language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
849
+ language_code = language_map.get(language, 'auto')
850
+
851
+ preprocessing_options = {
852
+ 'clean_text': clean_text,
853
+ 'remove_punctuation': remove_punct,
854
+ 'remove_numbers': remove_nums
855
+ }
856
+
857
+ with memory_cleanup():
858
+ results = self.engine.analyze_batch(texts, language_code, preprocessing_options)
859
+
860
+ # Add to history
861
+ batch_entries = []
862
+ for result in results:
863
+ if 'error' not in result:
864
+ entry = {
865
+ 'text': result['text'],
866
+ 'full_text': result['full_text'],
867
+ 'sentiment': result['sentiment'],
868
+ 'confidence': result['confidence'],
869
+ 'pos_prob': result.get('pos_prob', 0),
870
+ 'neg_prob': result.get('neg_prob', 0),
871
+ 'neu_prob': result.get('neu_prob', 0),
872
+ 'language': result['language'],
873
+ 'keywords': result['keywords'],
874
+ 'word_count': result['word_count'],
875
+ 'analysis_type': 'batch',
876
+ 'batch_index': result['batch_index']
877
+ }
878
+ batch_entries.append(entry)
879
+
880
+ self.history.add_batch(batch_entries)
881
+
882
+ # Create visualizations
883
+ theme_ctx = ThemeContext(theme)
884
+ summary_fig = PlotlyVisualizer.create_batch_summary(results, theme_ctx)
885
+ confidence_fig = PlotlyVisualizer.create_confidence_distribution(results)
886
+
887
+ # Create results DataFrame
888
+ df_data = []
889
+ for result in results:
890
+ if 'error' in result:
891
+ df_data.append({
892
+ 'Index': result['batch_index'] + 1,
893
+ 'Text': result['text'],
894
+ 'Sentiment': 'Error',
895
+ 'Confidence': 0.0,
896
+ 'Language': 'Unknown',
897
+ 'Error': result['error']
898
+ })
899
+ else:
900
+ keywords_str = ', '.join([word for word, _ in result['keywords'][:3]])
901
+ df_data.append({
902
+ 'Index': result['batch_index'] + 1,
903
+ 'Text': result['text'],
904
+ 'Sentiment': result['sentiment'],
905
+ 'Confidence': f"{result['confidence']:.3f}",
906
+ 'Language': result['language'].upper(),
907
+ 'Keywords': keywords_str
908
+ })
909
+
910
+ df = pd.DataFrame(df_data)
911
+
912
+ # Create summary text
913
+ successful_results = [r for r in results if 'error' not in r]
914
+ error_count = len(results) - len(successful_results)
915
+
916
+ if successful_results:
917
+ sentiment_counts = Counter([r['sentiment'] for r in successful_results])
918
+ avg_confidence = np.mean([r['confidence'] for r in successful_results])
919
+ languages = Counter([r['language'] for r in successful_results])
920
+
921
+ summary_text = f"""
922
+ **Batch Analysis Summary:**
923
+ - **Total Texts:** {len(texts)}
924
+ - **Successful:** {len(successful_results)}
925
+ - **Errors:** {error_count}
926
+ - **Average Confidence:** {avg_confidence:.3f}
927
+ - **Sentiments:** {dict(sentiment_counts)}
928
+ - **Languages Detected:** {dict(languages)}
929
+ """
930
+ else:
931
+ summary_text = f"All {len(texts)} texts failed to analyze."
932
+
933
+ return summary_text, df, summary_fig, confidence_fig
934
+
935
+ @handle_errors(default_return=(None, "No history available"))
936
+ def plot_history(self, theme: str = 'default'):
937
+ """Plot comprehensive history analysis"""
938
+ history = self.history.get_all()
939
+ if len(history) < 2:
940
+ return None, f"Need at least 2 analyses for trends. Current: {len(history)}"
941
 
 
942
  theme_ctx = ThemeContext(theme)
 
943
 
944
+ with memory_cleanup():
945
+ fig = PlotlyVisualizer.create_history_dashboard(history, theme_ctx)
946
+ stats = self.history.get_stats()
947
+
948
+ stats_text = f"""
949
+ **History Statistics:**
950
+ - **Total Analyses:** {stats.get('total_analyses', 0)}
951
+ - **Positive:** {stats.get('positive_count', 0)}
952
+ - **Negative:** {stats.get('negative_count', 0)}
953
+ - **Neutral:** {stats.get('neutral_count', 0)}
954
+ - **Average Confidence:** {stats.get('avg_confidence', 0):.3f}
955
+ - **Languages:** {stats.get('languages_detected', 0)}
956
+ - **Most Common Language:** {stats.get('most_common_language', 'N/A').upper()}
957
+ """
958
+
959
+ return fig, stats_text
960
+
961
+ @handle_errors(default_return=("No data available",))
962
+ def get_history_status(self):
963
+ """Get current history status"""
964
+ stats = self.history.get_stats()
965
+ if not stats:
966
+ return "No analyses performed yet"
967
+
968
+ return f"""
969
+ **Current Status:**
970
+ - **Total Analyses:** {stats['total_analyses']}
971
+ - **Recent Sentiment Distribution:**
972
+ * Positive: {stats['positive_count']}
973
+ * Negative: {stats['negative_count']}
974
+ * Neutral: {stats['neutral_count']}
975
+ - **Average Confidence:** {stats['avg_confidence']:.3f}
976
+ - **Languages Detected:** {stats['languages_detected']}
977
+ """
978
+
979
+ # Gradio Interface
980
+ def create_interface():
981
+ """Create comprehensive Gradio interface"""
982
+ app = SentimentApp()
983
+
984
+ with gr.Blocks(theme=gr.themes.Soft(), title="Multilingual Sentiment Analyzer") as demo:
985
+ gr.Markdown("# 🌍 Advanced Multilingual Sentiment Analyzer")
986
+ gr.Markdown("AI-powered sentiment analysis with support for multiple languages, advanced visualizations, and explainable AI features")
987
+
988
+ with gr.Tab("Single Analysis"):
989
+ with gr.Row():
990
+ with gr.Column():
991
+ text_input = gr.Textbox(
992
+ label="Enter Text for Analysis",
993
+ placeholder="Enter your text in any supported language...",
994
+ lines=5
995
+ )
996
+
997
+ with gr.Row():
998
+ language_selector = gr.Dropdown(
999
+ choices=list(config.SUPPORTED_LANGUAGES.values()),
1000
+ value="Auto Detect",
1001
+ label="Language"
1002
+ )
1003
+ theme_selector = gr.Dropdown(
1004
+ choices=list(config.THEMES.keys()),
1005
+ value="default",
1006
+ label="Theme"
1007
+ )
1008
+
1009
+ with gr.Row():
1010
+ clean_text_cb = gr.Checkbox(label="Clean Text", value=False)
1011
+ remove_punct_cb = gr.Checkbox(label="Remove Punctuation", value=False)
1012
+ remove_nums_cb = gr.Checkbox(label="Remove Numbers", value=False)
1013
+
1014
+ analyze_btn = gr.Button("Analyze", variant="primary", size="lg")
1015
+
1016
+ gr.Examples(
1017
+ examples=app.examples,
1018
+ inputs=text_input,
1019
+ cache_examples=False
1020
+ )
1021
+
1022
+ with gr.Column():
1023
+ result_output = gr.Textbox(label="Analysis Results", lines=8)
1024
+
1025
+ with gr.Row():
1026
+ gauge_plot = gr.Plot(label="Sentiment Gauge")
1027
+ probability_plot = gr.Plot(label="Probability Distribution")
1028
+
1029
+ with gr.Row():
1030
+ keyword_plot = gr.Plot(label="Key Contributing Words")
1031
+
1032
+ with gr.Tab("Batch Analysis"):
1033
+ with gr.Row():
1034
+ with gr.Column():
1035
+ file_upload = gr.File(
1036
+ label="Upload File (CSV/TXT)",
1037
+ file_types=[".csv", ".txt"]
1038
+ )
1039
+ batch_input = gr.Textbox(
1040
+ label="Batch Input (one text per line)",
1041
+ placeholder="Enter multiple texts, one per line...",
1042
+ lines=10
1043
+ )
1044
+
1045
+ with gr.Row():
1046
+ batch_language = gr.Dropdown(
1047
+ choices=list(config.SUPPORTED_LANGUAGES.values()),
1048
+ value="Auto Detect",
1049
+ label="Language"
1050
+ )
1051
+ batch_theme = gr.Dropdown(
1052
+ choices=list(config.THEMES.keys()),
1053
+ value="default",
1054
+ label="Theme"
1055
+ )
1056
+
1057
+ with gr.Row():
1058
+ batch_clean_cb = gr.Checkbox(label="Clean Text", value=False)
1059
+ batch_punct_cb = gr.Checkbox(label="Remove Punctuation", value=False)
1060
+ batch_nums_cb = gr.Checkbox(label="Remove Numbers", value=False)
1061
+
1062
+ with gr.Row():
1063
+ load_file_btn = gr.Button("Load File")
1064
+ analyze_batch_btn = gr.Button("Analyze Batch", variant="primary")
1065
+
1066
+ with gr.Column():
1067
+ batch_summary = gr.Textbox(label="Batch Summary", lines=8)
1068
+ batch_results_df = gr.Dataframe(
1069
+ label="Detailed Results",
1070
+ headers=["Index", "Text", "Sentiment", "Confidence", "Language", "Keywords"],
1071
+ datatype=["number", "str", "str", "str", "str", "str"]
1072
+ )
1073
+
1074
+ with gr.Row():
1075
+ batch_plot = gr.Plot(label="Batch Analysis Summary")
1076
+ confidence_dist_plot = gr.Plot(label="Confidence Distribution")
1077
 
1078
+ with gr.Tab("History & Analytics"):
1079
+ with gr.Row():
1080
+ with gr.Column():
1081
+ with gr.Row():
1082
+ refresh_history_btn = gr.Button("Refresh History")
1083
+ clear_history_btn = gr.Button("Clear History", variant="stop")
1084
+ status_btn = gr.Button("Get Status")
1085
+
1086
+ history_theme = gr.Dropdown(
1087
+ choices=list(config.THEMES.keys()),
1088
+ value="default",
1089
+ label="Dashboard Theme"
1090
+ )
1091
+
1092
+ with gr.Row():
1093
+ export_csv_btn = gr.Button("Export CSV")
1094
+ export_json_btn = gr.Button("Export JSON")
1095
+
1096
+ with gr.Column():
1097
+ history_status = gr.Textbox(label="History Status", lines=8)
1098
+
1099
+ history_dashboard = gr.Plot(label="History Analytics Dashboard")
1100
+
1101
+ with gr.Row():
1102
+ csv_download = gr.File(label="CSV Download", visible=True)
1103
+ json_download = gr.File(label="JSON Download", visible=True)
1104
+
1105
+ # Event Handlers
1106
+ analyze_btn.click(
1107
+ app.analyze_single,
1108
+ inputs=[text_input, language_selector, theme_selector,
1109
+ clean_text_cb, remove_punct_cb, remove_nums_cb],
1110
+ outputs=[result_output, gauge_plot, probability_plot, keyword_plot]
1111
+ )
1112
+
1113
+ load_file_btn.click(
1114
+ app.data_handler.process_file,
1115
+ inputs=file_upload,
1116
+ outputs=batch_input
1117
+ )
1118
+
1119
+ analyze_batch_btn.click(
1120
+ app.analyze_batch,
1121
+ inputs=[batch_input, batch_language, batch_theme,
1122
+ batch_clean_cb, batch_punct_cb, batch_nums_cb],
1123
+ outputs=[batch_summary, batch_results_df, batch_plot, confidence_dist_plot]
1124
+ )
1125
 
1126
+ refresh_history_btn.click(
1127
+ app.plot_history,
1128
+ inputs=history_theme,
1129
+ outputs=[history_dashboard, history_status]
1130
+ )
1131
+
1132
+ clear_history_btn.click(
1133
+ lambda: f"Cleared {app.history.clear()} entries",
1134
+ outputs=history_status
1135
+ )
1136
+
1137
+ status_btn.click(
1138
+ app.get_history_status,
1139
+ outputs=history_status
1140
+ )
1141
+
1142
+ export_csv_btn.click(
1143
+ lambda: app.data_handler.export_data(app.history.get_all(), 'csv'),
1144
+ outputs=[csv_download, history_status]
1145
+ )
1146
+
1147
+ export_json_btn.click(
1148
+ lambda: app.data_handler.export_data(app.history.get_all(), 'json'),
1149
+ outputs=[json_download, history_status]
1150
+ )
1151
 
1152
+ return demo
1153
+
1154
+ # Application Entry Point
1155
+ if __name__ == "__main__":
1156
+ logging.basicConfig(
1157
+ level=logging.INFO,
1158
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
1159
+ )
1160
+
1161
+ try:
1162
+ demo = create_interface()
1163
+ demo.launch(
1164
+ share=True,
1165
+ server_name="0.0.0.0",
1166
+ server_port=7860,
1167
+ show_error=True
1168
+ )
1169
+ except Exception as e:
1170
+ logger.error(f"Failed to launch application: {e}")
1171
+ raise
1172
+
1173
+ @handle_errors(default_return=("Please enter texts", None, None, None))
1174
+ def analyze_batch(self, batch_text: str, language: str, theme: str,
1175
+ clean_text: bool, remove_punct: bool, remove_nums: bool):
1176
+ """Enhanced batch analysis"""
1177
+ if not batch_text.strip():
1178
+ return "Please enter texts (one per line)", None, None, None
1179
+
1180
+ # Parse batch input
1181
+ texts = TextProcessor.parse_batch_input(batch_text)
1182
 
1183
+ if len(texts) > config.BATCH_SIZE_LIMIT:
1184
+ return f"Too many texts. Maximum {config.BATCH_SIZE_LIMIT} allowed.", None, None, None
1185
+
1186
+ if not texts:
1187
+ return "No valid texts found", None, None, None
1188
 
1189
+ # Map display names to language codes
1190
+ language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
1191
+ language_code = language_map.get(language, 'auto')
1192
 
1193
+ preprocessing_options = {
1194
+ 'clean_text': clean_text,
1195
+ 'remove_punctuation': remove_punct,
1196
+ 'remove_numbers': remove_nums
1197
+ }
1198
 
1199
+ with memory_cleanup():
1200
+ results = self.engine.analyze_batch(texts, language_code, preprocessing_options)
1201
+
1202
+ # Add to history
1203
+ batch_entries = []
1204
+ for result in results:
1205
+ if 'error' not in result:
1206
+ entry = {
1207
+ 'text': result['text'],
1208
+ 'full_text': result['full_text'],
1209
+ 'sentiment': result['sentiment'],
1210
+ 'confidence': result['confidence'],
1211
+ 'pos_prob': result.get('pos_prob', 0),
1212
+ 'neg_prob': result.get('neg_prob', 0),
1213
+ 'neu_prob': result.get('neu_prob', 0),
1214
+ 'language': result['language'],
1215
+ 'keywords': result['keywords'],
1216
+ 'word_count': result['word_count'],
1217
+ 'analysis_type': 'batch',
1218
+ 'batch_index': result['batch_index']
1219
+ }
1220
+ batch_entries.append(entry)
1221
+
1222
+ self.history.add_batch(batch_entries)
1223
+
1224
+ # Create visualizations
1225
+ theme_ctx = ThemeContext(theme)
1226
+ summary_fig = PlotlyVisualizer.create_batch_summary(results, theme_ctx)
1227
+ confidence_fig = PlotlyVisualizer.create_confidence_distribution(results)
1228
+
1229
+ # Create results DataFrame
1230
+ df_data = []
1231
+ for result in results:
1232
+ if 'error' in result:
1233
+ df_data.append({
1234
+ 'Index': result['batch_index'] + 1,
1235
+ 'Text': result['text'],
1236
+ 'Sentiment': 'Error',
1237
+ 'Confidence': 0.0,
1238
+ 'Language': 'Unknown',
1239
+ 'Error': result['error']
1240
+ })
1241
+ else:
1242
+ keywords_str = ', '.join([word for word, _ in result['keywords'][:3]])
1243
+ df_data.append({
1244
+ 'Index': result['batch_index'] + 1,
1245
+ 'Text': result['text'],
1246
+ 'Sentiment': result['sentiment'],
1247
+ 'Confidence': f"{result['confidence']:.3f}",
1248
+ 'Language': result['language'].upper(),
1249
+ 'Keywords': keywords_str
1250
+ })
1251
+
1252
+ df = pd.DataFrame(df_data)
1253
+
1254
+ # Create summary text
1255
+ successful_results = [r for r in results if 'error' not in r]
1256
+ error_count = len(results) - len(successful_results)
1257
+
1258
+ if successful_results:
1259
+ sentiment_counts = Counter([r['sentiment'] for r in successful_results])
1260
+ avg_confidence = np.mean([r['confidence'] for r in successful_results])
1261
+ languages = Counter([r['language'] for r in successful_results])
1262
+
1263
+ summary_text = f"""
1264
+ **Batch Analysis Summary:**
1265
+ - **Total Texts:** {len(texts)}
1266
+ - **Successful:** {len(successful_results)}
1267
+ - **Errors:** {error_count}
1268
+ - **Average Confidence:** {avg_confidence:.3f}
1269
+ - **Sentiments:** {dict(sentiment_counts)}
1270
+ - **Languages Detected:** {dict(languages)}
1271
+ """
1272
+ else:
1273
+ summary_text = f"All {len(texts)} texts failed to analyze."
1274
+
1275
+ return summary_text, df, summary_fig, confidence_fig
1276
 
1277
  @handle_errors(default_return=(None, "No history available"))
1278
  def plot_history(self, theme: str = 'default'):
1279
+ """Plot comprehensive history analysis"""
1280
  history = self.history.get_all()
1281
  if len(history) < 2:
1282
  return None, f"Need at least 2 analyses for trends. Current: {len(history)}"
1283
 
1284
  theme_ctx = ThemeContext(theme)
1285
 
1286
+ with memory_cleanup():
1287
+ fig = PlotlyVisualizer.create_history_dashboard(history, theme_ctx)
1288
+ stats = self.history.get_stats()
1289
+
1290
+ stats_text = f"""
1291
+ **History Statistics:**
1292
+ - **Total Analyses:** {stats.get('total_analyses', 0)}
1293
+ - **Positive:** {stats.get('positive_count', 0)}
1294
+ - **Negative:** {stats.get('negative_count', 0)}
1295
+ - **Neutral:** {stats.get('neutral_count', 0)}
1296
+ - **Average Confidence:** {stats.get('avg_confidence', 0):.3f}
1297
+ - **Languages:** {stats.get('languages_detected', 0)}
1298
+ - **Most Common Language:** {stats.get('most_common_language', 'N/A').upper()}
1299
+ """
1300
+
1301
+ return fig, stats_text
1302
+
1303
+ @handle_errors(default_return=("No data available",))
1304
+ def get_history_status(self):
1305
+ """Get current history status"""
1306
+ stats = self.history.get_stats()
1307
+ if not stats:
1308
+ return "No analyses performed yet"
1309
+
1310
+ return f"""
1311
+ **Current Status:**
1312
+ - **Total Analyses:** {stats['total_analyses']}
1313
+ - **Recent Sentiment Distribution:**
1314
+ * Positive: {stats['positive_count']}
1315
+ * Negative: {stats['negative_count']}
1316
+ * Neutral: {stats['neutral_count']}
1317
+ - **Average Confidence:** {stats['avg_confidence']:.3f}
1318
+ - **Languages Detected:** {stats['languages_detected']}
1319
+ """
1320
 
1321
+ # Gradio Interface
1322
  def create_interface():
1323
+ """Create comprehensive Gradio interface"""
1324
  app = SentimentApp()
1325
 
1326
+ with gr.Blocks(theme=gr.themes.Soft(), title="Multilingual Sentiment Analyzer") as demo:
1327
+ gr.Markdown("# 🌍 Advanced Multilingual Sentiment Analyzer")
1328
+ gr.Markdown("AI-powered sentiment analysis with support for multiple languages, advanced visualizations, and explainable AI features")
1329
 
1330
  with gr.Tab("Single Analysis"):
1331
  with gr.Row():
1332
  with gr.Column():
1333
  text_input = gr.Textbox(
1334
+ label="Enter Text for Analysis",
1335
+ placeholder="Enter your text in any supported language...",
1336
  lines=5
1337
  )
1338
+
1339
  with gr.Row():
1340
+ language_selector = gr.Dropdown(
1341
+ choices=list(config.SUPPORTED_LANGUAGES.values()),
1342
+ value="Auto Detect",
1343
+ label="Language"
1344
+ )
1345
  theme_selector = gr.Dropdown(
1346
  choices=list(config.THEMES.keys()),
1347
  value="default",
1348
  label="Theme"
1349
  )
1350
 
1351
+ with gr.Row():
1352
+ clean_text_cb = gr.Checkbox(label="Clean Text", value=False)
1353
+ remove_punct_cb = gr.Checkbox(label="Remove Punctuation", value=False)
1354
+ remove_nums_cb = gr.Checkbox(label="Remove Numbers", value=False)
1355
+
1356
+ analyze_btn = gr.Button("Analyze", variant="primary", size="lg")
1357
+
1358
  gr.Examples(
1359
  examples=app.examples,
1360
+ inputs=text_input,
1361
+ cache_examples=False
1362
  )
1363
 
1364
  with gr.Column():
1365
+ result_output = gr.Textbox(label="Analysis Results", lines=8)
1366
 
1367
  with gr.Row():
1368
+ gauge_plot = gr.Plot(label="Sentiment Gauge")
1369
+ probability_plot = gr.Plot(label="Probability Distribution")
1370
 
1371
  with gr.Row():
 
1372
  keyword_plot = gr.Plot(label="Key Contributing Words")
1373
 
1374
  with gr.Tab("Batch Analysis"):
1375
  with gr.Row():
1376
  with gr.Column():
1377
+ file_upload = gr.File(
1378
+ label="Upload File (CSV/TXT)",
1379
+ file_types=[".csv", ".txt"]
1380
+ )
1381
  batch_input = gr.Textbox(
1382
+ label="Batch Input (one text per line)",
1383
+ placeholder="Enter multiple texts, one per line...",
1384
+ lines=10
1385
  )
1386
+
1387
+ with gr.Row():
1388
+ batch_language = gr.Dropdown(
1389
+ choices=list(config.SUPPORTED_LANGUAGES.values()),
1390
+ value="Auto Detect",
1391
+ label="Language"
1392
+ )
1393
+ batch_theme = gr.Dropdown(
1394
+ choices=list(config.THEMES.keys()),
1395
+ value="default",
1396
+ label="Theme"
1397
+ )
1398
+
1399
+ with gr.Row():
1400
+ batch_clean_cb = gr.Checkbox(label="Clean Text", value=False)
1401
+ batch_punct_cb = gr.Checkbox(label="Remove Punctuation", value=False)
1402
+ batch_nums_cb = gr.Checkbox(label="Remove Numbers", value=False)
1403
+
1404
+ with gr.Row():
1405
+ load_file_btn = gr.Button("Load File")
1406
+ analyze_batch_btn = gr.Button("Analyze Batch", variant="primary")
1407
 
1408
  with gr.Column():
1409
+ batch_summary = gr.Textbox(label="Batch Summary", lines=8)
1410
+ batch_results_df = gr.Dataframe(
1411
+ label="Detailed Results",
1412
+ headers=["Index", "Text", "Sentiment", "Confidence", "Language", "Keywords"],
1413
+ datatype=["number", "str", "str", "str", "str", "str"]
1414
+ )
1415
 
1416
+ with gr.Row():
1417
+ batch_plot = gr.Plot(label="Batch Analysis Summary")
1418
+ confidence_dist_plot = gr.Plot(label="Confidence Distribution")
1419
 
1420
+ with gr.Tab("History & Analytics"):
1421
  with gr.Row():
1422
+ with gr.Column():
1423
+ with gr.Row():
1424
+ refresh_history_btn = gr.Button("Refresh History")
1425
+ clear_history_btn = gr.Button("Clear History", variant="stop")
1426
+ status_btn = gr.Button("Get Status")
1427
+
1428
+ history_theme = gr.Dropdown(
1429
+ choices=list(config.THEMES.keys()),
1430
+ value="default",
1431
+ label="Dashboard Theme"
1432
+ )
1433
+
1434
+ with gr.Row():
1435
+ export_csv_btn = gr.Button("Export CSV")
1436
+ export_json_btn = gr.Button("Export JSON")
1437
+
1438
+ with gr.Column():
1439
+ history_status = gr.Textbox(label="History Status", lines=8)
1440
 
1441
+ history_dashboard = gr.Plot(label="History Analytics Dashboard")
 
 
1442
 
1443
+ with gr.Row():
1444
+ csv_download = gr.File(label="CSV Download", visible=True)
1445
+ json_download = gr.File(label="JSON Download", visible=True)
 
1446
 
1447
+ # Event Handlers
1448
  analyze_btn.click(
1449
  app.analyze_single,
1450
+ inputs=[text_input, language_selector, theme_selector,
1451
+ clean_text_cb, remove_punct_cb, remove_nums_cb],
1452
+ outputs=[result_output, gauge_plot, probability_plot, keyword_plot]
1453
  )
1454
 
1455
+ load_file_btn.click(
1456
+ app.data_handler.process_file,
1457
+ inputs=file_upload,
1458
+ outputs=batch_input
1459
+ )
1460
 
1461
+ analyze_batch_btn.click(
1462
+ app.analyze_batch,
1463
+ inputs=[batch_input, batch_language, batch_theme,
1464
+ batch_clean_cb, batch_punct_cb, batch_nums_cb],
1465
+ outputs=[batch_summary, batch_results_df, batch_plot, confidence_dist_plot]
1466
  )
1467
 
1468
+ refresh_history_btn.click(
1469
+ app.plot_history,
1470
+ inputs=history_theme,
1471
+ outputs=[history_dashboard, history_status]
1472
+ )
1473
+
1474
+ clear_history_btn.click(
1475
  lambda: f"Cleared {app.history.clear()} entries",
1476
  outputs=history_status
1477
  )
1478
 
1479
  status_btn.click(
1480
+ app.get_history_status,
1481
  outputs=history_status
1482
  )
1483
 
1484
+ export_csv_btn.click(
1485
  lambda: app.data_handler.export_data(app.history.get_all(), 'csv'),
1486
+ outputs=[csv_download, history_status]
1487
  )
1488
 
1489
+ export_json_btn.click(
1490
  lambda: app.data_handler.export_data(app.history.get_all(), 'json'),
1491
+ outputs=[json_download, history_status]
1492
  )
1493
 
1494
  return demo
1495
 
1496
  # Application Entry Point
1497
  if __name__ == "__main__":
1498
+ logging.basicConfig(
1499
+ level=logging.INFO,
1500
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
1501
+ )
1502
+
1503
+ try:
1504
+ demo = create_interface()
1505
+ demo.launch(
1506
+ share=True,
1507
+ server_name="0.0.0.0",
1508
+ server_port=7860,
1509
+ show_error=True
1510
+ )
1511
+ except Exception as e:
1512
+ logger.error(f"Failed to launch application: {e}")
1513
+ raise