entropy25 commited on
Commit
5b09c58
·
verified ·
1 Parent(s): 79dfba2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +649 -1461
app.py CHANGED
@@ -1,1539 +1,727 @@
1
- import torch
2
  import gradio as gr
3
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
- import plotly.graph_objects as go
5
- import plotly.express as px
6
- from plotly.subplots import make_subplots
7
  import numpy as np
8
- from wordcloud import WordCloud
9
- from collections import Counter, defaultdict, OrderedDict
10
- import re
11
  import json
12
- import csv
13
  import io
14
- import tempfile
15
  from datetime import datetime
16
- import logging
17
- from functools import lru_cache, wraps
18
- from dataclasses import dataclass
19
- from typing import List, Dict, Optional, Tuple, Any, Callable
20
- from contextlib import contextmanager
21
- import nltk
22
- from nltk.corpus import stopwords
23
- import langdetect
24
- import pandas as pd
25
- import gc
26
- import threading
27
- import asyncio
28
- from concurrent.futures import ThreadPoolExecutor
29
  import time
30
 
31
- # Advanced analysis imports
32
- import shap
33
- import lime
34
- from lime.lime_text import LimeTextExplainer
35
 
36
- # Configuration
37
- @dataclass
38
- class Config:
39
- MAX_HISTORY_SIZE: int = 1000
40
- BATCH_SIZE_LIMIT: int = 50
41
- MAX_TEXT_LENGTH: int = 512
42
- MIN_WORD_LENGTH: int = 2
43
- CACHE_SIZE: int = 128
44
- BATCH_PROCESSING_SIZE: int = 8
45
- MODEL_CACHE_SIZE: int = 2 # Maximum models to keep in memory
46
-
47
- # Supported languages and models
48
- SUPPORTED_LANGUAGES = {
49
- 'auto': 'Auto Detect',
50
- 'en': 'English',
51
- 'zh': 'Chinese',
52
- 'es': 'Spanish',
53
- 'fr': 'French',
54
- 'de': 'German',
55
- 'sv': 'Swedish'
56
- }
57
-
58
- MODELS = {
59
- 'en': "cardiffnlp/twitter-roberta-base-sentiment-latest",
60
- 'multilingual': "cardiffnlp/twitter-xlm-roberta-base-sentiment",
61
- 'zh': "uer/roberta-base-finetuned-dianping-chinese"
62
- }
63
-
64
- # Color themes for Plotly
65
- THEMES = {
66
- 'default': {'pos': '#4CAF50', 'neg': '#F44336', 'neu': '#FF9800'},
67
- 'ocean': {'pos': '#0077BE', 'neg': '#FF6B35', 'neu': '#00BCD4'},
68
- 'dark': {'pos': '#66BB6A', 'neg': '#EF5350', 'neu': '#FFA726'},
69
- 'rainbow': {'pos': '#9C27B0', 'neg': '#E91E63', 'neu': '#FF5722'}
70
- }
71
-
72
- config = Config()
73
-
74
- # Logging setup
75
- logging.basicConfig(level=logging.INFO)
76
- logger = logging.getLogger(__name__)
77
-
78
- # Initialize NLTK
79
- try:
80
- nltk.download('stopwords', quiet=True)
81
- nltk.download('punkt', quiet=True)
82
- STOP_WORDS = set(stopwords.words('english'))
83
- except:
84
- STOP_WORDS = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
85
-
86
- # Decorators and Context Managers
87
- def handle_errors(default_return=None):
88
- """Centralized error handling decorator"""
89
- def decorator(func: Callable) -> Callable:
90
- @wraps(func)
91
- def wrapper(*args, **kwargs):
92
- try:
93
- return func(*args, **kwargs)
94
- except Exception as e:
95
- logger.error(f"{func.__name__} failed: {e}")
96
- return default_return if default_return is not None else f"Error: {str(e)}"
97
- return wrapper
98
- return decorator
99
-
100
- @contextmanager
101
- def memory_cleanup():
102
- """Context manager for memory cleanup"""
103
- try:
104
- yield
105
- finally:
106
- gc.collect()
107
- if torch.cuda.is_available():
108
- torch.cuda.empty_cache()
109
-
110
- class ThemeContext:
111
- """Theme management context"""
112
- def __init__(self, theme: str = 'default'):
113
- self.theme = theme
114
- self.colors = config.THEMES.get(theme, config.THEMES['default'])
115
-
116
- class LRUModelCache:
117
- """LRU Cache for models with memory management"""
118
- def __init__(self, max_size: int = 2):
119
- self.max_size = max_size
120
- self.cache = OrderedDict()
121
- self.lock = threading.Lock()
122
-
123
- def get(self, key):
124
- with self.lock:
125
- if key in self.cache:
126
- # Move to end (most recently used)
127
- self.cache.move_to_end(key)
128
- return self.cache[key]
129
- return None
130
-
131
- def put(self, key, value):
132
- with self.lock:
133
- if key in self.cache:
134
- self.cache.move_to_end(key)
135
- else:
136
- if len(self.cache) >= self.max_size:
137
- # Remove least recently used
138
- oldest_key = next(iter(self.cache))
139
- old_model, old_tokenizer = self.cache.pop(oldest_key)
140
- # Force cleanup
141
- del old_model, old_tokenizer
142
- gc.collect()
143
- if torch.cuda.is_available():
144
- torch.cuda.empty_cache()
145
-
146
- self.cache[key] = value
147
-
148
- def clear(self):
149
- with self.lock:
150
- for model, tokenizer in self.cache.values():
151
- del model, tokenizer
152
- self.cache.clear()
153
- gc.collect()
154
- if torch.cuda.is_available():
155
- torch.cuda.empty_cache()
156
-
157
- # Enhanced Model Manager with Optimized Memory Management
158
- class ModelManager:
159
- """Optimized multi-language model manager with LRU cache and lazy loading"""
160
- _instance = None
161
-
162
- def __new__(cls):
163
- if cls._instance is None:
164
- cls._instance = super().__new__(cls)
165
- cls._instance._initialized = False
166
- return cls._instance
167
-
168
  def __init__(self):
169
- if not self._initialized:
170
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
171
- self.model_cache = LRUModelCache(config.MODEL_CACHE_SIZE)
172
- self.loading_lock = threading.Lock()
173
- self._initialized = True
174
- logger.info(f"ModelManager initialized on device: {self.device}")
175
-
176
- def _load_model(self, model_name: str, cache_key: str):
177
- """Load model with memory optimization"""
178
- try:
179
- logger.info(f"Loading model: {model_name}")
180
-
181
- # Load with memory optimization
182
- tokenizer = AutoTokenizer.from_pretrained(model_name)
183
- model = AutoModelForSequenceClassification.from_pretrained(
184
- model_name,
185
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
186
- device_map="auto" if torch.cuda.is_available() else None
187
  )
188
-
189
- if not torch.cuda.is_available():
190
- model.to(self.device)
191
-
192
- # Set to eval mode to save memory
193
- model.eval()
194
-
195
- # Cache the model
196
- self.model_cache.put(cache_key, (model, tokenizer))
197
- logger.info(f"Model {model_name} loaded and cached successfully")
198
-
199
- return model, tokenizer
200
-
201
- except Exception as e:
202
- logger.error(f"Failed to load model {model_name}: {e}")
203
- raise
204
-
205
- def get_model(self, language='en'):
206
- """Get model for specific language with lazy loading and caching"""
207
- # Determine cache key and model name
208
- if language == 'zh':
209
- cache_key = 'zh'
210
- model_name = config.MODELS['zh']
211
- else:
212
- cache_key = 'multilingual'
213
- model_name = config.MODELS['multilingual']
214
-
215
- # Try to get from cache first
216
- cached_model = self.model_cache.get(cache_key)
217
- if cached_model is not None:
218
- return cached_model
219
-
220
- # Load model if not in cache (with thread safety)
221
- with self.loading_lock:
222
- # Double-check pattern
223
- cached_model = self.model_cache.get(cache_key)
224
- if cached_model is not None:
225
- return cached_model
226
-
227
- return self._load_model(model_name, cache_key)
228
-
229
- @staticmethod
230
- def detect_language(text: str) -> str:
231
- """Detect text language"""
232
- try:
233
- detected = langdetect.detect(text)
234
- language_mapping = {
235
- 'zh-cn': 'zh',
236
- 'zh-tw': 'zh'
237
- }
238
- detected = language_mapping.get(detected, detected)
239
- return detected if detected in config.SUPPORTED_LANGUAGES else 'en'
240
- except:
241
- return 'en'
242
-
243
- # Simplified Text Processing
244
- class TextProcessor:
245
- """Optimized text processing with multi-language support"""
246
-
247
- @staticmethod
248
- @lru_cache(maxsize=config.CACHE_SIZE)
249
- def clean_text(text: str, remove_punctuation: bool = True, remove_numbers: bool = False) -> str:
250
- """Clean text with language awareness"""
251
- text = text.strip()
252
-
253
- # Don't clean Chinese text aggressively
254
- if re.search(r'[\u4e00-\u9fff]', text):
255
- return text
256
-
257
- text = text.lower()
258
-
259
- if remove_numbers:
260
- text = re.sub(r'\d+', '', text)
261
-
262
- if remove_punctuation:
263
- text = re.sub(r'[^\w\s]', '', text)
264
-
265
- words = text.split()
266
- cleaned_words = [w for w in words if w not in STOP_WORDS and len(w) >= config.MIN_WORD_LENGTH]
267
- return ' '.join(cleaned_words)
268
-
269
- @staticmethod
270
- def parse_batch_input(text: str) -> List[str]:
271
- """Parse batch input from textarea"""
272
- lines = text.strip().split('\n')
273
- return [line.strip() for line in lines if line.strip()]
274
-
275
- # Enhanced History Manager
276
- class HistoryManager:
277
- """Enhanced history management with filtering"""
278
- def __init__(self):
279
- self._history = []
280
-
281
- def add(self, entry: Dict):
282
- """Add entry with timestamp"""
283
- entry['timestamp'] = datetime.now().isoformat()
284
- self._history.append(entry)
285
- if len(self._history) > config.MAX_HISTORY_SIZE:
286
- self._history = self._history[-config.MAX_HISTORY_SIZE:]
287
-
288
- def add_batch(self, entries: List[Dict]):
289
- """Add multiple entries"""
290
- for entry in entries:
291
- self.add(entry)
292
-
293
- def get_all(self) -> List[Dict]:
294
- return self._history.copy()
295
-
296
- def get_recent(self, n: int = 10) -> List[Dict]:
297
- return self._history[-n:] if self._history else []
298
-
299
- def filter_by(self, sentiment: str = None, language: str = None,
300
- min_confidence: float = None) -> List[Dict]:
301
- """Filter history by criteria"""
302
- filtered = self._history
303
-
304
- if sentiment:
305
- filtered = [h for h in filtered if h['sentiment'] == sentiment]
306
- if language:
307
- filtered = [h for h in filtered if h.get('language', 'en') == language]
308
- if min_confidence:
309
- filtered = [h for h in filtered if h['confidence'] >= min_confidence]
310
-
311
- return filtered
312
-
313
- def clear(self) -> int:
314
- count = len(self._history)
315
- self._history.clear()
316
- return count
317
-
318
- def size(self) -> int:
319
- return len(self._history)
320
-
321
- def get_stats(self) -> Dict:
322
- """Get comprehensive statistics"""
323
- if not self._history:
324
- return {}
325
 
326
- sentiments = [item['sentiment'] for item in self._history]
327
- confidences = [item['confidence'] for item in self._history]
328
- languages = [item.get('language', 'en') for item in self._history]
329
 
330
  return {
331
- 'total_analyses': len(self._history),
332
- 'positive_count': sentiments.count('Positive'),
333
- 'negative_count': sentiments.count('Negative'),
334
- 'neutral_count': sentiments.count('Neutral'),
335
- 'avg_confidence': np.mean(confidences),
336
- 'max_confidence': np.max(confidences),
337
- 'min_confidence': np.min(confidences),
338
- 'languages_detected': len(set(languages)),
339
- 'most_common_language': Counter(languages).most_common(1)[0][0] if languages else 'en'
340
- }
341
-
342
- # Core Sentiment Analysis Engine with Performance Optimizations
343
- class SentimentEngine:
344
- """Optimized multi-language sentiment analysis engine"""
345
-
346
- def __init__(self):
347
- self.model_manager = ModelManager()
348
- self.executor = ThreadPoolExecutor(max_workers=4)
349
-
350
- @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0})
351
- def analyze_single(self, text: str, language: str = 'auto', preprocessing_options: Dict = None) -> Dict:
352
- """Optimized single text analysis"""
353
- if not text.strip():
354
- raise ValueError("Empty text provided")
355
-
356
- # Detect language
357
- if language == 'auto':
358
- detected_lang = self.model_manager.detect_language(text)
359
- else:
360
- detected_lang = language
361
-
362
- # Get appropriate model
363
- model, tokenizer = self.model_manager.get_model(detected_lang)
364
-
365
- # Preprocessing
366
- options = preprocessing_options or {}
367
- processed_text = text
368
- if options.get('clean_text', False) and not re.search(r'[\u4e00-\u9fff]', text):
369
- processed_text = TextProcessor.clean_text(
370
- text,
371
- options.get('remove_punctuation', True),
372
- options.get('remove_numbers', False)
373
- )
374
-
375
- # Tokenize and analyze with memory optimization
376
- inputs = tokenizer(processed_text, return_tensors="pt", padding=True,
377
- truncation=True, max_length=config.MAX_TEXT_LENGTH).to(self.model_manager.device)
378
-
379
- # Use no_grad for inference to save memory
380
- with torch.no_grad():
381
- outputs = model(**inputs)
382
- probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
383
-
384
- # Clear GPU cache after inference
385
- if torch.cuda.is_available():
386
- torch.cuda.empty_cache()
387
-
388
- # Handle different model outputs
389
- if len(probs) == 3: # negative, neutral, positive
390
- sentiment_idx = np.argmax(probs)
391
- sentiment_labels = ['Negative', 'Neutral', 'Positive']
392
- sentiment = sentiment_labels[sentiment_idx]
393
- confidence = float(probs[sentiment_idx])
394
-
395
- result = {
396
- 'sentiment': sentiment,
397
- 'confidence': confidence,
398
- 'neg_prob': float(probs[0]),
399
- 'neu_prob': float(probs[1]),
400
- 'pos_prob': float(probs[2]),
401
- 'has_neutral': True
402
- }
403
- else: # negative, positive
404
- pred = np.argmax(probs)
405
- sentiment = "Positive" if pred == 1 else "Negative"
406
- confidence = float(probs[pred])
407
-
408
- result = {
409
- 'sentiment': sentiment,
410
- 'confidence': confidence,
411
- 'neg_prob': float(probs[0]),
412
- 'pos_prob': float(probs[1]),
413
- 'neu_prob': 0.0,
414
- 'has_neutral': False
415
- }
416
-
417
- # Add metadata
418
- result.update({
419
- 'language': detected_lang,
420
- 'word_count': len(text.split()),
421
- 'char_count': len(text)
422
- })
423
-
424
- return result
425
-
426
- def _analyze_text_batch(self, text: str, language: str, preprocessing_options: Dict, index: int) -> Dict:
427
- """Single text analysis for batch processing"""
428
- try:
429
- result = self.analyze_single(text, language, preprocessing_options)
430
- result['batch_index'] = index
431
- result['text'] = text[:100] + '...' if len(text) > 100 else text
432
- result['full_text'] = text
433
- return result
434
- except Exception as e:
435
- return {
436
- 'sentiment': 'Error',
437
- 'confidence': 0.0,
438
- 'error': str(e),
439
- 'batch_index': index,
440
- 'text': text[:100] + '...' if len(text) > 100 else text,
441
- 'full_text': text
442
  }
 
443
 
444
- @handle_errors(default_return=[])
445
- def analyze_batch(self, texts: List[str], language: str = 'auto',
446
- preprocessing_options: Dict = None, progress_callback=None) -> List[Dict]:
447
- """Optimized parallel batch processing"""
448
- if len(texts) > config.BATCH_SIZE_LIMIT:
449
- texts = texts[:config.BATCH_SIZE_LIMIT]
450
-
451
- if not texts:
452
- return []
453
-
454
- # Pre-load model to avoid race conditions
455
- self.model_manager.get_model(language if language != 'auto' else 'en')
456
-
457
- # Use ThreadPoolExecutor for parallel processing
458
- with ThreadPoolExecutor(max_workers=min(4, len(texts))) as executor:
459
- futures = []
460
- for i, text in enumerate(texts):
461
- future = executor.submit(
462
- self._analyze_text_batch,
463
- text, language, preprocessing_options, i
464
- )
465
- futures.append(future)
466
-
467
- results = []
468
- for i, future in enumerate(futures):
469
- if progress_callback:
470
- progress_callback((i + 1) / len(futures))
471
-
472
- try:
473
- result = future.result(timeout=30) # 30 second timeout per text
474
- results.append(result)
475
- except Exception as e:
476
- results.append({
477
- 'sentiment': 'Error',
478
- 'confidence': 0.0,
479
- 'error': f"Timeout or error: {str(e)}",
480
- 'batch_index': i,
481
- 'text': texts[i][:100] + '...' if len(texts[i]) > 100 else texts[i],
482
- 'full_text': texts[i]
483
- })
484
 
485
- return results
486
-
487
- class AdvancedAnalysisEngine:
488
- """Advanced analysis using SHAP and LIME with FIXED implementation"""
489
-
490
- def __init__(self):
491
- self.model_manager = ModelManager()
492
-
493
- def create_prediction_function(self, model, tokenizer, device):
494
- """Create FIXED prediction function for SHAP/LIME"""
495
- def predict_proba(texts):
496
- # Ensure texts is a list
497
- if isinstance(texts, str):
498
- texts = [texts]
499
- elif isinstance(texts, np.ndarray):
500
- texts = texts.tolist()
501
-
502
- # Convert all elements to strings
503
- texts = [str(text) for text in texts]
504
-
505
- results = []
506
- batch_size = 16 # Process in smaller batches
507
-
508
- for i in range(0, len(texts), batch_size):
509
- batch_texts = texts[i:i + batch_size]
510
 
511
- try:
512
- with torch.no_grad():
513
- # Tokenize batch
514
- inputs = tokenizer(
515
- batch_texts,
516
- return_tensors="pt",
517
- padding=True,
518
- truncation=True,
519
- max_length=config.MAX_TEXT_LENGTH
520
- ).to(device)
521
-
522
- # Batch inference
523
- outputs = model(**inputs)
524
- probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()
525
-
526
- results.extend(probs)
527
-
528
- except Exception as e:
529
- logger.error(f"Prediction batch failed: {e}")
530
- # Return neutral predictions for failed batch
531
- batch_size_actual = len(batch_texts)
532
- if hasattr(model.config, 'num_labels') and model.config.num_labels == 3:
533
- neutral_probs = np.array([[0.33, 0.34, 0.33]] * batch_size_actual)
534
- else:
535
- neutral_probs = np.array([[0.5, 0.5]] * batch_size_actual)
536
- results.extend(neutral_probs)
537
-
538
- return np.array(results)
539
-
540
- return predict_proba
541
-
542
- @handle_errors(default_return=("Analysis failed", None, None))
543
- def analyze_with_shap(self, text: str, language: str = 'auto', num_samples: int = 100) -> Tuple[str, go.Figure, Dict]:
544
- """FIXED SHAP analysis implementation"""
545
- if not text.strip():
546
- return "Please enter text for analysis", None, {}
547
-
548
- # Detect language and get model
549
- if language == 'auto':
550
- detected_lang = self.model_manager.detect_language(text)
551
- else:
552
- detected_lang = language
553
-
554
- model, tokenizer = self.model_manager.get_model(detected_lang)
555
-
556
- try:
557
- # Create FIXED prediction function
558
- predict_fn = self.create_prediction_function(model, tokenizer, self.model_manager.device)
559
-
560
- # Test the prediction function first
561
- test_pred = predict_fn([text])
562
- if test_pred is None or len(test_pred) == 0:
563
- return "Prediction function test failed", None, {}
564
 
565
- # Use SHAP Text Explainer instead of generic Explainer
566
- explainer = shap.Explainer(predict_fn, masker=shap.maskers.Text(tokenizer))
567
 
568
- # Get SHAP values with proper text input
569
- shap_values = explainer([text], max_evals=num_samples)
570
-
571
- # Extract data safely
572
- if hasattr(shap_values, 'data') and hasattr(shap_values, 'values'):
573
- tokens = shap_values.data[0] if len(shap_values.data) > 0 else []
574
- values = shap_values.values[0] if len(shap_values.values) > 0 else []
575
  else:
576
- return "SHAP values extraction failed", None, {}
577
-
578
- if len(tokens) == 0 or len(values) == 0:
579
- return "No tokens or values extracted from SHAP", None, {}
580
-
581
- # Handle multi-dimensional values
582
- if len(values.shape) > 1:
583
- # Use positive class values (last column for 3-class, second for 2-class)
584
- pos_values = values[:, -1] if values.shape[1] >= 2 else values[:, 0]
585
- else:
586
- pos_values = values
587
-
588
- # Ensure we have matching lengths
589
- min_len = min(len(tokens), len(pos_values))
590
- tokens = tokens[:min_len]
591
- pos_values = pos_values[:min_len]
592
 
593
- # Create visualization
594
- fig = go.Figure()
595
-
596
- colors = ['red' if v < 0 else 'green' for v in pos_values]
597
-
598
- fig.add_trace(go.Bar(
599
- x=list(range(len(tokens))),
600
- y=pos_values,
601
- text=tokens,
602
- textposition='outside',
603
- marker_color=colors,
604
- name='SHAP Values',
605
- hovertemplate='<b>%{text}</b><br>SHAP Value: %{y:.4f}<extra></extra>'
606
- ))
607
-
608
- fig.update_layout(
609
- title=f"SHAP Analysis - Token Importance (Samples: {num_samples})",
610
- xaxis_title="Token Index",
611
- yaxis_title="SHAP Value",
612
- height=500,
613
- xaxis=dict(tickmode='array', tickvals=list(range(len(tokens))), ticktext=tokens)
614
- )
615
-
616
- # Create analysis summary
617
- analysis_data = {
618
- 'method': 'SHAP',
619
- 'language': detected_lang,
620
- 'total_tokens': len(tokens),
621
- 'samples_used': num_samples,
622
- 'positive_influence': sum(1 for v in pos_values if v > 0),
623
- 'negative_influence': sum(1 for v in pos_values if v < 0),
624
- 'most_important_tokens': [(str(tokens[i]), float(pos_values[i]))
625
- for i in np.argsort(np.abs(pos_values))[-5:]]
626
- }
627
-
628
- summary_text = f"""
629
- **SHAP Analysis Results:**
630
- - **Language:** {detected_lang.upper()}
631
- - **Total Tokens:** {analysis_data['total_tokens']}
632
- - **Samples Used:** {num_samples}
633
- - **Positive Influence Tokens:** {analysis_data['positive_influence']}
634
- - **Negative Influence Tokens:** {analysis_data['negative_influence']}
635
- - **Most Important Tokens:** {', '.join([f"{token}({score:.3f})" for token, score in analysis_data['most_important_tokens']])}
636
- - **Status:** SHAP analysis completed successfully
637
- """
638
-
639
- return summary_text, fig, analysis_data
640
-
641
- except Exception as e:
642
- logger.error(f"SHAP analysis failed: {e}")
643
- error_msg = f"""
644
- **SHAP Analysis Failed:**
645
- - **Error:** {str(e)}
646
- - **Language:** {detected_lang.upper()}
647
- - **Suggestion:** Try with a shorter text or reduce number of samples
648
-
649
- **Common fixes:**
650
- - Reduce sample size to 50-100
651
- - Use shorter input text (< 200 words)
652
- - Check if model supports the text language
653
- """
654
- return error_msg, None, {}
655
-
656
- @handle_errors(default_return=("Analysis failed", None, None))
657
- def analyze_with_lime(self, text: str, language: str = 'auto', num_samples: int = 100) -> Tuple[str, go.Figure, Dict]:
658
- """FIXED LIME analysis implementation - Bug Fix for mode parameter"""
659
- if not text.strip():
660
- return "Please enter text for analysis", None, {}
661
-
662
- # Detect language and get model
663
- if language == 'auto':
664
- detected_lang = self.model_manager.detect_language(text)
665
- else:
666
- detected_lang = language
667
-
668
- model, tokenizer = self.model_manager.get_model(detected_lang)
669
 
670
- try:
671
- # Create FIXED prediction function
672
- predict_fn = self.create_prediction_function(model, tokenizer, self.model_manager.device)
673
-
674
- # Test the prediction function first
675
- test_pred = predict_fn([text])
676
- if test_pred is None or len(test_pred) == 0:
677
- return "Prediction function test failed", None, {}
678
-
679
- # Determine class names based on model output
680
- num_classes = test_pred.shape[1] if len(test_pred.shape) > 1 else 2
681
- if num_classes == 3:
682
- class_names = ['Negative', 'Neutral', 'Positive']
683
- else:
684
- class_names = ['Negative', 'Positive']
685
-
686
- # Initialize LIME explainer - FIXED: Remove 'mode' parameter
687
- explainer = LimeTextExplainer(class_names=class_names)
688
-
689
- # Get LIME explanation
690
- exp = explainer.explain_instance(
691
- text,
692
- predict_fn,
693
- num_features=min(20, len(text.split())), # Limit features
694
- num_samples=num_samples
695
- )
696
-
697
- # Extract feature importance
698
- lime_data = exp.as_list()
699
-
700
- if not lime_data:
701
- return "No LIME features extracted", None, {}
702
-
703
- # Create visualization
704
- words = [item[0] for item in lime_data]
705
- scores = [item[1] for item in lime_data]
706
-
707
- fig = go.Figure()
708
-
709
- colors = ['red' if s < 0 else 'green' for s in scores]
710
-
711
- fig.add_trace(go.Bar(
712
- y=words,
713
- x=scores,
714
- orientation='h',
715
- marker_color=colors,
716
- text=[f'{s:.3f}' for s in scores],
717
- textposition='auto',
718
- name='LIME Importance',
719
- hovertemplate='<b>%{y}</b><br>Importance: %{x:.4f}<extra></extra>'
720
- ))
721
-
722
- fig.update_layout(
723
- title=f"LIME Analysis - Feature Importance (Samples: {num_samples})",
724
- xaxis_title="Importance Score",
725
- yaxis_title="Words/Phrases",
726
- height=500
727
- )
728
-
729
- # Create analysis summary
730
- analysis_data = {
731
- 'method': 'LIME',
732
- 'language': detected_lang,
733
- 'features_analyzed': len(lime_data),
734
- 'samples_used': num_samples,
735
- 'positive_features': sum(1 for _, score in lime_data if score > 0),
736
- 'negative_features': sum(1 for _, score in lime_data if score < 0),
737
- 'feature_importance': lime_data
738
- }
739
-
740
- summary_text = f"""
741
- **LIME Analysis Results:**
742
- - **Language:** {detected_lang.upper()}
743
- - **Features Analyzed:** {analysis_data['features_analyzed']}
744
- - **Classes:** {', '.join(class_names)}
745
- - **Samples Used:** {num_samples}
746
- - **Positive Features:** {analysis_data['positive_features']}
747
- - **Negative Features:** {analysis_data['negative_features']}
748
- - **Top Features:** {', '.join([f"{word}({score:.3f})" for word, score in lime_data[:5]])}
749
- - **Status:** LIME analysis completed successfully
750
- """
751
-
752
- return summary_text, fig, analysis_data
753
-
754
- except Exception as e:
755
- logger.error(f"LIME analysis failed: {e}")
756
- error_msg = f"""
757
- **LIME Analysis Failed:**
758
- - **Error:** {str(e)}
759
- - **Language:** {detected_lang.upper()}
760
- - **Suggestion:** Try with a shorter text or reduce number of samples
761
-
762
- **Bug Fix Applied:**
763
- - ✅ Removed 'mode' parameter from LimeTextExplainer initialization
764
- - ✅ This should resolve the "unexpected keyword argument 'mode'" error
765
-
766
- **Common fixes:**
767
- - Reduce sample size to 50-100
768
- - Use shorter input text (< 200 words)
769
- - Check if model supports the text language
770
- """
771
- return error_msg, None, {}
772
-
773
- # Optimized Plotly Visualization System
774
- class PlotlyVisualizer:
775
- """Enhanced Plotly visualizations"""
776
-
777
- @staticmethod
778
- @handle_errors(default_return=None)
779
- def create_sentiment_gauge(result: Dict, theme: ThemeContext) -> go.Figure:
780
- """Create animated sentiment gauge"""
781
- colors = theme.colors
782
 
783
- if result.get('has_neutral', False):
784
- # Three-way gauge
785
- fig = go.Figure(go.Indicator(
786
- mode="gauge+number+delta",
787
- value=result['pos_prob'] * 100,
788
- domain={'x': [0, 1], 'y': [0, 1]},
789
- title={'text': f"Sentiment: {result['sentiment']}"},
790
- delta={'reference': 50},
791
- gauge={
792
- 'axis': {'range': [None, 100]},
793
- 'bar': {'color': colors['pos'] if result['sentiment'] == 'Positive' else colors['neg']},
794
- 'steps': [
795
- {'range': [0, 33], 'color': colors['neg']},
796
- {'range': [33, 67], 'color': colors['neu']},
797
- {'range': [67, 100], 'color': colors['pos']}
798
- ],
799
- 'threshold': {
800
- 'line': {'color': "red", 'width': 4},
801
- 'thickness': 0.75,
802
- 'value': 90
803
- }
804
- }
805
- ))
806
- else:
807
- # Two-way gauge
808
- fig = go.Figure(go.Indicator(
809
- mode="gauge+number",
810
- value=result['confidence'] * 100,
811
- domain={'x': [0, 1], 'y': [0, 1]},
812
- title={'text': f"Confidence: {result['sentiment']}"},
813
- gauge={
814
- 'axis': {'range': [None, 100]},
815
- 'bar': {'color': colors['pos'] if result['sentiment'] == 'Positive' else colors['neg']},
816
- 'steps': [
817
- {'range': [0, 50], 'color': "lightgray"},
818
- {'range': [50, 100], 'color': "gray"}
819
- ]
820
- }
821
- ))
822
 
823
- fig.update_layout(height=400, font={'size': 16})
824
- return fig
 
 
 
 
825
 
826
- @staticmethod
827
- @handle_errors(default_return=None)
828
- def create_probability_bars(result: Dict, theme: ThemeContext) -> go.Figure:
829
- """Create probability bar chart"""
830
- colors = theme.colors
831
-
832
- if result.get('has_neutral', False):
833
- labels = ['Negative', 'Neutral', 'Positive']
834
- values = [result['neg_prob'], result['neu_prob'], result['pos_prob']]
835
- bar_colors = [colors['neg'], colors['neu'], colors['pos']]
836
- else:
837
- labels = ['Negative', 'Positive']
838
- values = [result['neg_prob'], result['pos_prob']]
839
- bar_colors = [colors['neg'], colors['pos']]
840
 
841
- fig = go.Figure(data=[
842
- go.Bar(x=labels, y=values, marker_color=bar_colors,
843
- text=[f'{v:.3f}' for v in values], textposition='outside')
844
- ])
845
 
846
- fig.update_layout(
847
- title="Sentiment Probabilities",
848
- yaxis_title="Probability",
849
- height=400,
850
- showlegend=False
851
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
852
 
853
- return fig
 
 
 
 
 
 
 
 
854
 
855
- @staticmethod
856
- @handle_errors(default_return=None)
857
- def create_batch_summary(results: List[Dict], theme: ThemeContext) -> go.Figure:
858
- """Create batch analysis summary"""
859
- colors = theme.colors
860
-
861
- # Count sentiments
862
- sentiments = [r['sentiment'] for r in results if 'sentiment' in r and r['sentiment'] != 'Error']
863
- sentiment_counts = Counter(sentiments)
864
-
865
- # Create pie chart
866
- fig = go.Figure(data=[go.Pie(
867
- labels=list(sentiment_counts.keys()),
868
- values=list(sentiment_counts.values()),
869
- marker_colors=[colors.get(s.lower()[:3], '#999999') for s in sentiment_counts.keys()],
870
- textinfo='label+percent',
871
- hole=0.3
872
- )])
873
 
874
- fig.update_layout(
875
- title=f"Batch Analysis Summary ({len(results)} texts)",
876
- height=400
877
- )
878
-
879
- return fig
880
-
881
- @staticmethod
882
- @handle_errors(default_return=None)
883
- def create_confidence_distribution(results: List[Dict]) -> go.Figure:
884
- """Create confidence distribution plot"""
885
- confidences = [r['confidence'] for r in results if 'confidence' in r and r['sentiment'] != 'Error']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
886
 
887
- if not confidences:
888
- return go.Figure()
889
 
890
- fig = go.Figure(data=[go.Histogram(
891
- x=confidences,
892
- nbinsx=20,
893
- marker_color='skyblue',
894
- opacity=0.7
895
- )])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
896
 
897
  fig.update_layout(
898
- title="Confidence Distribution",
899
- xaxis_title="Confidence Score",
900
- yaxis_title="Frequency",
901
- height=400
902
- )
903
-
904
- return fig
905
-
906
- @staticmethod
907
- @handle_errors(default_return=None)
908
- def create_history_dashboard(history: List[Dict], theme: ThemeContext) -> go.Figure:
909
- """Create comprehensive history dashboard"""
910
- if len(history) < 2:
911
- return go.Figure()
912
-
913
- # Create subplots
914
- fig = make_subplots(
915
- rows=2, cols=2,
916
- subplot_titles=['Sentiment Timeline', 'Confidence Distribution',
917
- 'Language Distribution', 'Sentiment Summary'],
918
- specs=[[{"secondary_y": False}, {"secondary_y": False}],
919
- [{"type": "pie"}, {"type": "bar"}]]
920
  )
921
 
922
- # Extract data
923
- indices = list(range(len(history)))
924
- pos_probs = [item.get('pos_prob', 0) for item in history]
925
- confidences = [item['confidence'] for item in history]
926
- sentiments = [item['sentiment'] for item in history]
927
- languages = [item.get('language', 'en') for item in history]
928
-
929
- # Sentiment timeline
930
- colors_map = {'Positive': theme.colors['pos'], 'Negative': theme.colors['neg'], 'Neutral': theme.colors['neu']}
931
- colors = [colors_map.get(s, '#999999') for s in sentiments]
932
-
933
- fig.add_trace(
934
- go.Scatter(x=indices, y=pos_probs, mode='lines+markers',
935
- marker=dict(color=colors, size=8),
936
- name='Positive Probability'),
937
- row=1, col=1
938
- )
939
-
940
- # Confidence distribution
941
- fig.add_trace(
942
- go.Histogram(x=confidences, nbinsx=10, name='Confidence'),
943
- row=1, col=2
944
- )
945
 
946
- # Language distribution
947
- lang_counts = Counter(languages)
948
- fig.add_trace(
949
- go.Pie(labels=list(lang_counts.keys()), values=list(lang_counts.values()),
950
- name="Languages"),
951
- row=2, col=1
952
  )
953
 
954
- # Sentiment summary
955
- sent_counts = Counter(sentiments)
956
- sent_colors = [colors_map.get(k, '#999999') for k in sent_counts.keys()]
957
- fig.add_trace(
958
- go.Bar(x=list(sent_counts.keys()), y=list(sent_counts.values()),
959
- marker_color=sent_colors),
960
- row=2, col=2
961
- )
962
 
963
- fig.update_layout(height=800, showlegend=False)
964
- return fig
965
-
966
- # Universal Data Handler
967
- class DataHandler:
968
- """Enhanced data operations"""
969
-
970
- @staticmethod
971
- @handle_errors(default_return=(None, "Export failed"))
972
- def export_data(data: List[Dict], format_type: str) -> Tuple[Optional[str], str]:
973
- """Export data with comprehensive information"""
974
- if not data:
975
- return None, "No data to export"
976
 
977
- temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False,
978
- suffix=f'.{format_type}', encoding='utf-8')
979
 
980
- if format_type == 'csv':
981
- writer = csv.writer(temp_file)
982
- writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Language',
983
- 'Pos_Prob', 'Neg_Prob', 'Neu_Prob', 'Word_Count'])
984
- for entry in data:
985
- writer.writerow([
986
- entry.get('timestamp', ''),
987
- entry.get('text', ''),
988
- entry.get('sentiment', ''),
989
- f"{entry.get('confidence', 0):.4f}",
990
- entry.get('language', 'en'),
991
- f"{entry.get('pos_prob', 0):.4f}",
992
- f"{entry.get('neg_prob', 0):.4f}",
993
- f"{entry.get('neu_prob', 0):.4f}",
994
- entry.get('word_count', 0)
995
- ])
996
- elif format_type == 'json':
997
- json.dump(data, temp_file, indent=2, ensure_ascii=False)
998
 
999
- temp_file.close()
1000
- return temp_file.name, f"Exported {len(data)} entries"
1001
 
1002
- @staticmethod
1003
- @handle_errors(default_return="")
1004
- def process_file(file) -> str:
1005
- """Process uploaded files"""
1006
- if not file:
1007
- return ""
1008
-
1009
- content = file.read().decode('utf-8')
1010
-
1011
- if file.name.endswith('.csv'):
1012
- csv_file = io.StringIO(content)
1013
- reader = csv.reader(csv_file)
1014
- try:
1015
- next(reader) # Skip header
1016
- texts = []
1017
- for row in reader:
1018
- if row and row[0].strip():
1019
- text = row[0].strip().strip('"')
1020
- if text:
1021
- texts.append(text)
1022
- return '\n'.join(texts)
1023
- except:
1024
- lines = content.strip().split('\n')[1:]
1025
- texts = []
1026
- for line in lines:
1027
- if line.strip():
1028
- text = line.strip().strip('"')
1029
- if text:
1030
- texts.append(text)
1031
- return '\n'.join(texts)
1032
 
1033
- return content
 
 
 
 
 
 
1034
 
 
 
 
 
 
1035
 
1036
- class SentimentApp:
1037
- """Optimized multilingual sentiment analysis application"""
1038
-
1039
- def __init__(self):
1040
- self.engine = SentimentEngine()
1041
- self.advanced_engine = AdvancedAnalysisEngine()
1042
- self.history = HistoryManager()
1043
- self.data_handler = DataHandler()
1044
-
1045
- # Multi-language examples
1046
- self.examples = [
1047
- # Auto Detect
1048
- ["The film had its moments, but overall it felt a bit too long and lacked emotional depth. Some scenes were visually impressive, yet they failed to connect emotionally. By the end, I found myself disengaged and unsatisfied."],
1049
 
1050
- # English
1051
- ["I was completely blown away by the movie — the performances were raw and powerful, and the story stayed with me long after the credits rolled. Every scene felt purposeful, and the emotional arc was handled with incredible nuance. It's the kind of film that makes you reflect deeply on your own life."],
 
 
1052
 
1053
- # Chinese
1054
- ["这部电影节奏拖沓,剧情老套,完全没有让我产生任何共鸣,是一次失望的观影体验。演员的表演也显得做作,缺乏真实感。看到最后甚至有点不耐烦,整体表现乏善可陈。"],
 
 
 
 
 
 
 
1055
 
1056
- # Spanish
1057
- ["Una obra maestra del cine contemporáneo, con actuaciones sobresalientes, un guion bien escrito y una dirección impecable. Cada plano parecía cuidadosamente pensado, y la historia avanzaba con una intensidad emocional que mantenía al espectador cautivado. Definitivamente una película que vale la pena volver a ver."],
 
 
1058
 
1059
- # French
1060
- ["Je m'attendais à beaucoup mieux. Le scénario était confus, les dialogues ennuyeux, et je me suis presque endormi au milieu du film. Même la mise en scène, habituellement un point fort, manquait cruellement d'inspiration cette fois-ci."],
 
1061
 
1062
- # German
1063
- ["Der Film war ein emotionales Erlebnis mit großartigen Bildern, einem mitreißenden Soundtrack und einer Geschichte, die zum Nachdenken anregt. Besonders beeindruckend war die schauspielerische Leistung der Hauptdarsteller, die eine tiefe Menschlichkeit vermittelten. Es ist ein Film, der lange nachwirkt."],
 
 
 
 
 
 
1064
 
1065
- # Swedish
1066
- ["Filmen var en besvikelse – tråkig handling, överdrivet skådespeleri och ett slut som inte gav något avslut alls. Den kändes forcerad och saknade en tydlig röd tråd. Jag gick från biografen med en känsla av tomhet och frustration."]
1067
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1068
 
1069
- @handle_errors(default_return=("Please enter text", None, None))
1070
- def analyze_single(self, text: str, language: str, theme: str, clean_text: bool,
1071
- remove_punct: bool, remove_nums: bool):
1072
- """Optimized single text analysis"""
1073
- if not text.strip():
1074
- return "Please enter text", None, None
1075
-
1076
- # Map display names to language codes
1077
- language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
1078
- language_code = language_map.get(language, 'auto')
1079
-
1080
- preprocessing_options = {
1081
- 'clean_text': clean_text,
1082
- 'remove_punctuation': remove_punct,
1083
- 'remove_numbers': remove_nums
1084
- }
1085
 
1086
- with memory_cleanup():
1087
- result = self.engine.analyze_single(text, language_code, preprocessing_options)
1088
-
1089
- # Add to history
1090
- history_entry = {
1091
- 'text': text[:100] + '...' if len(text) > 100 else text,
1092
- 'full_text': text,
1093
- 'sentiment': result['sentiment'],
1094
- 'confidence': result['confidence'],
1095
- 'pos_prob': result.get('pos_prob', 0),
1096
- 'neg_prob': result.get('neg_prob', 0),
1097
- 'neu_prob': result.get('neu_prob', 0),
1098
- 'language': result['language'],
1099
- 'word_count': result['word_count'],
1100
- 'analysis_type': 'single'
1101
- }
1102
- self.history.add(history_entry)
1103
-
1104
- # Create visualizations
1105
- theme_ctx = ThemeContext(theme)
1106
- gauge_fig = PlotlyVisualizer.create_sentiment_gauge(result, theme_ctx)
1107
- bars_fig = PlotlyVisualizer.create_probability_bars(result, theme_ctx)
1108
-
1109
- # Create comprehensive result text
1110
- info_text = f"""
1111
- **Analysis Results:**
1112
- - **Sentiment:** {result['sentiment']} ({result['confidence']:.3f} confidence)
1113
- - **Language:** {result['language'].upper()}
1114
- - **Statistics:** {result['word_count']} words, {result['char_count']} characters
1115
- - **Probabilities:** Positive: {result.get('pos_prob', 0):.3f}, Negative: {result.get('neg_prob', 0):.3f}, Neutral: {result.get('neu_prob', 0):.3f}
1116
- """
1117
-
1118
- return info_text, gauge_fig, bars_fig
1119
-
1120
- @handle_errors(default_return=("Please enter texts", None, None, None))
1121
- def analyze_batch(self, batch_text: str, language: str, theme: str,
1122
- clean_text: bool, remove_punct: bool, remove_nums: bool):
1123
- """Enhanced batch analysis with parallel processing"""
1124
- if not batch_text.strip():
1125
- return "Please enter texts (one per line)", None, None, None
1126
 
1127
- # Parse batch input
1128
- texts = TextProcessor.parse_batch_input(batch_text)
 
 
 
 
 
 
1129
 
1130
- if len(texts) > config.BATCH_SIZE_LIMIT:
1131
- return f"Too many texts. Maximum {config.BATCH_SIZE_LIMIT} allowed.", None, None, None
1132
 
1133
- if not texts:
1134
- return "No valid texts found", None, None, None
1135
 
1136
- # Map display names to language codes
1137
- language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
1138
- language_code = language_map.get(language, 'auto')
 
 
1139
 
1140
- preprocessing_options = {
1141
- 'clean_text': clean_text,
1142
- 'remove_punctuation': remove_punct,
1143
- 'remove_numbers': remove_nums
1144
- }
1145
 
1146
- with memory_cleanup():
1147
- results = self.engine.analyze_batch(texts, language_code, preprocessing_options)
1148
-
1149
- # Add to history
1150
- batch_entries = []
1151
- for result in results:
1152
- if 'error' not in result:
1153
- entry = {
1154
- 'text': result['text'],
1155
- 'full_text': result['full_text'],
1156
- 'sentiment': result['sentiment'],
1157
- 'confidence': result['confidence'],
1158
- 'pos_prob': result.get('pos_prob', 0),
1159
- 'neg_prob': result.get('neg_prob', 0),
1160
- 'neu_prob': result.get('neu_prob', 0),
1161
- 'language': result['language'],
1162
- 'word_count': result['word_count'],
1163
- 'analysis_type': 'batch',
1164
- 'batch_index': result['batch_index']
1165
- }
1166
- batch_entries.append(entry)
1167
-
1168
- self.history.add_batch(batch_entries)
1169
-
1170
- # Create visualizations
1171
- theme_ctx = ThemeContext(theme)
1172
- summary_fig = PlotlyVisualizer.create_batch_summary(results, theme_ctx)
1173
- confidence_fig = PlotlyVisualizer.create_confidence_distribution(results)
1174
-
1175
- # Create results DataFrame
1176
- df_data = []
1177
- for result in results:
1178
- if 'error' in result:
1179
- df_data.append({
1180
- 'Index': result['batch_index'] + 1,
1181
- 'Text': result['text'],
1182
- 'Sentiment': 'Error',
1183
- 'Confidence': 0.0,
1184
- 'Language': 'Unknown',
1185
- 'Error': result['error']
1186
- })
1187
- else:
1188
- df_data.append({
1189
- 'Index': result['batch_index'] + 1,
1190
- 'Text': result['text'],
1191
- 'Sentiment': result['sentiment'],
1192
- 'Confidence': f"{result['confidence']:.3f}",
1193
- 'Language': result['language'].upper(),
1194
- 'Word_Count': result.get('word_count', 0)
1195
- })
1196
-
1197
- df = pd.DataFrame(df_data)
1198
-
1199
- # Create summary text
1200
- successful_results = [r for r in results if 'error' not in r]
1201
- error_count = len(results) - len(successful_results)
1202
-
1203
- if successful_results:
1204
- sentiment_counts = Counter([r['sentiment'] for r in successful_results])
1205
- avg_confidence = np.mean([r['confidence'] for r in successful_results])
1206
- languages = Counter([r['language'] for r in successful_results])
1207
-
1208
- summary_text = f"""
1209
- **Batch Analysis Summary:**
1210
- - **Total Texts:** {len(texts)}
1211
- - **Successful:** {len(successful_results)}
1212
- - **Errors:** {error_count}
1213
- - **Average Confidence:** {avg_confidence:.3f}
1214
- - **Sentiments:** {dict(sentiment_counts)}
1215
- - **Languages Detected:** {dict(languages)}
1216
- """
1217
- else:
1218
- summary_text = f"All {len(texts)} texts failed to analyze."
1219
-
1220
- return summary_text, df, summary_fig, confidence_fig
1221
 
1222
- # FIXED advanced analysis methods with sample size control
1223
- @handle_errors(default_return=("Please enter text", None))
1224
- def analyze_with_shap(self, text: str, language: str, num_samples: int = 100):
1225
- """Perform FIXED SHAP analysis with configurable samples"""
1226
- language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
1227
- language_code = language_map.get(language, 'auto')
1228
-
1229
- return self.advanced_engine.analyze_with_shap(text, language_code, num_samples)
1230
 
1231
- @handle_errors(default_return=("Please enter text", None))
1232
- def analyze_with_lime(self, text: str, language: str, num_samples: int = 100):
1233
- """Perform FIXED LIME analysis with configurable samples"""
1234
- language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
1235
- language_code = language_map.get(language, 'auto')
1236
-
1237
- return self.advanced_engine.analyze_with_lime(text, language_code, num_samples)
1238
 
1239
- @handle_errors(default_return=(None, "No history available"))
1240
- def plot_history(self, theme: str = 'default'):
1241
- """Plot comprehensive history analysis"""
1242
- history = self.history.get_all()
1243
- if len(history) < 2:
1244
- return None, f"Need at least 2 analyses for trends. Current: {len(history)}"
1245
 
1246
- theme_ctx = ThemeContext(theme)
 
 
 
 
 
1247
 
1248
- with memory_cleanup():
1249
- fig = PlotlyVisualizer.create_history_dashboard(history, theme_ctx)
1250
- stats = self.history.get_stats()
1251
-
1252
- stats_text = f"""
1253
- **History Statistics:**
1254
- - **Total Analyses:** {stats.get('total_analyses', 0)}
1255
- - **Positive:** {stats.get('positive_count', 0)}
1256
- - **Negative:** {stats.get('negative_count', 0)}
1257
- - **Neutral:** {stats.get('neutral_count', 0)}
1258
- - **Average Confidence:** {stats.get('avg_confidence', 0):.3f}
1259
- - **Languages:** {stats.get('languages_detected', 0)}
1260
- - **Most Common Language:** {stats.get('most_common_language', 'N/A').upper()}
1261
- """
1262
-
1263
- return fig, stats_text
1264
 
1265
- @handle_errors(default_return=("No data available",))
1266
- def get_history_status(self):
1267
- """Get current history status"""
1268
- stats = self.history.get_stats()
1269
- if not stats:
1270
- return "No analyses performed yet"
1271
-
1272
- return f"""
1273
- **Current Status:**
1274
- - **Total Analyses:** {stats['total_analyses']}
1275
- - **Recent Sentiment Distribution:**
1276
- * Positive: {stats['positive_count']}
1277
- * Negative: {stats['negative_count']}
1278
- * Neutral: {stats['neutral_count']}
1279
- - **Average Confidence:** {stats['avg_confidence']:.3f}
1280
- - **Languages Detected:** {stats['languages_detected']}
1281
- """
1282
 
1283
- # Optimized Gradio Interface
1284
- def create_interface():
1285
- """Create comprehensive Gradio interface with optimizations"""
1286
- app = SentimentApp()
1287
 
1288
- with gr.Blocks(theme=gr.themes.Soft(), title="Multilingual Sentiment Analyzer") as demo:
1289
- gr.Markdown("# 🌍 Multilingual Sentiment Analyzer")
1290
- gr.Markdown("AI-powered sentiment analysis with SHAP & LIME explainable AI features")
1291
-
1292
- with gr.Tab("Single Analysis"):
1293
- with gr.Row():
1294
- with gr.Column():
1295
- text_input = gr.Textbox(
1296
- label="Enter Text for Analysis",
1297
- placeholder="Enter your text in any supported language...",
1298
- lines=5
1299
- )
1300
-
1301
- with gr.Row():
1302
- language_selector = gr.Dropdown(
1303
- choices=list(config.SUPPORTED_LANGUAGES.values()),
1304
- value="Auto Detect",
1305
- label="Language"
1306
- )
1307
- theme_selector = gr.Dropdown(
1308
- choices=list(config.THEMES.keys()),
1309
- value="default",
1310
- label="Theme"
1311
- )
1312
-
1313
- with gr.Row():
1314
- clean_text_cb = gr.Checkbox(label="Clean Text", value=False)
1315
- remove_punct_cb = gr.Checkbox(label="Remove Punctuation", value=False)
1316
- remove_nums_cb = gr.Checkbox(label="Remove Numbers", value=False)
1317
-
1318
- analyze_btn = gr.Button("Analyze", variant="primary", size="lg")
1319
-
1320
- gr.Examples(
1321
- examples=app.examples,
1322
- inputs=text_input,
1323
- cache_examples=False
1324
- )
1325
-
1326
- with gr.Column():
1327
- result_output = gr.Textbox(label="Analysis Results", lines=8)
1328
-
1329
- with gr.Row():
1330
- gauge_plot = gr.Plot(label="Sentiment Gauge")
1331
- probability_plot = gr.Plot(label="Probability Distribution")
1332
-
1333
- # FIXED Advanced Analysis Tab
1334
- with gr.Tab("Advanced Analysis"):
1335
- gr.Markdown("## Explainable AI Analysis")
1336
- gr.Markdown("**SHAP and LIME analysis with FIXED implementation** - now handles text input correctly!")
1337
-
1338
- with gr.Row():
1339
- with gr.Column():
1340
- advanced_text_input = gr.Textbox(
1341
- label="Enter Text for Advanced Analysis",
1342
- placeholder="Enter text to analyze with SHAP and LIME...",
1343
- lines=6,
1344
- value="This movie is absolutely fantastic and amazing!"
1345
- )
1346
-
1347
- with gr.Row():
1348
- advanced_language = gr.Dropdown(
1349
- choices=list(config.SUPPORTED_LANGUAGES.values()),
1350
- value="Auto Detect",
1351
- label="Language"
1352
- )
1353
-
1354
- num_samples_slider = gr.Slider(
1355
- minimum=50,
1356
- maximum=300,
1357
- value=100,
1358
- step=25,
1359
- label="Number of Samples",
1360
- info="Lower = Faster, Higher = More Accurate"
1361
- )
1362
-
1363
- with gr.Row():
1364
- shap_btn = gr.Button("SHAP Analysis", variant="primary")
1365
- lime_btn = gr.Button("LIME Analysis", variant="secondary")
1366
-
1367
- gr.Markdown("""
1368
-
1369
- **📊 Analysis Methods:**
1370
- - **SHAP**: Token-level importance scores using Text masker
1371
- - **LIME**: Feature importance through text perturbation
1372
-
1373
- **⚡ Expected Performance:**
1374
- - 50 samples: ~10-20s | 100 samples: ~20-40s | 200+ samples: ~40-80s
1375
- """)
1376
-
1377
- with gr.Column():
1378
- advanced_results = gr.Textbox(label="Analysis Summary", lines=12)
1379
-
1380
- with gr.Row():
1381
- advanced_plot = gr.Plot(label="Feature Importance Visualization")
1382
-
1383
- with gr.Tab("Batch Analysis"):
1384
- with gr.Row():
1385
- with gr.Column():
1386
- file_upload = gr.File(
1387
- label="Upload File (CSV/TXT)",
1388
- file_types=[".csv", ".txt"]
1389
- )
1390
- batch_input = gr.Textbox(
1391
- label="Batch Input (one text per line)",
1392
- placeholder="Enter multiple texts, one per line...",
1393
- lines=10
1394
- )
1395
-
1396
- with gr.Row():
1397
- batch_language = gr.Dropdown(
1398
- choices=list(config.SUPPORTED_LANGUAGES.values()),
1399
- value="Auto Detect",
1400
- label="Language"
1401
- )
1402
- batch_theme = gr.Dropdown(
1403
- choices=list(config.THEMES.keys()),
1404
- value="default",
1405
- label="Theme"
1406
- )
1407
-
1408
- with gr.Row():
1409
- batch_clean_cb = gr.Checkbox(label="Clean Text", value=False)
1410
- batch_punct_cb = gr.Checkbox(label="Remove Punctuation", value=False)
1411
- batch_nums_cb = gr.Checkbox(label="Remove Numbers", value=False)
1412
-
1413
- with gr.Row():
1414
- load_file_btn = gr.Button("Load File")
1415
- analyze_batch_btn = gr.Button("Analyze Batch", variant="primary")
1416
-
1417
- with gr.Column():
1418
- batch_summary = gr.Textbox(label="Batch Summary", lines=8)
1419
- batch_results_df = gr.Dataframe(
1420
- label="Detailed Results",
1421
- headers=["Index", "Text", "Sentiment", "Confidence", "Language", "Word_Count"],
1422
- datatype=["number", "str", "str", "str", "str", "number"]
1423
- )
1424
-
1425
- with gr.Row():
1426
- batch_plot = gr.Plot(label="Batch Analysis Summary")
1427
- confidence_dist_plot = gr.Plot(label="Confidence Distribution")
1428
-
1429
- with gr.Tab("History & Analytics"):
1430
- with gr.Row():
1431
- with gr.Column():
1432
- with gr.Row():
1433
- refresh_history_btn = gr.Button("Refresh History")
1434
- clear_history_btn = gr.Button("Clear History", variant="stop")
1435
- status_btn = gr.Button("Get Status")
1436
-
1437
- history_theme = gr.Dropdown(
1438
- choices=list(config.THEMES.keys()),
1439
- value="default",
1440
- label="Dashboard Theme"
1441
- )
1442
-
1443
- with gr.Row():
1444
- export_csv_btn = gr.Button("Export CSV")
1445
- export_json_btn = gr.Button("Export JSON")
1446
-
1447
- with gr.Column():
1448
- history_status = gr.Textbox(label="History Status", lines=8)
1449
-
1450
- history_dashboard = gr.Plot(label="History Analytics Dashboard")
1451
-
1452
- with gr.Row():
1453
- csv_download = gr.File(label="CSV Download", visible=True)
1454
- json_download = gr.File(label="JSON Download", visible=True)
1455
-
1456
- # Event Handlers
1457
-
1458
- # Single Analysis
1459
- analyze_btn.click(
1460
- app.analyze_single,
1461
- inputs=[text_input, language_selector, theme_selector,
1462
- clean_text_cb, remove_punct_cb, remove_nums_cb],
1463
- outputs=[result_output, gauge_plot, probability_plot]
1464
- )
1465
-
1466
- # FIXED Advanced Analysis with sample size control
1467
- shap_btn.click(
1468
- app.analyze_with_shap,
1469
- inputs=[advanced_text_input, advanced_language, num_samples_slider],
1470
- outputs=[advanced_results, advanced_plot]
1471
- )
1472
-
1473
- lime_btn.click(
1474
- app.analyze_with_lime,
1475
- inputs=[advanced_text_input, advanced_language, num_samples_slider],
1476
- outputs=[advanced_results, advanced_plot]
1477
- )
1478
-
1479
- # Batch Analysis
1480
- load_file_btn.click(
1481
- app.data_handler.process_file,
1482
- inputs=file_upload,
1483
- outputs=batch_input
1484
- )
1485
-
1486
- analyze_batch_btn.click(
1487
- app.analyze_batch,
1488
- inputs=[batch_input, batch_language, batch_theme,
1489
- batch_clean_cb, batch_punct_cb, batch_nums_cb],
1490
- outputs=[batch_summary, batch_results_df, batch_plot, confidence_dist_plot]
1491
- )
1492
 
1493
- # History & Analytics
1494
- refresh_history_btn.click(
1495
- app.plot_history,
1496
- inputs=history_theme,
1497
- outputs=[history_dashboard, history_status]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1498
  )
1499
-
1500
- clear_history_btn.click(
1501
- lambda: f"Cleared {app.history.clear()} entries",
1502
- outputs=history_status
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1503
  )
1504
-
1505
- status_btn.click(
1506
- app.get_history_status,
1507
- outputs=history_status
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1508
  )
1509
-
1510
- export_csv_btn.click(
1511
- lambda: app.data_handler.export_data(app.history.get_all(), 'csv'),
1512
- outputs=[csv_download, history_status]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1513
  )
1514
-
1515
- export_json_btn.click(
1516
- lambda: app.data_handler.export_data(app.history.get_all(), 'json'),
1517
- outputs=[json_download, history_status]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1518
  )
1519
 
1520
- return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1521
 
1522
- # Application Entry Point
1523
  if __name__ == "__main__":
1524
- logging.basicConfig(
1525
- level=logging.INFO,
1526
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
1527
- )
1528
-
1529
- try:
1530
- demo = create_interface()
1531
- demo.launch(
1532
- share=True,
1533
- server_name="0.0.0.0",
1534
- server_port=7860,
1535
- show_error=True
1536
- )
1537
- except Exception as e:
1538
- logger.error(f"Failed to launch application: {e}")
1539
- raise
 
 
1
  import gradio as gr
2
+ import pandas as pd
 
 
 
3
  import numpy as np
 
 
 
4
  import json
5
+ import re
6
  import io
 
7
  from datetime import datetime
8
+ from typing import List, Dict, Tuple
9
+ from transformers import pipeline, AutoTokenizer
10
+ import plotly.graph_objects as go
11
+ from plotly.subplots import make_subplots
12
+ import sqlite3
13
+ import hashlib
 
 
 
 
 
 
 
14
  import time
15
 
16
+ # Initialize models
17
+ sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest")
18
+ absa_analyzer = pipeline("ner", model="yangheng/deberta-v3-base-absa-v1.1", aggregation_strategy="simple")
 
19
 
20
+ class ReviewAnalyzer:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def __init__(self):
22
+ self.db_path = "reviews.db"
23
+ self._init_db()
24
+
25
+ def _init_db(self):
26
+ conn = sqlite3.connect(self.db_path)
27
+ conn.execute('''
28
+ CREATE TABLE IF NOT EXISTS usage_log (
29
+ id INTEGER PRIMARY KEY,
30
+ user_id TEXT,
31
+ timestamp DATETIME,
32
+ analysis_type TEXT,
33
+ items_count INTEGER
 
 
 
 
 
 
34
  )
35
+ ''')
36
+ conn.close()
37
+
38
+ def preprocess_text(self, text: str) -> str:
39
+ """Clean and preprocess review text"""
40
+ text = re.sub(r'http\S+', '', text)
41
+ text = re.sub(r'[^\w\s]', '', text)
42
+ text = text.strip().lower()
43
+ return text
44
+
45
+ def extract_aspect_keywords(self, reviews: List[str]) -> Dict:
46
+ """Extract aspect-based sentiment keywords"""
47
+ all_aspects = {'positive': {}, 'negative': {}}
48
+ detailed_aspects = []
49
+
50
+ for review in reviews:
51
+ if not review.strip() or len(review) < 10:
52
+ continue
53
+
54
+ try:
55
+ aspects = absa_analyzer(review)
56
+ for aspect in aspects:
57
+ word = aspect['word'].lower()
58
+ label = aspect['entity_group'].lower()
59
+ confidence = aspect['score']
60
+
61
+ # Map labels to sentiment
62
+ if 'pos' in label or label == 'positive':
63
+ sentiment = 'positive'
64
+ elif 'neg' in label or label == 'negative':
65
+ sentiment = 'negative'
66
+ else:
67
+ continue
68
+
69
+ # Count aspects
70
+ if word not in all_aspects[sentiment]:
71
+ all_aspects[sentiment][word] = 0
72
+ all_aspects[sentiment][word] += 1
73
+
74
+ detailed_aspects.append({
75
+ 'review': review[:50] + '...',
76
+ 'aspect': word,
77
+ 'sentiment': sentiment,
78
+ 'confidence': round(confidence, 3)
79
+ })
80
+ except:
81
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ # Get top aspects
84
+ top_positive = sorted(all_aspects['positive'].items(), key=lambda x: x[1], reverse=True)[:10]
85
+ top_negative = sorted(all_aspects['negative'].items(), key=lambda x: x[1], reverse=True)[:10]
86
 
87
  return {
88
+ 'top_positive_aspects': top_positive,
89
+ 'top_negative_aspects': top_negative,
90
+ 'detailed_aspects': detailed_aspects,
91
+ 'summary': {
92
+ 'total_positive_aspects': len(all_aspects['positive']),
93
+ 'total_negative_aspects': len(all_aspects['negative'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  }
95
+ }
96
 
97
+ def analyze_sentiment(self, reviews: List[str]) -> Dict:
98
+ """Analyze sentiment of reviews with keyword extraction"""
99
+ results = []
100
+ sentiments = {'positive': 0, 'negative': 0, 'neutral': 0}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
+ for review in reviews:
103
+ if not review.strip():
104
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ clean_review = self.preprocess_text(review)
107
+ result = sentiment_analyzer(clean_review)[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
+ label = result['label'].lower()
110
+ score = result['score']
111
 
112
+ if 'pos' in label:
113
+ sentiment = 'positive'
114
+ elif 'neg' in label:
115
+ sentiment = 'negative'
 
 
 
116
  else:
117
+ sentiment = 'neutral'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
+ sentiments[sentiment] += 1
120
+ results.append({
121
+ 'text': review[:100] + '...' if len(review) > 100 else review,
122
+ 'sentiment': sentiment,
123
+ 'confidence': round(score, 3)
124
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
+ total = len(results)
127
+ sentiment_percentages = {k: round(v/total*100, 1) for k, v in sentiments.items()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
+ # Extract keywords
130
+ keywords = self.extract_aspect_keywords(reviews)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
+ return {
133
+ 'summary': sentiment_percentages,
134
+ 'details': results,
135
+ 'total_reviews': total,
136
+ 'keywords': keywords
137
+ }
138
 
139
+ def detect_fake_reviews(self, reviews: List[str], metadata: Dict = None) -> Dict:
140
+ """Detect potentially fake reviews with optional metadata"""
141
+ fake_scores = []
 
 
 
 
 
 
 
 
 
 
 
142
 
143
+ # Process metadata if provided
144
+ metadata_flags = []
145
+ if metadata and 'timestamps' in metadata and 'usernames' in metadata:
146
+ metadata_flags = self._analyze_metadata(metadata['timestamps'], metadata['usernames'])
147
 
148
+ for i, review in enumerate(reviews):
149
+ if not review.strip():
150
+ continue
151
+
152
+ score = 0
153
+ flags = []
154
+
155
+ # Text-based checks
156
+ if len(review) < 20:
157
+ score += 0.3
158
+ flags.append("too_short")
159
+
160
+ words = review.lower().split()
161
+ unique_ratio = len(set(words)) / len(words) if words else 0
162
+ if unique_ratio < 0.5:
163
+ score += 0.4
164
+ flags.append("repetitive")
165
+
166
+ punct_ratio = len(re.findall(r'[!?.]', review)) / len(review) if review else 0
167
+ if punct_ratio > 0.1:
168
+ score += 0.2
169
+ flags.append("excessive_punctuation")
170
+
171
+ generic_phrases = ['amazing', 'perfect', 'best ever', 'highly recommend']
172
+ if any(phrase in review.lower() for phrase in generic_phrases):
173
+ score += 0.1
174
+ flags.append("generic_language")
175
+
176
+ # Add metadata flags if available
177
+ if i < len(metadata_flags):
178
+ if metadata_flags[i]:
179
+ score += 0.3
180
+ flags.extend(metadata_flags[i])
181
+
182
+ fake_scores.append({
183
+ 'text': review[:100] + '...' if len(review) > 100 else review,
184
+ 'fake_probability': min(round(score, 3), 1.0),
185
+ 'status': 'suspicious' if score > 0.5 else 'authentic',
186
+ 'flags': flags
187
+ })
188
+
189
+ suspicious_count = sum(1 for item in fake_scores if item['fake_probability'] > 0.5)
190
 
191
+ return {
192
+ 'summary': {
193
+ 'total_reviews': len(fake_scores),
194
+ 'suspicious_reviews': suspicious_count,
195
+ 'authenticity_rate': round((len(fake_scores) - suspicious_count) / len(fake_scores) * 100, 1) if fake_scores else 0
196
+ },
197
+ 'details': fake_scores,
198
+ 'metadata_analysis': metadata_flags if metadata_flags else None
199
+ }
200
 
201
+ def _analyze_metadata(self, timestamps: List[str], usernames: List[str]) -> List[List[str]]:
202
+ """Analyze metadata for suspicious patterns"""
203
+ flags_per_review = [[] for _ in range(len(timestamps))]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
+ # Time density analysis
206
+ if len(timestamps) >= 5:
207
+ times = []
208
+ for i, ts in enumerate(timestamps):
209
+ try:
210
+ dt = datetime.strptime(ts, "%Y-%m-%d %H:%M:%S")
211
+ times.append((i, dt))
212
+ except:
213
+ continue
214
+
215
+ times.sort(key=lambda x: x[1])
216
+
217
+ # Check for clusters
218
+ for i in range(len(times) - 5):
219
+ if (times[i + 5][1] - times[i][1]).total_seconds() < 300: # 5 mins
220
+ for j in range(i, i + 6):
221
+ flags_per_review[times[j][0]].append("time_cluster")
222
+
223
+ # Username pattern analysis
224
+ for i, username in enumerate(usernames):
225
+ if re.match(r"user_\d{4,}", username):
226
+ flags_per_review[i].append("suspicious_username")
227
+ if len(username) < 4:
228
+ flags_per_review[i].append("short_username")
229
+
230
+ return flags_per_review
231
+
232
+ def assess_quality(self, reviews: List[str], custom_weights: Dict = None) -> Tuple[Dict, go.Figure]:
233
+ """Assess review quality with customizable weights and radar chart"""
234
+ default_weights = {
235
+ 'length': 0.25,
236
+ 'detail': 0.25,
237
+ 'structure': 0.25,
238
+ 'helpfulness': 0.25
239
+ }
240
 
241
+ weights = custom_weights if custom_weights else default_weights
242
+ quality_scores = []
243
 
244
+ for review in reviews:
245
+ if not review.strip():
246
+ continue
247
+
248
+ factors = {}
249
+
250
+ # Length factor
251
+ length_score = min(len(review) / 200, 1.0)
252
+ factors['length'] = round(length_score, 2)
253
+
254
+ # Detail factor
255
+ detail_words = ['because', 'however', 'although', 'specifically', 'particularly']
256
+ detail_score = min(sum(1 for word in detail_words if word in review.lower()) / 3, 1.0)
257
+ factors['detail'] = round(detail_score, 2)
258
+
259
+ # Structure factor
260
+ sentences = len(re.split(r'[.!?]', review))
261
+ structure_score = min(sentences / 5, 1.0)
262
+ factors['structure'] = round(structure_score, 2)
263
+
264
+ # Helpfulness factor
265
+ helpful_words = ['pros', 'cons', 'recommend', 'suggest', 'tip', 'advice']
266
+ helpful_score = min(sum(1 for word in helpful_words if word in review.lower()) / 2, 1.0)
267
+ factors['helpfulness'] = round(helpful_score, 2)
268
+
269
+ # Calculate weighted score
270
+ total_score = sum(factors[k] * weights[k] for k in factors.keys())
271
+
272
+ quality_scores.append({
273
+ 'text': review[:100] + '...' if len(review) > 100 else review,
274
+ 'quality_score': round(total_score, 3),
275
+ 'factors': factors,
276
+ 'grade': 'A' if total_score > 0.8 else 'B' if total_score > 0.6 else 'C' if total_score > 0.4 else 'D'
277
+ })
278
+
279
+ avg_quality = sum(item['quality_score'] for item in quality_scores) / len(quality_scores) if quality_scores else 0
280
+
281
+ # Create radar chart for average factors
282
+ avg_factors = {}
283
+ for factor in ['length', 'detail', 'structure', 'helpfulness']:
284
+ avg_factors[factor] = sum(item['factors'][factor] for item in quality_scores) / len(quality_scores) if quality_scores else 0
285
+
286
+ fig = go.Figure()
287
+ fig.add_trace(go.Scatterpolar(
288
+ r=list(avg_factors.values()),
289
+ theta=list(avg_factors.keys()),
290
+ fill='toself',
291
+ name='Quality Factors'
292
+ ))
293
 
294
  fig.update_layout(
295
+ polar=dict(
296
+ radialaxis=dict(
297
+ visible=True,
298
+ range=[0, 1]
299
+ )),
300
+ showlegend=True,
301
+ title="Average Quality Factors"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  )
303
 
304
+ return {
305
+ 'summary': {
306
+ 'average_quality': round(avg_quality, 3),
307
+ 'total_reviews': len(quality_scores),
308
+ 'high_quality_count': sum(1 for item in quality_scores if item['quality_score'] > 0.7),
309
+ 'weights_used': weights
310
+ },
311
+ 'details': quality_scores,
312
+ 'factor_averages': avg_factors
313
+ }, fig
314
+
315
+ def compare_competitors(self, product_a_reviews: List[str], product_b_reviews: List[str]) -> Tuple[Dict, go.Figure]:
316
+ """Compare sentiment between two products"""
317
+ analysis_a = self.analyze_sentiment(product_a_reviews)
318
+ analysis_b = self.analyze_sentiment(product_b_reviews)
 
 
 
 
 
 
 
 
319
 
320
+ fig = make_subplots(
321
+ rows=1, cols=2,
322
+ specs=[[{'type': 'pie'}, {'type': 'pie'}]],
323
+ subplot_titles=['Product A', 'Product B']
 
 
324
  )
325
 
326
+ fig.add_trace(go.Pie(
327
+ labels=list(analysis_a['summary'].keys()),
328
+ values=list(analysis_a['summary'].values()),
329
+ name="Product A"
330
+ ), row=1, col=1)
 
 
 
331
 
332
+ fig.add_trace(go.Pie(
333
+ labels=list(analysis_b['summary'].keys()),
334
+ values=list(analysis_b['summary'].values()),
335
+ name="Product B"
336
+ ), row=1, col=2)
 
 
 
 
 
 
 
 
337
 
338
+ fig.update_layout(title_text="Sentiment Comparison")
 
339
 
340
+ comparison = {
341
+ 'product_a': analysis_a,
342
+ 'product_b': analysis_b,
343
+ 'winner': 'Product A' if analysis_a['summary']['positive'] > analysis_b['summary']['positive'] else 'Product B'
344
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
345
 
346
+ return comparison, fig
 
347
 
348
+ def generate_report(self, analysis_data: Dict, report_type: str = "basic") -> str:
349
+ """Generate analysis report with export capability"""
350
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
+ if report_type == "sentiment":
353
+ keywords = analysis_data.get('keywords', {})
354
+ top_pos = keywords.get('top_positive_aspects', [])[:5]
355
+ top_neg = keywords.get('top_negative_aspects', [])[:5]
356
+
357
+ return f"""# Sentiment Analysis Report
358
+ Generated: {timestamp}
359
 
360
+ ## Summary
361
+ - Total Reviews: {analysis_data.get('total_reviews', 0)}
362
+ - Positive: {analysis_data.get('summary', {}).get('positive', 0)}%
363
+ - Negative: {analysis_data.get('summary', {}).get('negative', 0)}%
364
+ - Neutral: {analysis_data.get('summary', {}).get('neutral', 0)}%
365
 
366
+ ## Top Positive Aspects
367
+ {chr(10).join([f"- {aspect[0]} (mentioned {aspect[1]} times)" for aspect in top_pos])}
368
+
369
+ ## Top Negative Aspects
370
+ {chr(10).join([f"- {aspect[0]} (mentioned {aspect[1]} times)" for aspect in top_neg])}
 
 
 
 
 
 
 
 
371
 
372
+ ## Key Insights
373
+ - Overall sentiment: {'Positive' if analysis_data.get('summary', {}).get('positive', 0) > 50 else 'Mixed'}
374
+ - Main complaints: {', '.join([aspect[0] for aspect in top_neg[:3]])}
375
+ - Key strengths: {', '.join([aspect[0] for aspect in top_pos[:3]])}
376
 
377
+ ## Recommendations
378
+ - Address negative aspects: {', '.join([aspect[0] for aspect in top_neg[:2]])}
379
+ - Leverage positive aspects in marketing
380
+ - Monitor sentiment trends over time
381
+ """
382
+
383
+ elif report_type == "fake":
384
+ return f"""# Fake Review Detection Report
385
+ Generated: {timestamp}
386
 
387
+ ## Summary
388
+ - Total Reviews: {analysis_data.get('summary', {}).get('total_reviews', 0)}
389
+ - Suspicious Reviews: {analysis_data.get('summary', {}).get('suspicious_reviews', 0)}
390
+ - Authenticity Rate: {analysis_data.get('summary', {}).get('authenticity_rate', 0)}%
391
 
392
+ ## Risk Assessment
393
+ - Overall Risk: {'High' if analysis_data.get('summary', {}).get('authenticity_rate', 0) < 70 else 'Low'}
394
+ - Action Required: {'Yes' if analysis_data.get('summary', {}).get('suspicious_reviews', 0) > 0 else 'No'}
395
 
396
+ ## Common Fraud Indicators
397
+ - Short reviews with generic language
398
+ - Repetitive content patterns
399
+ - Suspicious timing clusters
400
+ - Unusual username patterns
401
+ """
402
+
403
+ return "Report generated successfully"
404
 
405
+ # Global analyzer instance
406
+ analyzer = ReviewAnalyzer()
407
+
408
+ def process_reviews_input(text: str) -> List[str]:
409
+ """Process review input text into list"""
410
+ if not text.strip():
411
+ return []
412
+
413
+ reviews = []
414
+ for line in text.split('\n'):
415
+ line = line.strip()
416
+ if line and len(line) > 10:
417
+ reviews.append(line)
418
+
419
+ return reviews
420
+
421
+ def process_csv_upload(file) -> Tuple[List[str], Dict]:
422
+ """Process uploaded CSV file"""
423
+ if file is None:
424
+ return [], {}
425
 
426
+ try:
427
+ df = pd.read_csv(file.name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
 
429
+ # Look for common column names
430
+ review_col = None
431
+ time_col = None
432
+ user_col = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
 
434
+ for col in df.columns:
435
+ col_lower = col.lower()
436
+ if 'review' in col_lower or 'comment' in col_lower or 'text' in col_lower:
437
+ review_col = col
438
+ elif 'time' in col_lower or 'date' in col_lower:
439
+ time_col = col
440
+ elif 'user' in col_lower or 'name' in col_lower:
441
+ user_col = col
442
 
443
+ if review_col is None:
444
+ return [], {"error": "No review column found. Expected columns: 'review', 'comment', or 'text'"}
445
 
446
+ reviews = df[review_col].dropna().astype(str).tolist()
 
447
 
448
+ metadata = {}
449
+ if time_col:
450
+ metadata['timestamps'] = df[time_col].dropna().astype(str).tolist()
451
+ if user_col:
452
+ metadata['usernames'] = df[user_col].dropna().astype(str).tolist()
453
 
454
+ return reviews, metadata
 
 
 
 
455
 
456
+ except Exception as e:
457
+ return [], {"error": f"Failed to process CSV: {str(e)}"}
458
+
459
+ def sentiment_analysis_interface(reviews_text: str, csv_file):
460
+ """Interface for sentiment analysis"""
461
+ reviews = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
 
463
+ if csv_file is not None:
464
+ reviews, metadata = process_csv_upload(csv_file)
465
+ if 'error' in metadata:
466
+ return metadata['error'], None
467
+ else:
468
+ reviews = process_reviews_input(reviews_text)
 
 
469
 
470
+ if not reviews:
471
+ return "Please enter reviews or upload a CSV file.", None
 
 
 
 
 
472
 
473
+ try:
474
+ result = analyzer.analyze_sentiment(reviews)
 
 
 
 
475
 
476
+ fig = go.Figure(data=[
477
+ go.Bar(x=list(result['summary'].keys()),
478
+ y=list(result['summary'].values()),
479
+ marker_color=['green', 'red', 'gray'])
480
+ ])
481
+ fig.update_layout(title="Sentiment Distribution", yaxis_title="Percentage")
482
 
483
+ return json.dumps(result, indent=2), fig
484
+ except Exception as e:
485
+ return f"Error: {str(e)}", None
486
+
487
+ def fake_detection_interface(reviews_text: str, csv_file):
488
+ """Interface for fake review detection"""
489
+ reviews = []
490
+ metadata = {}
 
 
 
 
 
 
 
 
491
 
492
+ if csv_file is not None:
493
+ reviews, metadata = process_csv_upload(csv_file)
494
+ if 'error' in metadata:
495
+ return metadata['error']
496
+ else:
497
+ reviews = process_reviews_input(reviews_text)
498
+
499
+ if not reviews:
500
+ return "Please enter reviews or upload a CSV file."
501
+
502
+ try:
503
+ result = analyzer.detect_fake_reviews(reviews, metadata if metadata else None)
504
+ return json.dumps(result, indent=2)
505
+ except Exception as e:
506
+ return f"Error: {str(e)}"
 
 
507
 
508
+ def quality_assessment_interface(reviews_text: str, csv_file, length_weight: float, detail_weight: float, structure_weight: float, help_weight: float):
509
+ """Interface for quality assessment with custom weights"""
510
+ reviews = []
 
511
 
512
+ if csv_file is not None:
513
+ reviews, metadata = process_csv_upload(csv_file)
514
+ if 'error' in metadata:
515
+ return metadata['error'], None
516
+ else:
517
+ reviews = process_reviews_input(reviews_text)
518
+
519
+ if not reviews:
520
+ return "Please enter reviews or upload a CSV file.", None
521
+
522
+ try:
523
+ custom_weights = {
524
+ 'length': length_weight,
525
+ 'detail': detail_weight,
526
+ 'structure': structure_weight,
527
+ 'helpfulness': help_weight
528
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529
 
530
+ result, radar_fig = analyzer.assess_quality(reviews, custom_weights)
531
+ return json.dumps(result, indent=2), radar_fig
532
+ except Exception as e:
533
+ return f"Error: {str(e)}", None
534
+
535
+ def competitor_comparison_interface(product_a_text: str, product_b_text: str):
536
+ """Interface for competitor comparison"""
537
+ if not product_a_text.strip() or not product_b_text.strip():
538
+ return "Please enter reviews for both products.", None
539
+
540
+ reviews_a = process_reviews_input(product_a_text)
541
+ reviews_b = process_reviews_input(product_b_text)
542
+
543
+ if not reviews_a or not reviews_b:
544
+ return "Please provide valid reviews for both products.", None
545
+
546
+ try:
547
+ result, fig = analyzer.compare_competitors(reviews_a, reviews_b)
548
+ return json.dumps(result, indent=2), fig
549
+ except Exception as e:
550
+ return f"Error: {str(e)}", None
551
+
552
+ def generate_report_interface(analysis_result: str, report_type: str):
553
+ """Interface for report generation"""
554
+ if not analysis_result.strip():
555
+ return "No analysis data available. Please run an analysis first."
556
+
557
+ try:
558
+ data = json.loads(analysis_result)
559
+ report = analyzer.generate_report(data, report_type.lower())
560
+ return report
561
+ except Exception as e:
562
+ return f"Error generating report: {str(e)}"
563
+
564
+ # Create Gradio interface
565
+ with gr.Blocks(title="SmartReview Pro", theme=gr.themes.Soft()) as demo:
566
+ gr.Markdown("# 🛒 SmartReview Pro")
567
+ gr.Markdown("Advanced review analysis platform with AI-powered insights")
568
+
569
+ with gr.Tab("📊 Sentiment Analysis"):
570
+ gr.Markdown("### Analyze customer sentiment and extract key aspects")
571
+ with gr.Row():
572
+ with gr.Column():
573
+ sentiment_input = gr.Textbox(
574
+ lines=8,
575
+ placeholder="Enter reviews (one per line) or upload CSV...",
576
+ label="Reviews"
577
+ )
578
+ sentiment_csv = gr.File(
579
+ label="Upload CSV (columns: review/comment/text, optional: timestamp, username)",
580
+ file_types=[".csv"]
581
+ )
582
+ sentiment_btn = gr.Button("Analyze Sentiment", variant="primary")
583
+ with gr.Column():
584
+ sentiment_output = gr.Textbox(label="Analysis Results", lines=15)
585
+ sentiment_chart = gr.Plot(label="Sentiment Distribution")
586
+
587
+ sentiment_btn.click(
588
+ sentiment_analysis_interface,
589
+ inputs=[sentiment_input, sentiment_csv],
590
+ outputs=[sentiment_output, sentiment_chart]
591
  )
592
+
593
+ with gr.Tab("🔍 Fake Review Detection"):
594
+ gr.Markdown("### Detect suspicious reviews using text analysis and metadata")
595
+ with gr.Row():
596
+ with gr.Column():
597
+ fake_input = gr.Textbox(
598
+ lines=8,
599
+ placeholder="Enter reviews to analyze...",
600
+ label="Reviews"
601
+ )
602
+ fake_csv = gr.File(
603
+ label="Upload CSV (supports timestamp & username analysis)",
604
+ file_types=[".csv"]
605
+ )
606
+ fake_btn = gr.Button("Detect Fake Reviews", variant="primary")
607
+ with gr.Column():
608
+ fake_output = gr.Textbox(label="Detection Results", lines=15)
609
+
610
+ fake_btn.click(
611
+ fake_detection_interface,
612
+ inputs=[fake_input, fake_csv],
613
+ outputs=[fake_output]
614
  )
615
+
616
+ with gr.Tab("⭐ Quality Assessment"):
617
+ gr.Markdown("### Assess review quality with customizable weights")
618
+ with gr.Row():
619
+ with gr.Column():
620
+ quality_input = gr.Textbox(
621
+ lines=8,
622
+ placeholder="Enter reviews to assess...",
623
+ label="Reviews"
624
+ )
625
+ quality_csv = gr.File(
626
+ label="Upload CSV",
627
+ file_types=[".csv"]
628
+ )
629
+
630
+ gr.Markdown("**Customize Quality Weights:**")
631
+ with gr.Row():
632
+ length_weight = gr.Slider(0, 1, 0.25, label="Length Weight")
633
+ detail_weight = gr.Slider(0, 1, 0.25, label="Detail Weight")
634
+ with gr.Row():
635
+ structure_weight = gr.Slider(0, 1, 0.25, label="Structure Weight")
636
+ help_weight = gr.Slider(0, 1, 0.25, label="Helpfulness Weight")
637
+
638
+ quality_btn = gr.Button("Assess Quality", variant="primary")
639
+ with gr.Column():
640
+ quality_output = gr.Textbox(label="Quality Assessment", lines=12)
641
+ quality_radar = gr.Plot(label="Quality Factors Radar Chart")
642
+
643
+ quality_btn.click(
644
+ quality_assessment_interface,
645
+ inputs=[quality_input, quality_csv, length_weight, detail_weight, structure_weight, help_weight],
646
+ outputs=[quality_output, quality_radar]
647
  )
648
+
649
+ with gr.Tab("🆚 Competitor Comparison"):
650
+ gr.Markdown("### Compare sentiment between competing products")
651
+ with gr.Row():
652
+ with gr.Column():
653
+ comp_product_a = gr.Textbox(
654
+ lines=8,
655
+ placeholder="Product A reviews...",
656
+ label="Product A Reviews"
657
+ )
658
+ comp_product_b = gr.Textbox(
659
+ lines=8,
660
+ placeholder="Product B reviews...",
661
+ label="Product B Reviews"
662
+ )
663
+ comp_btn = gr.Button("Compare Products", variant="primary")
664
+ with gr.Column():
665
+ comp_output = gr.Textbox(label="Comparison Results", lines=15)
666
+ comp_chart = gr.Plot(label="Comparison Chart")
667
+
668
+ comp_btn.click(
669
+ competitor_comparison_interface,
670
+ inputs=[comp_product_a, comp_product_b],
671
+ outputs=[comp_output, comp_chart]
672
  )
673
+
674
+ with gr.Tab("📋 Report Generation"):
675
+ gr.Markdown("### Generate professional analysis reports")
676
+ with gr.Row():
677
+ with gr.Column():
678
+ report_data = gr.Textbox(
679
+ lines=10,
680
+ placeholder="Paste analysis results here...",
681
+ label="Analysis Data (JSON)"
682
+ )
683
+ report_type = gr.Dropdown(
684
+ choices=["sentiment", "fake", "quality"],
685
+ value="sentiment",
686
+ label="Report Type"
687
+ )
688
+ report_btn = gr.Button("Generate Report", variant="primary")
689
+ with gr.Column():
690
+ report_output = gr.Textbox(label="Generated Report", lines=15)
691
+
692
+ report_btn.click(
693
+ generate_report_interface,
694
+ inputs=[report_data, report_type],
695
+ outputs=[report_output]
696
  )
697
 
698
+ with gr.Tab("ℹ️ About"):
699
+ gr.Markdown("""
700
+ ## SmartReview Pro Features
701
+
702
+ ### 🆕 New Features:
703
+ - **Aspect-Based Sentiment Analysis**: Extract specific aspects customers love/hate
704
+ - **CSV Batch Processing**: Upload review files for bulk analysis
705
+ - **Metadata Analysis**: Detect fake reviews using timestamps and usernames
706
+ - **Customizable Quality Scoring**: Adjust quality factors to your needs
707
+ - **Advanced Visualizations**: Radar charts and enhanced reporting
708
+
709
+ ### Core Capabilities:
710
+ - **Sentiment Analysis**: AI-powered emotion detection with keyword extraction
711
+ - **Fake Review Detection**: Multi-layer authenticity verification
712
+ - **Quality Assessment**: Comprehensive review helpfulness scoring
713
+ - **Competitor Comparison**: Side-by-side sentiment analysis
714
+ - **Professional Reports**: Detailed insights with actionable recommendations
715
+
716
+ ### CSV Format:
717
+ Required columns: `review` or `comment` or `text`
718
+ Optional columns: `timestamp`, `username` (for enhanced fake detection)
719
+
720
+ ### Pricing:
721
+ - **Free**: 50 analyses/day, basic features
722
+ - **Pro ($299/month)**: Unlimited analyses, CSV upload, custom reports
723
+ - **Enterprise**: API access, custom models, priority support
724
+ """)
725
 
 
726
  if __name__ == "__main__":
727
+ demo.launch()