entropy25 commited on
Commit
beb692a
·
verified ·
1 Parent(s): 5b09c58

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1749 -659
app.py CHANGED
@@ -1,727 +1,1817 @@
 
1
  import gradio as gr
2
- import pandas as pd
 
 
 
3
  import numpy as np
4
- import json
 
5
  import re
 
 
6
  import io
 
7
  from datetime import datetime
8
- from typing import List, Dict, Tuple
9
- from transformers import pipeline, AutoTokenizer
10
- import plotly.graph_objects as go
11
- from plotly.subplots import make_subplots
12
- import sqlite3
13
- import hashlib
 
 
 
 
 
 
 
14
  import time
15
 
16
- # Initialize models
17
- sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest")
18
- absa_analyzer = pipeline("ner", model="yangheng/deberta-v3-base-absa-v1.1", aggregation_strategy="simple")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- class ReviewAnalyzer:
 
 
 
 
 
 
 
 
 
 
21
  def __init__(self):
22
- self.db_path = "reviews.db"
23
- self._init_db()
24
-
25
- def _init_db(self):
26
- conn = sqlite3.connect(self.db_path)
27
- conn.execute('''
28
- CREATE TABLE IF NOT EXISTS usage_log (
29
- id INTEGER PRIMARY KEY,
30
- user_id TEXT,
31
- timestamp DATETIME,
32
- analysis_type TEXT,
33
- items_count INTEGER
 
 
 
 
 
 
34
  )
35
- ''')
36
- conn.close()
37
-
38
- def preprocess_text(self, text: str) -> str:
39
- """Clean and preprocess review text"""
40
- text = re.sub(r'http\S+', '', text)
41
- text = re.sub(r'[^\w\s]', '', text)
42
- text = text.strip().lower()
43
- return text
44
-
45
- def extract_aspect_keywords(self, reviews: List[str]) -> Dict:
46
- """Extract aspect-based sentiment keywords"""
47
- all_aspects = {'positive': {}, 'negative': {}}
48
- detailed_aspects = []
49
-
50
- for review in reviews:
51
- if not review.strip() or len(review) < 10:
52
- continue
53
-
54
- try:
55
- aspects = absa_analyzer(review)
56
- for aspect in aspects:
57
- word = aspect['word'].lower()
58
- label = aspect['entity_group'].lower()
59
- confidence = aspect['score']
60
-
61
- # Map labels to sentiment
62
- if 'pos' in label or label == 'positive':
63
- sentiment = 'positive'
64
- elif 'neg' in label or label == 'negative':
65
- sentiment = 'negative'
66
- else:
67
- continue
68
-
69
- # Count aspects
70
- if word not in all_aspects[sentiment]:
71
- all_aspects[sentiment][word] = 0
72
- all_aspects[sentiment][word] += 1
73
-
74
- detailed_aspects.append({
75
- 'review': review[:50] + '...',
76
- 'aspect': word,
77
- 'sentiment': sentiment,
78
- 'confidence': round(confidence, 3)
79
- })
80
- except:
81
- continue
82
 
83
- # Get top aspects
84
- top_positive = sorted(all_aspects['positive'].items(), key=lambda x: x[1], reverse=True)[:10]
85
- top_negative = sorted(all_aspects['negative'].items(), key=lambda x: x[1], reverse=True)[:10]
 
86
 
87
- return {
88
- 'top_positive_aspects': top_positive,
89
- 'top_negative_aspects': top_negative,
90
- 'detailed_aspects': detailed_aspects,
91
- 'summary': {
92
- 'total_positive_aspects': len(all_aspects['positive']),
93
- 'total_negative_aspects': len(all_aspects['negative'])
 
 
 
 
 
 
 
 
 
 
94
  }
95
- }
 
 
 
 
 
 
 
96
 
97
- def analyze_sentiment(self, reviews: List[str]) -> Dict:
98
- """Analyze sentiment of reviews with keyword extraction"""
99
- results = []
100
- sentiments = {'positive': 0, 'negative': 0, 'neutral': 0}
 
101
 
102
- for review in reviews:
103
- if not review.strip():
104
- continue
105
-
106
- clean_review = self.preprocess_text(review)
107
- result = sentiment_analyzer(clean_review)[0]
108
-
109
- label = result['label'].lower()
110
- score = result['score']
111
-
112
- if 'pos' in label:
113
- sentiment = 'positive'
114
- elif 'neg' in label:
115
- sentiment = 'negative'
116
- else:
117
- sentiment = 'neutral'
118
-
119
- sentiments[sentiment] += 1
120
- results.append({
121
- 'text': review[:100] + '...' if len(review) > 100 else review,
122
- 'sentiment': sentiment,
123
- 'confidence': round(score, 3)
124
- })
125
 
126
- total = len(results)
127
- sentiment_percentages = {k: round(v/total*100, 1) for k, v in sentiments.items()}
128
 
129
- # Extract keywords
130
- keywords = self.extract_aspect_keywords(reviews)
131
 
132
- return {
133
- 'summary': sentiment_percentages,
134
- 'details': results,
135
- 'total_reviews': total,
136
- 'keywords': keywords
137
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
- def detect_fake_reviews(self, reviews: List[str], metadata: Dict = None) -> Dict:
140
- """Detect potentially fake reviews with optional metadata"""
141
- fake_scores = []
 
 
 
 
142
 
143
- # Process metadata if provided
144
- metadata_flags = []
145
- if metadata and 'timestamps' in metadata and 'usernames' in metadata:
146
- metadata_flags = self._analyze_metadata(metadata['timestamps'], metadata['usernames'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
- for i, review in enumerate(reviews):
149
- if not review.strip():
150
- continue
151
-
152
- score = 0
153
- flags = []
154
-
155
- # Text-based checks
156
- if len(review) < 20:
157
- score += 0.3
158
- flags.append("too_short")
159
-
160
- words = review.lower().split()
161
- unique_ratio = len(set(words)) / len(words) if words else 0
162
- if unique_ratio < 0.5:
163
- score += 0.4
164
- flags.append("repetitive")
165
-
166
- punct_ratio = len(re.findall(r'[!?.]', review)) / len(review) if review else 0
167
- if punct_ratio > 0.1:
168
- score += 0.2
169
- flags.append("excessive_punctuation")
170
-
171
- generic_phrases = ['amazing', 'perfect', 'best ever', 'highly recommend']
172
- if any(phrase in review.lower() for phrase in generic_phrases):
173
- score += 0.1
174
- flags.append("generic_language")
175
-
176
- # Add metadata flags if available
177
- if i < len(metadata_flags):
178
- if metadata_flags[i]:
179
- score += 0.3
180
- flags.extend(metadata_flags[i])
181
-
182
- fake_scores.append({
183
- 'text': review[:100] + '...' if len(review) > 100 else review,
184
- 'fake_probability': min(round(score, 3), 1.0),
185
- 'status': 'suspicious' if score > 0.5 else 'authentic',
186
- 'flags': flags
187
- })
188
-
189
- suspicious_count = sum(1 for item in fake_scores if item['fake_probability'] > 0.5)
190
 
191
  return {
192
- 'summary': {
193
- 'total_reviews': len(fake_scores),
194
- 'suspicious_reviews': suspicious_count,
195
- 'authenticity_rate': round((len(fake_scores) - suspicious_count) / len(fake_scores) * 100, 1) if fake_scores else 0
196
- },
197
- 'details': fake_scores,
198
- 'metadata_analysis': metadata_flags if metadata_flags else None
 
 
199
  }
 
 
 
 
 
 
 
 
200
 
201
- def _analyze_metadata(self, timestamps: List[str], usernames: List[str]) -> List[List[str]]:
202
- """Analyze metadata for suspicious patterns"""
203
- flags_per_review = [[] for _ in range(len(timestamps))]
 
 
204
 
205
- # Time density analysis
206
- if len(timestamps) >= 5:
207
- times = []
208
- for i, ts in enumerate(timestamps):
209
- try:
210
- dt = datetime.strptime(ts, "%Y-%m-%d %H:%M:%S")
211
- times.append((i, dt))
212
- except:
213
- continue
214
-
215
- times.sort(key=lambda x: x[1])
216
-
217
- # Check for clusters
218
- for i in range(len(times) - 5):
219
- if (times[i + 5][1] - times[i][1]).total_seconds() < 300: # 5 mins
220
- for j in range(i, i + 6):
221
- flags_per_review[times[j][0]].append("time_cluster")
222
-
223
- # Username pattern analysis
224
- for i, username in enumerate(usernames):
225
- if re.match(r"user_\d{4,}", username):
226
- flags_per_review[i].append("suspicious_username")
227
- if len(username) < 4:
228
- flags_per_review[i].append("short_username")
229
-
230
- return flags_per_review
231
-
232
- def assess_quality(self, reviews: List[str], custom_weights: Dict = None) -> Tuple[Dict, go.Figure]:
233
- """Assess review quality with customizable weights and radar chart"""
234
- default_weights = {
235
- 'length': 0.25,
236
- 'detail': 0.25,
237
- 'structure': 0.25,
238
- 'helpfulness': 0.25
239
- }
240
 
241
- weights = custom_weights if custom_weights else default_weights
242
- quality_scores = []
243
 
244
- for review in reviews:
245
- if not review.strip():
246
- continue
247
-
248
- factors = {}
249
-
250
- # Length factor
251
- length_score = min(len(review) / 200, 1.0)
252
- factors['length'] = round(length_score, 2)
253
-
254
- # Detail factor
255
- detail_words = ['because', 'however', 'although', 'specifically', 'particularly']
256
- detail_score = min(sum(1 for word in detail_words if word in review.lower()) / 3, 1.0)
257
- factors['detail'] = round(detail_score, 2)
258
-
259
- # Structure factor
260
- sentences = len(re.split(r'[.!?]', review))
261
- structure_score = min(sentences / 5, 1.0)
262
- factors['structure'] = round(structure_score, 2)
263
-
264
- # Helpfulness factor
265
- helpful_words = ['pros', 'cons', 'recommend', 'suggest', 'tip', 'advice']
266
- helpful_score = min(sum(1 for word in helpful_words if word in review.lower()) / 2, 1.0)
267
- factors['helpfulness'] = round(helpful_score, 2)
268
-
269
- # Calculate weighted score
270
- total_score = sum(factors[k] * weights[k] for k in factors.keys())
271
-
272
- quality_scores.append({
273
- 'text': review[:100] + '...' if len(review) > 100 else review,
274
- 'quality_score': round(total_score, 3),
275
- 'factors': factors,
276
- 'grade': 'A' if total_score > 0.8 else 'B' if total_score > 0.6 else 'C' if total_score > 0.4 else 'D'
277
- })
278
-
279
- avg_quality = sum(item['quality_score'] for item in quality_scores) / len(quality_scores) if quality_scores else 0
280
-
281
- # Create radar chart for average factors
282
- avg_factors = {}
283
- for factor in ['length', 'detail', 'structure', 'helpfulness']:
284
- avg_factors[factor] = sum(item['factors'][factor] for item in quality_scores) / len(quality_scores) if quality_scores else 0
285
-
286
- fig = go.Figure()
287
- fig.add_trace(go.Scatterpolar(
288
- r=list(avg_factors.values()),
289
- theta=list(avg_factors.keys()),
290
- fill='toself',
291
- name='Quality Factors'
292
- ))
293
 
294
- fig.update_layout(
295
- polar=dict(
296
- radialaxis=dict(
297
- visible=True,
298
- range=[0, 1]
299
- )),
300
- showlegend=True,
301
- title="Average Quality Factors"
302
- )
303
 
304
- return {
305
- 'summary': {
306
- 'average_quality': round(avg_quality, 3),
307
- 'total_reviews': len(quality_scores),
308
- 'high_quality_count': sum(1 for item in quality_scores if item['quality_score'] > 0.7),
309
- 'weights_used': weights
310
- },
311
- 'details': quality_scores,
312
- 'factor_averages': avg_factors
313
- }, fig
314
-
315
- def compare_competitors(self, product_a_reviews: List[str], product_b_reviews: List[str]) -> Tuple[Dict, go.Figure]:
316
- """Compare sentiment between two products"""
317
- analysis_a = self.analyze_sentiment(product_a_reviews)
318
- analysis_b = self.analyze_sentiment(product_b_reviews)
319
 
320
- fig = make_subplots(
321
- rows=1, cols=2,
322
- specs=[[{'type': 'pie'}, {'type': 'pie'}]],
323
- subplot_titles=['Product A', 'Product B']
324
- )
325
 
326
- fig.add_trace(go.Pie(
327
- labels=list(analysis_a['summary'].keys()),
328
- values=list(analysis_a['summary'].values()),
329
- name="Product A"
330
- ), row=1, col=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
- fig.add_trace(go.Pie(
333
- labels=list(analysis_b['summary'].keys()),
334
- values=list(analysis_b['summary'].values()),
335
- name="Product B"
336
- ), row=1, col=2)
 
337
 
338
- fig.update_layout(title_text="Sentiment Comparison")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
- comparison = {
341
- 'product_a': analysis_a,
342
- 'product_b': analysis_b,
343
- 'winner': 'Product A' if analysis_a['summary']['positive'] > analysis_b['summary']['positive'] else 'Product B'
344
- }
345
 
346
- return comparison, fig
347
-
348
- def generate_report(self, analysis_data: Dict, report_type: str = "basic") -> str:
349
- """Generate analysis report with export capability"""
350
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
351
 
352
- if report_type == "sentiment":
353
- keywords = analysis_data.get('keywords', {})
354
- top_pos = keywords.get('top_positive_aspects', [])[:5]
355
- top_neg = keywords.get('top_negative_aspects', [])[:5]
 
 
 
 
 
356
 
357
- return f"""# Sentiment Analysis Report
358
- Generated: {timestamp}
359
-
360
- ## Summary
361
- - Total Reviews: {analysis_data.get('total_reviews', 0)}
362
- - Positive: {analysis_data.get('summary', {}).get('positive', 0)}%
363
- - Negative: {analysis_data.get('summary', {}).get('negative', 0)}%
364
- - Neutral: {analysis_data.get('summary', {}).get('neutral', 0)}%
365
-
366
- ## Top Positive Aspects
367
- {chr(10).join([f"- {aspect[0]} (mentioned {aspect[1]} times)" for aspect in top_pos])}
368
-
369
- ## Top Negative Aspects
370
- {chr(10).join([f"- {aspect[0]} (mentioned {aspect[1]} times)" for aspect in top_neg])}
371
-
372
- ## Key Insights
373
- - Overall sentiment: {'Positive' if analysis_data.get('summary', {}).get('positive', 0) > 50 else 'Mixed'}
374
- - Main complaints: {', '.join([aspect[0] for aspect in top_neg[:3]])}
375
- - Key strengths: {', '.join([aspect[0] for aspect in top_pos[:3]])}
376
-
377
- ## Recommendations
378
- - Address negative aspects: {', '.join([aspect[0] for aspect in top_neg[:2]])}
379
- - Leverage positive aspects in marketing
380
- - Monitor sentiment trends over time
381
- """
382
-
383
- elif report_type == "fake":
384
- return f"""# Fake Review Detection Report
385
- Generated: {timestamp}
386
-
387
- ## Summary
388
- - Total Reviews: {analysis_data.get('summary', {}).get('total_reviews', 0)}
389
- - Suspicious Reviews: {analysis_data.get('summary', {}).get('suspicious_reviews', 0)}
390
- - Authenticity Rate: {analysis_data.get('summary', {}).get('authenticity_rate', 0)}%
391
-
392
- ## Risk Assessment
393
- - Overall Risk: {'High' if analysis_data.get('summary', {}).get('authenticity_rate', 0) < 70 else 'Low'}
394
- - Action Required: {'Yes' if analysis_data.get('summary', {}).get('suspicious_reviews', 0) > 0 else 'No'}
395
-
396
- ## Common Fraud Indicators
397
- - Short reviews with generic language
398
- - Repetitive content patterns
399
- - Suspicious timing clusters
400
- - Unusual username patterns
401
- """
402
 
403
- return "Report generated successfully"
404
-
405
- # Global analyzer instance
406
- analyzer = ReviewAnalyzer()
407
 
408
- def process_reviews_input(text: str) -> List[str]:
409
- """Process review input text into list"""
410
- if not text.strip():
411
- return []
412
 
413
- reviews = []
414
- for line in text.split('\n'):
415
- line = line.strip()
416
- if line and len(line) > 10:
417
- reviews.append(line)
418
-
419
- return reviews
420
-
421
- def process_csv_upload(file) -> Tuple[List[str], Dict]:
422
- """Process uploaded CSV file"""
423
- if file is None:
424
- return [], {}
425
 
426
- try:
427
- df = pd.read_csv(file.name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
 
429
- # Look for common column names
430
- review_col = None
431
- time_col = None
432
- user_col = None
 
 
 
433
 
434
- for col in df.columns:
435
- col_lower = col.lower()
436
- if 'review' in col_lower or 'comment' in col_lower or 'text' in col_lower:
437
- review_col = col
438
- elif 'time' in col_lower or 'date' in col_lower:
439
- time_col = col
440
- elif 'user' in col_lower or 'name' in col_lower:
441
- user_col = col
442
 
443
- if review_col is None:
444
- return [], {"error": "No review column found. Expected columns: 'review', 'comment', or 'text'"}
445
 
446
- reviews = df[review_col].dropna().astype(str).tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
 
448
- metadata = {}
449
- if time_col:
450
- metadata['timestamps'] = df[time_col].dropna().astype(str).tolist()
451
- if user_col:
452
- metadata['usernames'] = df[user_col].dropna().astype(str).tolist()
453
 
454
- return reviews, metadata
455
 
456
- except Exception as e:
457
- return [], {"error": f"Failed to process CSV: {str(e)}"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
 
459
- def sentiment_analysis_interface(reviews_text: str, csv_file):
460
- """Interface for sentiment analysis"""
461
- reviews = []
462
-
463
- if csv_file is not None:
464
- reviews, metadata = process_csv_upload(csv_file)
465
- if 'error' in metadata:
466
- return metadata['error'], None
467
- else:
468
- reviews = process_reviews_input(reviews_text)
 
 
 
 
469
 
470
- if not reviews:
471
- return "Please enter reviews or upload a CSV file.", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
 
473
- try:
474
- result = analyzer.analyze_sentiment(reviews)
 
 
 
 
 
 
 
 
 
 
 
 
475
 
476
  fig = go.Figure(data=[
477
- go.Bar(x=list(result['summary'].keys()),
478
- y=list(result['summary'].values()),
479
- marker_color=['green', 'red', 'gray'])
480
  ])
481
- fig.update_layout(title="Sentiment Distribution", yaxis_title="Percentage")
482
 
483
- return json.dumps(result, indent=2), fig
484
- except Exception as e:
485
- return f"Error: {str(e)}", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
 
487
- def fake_detection_interface(reviews_text: str, csv_file):
488
- """Interface for fake review detection"""
489
- reviews = []
490
- metadata = {}
491
-
492
- if csv_file is not None:
493
- reviews, metadata = process_csv_upload(csv_file)
494
- if 'error' in metadata:
495
- return metadata['error']
496
- else:
497
- reviews = process_reviews_input(reviews_text)
498
-
499
- if not reviews:
500
- return "Please enter reviews or upload a CSV file."
501
-
502
- try:
503
- result = analyzer.detect_fake_reviews(reviews, metadata if metadata else None)
504
- return json.dumps(result, indent=2)
505
- except Exception as e:
506
- return f"Error: {str(e)}"
507
 
508
- def quality_assessment_interface(reviews_text: str, csv_file, length_weight: float, detail_weight: float, structure_weight: float, help_weight: float):
509
- """Interface for quality assessment with custom weights"""
510
- reviews = []
511
-
512
- if csv_file is not None:
513
- reviews, metadata = process_csv_upload(csv_file)
514
- if 'error' in metadata:
515
- return metadata['error'], None
516
- else:
517
- reviews = process_reviews_input(reviews_text)
518
-
519
- if not reviews:
520
- return "Please enter reviews or upload a CSV file.", None
521
-
522
- try:
523
- custom_weights = {
524
- 'length': length_weight,
525
- 'detail': detail_weight,
526
- 'structure': structure_weight,
527
- 'helpfulness': help_weight
528
- }
529
 
530
- result, radar_fig = analyzer.assess_quality(reviews, custom_weights)
531
- return json.dumps(result, indent=2), radar_fig
532
- except Exception as e:
533
- return f"Error: {str(e)}", None
 
 
 
534
 
535
- def competitor_comparison_interface(product_a_text: str, product_b_text: str):
536
- """Interface for competitor comparison"""
537
- if not product_a_text.strip() or not product_b_text.strip():
538
- return "Please enter reviews for both products.", None
539
-
540
- reviews_a = process_reviews_input(product_a_text)
541
- reviews_b = process_reviews_input(product_b_text)
542
-
543
- if not reviews_a or not reviews_b:
544
- return "Please provide valid reviews for both products.", None
545
-
546
- try:
547
- result, fig = analyzer.compare_competitors(reviews_a, reviews_b)
548
- return json.dumps(result, indent=2), fig
549
- except Exception as e:
550
- return f"Error: {str(e)}", None
551
 
552
- def generate_report_interface(analysis_result: str, report_type: str):
553
- """Interface for report generation"""
554
- if not analysis_result.strip():
555
- return "No analysis data available. Please run an analysis first."
556
-
557
- try:
558
- data = json.loads(analysis_result)
559
- report = analyzer.generate_report(data, report_type.lower())
560
- return report
561
- except Exception as e:
562
- return f"Error generating report: {str(e)}"
563
-
564
- # Create Gradio interface
565
- with gr.Blocks(title="SmartReview Pro", theme=gr.themes.Soft()) as demo:
566
- gr.Markdown("# 🛒 SmartReview Pro")
567
- gr.Markdown("Advanced review analysis platform with AI-powered insights")
568
-
569
- with gr.Tab("📊 Sentiment Analysis"):
570
- gr.Markdown("### Analyze customer sentiment and extract key aspects")
571
- with gr.Row():
572
- with gr.Column():
573
- sentiment_input = gr.Textbox(
574
- lines=8,
575
- placeholder="Enter reviews (one per line) or upload CSV...",
576
- label="Reviews"
577
- )
578
- sentiment_csv = gr.File(
579
- label="Upload CSV (columns: review/comment/text, optional: timestamp, username)",
580
- file_types=[".csv"]
581
- )
582
- sentiment_btn = gr.Button("Analyze Sentiment", variant="primary")
583
- with gr.Column():
584
- sentiment_output = gr.Textbox(label="Analysis Results", lines=15)
585
- sentiment_chart = gr.Plot(label="Sentiment Distribution")
586
-
587
- sentiment_btn.click(
588
- sentiment_analysis_interface,
589
- inputs=[sentiment_input, sentiment_csv],
590
- outputs=[sentiment_output, sentiment_chart]
591
- )
592
-
593
- with gr.Tab("🔍 Fake Review Detection"):
594
- gr.Markdown("### Detect suspicious reviews using text analysis and metadata")
595
- with gr.Row():
596
- with gr.Column():
597
- fake_input = gr.Textbox(
598
- lines=8,
599
- placeholder="Enter reviews to analyze...",
600
- label="Reviews"
601
- )
602
- fake_csv = gr.File(
603
- label="Upload CSV (supports timestamp & username analysis)",
604
- file_types=[".csv"]
605
- )
606
- fake_btn = gr.Button("Detect Fake Reviews", variant="primary")
607
- with gr.Column():
608
- fake_output = gr.Textbox(label="Detection Results", lines=15)
609
-
610
- fake_btn.click(
611
- fake_detection_interface,
612
- inputs=[fake_input, fake_csv],
613
- outputs=[fake_output]
614
- )
615
-
616
- with gr.Tab("⭐ Quality Assessment"):
617
- gr.Markdown("### Assess review quality with customizable weights")
618
- with gr.Row():
619
- with gr.Column():
620
- quality_input = gr.Textbox(
621
- lines=8,
622
- placeholder="Enter reviews to assess...",
623
- label="Reviews"
624
- )
625
- quality_csv = gr.File(
626
- label="Upload CSV",
627
- file_types=[".csv"]
628
- )
629
-
630
- gr.Markdown("**Customize Quality Weights:**")
631
- with gr.Row():
632
- length_weight = gr.Slider(0, 1, 0.25, label="Length Weight")
633
- detail_weight = gr.Slider(0, 1, 0.25, label="Detail Weight")
634
- with gr.Row():
635
- structure_weight = gr.Slider(0, 1, 0.25, label="Structure Weight")
636
- help_weight = gr.Slider(0, 1, 0.25, label="Helpfulness Weight")
637
-
638
- quality_btn = gr.Button("Assess Quality", variant="primary")
639
- with gr.Column():
640
- quality_output = gr.Textbox(label="Quality Assessment", lines=12)
641
- quality_radar = gr.Plot(label="Quality Factors Radar Chart")
642
-
643
- quality_btn.click(
644
- quality_assessment_interface,
645
- inputs=[quality_input, quality_csv, length_weight, detail_weight, structure_weight, help_weight],
646
- outputs=[quality_output, quality_radar]
647
- )
648
 
649
- with gr.Tab("🆚 Competitor Comparison"):
650
- gr.Markdown("### Compare sentiment between competing products")
651
- with gr.Row():
652
- with gr.Column():
653
- comp_product_a = gr.Textbox(
654
- lines=8,
655
- placeholder="Product A reviews...",
656
- label="Product A Reviews"
657
- )
658
- comp_product_b = gr.Textbox(
659
- lines=8,
660
- placeholder="Product B reviews...",
661
- label="Product B Reviews"
662
- )
663
- comp_btn = gr.Button("Compare Products", variant="primary")
664
- with gr.Column():
665
- comp_output = gr.Textbox(label="Comparison Results", lines=15)
666
- comp_chart = gr.Plot(label="Comparison Chart")
667
-
668
- comp_btn.click(
669
- competitor_comparison_interface,
670
- inputs=[comp_product_a, comp_product_b],
671
- outputs=[comp_output, comp_chart]
672
  )
 
 
673
 
674
- with gr.Tab("📋 Report Generation"):
675
- gr.Markdown("### Generate professional analysis reports")
676
- with gr.Row():
677
- with gr.Column():
678
- report_data = gr.Textbox(
679
- lines=10,
680
- placeholder="Paste analysis results here...",
681
- label="Analysis Data (JSON)"
682
- )
683
- report_type = gr.Dropdown(
684
- choices=["sentiment", "fake", "quality"],
685
- value="sentiment",
686
- label="Report Type"
687
- )
688
- report_btn = gr.Button("Generate Report", variant="primary")
689
- with gr.Column():
690
- report_output = gr.Textbox(label="Generated Report", lines=15)
691
-
692
- report_btn.click(
693
- generate_report_interface,
694
- inputs=[report_data, report_type],
695
- outputs=[report_output]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
696
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
697
 
698
- with gr.Tab("ℹ️ About"):
699
- gr.Markdown("""
700
- ## SmartReview Pro Features
701
-
702
- ### 🆕 New Features:
703
- - **Aspect-Based Sentiment Analysis**: Extract specific aspects customers love/hate
704
- - **CSV Batch Processing**: Upload review files for bulk analysis
705
- - **Metadata Analysis**: Detect fake reviews using timestamps and usernames
706
- - **Customizable Quality Scoring**: Adjust quality factors to your needs
707
- - **Advanced Visualizations**: Radar charts and enhanced reporting
708
-
709
- ### Core Capabilities:
710
- - **Sentiment Analysis**: AI-powered emotion detection with keyword extraction
711
- - **Fake Review Detection**: Multi-layer authenticity verification
712
- - **Quality Assessment**: Comprehensive review helpfulness scoring
713
- - **Competitor Comparison**: Side-by-side sentiment analysis
714
- - **Professional Reports**: Detailed insights with actionable recommendations
715
-
716
- ### CSV Format:
717
- Required columns: `review` or `comment` or `text`
718
- Optional columns: `timestamp`, `username` (for enhanced fake detection)
719
-
720
- ### Pricing:
721
- - **Free**: 50 analyses/day, basic features
722
- - **Pro ($299/month)**: Unlimited analyses, CSV upload, custom reports
723
- - **Enterprise**: API access, custom models, priority support
724
- """)
725
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
726
  if __name__ == "__main__":
727
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
  import gradio as gr
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
+ import plotly.graph_objects as go
5
+ import plotly.express as px
6
+ from plotly.subplots import make_subplots
7
  import numpy as np
8
+ from wordcloud import WordCloud
9
+ from collections import Counter, defaultdict, OrderedDict
10
  import re
11
+ import json
12
+ import csv
13
  import io
14
+ import tempfile
15
  from datetime import datetime
16
+ import logging
17
+ from functools import lru_cache, wraps
18
+ from dataclasses import dataclass
19
+ from typing import List, Dict, Optional, Tuple, Any, Callable
20
+ from contextlib import contextmanager
21
+ import nltk
22
+ from nltk.corpus import stopwords
23
+ import langdetect
24
+ import pandas as pd
25
+ import gc
26
+ import threading
27
+ import asyncio
28
+ from concurrent.futures import ThreadPoolExecutor
29
  import time
30
 
31
+ # Advanced analysis imports
32
+ import shap
33
+ import lime
34
+ from lime.lime_text import LimeTextExplainer
35
+
36
+ # Configuration
37
+ @dataclass
38
+ class Config:
39
+ MAX_HISTORY_SIZE: int = 1000
40
+ BATCH_SIZE_LIMIT: int = 50
41
+ MAX_TEXT_LENGTH: int = 512
42
+ MIN_WORD_LENGTH: int = 2
43
+ CACHE_SIZE: int = 128
44
+ BATCH_PROCESSING_SIZE: int = 8
45
+ MODEL_CACHE_SIZE: int = 2 # Maximum models to keep in memory
46
+
47
+ # Supported languages and models
48
+ SUPPORTED_LANGUAGES = {
49
+ 'auto': 'Auto Detect',
50
+ 'en': 'English',
51
+ 'zh': 'Chinese',
52
+ 'es': 'Spanish',
53
+ 'fr': 'French',
54
+ 'de': 'German',
55
+ 'sv': 'Swedish'
56
+ }
57
+
58
+ MODELS = {
59
+ 'en': "cardiffnlp/twitter-roberta-base-sentiment-latest",
60
+ 'multilingual': "cardiffnlp/twitter-xlm-roberta-base-sentiment",
61
+ 'zh': "uer/roberta-base-finetuned-dianping-chinese"
62
+ }
63
+
64
+ # Color themes for Plotly
65
+ THEMES = {
66
+ 'default': {'pos': '#4CAF50', 'neg': '#F44336', 'neu': '#FF9800'},
67
+ 'ocean': {'pos': '#0077BE', 'neg': '#FF6B35', 'neu': '#00BCD4'},
68
+ 'dark': {'pos': '#66BB6A', 'neg': '#EF5350', 'neu': '#FFA726'},
69
+ 'rainbow': {'pos': '#9C27B0', 'neg': '#E91E63', 'neu': '#FF5722'}
70
+ }
71
+
72
+ config = Config()
73
+
74
+ # Logging setup
75
+ logging.basicConfig(level=logging.INFO)
76
+ logger = logging.getLogger(__name__)
77
+
78
+ # Initialize NLTK
79
+ try:
80
+ nltk.download('stopwords', quiet=True)
81
+ nltk.download('punkt', quiet=True)
82
+ STOP_WORDS = set(stopwords.words('english'))
83
+ except:
84
+ STOP_WORDS = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
85
+
86
+ # Decorators and Context Managers
87
+ def handle_errors(default_return=None):
88
+ """Centralized error handling decorator"""
89
+ def decorator(func: Callable) -> Callable:
90
+ @wraps(func)
91
+ def wrapper(*args, **kwargs):
92
+ try:
93
+ return func(*args, **kwargs)
94
+ except Exception as e:
95
+ logger.error(f"{func.__name__} failed: {e}")
96
+ return default_return if default_return is not None else f"Error: {str(e)}"
97
+ return wrapper
98
+ return decorator
99
+
100
+ @contextmanager
101
+ def memory_cleanup():
102
+ """Context manager for memory cleanup"""
103
+ try:
104
+ yield
105
+ finally:
106
+ gc.collect()
107
+ if torch.cuda.is_available():
108
+ torch.cuda.empty_cache()
109
+
110
+ class ThemeContext:
111
+ """Theme management context"""
112
+ def __init__(self, theme: str = 'default'):
113
+ self.theme = theme
114
+ self.colors = config.THEMES.get(theme, config.THEMES['default'])
115
+
116
+ class LRUModelCache:
117
+ """LRU Cache for models with memory management"""
118
+ def __init__(self, max_size: int = 2):
119
+ self.max_size = max_size
120
+ self.cache = OrderedDict()
121
+ self.lock = threading.Lock()
122
+
123
+ def get(self, key):
124
+ with self.lock:
125
+ if key in self.cache:
126
+ # Move to end (most recently used)
127
+ self.cache.move_to_end(key)
128
+ return self.cache[key]
129
+ return None
130
+
131
+ def put(self, key, value):
132
+ with self.lock:
133
+ if key in self.cache:
134
+ self.cache.move_to_end(key)
135
+ else:
136
+ if len(self.cache) >= self.max_size:
137
+ # Remove least recently used
138
+ oldest_key = next(iter(self.cache))
139
+ old_model, old_tokenizer = self.cache.pop(oldest_key)
140
+ # Force cleanup
141
+ del old_model, old_tokenizer
142
+ gc.collect()
143
+ if torch.cuda.is_available():
144
+ torch.cuda.empty_cache()
145
+
146
+ self.cache[key] = value
147
+
148
+ def clear(self):
149
+ with self.lock:
150
+ for model, tokenizer in self.cache.values():
151
+ del model, tokenizer
152
+ self.cache.clear()
153
+ gc.collect()
154
+ if torch.cuda.is_available():
155
+ torch.cuda.empty_cache()
156
 
157
+ # Enhanced Model Manager with Optimized Memory Management
158
+ class ModelManager:
159
+ """Optimized multi-language model manager with LRU cache and lazy loading"""
160
+ _instance = None
161
+
162
+ def __new__(cls):
163
+ if cls._instance is None:
164
+ cls._instance = super().__new__(cls)
165
+ cls._instance._initialized = False
166
+ return cls._instance
167
+
168
  def __init__(self):
169
+ if not self._initialized:
170
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
171
+ self.model_cache = LRUModelCache(config.MODEL_CACHE_SIZE)
172
+ self.loading_lock = threading.Lock()
173
+ self._initialized = True
174
+ logger.info(f"ModelManager initialized on device: {self.device}")
175
+
176
+ def _load_model(self, model_name: str, cache_key: str):
177
+ """Load model with memory optimization"""
178
+ try:
179
+ logger.info(f"Loading model: {model_name}")
180
+
181
+ # Load with memory optimization
182
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
183
+ model = AutoModelForSequenceClassification.from_pretrained(
184
+ model_name,
185
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
186
+ device_map="auto" if torch.cuda.is_available() else None
187
  )
188
+
189
+ if not torch.cuda.is_available():
190
+ model.to(self.device)
191
+
192
+ # Set to eval mode to save memory
193
+ model.eval()
194
+
195
+ # Cache the model
196
+ self.model_cache.put(cache_key, (model, tokenizer))
197
+ logger.info(f"Model {model_name} loaded and cached successfully")
198
+
199
+ return model, tokenizer
200
+
201
+ except Exception as e:
202
+ logger.error(f"Failed to load model {model_name}: {e}")
203
+ raise
204
+
205
+ def get_model(self, language='en'):
206
+ """Get model for specific language with lazy loading and caching"""
207
+ # Determine cache key and model name
208
+ if language == 'zh':
209
+ cache_key = 'zh'
210
+ model_name = config.MODELS['zh']
211
+ else:
212
+ cache_key = 'multilingual'
213
+ model_name = config.MODELS['multilingual']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
+ # Try to get from cache first
216
+ cached_model = self.model_cache.get(cache_key)
217
+ if cached_model is not None:
218
+ return cached_model
219
 
220
+ # Load model if not in cache (with thread safety)
221
+ with self.loading_lock:
222
+ # Double-check pattern
223
+ cached_model = self.model_cache.get(cache_key)
224
+ if cached_model is not None:
225
+ return cached_model
226
+
227
+ return self._load_model(model_name, cache_key)
228
+
229
+ @staticmethod
230
+ def detect_language(text: str) -> str:
231
+ """Detect text language"""
232
+ try:
233
+ detected = langdetect.detect(text)
234
+ language_mapping = {
235
+ 'zh-cn': 'zh',
236
+ 'zh-tw': 'zh'
237
  }
238
+ detected = language_mapping.get(detected, detected)
239
+ return detected if detected in config.SUPPORTED_LANGUAGES else 'en'
240
+ except:
241
+ return 'en'
242
+
243
+ # Simplified Text Processing
244
+ class TextProcessor:
245
+ """Optimized text processing with multi-language support"""
246
 
247
+ @staticmethod
248
+ @lru_cache(maxsize=config.CACHE_SIZE)
249
+ def clean_text(text: str, remove_punctuation: bool = True, remove_numbers: bool = False) -> str:
250
+ """Clean text with language awareness"""
251
+ text = text.strip()
252
 
253
+ # Don't clean Chinese text aggressively
254
+ if re.search(r'[\u4e00-\u9fff]', text):
255
+ return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
+ text = text.lower()
 
258
 
259
+ if remove_numbers:
260
+ text = re.sub(r'\d+', '', text)
261
 
262
+ if remove_punctuation:
263
+ text = re.sub(r'[^\w\s]', '', text)
264
+
265
+ words = text.split()
266
+ cleaned_words = [w for w in words if w not in STOP_WORDS and len(w) >= config.MIN_WORD_LENGTH]
267
+ return ' '.join(cleaned_words)
268
+
269
+ @staticmethod
270
+ def parse_batch_input(text: str) -> List[str]:
271
+ """Parse batch input from textarea"""
272
+ lines = text.strip().split('\n')
273
+ return [line.strip() for line in lines if line.strip()]
274
+
275
+ # Enhanced History Manager
276
+ class HistoryManager:
277
+ """Enhanced history management with filtering"""
278
+ def __init__(self):
279
+ self._history = []
280
+
281
+ def add(self, entry: Dict):
282
+ """Add entry with timestamp"""
283
+ entry['timestamp'] = datetime.now().isoformat()
284
+ self._history.append(entry)
285
+ if len(self._history) > config.MAX_HISTORY_SIZE:
286
+ self._history = self._history[-config.MAX_HISTORY_SIZE:]
287
+
288
+ def add_batch(self, entries: List[Dict]):
289
+ """Add multiple entries"""
290
+ for entry in entries:
291
+ self.add(entry)
292
+
293
+ def get_all(self) -> List[Dict]:
294
+ return self._history.copy()
295
 
296
+ def get_recent(self, n: int = 10) -> List[Dict]:
297
+ return self._history[-n:] if self._history else []
298
+
299
+ def filter_by(self, sentiment: str = None, language: str = None,
300
+ min_confidence: float = None) -> List[Dict]:
301
+ """Filter history by criteria"""
302
+ filtered = self._history
303
 
304
+ if sentiment:
305
+ filtered = [h for h in filtered if h['sentiment'] == sentiment]
306
+ if language:
307
+ filtered = [h for h in filtered if h.get('language', 'en') == language]
308
+ if min_confidence:
309
+ filtered = [h for h in filtered if h['confidence'] >= min_confidence]
310
+
311
+ return filtered
312
+
313
+ def clear(self) -> int:
314
+ count = len(self._history)
315
+ self._history.clear()
316
+ return count
317
+
318
+ def size(self) -> int:
319
+ return len(self._history)
320
+
321
+ def get_stats(self) -> Dict:
322
+ """Get comprehensive statistics"""
323
+ if not self._history:
324
+ return {}
325
 
326
+ sentiments = [item['sentiment'] for item in self._history]
327
+ confidences = [item['confidence'] for item in self._history]
328
+ languages = [item.get('language', 'en') for item in self._history]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
 
330
  return {
331
+ 'total_analyses': len(self._history),
332
+ 'positive_count': sentiments.count('Positive'),
333
+ 'negative_count': sentiments.count('Negative'),
334
+ 'neutral_count': sentiments.count('Neutral'),
335
+ 'avg_confidence': np.mean(confidences),
336
+ 'max_confidence': np.max(confidences),
337
+ 'min_confidence': np.min(confidences),
338
+ 'languages_detected': len(set(languages)),
339
+ 'most_common_language': Counter(languages).most_common(1)[0][0] if languages else 'en'
340
  }
341
+
342
+ # Core Sentiment Analysis Engine with Performance Optimizations
343
+ class SentimentEngine:
344
+ """Optimized multi-language sentiment analysis engine"""
345
+
346
+ def __init__(self):
347
+ self.model_manager = ModelManager()
348
+ self.executor = ThreadPoolExecutor(max_workers=4)
349
 
350
+ @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0})
351
+ def analyze_single(self, text: str, language: str = 'auto', preprocessing_options: Dict = None) -> Dict:
352
+ """Optimized single text analysis"""
353
+ if not text.strip():
354
+ raise ValueError("Empty text provided")
355
 
356
+ # Detect language
357
+ if language == 'auto':
358
+ detected_lang = self.model_manager.detect_language(text)
359
+ else:
360
+ detected_lang = language
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
 
362
+ # Get appropriate model
363
+ model, tokenizer = self.model_manager.get_model(detected_lang)
364
 
365
+ # Preprocessing
366
+ options = preprocessing_options or {}
367
+ processed_text = text
368
+ if options.get('clean_text', False) and not re.search(r'[\u4e00-\u9fff]', text):
369
+ processed_text = TextProcessor.clean_text(
370
+ text,
371
+ options.get('remove_punctuation', True),
372
+ options.get('remove_numbers', False)
373
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
 
375
+ # Tokenize and analyze with memory optimization
376
+ inputs = tokenizer(processed_text, return_tensors="pt", padding=True,
377
+ truncation=True, max_length=config.MAX_TEXT_LENGTH).to(self.model_manager.device)
 
 
 
 
 
 
378
 
379
+ # Use no_grad for inference to save memory
380
+ with torch.no_grad():
381
+ outputs = model(**inputs)
382
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
 
 
 
 
 
 
 
 
 
 
 
383
 
384
+ # Clear GPU cache after inference
385
+ if torch.cuda.is_available():
386
+ torch.cuda.empty_cache()
 
 
387
 
388
+ # Handle different model outputs
389
+ if len(probs) == 3: # negative, neutral, positive
390
+ sentiment_idx = np.argmax(probs)
391
+ sentiment_labels = ['Negative', 'Neutral', 'Positive']
392
+ sentiment = sentiment_labels[sentiment_idx]
393
+ confidence = float(probs[sentiment_idx])
394
+
395
+ result = {
396
+ 'sentiment': sentiment,
397
+ 'confidence': confidence,
398
+ 'neg_prob': float(probs[0]),
399
+ 'neu_prob': float(probs[1]),
400
+ 'pos_prob': float(probs[2]),
401
+ 'has_neutral': True
402
+ }
403
+ else: # negative, positive
404
+ pred = np.argmax(probs)
405
+ sentiment = "Positive" if pred == 1 else "Negative"
406
+ confidence = float(probs[pred])
407
+
408
+ result = {
409
+ 'sentiment': sentiment,
410
+ 'confidence': confidence,
411
+ 'neg_prob': float(probs[0]),
412
+ 'pos_prob': float(probs[1]),
413
+ 'neu_prob': 0.0,
414
+ 'has_neutral': False
415
+ }
416
 
417
+ # Add metadata
418
+ result.update({
419
+ 'language': detected_lang,
420
+ 'word_count': len(text.split()),
421
+ 'char_count': len(text)
422
+ })
423
 
424
+ return result
425
+
426
+ def _analyze_text_batch(self, text: str, language: str, preprocessing_options: Dict, index: int) -> Dict:
427
+ """Single text analysis for batch processing"""
428
+ try:
429
+ result = self.analyze_single(text, language, preprocessing_options)
430
+ result['batch_index'] = index
431
+ result['text'] = text[:100] + '...' if len(text) > 100 else text
432
+ result['full_text'] = text
433
+ return result
434
+ except Exception as e:
435
+ return {
436
+ 'sentiment': 'Error',
437
+ 'confidence': 0.0,
438
+ 'error': str(e),
439
+ 'batch_index': index,
440
+ 'text': text[:100] + '...' if len(text) > 100 else text,
441
+ 'full_text': text
442
+ }
443
+
444
+
445
+ @handle_errors(default_return=[])
446
+ def analyze_batch(self, texts: List[str], language: str = 'auto',
447
+ preprocessing_options: Dict = None, progress_callback=None) -> List[Dict]:
448
+ """Optimized parallel batch processing"""
449
+ if len(texts) > config.BATCH_SIZE_LIMIT:
450
+ texts = texts[:config.BATCH_SIZE_LIMIT]
451
 
452
+ if not texts:
453
+ return []
 
 
 
454
 
455
+ # Pre-load model to avoid race conditions
456
+ self.model_manager.get_model(language if language != 'auto' else 'en')
 
 
 
457
 
458
+ # Use ThreadPoolExecutor for parallel processing
459
+ with ThreadPoolExecutor(max_workers=min(4, len(texts))) as executor:
460
+ futures = []
461
+ for i, text in enumerate(texts):
462
+ future = executor.submit(
463
+ self._analyze_text_batch,
464
+ text, language, preprocessing_options, i
465
+ )
466
+ futures.append(future)
467
 
468
+ results = []
469
+ for i, future in enumerate(futures):
470
+ if progress_callback:
471
+ progress_callback((i + 1) / len(futures))
472
+
473
+ try:
474
+ result = future.result(timeout=30) # 30 second timeout per text
475
+ results.append(result)
476
+ except Exception as e:
477
+ results.append({
478
+ 'sentiment': 'Error',
479
+ 'confidence': 0.0,
480
+ 'error': f"Timeout or error: {str(e)}",
481
+ 'batch_index': i,
482
+ 'text': texts[i][:100] + '...' if len(texts[i]) > 100 else texts[i],
483
+ 'full_text': texts[i]
484
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
 
486
+ return results
 
 
 
487
 
488
+ class AdvancedAnalysisEngine:
489
+ """Advanced analysis using SHAP and LIME with FIXED implementation"""
 
 
490
 
491
+ def __init__(self):
492
+ self.model_manager = ModelManager()
 
 
 
 
 
 
 
 
 
 
493
 
494
+ def create_prediction_function(self, model, tokenizer, device):
495
+ """Create FIXED prediction function for SHAP/LIME"""
496
+ def predict_proba(texts):
497
+ # Ensure texts is a list
498
+ if isinstance(texts, str):
499
+ texts = [texts]
500
+ elif isinstance(texts, np.ndarray):
501
+ texts = texts.tolist()
502
+
503
+ # Convert all elements to strings
504
+ texts = [str(text) for text in texts]
505
+
506
+ results = []
507
+ batch_size = 16 # Process in smaller batches
508
+
509
+ for i in range(0, len(texts), batch_size):
510
+ batch_texts = texts[i:i + batch_size]
511
+
512
+ try:
513
+ with torch.no_grad():
514
+ # Tokenize batch
515
+ inputs = tokenizer(
516
+ batch_texts,
517
+ return_tensors="pt",
518
+ padding=True,
519
+ truncation=True,
520
+ max_length=config.MAX_TEXT_LENGTH
521
+ ).to(device)
522
+
523
+ # Batch inference
524
+ outputs = model(**inputs)
525
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()
526
+
527
+ results.extend(probs)
528
+
529
+ except Exception as e:
530
+ logger.error(f"Prediction batch failed: {e}")
531
+ # Return neutral predictions for failed batch
532
+ batch_size_actual = len(batch_texts)
533
+ if hasattr(model.config, 'num_labels') and model.config.num_labels == 3:
534
+ neutral_probs = np.array([[0.33, 0.34, 0.33]] * batch_size_actual)
535
+ else:
536
+ neutral_probs = np.array([[0.5, 0.5]] * batch_size_actual)
537
+ results.extend(neutral_probs)
538
+
539
+ return np.array(results)
540
 
541
+ return predict_proba
542
+
543
+ @handle_errors(default_return=("Analysis failed", None, None))
544
+ def analyze_with_shap(self, text: str, language: str = 'auto', num_samples: int = 100) -> Tuple[str, go.Figure, Dict]:
545
+ """FIXED SHAP analysis implementation"""
546
+ if not text.strip():
547
+ return "Please enter text for analysis", None, {}
548
 
549
+ # Detect language and get model
550
+ if language == 'auto':
551
+ detected_lang = self.model_manager.detect_language(text)
552
+ else:
553
+ detected_lang = language
 
 
 
554
 
555
+ model, tokenizer = self.model_manager.get_model(detected_lang)
 
556
 
557
+ try:
558
+ # Create FIXED prediction function
559
+ predict_fn = self.create_prediction_function(model, tokenizer, self.model_manager.device)
560
+
561
+ # Test the prediction function first
562
+ test_pred = predict_fn([text])
563
+ if test_pred is None or len(test_pred) == 0:
564
+ return "Prediction function test failed", None, {}
565
+
566
+ # Use SHAP Text Explainer instead of generic Explainer
567
+ explainer = shap.Explainer(predict_fn, masker=shap.maskers.Text(tokenizer))
568
+
569
+ # Get SHAP values with proper text input
570
+ shap_values = explainer([text], max_evals=num_samples)
571
+
572
+ # Extract data safely
573
+ if hasattr(shap_values, 'data') and hasattr(shap_values, 'values'):
574
+ tokens = shap_values.data[0] if len(shap_values.data) > 0 else []
575
+ values = shap_values.values[0] if len(shap_values.values) > 0 else []
576
+ else:
577
+ return "SHAP values extraction failed", None, {}
578
+
579
+ if len(tokens) == 0 or len(values) == 0:
580
+ return "No tokens or values extracted from SHAP", None, {}
581
+
582
+ # Handle multi-dimensional values
583
+ if len(values.shape) > 1:
584
+ # Use positive class values (last column for 3-class, second for 2-class)
585
+ pos_values = values[:, -1] if values.shape[1] >= 2 else values[:, 0]
586
+ else:
587
+ pos_values = values
588
+
589
+ # Ensure we have matching lengths
590
+ min_len = min(len(tokens), len(pos_values))
591
+ tokens = tokens[:min_len]
592
+ pos_values = pos_values[:min_len]
593
+
594
+ # Create visualization
595
+ fig = go.Figure()
596
+
597
+ colors = ['red' if v < 0 else 'green' for v in pos_values]
598
+
599
+ fig.add_trace(go.Bar(
600
+ x=list(range(len(tokens))),
601
+ y=pos_values,
602
+ text=tokens,
603
+ textposition='outside',
604
+ marker_color=colors,
605
+ name='SHAP Values',
606
+ hovertemplate='<b>%{text}</b><br>SHAP Value: %{y:.4f}<extra></extra>'
607
+ ))
608
+
609
+ fig.update_layout(
610
+ title=f"SHAP Analysis - Token Importance (Samples: {num_samples})",
611
+ xaxis_title="Token Index",
612
+ yaxis_title="SHAP Value",
613
+ height=500,
614
+ xaxis=dict(tickmode='array', tickvals=list(range(len(tokens))), ticktext=tokens)
615
+ )
616
+
617
+ # Create analysis summary
618
+ analysis_data = {
619
+ 'method': 'SHAP',
620
+ 'language': detected_lang,
621
+ 'total_tokens': len(tokens),
622
+ 'samples_used': num_samples,
623
+ 'positive_influence': sum(1 for v in pos_values if v > 0),
624
+ 'negative_influence': sum(1 for v in pos_values if v < 0),
625
+ 'most_important_tokens': [(str(tokens[i]), float(pos_values[i]))
626
+ for i in np.argsort(np.abs(pos_values))[-5:]]
627
+ }
628
+
629
+ summary_text = f"""
630
+ **SHAP Analysis Results:**
631
+ - **Language:** {detected_lang.upper()}
632
+ - **Total Tokens:** {analysis_data['total_tokens']}
633
+ - **Samples Used:** {num_samples}
634
+ - **Positive Influence Tokens:** {analysis_data['positive_influence']}
635
+ - **Negative Influence Tokens:** {analysis_data['negative_influence']}
636
+ - **Most Important Tokens:** {', '.join([f"{token}({score:.3f})" for token, score in analysis_data['most_important_tokens']])}
637
+ - **Status:** SHAP analysis completed successfully
638
+ """
639
+
640
+ return summary_text, fig, analysis_data
641
+
642
+ except Exception as e:
643
+ logger.error(f"SHAP analysis failed: {e}")
644
+ error_msg = f"""
645
+ **SHAP Analysis Failed:**
646
+ - **Error:** {str(e)}
647
+ - **Language:** {detected_lang.upper()}
648
+ - **Suggestion:** Try with a shorter text or reduce number of samples
649
+
650
+ **Common fixes:**
651
+ - Reduce sample size to 50-100
652
+ - Use shorter input text (< 200 words)
653
+ - Check if model supports the text language
654
+ """
655
+ return error_msg, None, {}
656
+
657
+ @handle_errors(default_return=("Analysis failed", None, None))
658
+ def analyze_with_lime(self, text: str, language: str = 'auto', num_samples: int = 100) -> Tuple[str, go.Figure, Dict]:
659
+ """FIXED LIME analysis implementation - Bug Fix for mode parameter"""
660
+ if not text.strip():
661
+ return "Please enter text for analysis", None, {}
662
 
663
+ # Detect language and get model
664
+ if language == 'auto':
665
+ detected_lang = self.model_manager.detect_language(text)
666
+ else:
667
+ detected_lang = language
668
 
669
+ model, tokenizer = self.model_manager.get_model(detected_lang)
670
 
671
+ try:
672
+ # Create FIXED prediction function
673
+ predict_fn = self.create_prediction_function(model, tokenizer, self.model_manager.device)
674
+
675
+ # Test the prediction function first
676
+ test_pred = predict_fn([text])
677
+ if test_pred is None or len(test_pred) == 0:
678
+ return "Prediction function test failed", None, {}
679
+
680
+ # Determine class names based on model output
681
+ num_classes = test_pred.shape[1] if len(test_pred.shape) > 1 else 2
682
+ if num_classes == 3:
683
+ class_names = ['Negative', 'Neutral', 'Positive']
684
+ else:
685
+ class_names = ['Negative', 'Positive']
686
+
687
+ # Initialize LIME explainer - FIXED: Remove 'mode' parameter
688
+ explainer = LimeTextExplainer(class_names=class_names)
689
+
690
+ # Get LIME explanation
691
+ exp = explainer.explain_instance(
692
+ text,
693
+ predict_fn,
694
+ num_features=min(20, len(text.split())), # Limit features
695
+ num_samples=num_samples
696
+ )
697
+
698
+ # Extract feature importance
699
+ lime_data = exp.as_list()
700
+
701
+ if not lime_data:
702
+ return "No LIME features extracted", None, {}
703
+
704
+ # Create visualization
705
+ words = [item[0] for item in lime_data]
706
+ scores = [item[1] for item in lime_data]
707
+
708
+ fig = go.Figure()
709
+
710
+ colors = ['red' if s < 0 else 'green' for s in scores]
711
+
712
+ fig.add_trace(go.Bar(
713
+ y=words,
714
+ x=scores,
715
+ orientation='h',
716
+ marker_color=colors,
717
+ text=[f'{s:.3f}' for s in scores],
718
+ textposition='auto',
719
+ name='LIME Importance',
720
+ hovertemplate='<b>%{y}</b><br>Importance: %{x:.4f}<extra></extra>'
721
+ ))
722
+
723
+ fig.update_layout(
724
+ title=f"LIME Analysis - Feature Importance (Samples: {num_samples})",
725
+ xaxis_title="Importance Score",
726
+ yaxis_title="Words/Phrases",
727
+ height=500
728
+ )
729
+
730
+ # Create analysis summary
731
+ analysis_data = {
732
+ 'method': 'LIME',
733
+ 'language': detected_lang,
734
+ 'features_analyzed': len(lime_data),
735
+ 'samples_used': num_samples,
736
+ 'positive_features': sum(1 for _, score in lime_data if score > 0),
737
+ 'negative_features': sum(1 for _, score in lime_data if score < 0),
738
+ 'feature_importance': lime_data
739
+ }
740
+
741
+ summary_text = f"""
742
+ **LIME Analysis Results:**
743
+ - **Language:** {detected_lang.upper()}
744
+ - **Features Analyzed:** {analysis_data['features_analyzed']}
745
+ - **Classes:** {', '.join(class_names)}
746
+ - **Samples Used:** {num_samples}
747
+ - **Positive Features:** {analysis_data['positive_features']}
748
+ - **Negative Features:** {analysis_data['negative_features']}
749
+ - **Top Features:** {', '.join([f"{word}({score:.3f})" for word, score in lime_data[:5]])}
750
+ - **Status:** LIME analysis completed successfully
751
+ """
752
+
753
+ return summary_text, fig, analysis_data
754
+
755
+ except Exception as e:
756
+ logger.error(f"LIME analysis failed: {e}")
757
+ error_msg = f"""
758
+ **LIME Analysis Failed:**
759
+ - **Error:** {str(e)}
760
+ - **Language:** {detected_lang.upper()}
761
+ - **Suggestion:** Try with a shorter text or reduce number of samples
762
 
763
+ **Bug Fix Applied:**
764
+ - Removed 'mode' parameter from LimeTextExplainer initialization
765
+ - This should resolve the "unexpected keyword argument 'mode'" error
766
+
767
+ **Common fixes:**
768
+ - Reduce sample size to 50-100
769
+ - Use shorter input text (< 200 words)
770
+ - Check if model supports the text language
771
+ """
772
+ return error_msg, None, {}
773
+
774
+ # Optimized Plotly Visualization System
775
+ class PlotlyVisualizer:
776
+ """Enhanced Plotly visualizations"""
777
 
778
+ @staticmethod
779
+ @handle_errors(default_return=None)
780
+ def create_sentiment_gauge(result: Dict, theme: ThemeContext) -> go.Figure:
781
+ """Create animated sentiment gauge"""
782
+ colors = theme.colors
783
+
784
+ if result.get('has_neutral', False):
785
+ # Three-way gauge
786
+ fig = go.Figure(go.Indicator(
787
+ mode="gauge+number+delta",
788
+ value=result['pos_prob'] * 100,
789
+ domain={'x': [0, 1], 'y': [0, 1]},
790
+ title={'text': f"Sentiment: {result['sentiment']}"},
791
+ delta={'reference': 50},
792
+ gauge={
793
+ 'axis': {'range': [None, 100]},
794
+ 'bar': {'color': colors['pos'] if result['sentiment'] == 'Positive' else colors['neg']},
795
+ 'steps': [
796
+ {'range': [0, 33], 'color': colors['neg']},
797
+ {'range': [33, 67], 'color': colors['neu']},
798
+ {'range': [67, 100], 'color': colors['pos']}
799
+ ],
800
+ 'threshold': {
801
+ 'line': {'color': "red", 'width': 4},
802
+ 'thickness': 0.75,
803
+ 'value': 90
804
+ }
805
+ }
806
+ ))
807
+ else:
808
+ # Two-way gauge
809
+ fig = go.Figure(go.Indicator(
810
+ mode="gauge+number",
811
+ value=result['confidence'] * 100,
812
+ domain={'x': [0, 1], 'y': [0, 1]},
813
+ title={'text': f"Confidence: {result['sentiment']}"},
814
+ gauge={
815
+ 'axis': {'range': [None, 100]},
816
+ 'bar': {'color': colors['pos'] if result['sentiment'] == 'Positive' else colors['neg']},
817
+ 'steps': [
818
+ {'range': [0, 50], 'color': "lightgray"},
819
+ {'range': [50, 100], 'color': "gray"}
820
+ ]
821
+ }
822
+ ))
823
+
824
+ fig.update_layout(height=400, font={'size': 16})
825
+ return fig
826
+
827
+
828
+
829
+
830
 
831
+ @staticmethod
832
+ @handle_errors(default_return=None)
833
+ def create_probability_bars(result: Dict, theme: ThemeContext) -> go.Figure:
834
+ """Create probability bar chart"""
835
+ colors = theme.colors
836
+
837
+ if result.get('has_neutral', False):
838
+ labels = ['Negative', 'Neutral', 'Positive']
839
+ values = [result['neg_prob'], result['neu_prob'], result['pos_prob']]
840
+ bar_colors = [colors['neg'], colors['neu'], colors['pos']]
841
+ else:
842
+ labels = ['Negative', 'Positive']
843
+ values = [result['neg_prob'], result['pos_prob']]
844
+ bar_colors = [colors['neg'], colors['pos']]
845
 
846
  fig = go.Figure(data=[
847
+ go.Bar(x=labels, y=values, marker_color=bar_colors,
848
+ text=[f'{v:.3f}' for v in values], textposition='outside')
 
849
  ])
 
850
 
851
+ fig.update_layout(
852
+ title="Sentiment Probabilities",
853
+ yaxis_title="Probability",
854
+ height=400,
855
+ showlegend=False
856
+ )
857
+
858
+
859
+
860
+
861
+
862
+
863
+
864
+
865
+
866
+
867
+
868
+
869
+
870
+
871
+
872
+
873
+
874
+
875
+
876
+
877
+
878
+
879
+
880
+
881
+
882
+
883
+
884
+
885
+
886
+
887
+
888
+
889
+
890
+
891
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
892
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
893
 
894
+ return fig
895
+
896
+
897
+
898
+
899
+
900
+
901
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
902
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
903
 
904
+ @staticmethod
905
+ @handle_errors(default_return=None)
906
+ def create_batch_summary(results: List[Dict], theme: ThemeContext) -> go.Figure:
907
+ """Create batch analysis summary"""
908
+ colors = theme.colors
909
+
910
+ # Count sentiments
911
+ sentiments = [r['sentiment'] for r in results if 'sentiment' in r and r['sentiment'] != 'Error']
912
+ sentiment_counts = Counter(sentiments)
913
+
914
+ # Create pie chart
915
+ fig = go.Figure(data=[go.Pie(
916
+ labels=list(sentiment_counts.keys()),
917
+ values=list(sentiment_counts.values()),
918
+ marker_colors=[colors.get(s.lower()[:3], '#999999') for s in sentiment_counts.keys()],
919
+ textinfo='label+percent',
920
+ hole=0.3
921
+ )])
922
+
923
+ fig.update_layout(
924
+ title=f"Batch Analysis Summary ({len(results)} texts)",
925
+ height=400
 
926
  )
927
+
928
+ return fig
929
 
930
+ @staticmethod
931
+ @handle_errors(default_return=None)
932
+ def create_confidence_distribution(results: List[Dict]) -> go.Figure:
933
+ """Create confidence distribution plot"""
934
+ confidences = [r['confidence'] for r in results if 'confidence' in r and r['sentiment'] != 'Error']
935
+
936
+
937
+
938
+
939
+
940
+
941
+
942
+
943
+
944
+
945
+
946
+
947
+
948
+
949
+
950
+
951
+
952
+
953
+
954
+
955
+
956
+
957
+
958
+
959
+ if not confidences:
960
+ return go.Figure()
961
+
962
+ fig = go.Figure(data=[go.Histogram(
963
+ x=confidences,
964
+ nbinsx=20,
965
+ marker_color='skyblue',
966
+ opacity=0.7
967
+ )])
968
+
969
+
970
+
971
+
972
+
973
+
974
+
975
+
976
+
977
+
978
+
979
+
980
+
981
+
982
+
983
+
984
+
985
+
986
+
987
+
988
+
989
+
990
+
991
+
992
+
993
+
994
+
995
+
996
+
997
+
998
+
999
+
1000
+
1001
+
1002
+
1003
+
1004
+
1005
+
1006
+
1007
+
1008
+
1009
+
1010
+
1011
+
1012
+ fig.update_layout(
1013
+ title="Confidence Distribution",
1014
+ xaxis_title="Confidence Score",
1015
+ yaxis_title="Frequency",
1016
+ height=400
1017
+ )
1018
+
1019
+ return fig
1020
+
1021
+ @staticmethod
1022
+ @handle_errors(default_return=None)
1023
+ def create_history_dashboard(history: List[Dict], theme: ThemeContext) -> go.Figure:
1024
+ """Create comprehensive history dashboard"""
1025
+ if len(history) < 2:
1026
+ return go.Figure()
1027
+
1028
+ # Create subplots
1029
+ fig = make_subplots(
1030
+ rows=2, cols=2,
1031
+ subplot_titles=['Sentiment Timeline', 'Confidence Distribution',
1032
+ 'Language Distribution', 'Sentiment Summary'],
1033
+ specs=[[{"secondary_y": False}, {"secondary_y": False}],
1034
+ [{"type": "pie"}, {"type": "bar"}]]
1035
+ )
1036
+
1037
+ # Extract data
1038
+ indices = list(range(len(history)))
1039
+ pos_probs = [item.get('pos_prob', 0) for item in history]
1040
+ confidences = [item['confidence'] for item in history]
1041
+ sentiments = [item['sentiment'] for item in history]
1042
+ languages = [item.get('language', 'en') for item in history]
1043
+
1044
+ # Sentiment timeline
1045
+ colors_map = {'Positive': theme.colors['pos'], 'Negative': theme.colors['neg'], 'Neutral': theme.colors['neu']}
1046
+ colors = [colors_map.get(s, '#999999') for s in sentiments]
1047
+
1048
+ fig.add_trace(
1049
+ go.Scatter(x=indices, y=pos_probs, mode='lines+markers',
1050
+ marker=dict(color=colors, size=8),
1051
+ name='Positive Probability'),
1052
+ row=1, col=1
1053
+ )
1054
+
1055
+ # Confidence distribution
1056
+ fig.add_trace(
1057
+ go.Histogram(x=confidences, nbinsx=10, name='Confidence'),
1058
+ row=1, col=2
1059
  )
1060
+
1061
+ # Language distribution
1062
+ lang_counts = Counter(languages)
1063
+ fig.add_trace(
1064
+ go.Pie(labels=list(lang_counts.keys()), values=list(lang_counts.values()),
1065
+ name="Languages"),
1066
+ row=2, col=1
1067
+ )
1068
+
1069
+ # Sentiment summary
1070
+ sent_counts = Counter(sentiments)
1071
+ sent_colors = [colors_map.get(k, '#999999') for k in sent_counts.keys()]
1072
+ fig.add_trace(
1073
+ go.Bar(x=list(sent_counts.keys()), y=list(sent_counts.values()),
1074
+ marker_color=sent_colors),
1075
+ row=2, col=2
1076
+ )
1077
+
1078
+ fig.update_layout(height=800, showlegend=False)
1079
+ return fig
1080
+
1081
+ # Universal Data Handler
1082
+ class DataHandler:
1083
+ """Enhanced data operations"""
1084
+
1085
+ @staticmethod
1086
+ @handle_errors(default_return=(None, "Export failed"))
1087
+ def export_data(data: List[Dict], format_type: str) -> Tuple[Optional[str], str]:
1088
+ """Export data with comprehensive information"""
1089
+ if not data:
1090
+ return None, "No data to export"
1091
+
1092
+ temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False,
1093
+ suffix=f'.{format_type}', encoding='utf-8')
1094
+
1095
+ if format_type == 'csv':
1096
+ writer = csv.writer(temp_file)
1097
+ writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Language',
1098
+ 'Pos_Prob', 'Neg_Prob', 'Neu_Prob', 'Word_Count'])
1099
+ for entry in data:
1100
+ writer.writerow([
1101
+ entry.get('timestamp', ''),
1102
+ entry.get('text', ''),
1103
+ entry.get('sentiment', ''),
1104
+ f"{entry.get('confidence', 0):.4f}",
1105
+ entry.get('language', 'en'),
1106
+ f"{entry.get('pos_prob', 0):.4f}",
1107
+ f"{entry.get('neg_prob', 0):.4f}",
1108
+ f"{entry.get('neu_prob', 0):.4f}",
1109
+ entry.get('word_count', 0)
1110
+ ])
1111
+ elif format_type == 'json':
1112
+ json.dump(data, temp_file, indent=2, ensure_ascii=False)
1113
+
1114
+ temp_file.close()
1115
+ return temp_file.name, f"Exported {len(data)} entries"
1116
+
1117
+ @staticmethod
1118
+ @handle_errors(default_return="")
1119
+ def process_file(file) -> str:
1120
+ """Process uploaded files"""
1121
+ if not file:
1122
+ return ""
1123
+
1124
+ content = file.read().decode('utf-8')
1125
+
1126
+ if file.name.endswith('.csv'):
1127
+ csv_file = io.StringIO(content)
1128
+ reader = csv.reader(csv_file)
1129
+ try:
1130
+ next(reader) # Skip header
1131
+ texts = []
1132
+ for row in reader:
1133
+ if row and row[0].strip():
1134
+ text = row[0].strip().strip('"')
1135
+ if text:
1136
+ texts.append(text)
1137
+ return '\n'.join(texts)
1138
+ except:
1139
+ lines = content.strip().split('\n')[1:]
1140
+ texts = []
1141
+ for line in lines:
1142
+ if line.strip():
1143
+ text = line.strip().strip('"')
1144
+ if text:
1145
+ texts.append(text)
1146
+ return '\n'.join(texts)
1147
+
1148
+ return content
1149
+
1150
+
1151
+
1152
+
1153
+
1154
+
1155
+
1156
+
1157
+
1158
+
1159
+
1160
+
1161
+
1162
+ class SentimentApp:
1163
+ """Optimized multilingual sentiment analysis application"""
1164
+
1165
+ def __init__(self):
1166
+ self.engine = SentimentEngine()
1167
+ self.advanced_engine = AdvancedAnalysisEngine()
1168
+ self.history = HistoryManager()
1169
+ self.data_handler = DataHandler()
1170
+
1171
+ # Multi-language examples
1172
+ self.examples = [
1173
+ # Auto Detect
1174
+ ["The film had its moments, but overall it felt a bit too long and lacked emotional depth. Some scenes were visually impressive, yet they failed to connect emotionally. By the end, I found myself disengaged and unsatisfied."],
1175
+
1176
+ # English
1177
+ ["I was completely blown away by the movie — the performances were raw and powerful, and the story stayed with me long after the credits rolled. Every scene felt purposeful, and the emotional arc was handled with incredible nuance. It's the kind of film that makes you reflect deeply on your own life."],
1178
+
1179
+
1180
+
1181
+ # Chinese
1182
+ ["这部电影节奏拖沓,剧情老套,完全没有让我产生任何共鸣,是一次失望的观影体验。演员的表演也显得做作,缺乏真实感。看到最后甚至有点不耐烦,整体表现乏善可陈。"],
1183
+
1184
+
1185
+
1186
+
1187
+
1188
+
1189
+
1190
+
1191
+ # Spanish
1192
+ ["Una obra maestra del cine contemporáneo, con actuaciones sobresalientes, un guion bien escrito y una dirección impecable. Cada plano parecía cuidadosamente pensado, y la historia avanzaba con una intensidad emocional que mantenía al espectador cautivado. Definitivamente una película que vale la pena volver a ver."],
1193
+
1194
+
1195
+
1196
+ # French
1197
+ ["Je m'attendais à beaucoup mieux. Le scénario était confus, les dialogues ennuyeux, et je me suis presque endormi au milieu du film. Même la mise en scène, habituellement un point fort, manquait cruellement d'inspiration cette fois-ci."],
1198
+
1199
+
1200
+ # German
1201
+ ["Der Film war ein emotionales Erlebnis mit großartigen Bildern, einem mitreißenden Soundtrack und einer Geschichte, die zum Nachdenken anregt. Besonders beeindruckend war die schauspielerische Leistung der Hauptdarsteller, die eine tiefe Menschlichkeit vermittelten. Es ist ein Film, der lange nachwirkt."],
1202
+
1203
+
1204
+
1205
+
1206
+
1207
+
1208
+
1209
+ # Swedish
1210
+ ["Filmen var en besvikelse – tråkig handling, överdrivet skådespeleri och ett slut som inte gav något avslut alls. Den kändes forcerad och saknade en tydlig röd tråd. Jag gick från biografen med en känsla av tomhet och frustration."]
1211
+ ]
1212
+
1213
+
1214
+
1215
+
1216
+
1217
+
1218
+
1219
+
1220
+
1221
+
1222
+
1223
+
1224
+
1225
+
1226
+
1227
+
1228
+
1229
+
1230
+ @handle_errors(default_return=("Please enter text", None, None))
1231
+ def analyze_single(self, text: str, language: str, theme: str, clean_text: bool,
1232
+ remove_punct: bool, remove_nums: bool):
1233
+ """Optimized single text analysis"""
1234
+ if not text.strip():
1235
+ return "Please enter text", None, None
1236
+
1237
+ # Map display names to language codes
1238
+ language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
1239
+ language_code = language_map.get(language, 'auto')
1240
+
1241
+ preprocessing_options = {
1242
+ 'clean_text': clean_text,
1243
+ 'remove_punctuation': remove_punct,
1244
+ 'remove_numbers': remove_nums
1245
+ }
1246
+
1247
+ with memory_cleanup():
1248
+ result = self.engine.analyze_single(text, language_code, preprocessing_options)
1249
+
1250
+ # Add to history
1251
+ history_entry = {
1252
+ 'text': text[:100] + '...' if len(text) > 100 else text,
1253
+ 'full_text': text,
1254
+ 'sentiment': result['sentiment'],
1255
+ 'confidence': result['confidence'],
1256
+ 'pos_prob': result.get('pos_prob', 0),
1257
+ 'neg_prob': result.get('neg_prob', 0),
1258
+ 'neu_prob': result.get('neu_prob', 0),
1259
+ 'language': result['language'],
1260
+ 'word_count': result['word_count'],
1261
+ 'analysis_type': 'single'
1262
+ }
1263
+ self.history.add(history_entry)
1264
+
1265
+ # Create visualizations
1266
+ theme_ctx = ThemeContext(theme)
1267
+ gauge_fig = PlotlyVisualizer.create_sentiment_gauge(result, theme_ctx)
1268
+ bars_fig = PlotlyVisualizer.create_probability_bars(result, theme_ctx)
1269
+
1270
+ # Create comprehensive result text
1271
+ info_text = f"""
1272
+ **Analysis Results:**
1273
+ - **Sentiment:** {result['sentiment']} ({result['confidence']:.3f} confidence)
1274
+ - **Language:** {result['language'].upper()}
1275
+ - **Statistics:** {result['word_count']} words, {result['char_count']} characters
1276
+ - **Probabilities:** Positive: {result.get('pos_prob', 0):.3f}, Negative: {result.get('neg_prob', 0):.3f}, Neutral: {result.get('neu_prob', 0):.3f}
1277
+ """
1278
+
1279
+ return info_text, gauge_fig, bars_fig
1280
 
1281
+ @handle_errors(default_return=("Please enter texts", None, None, None))
1282
+ def analyze_batch(self, batch_text: str, language: str, theme: str,
1283
+ clean_text: bool, remove_punct: bool, remove_nums: bool):
1284
+ """Enhanced batch analysis with parallel processing"""
1285
+ if not batch_text.strip():
1286
+ return "Please enter texts (one per line)", None, None, None
1287
+
1288
+ # Parse batch input
1289
+ texts = TextProcessor.parse_batch_input(batch_text)
1290
+
1291
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1292
 
1293
+
1294
+
1295
+
1296
+
1297
+ if len(texts) > config.BATCH_SIZE_LIMIT:
1298
+ return f"Too many texts. Maximum {config.BATCH_SIZE_LIMIT} allowed.", None, None, None
1299
+
1300
+ if not texts:
1301
+ return "No valid texts found", None, None, None
1302
+
1303
+ # Map display names to language codes
1304
+ language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
1305
+ language_code = language_map.get(language, 'auto')
1306
+
1307
+
1308
+
1309
+ preprocessing_options = {
1310
+ 'clean_text': clean_text,
1311
+ 'remove_punctuation': remove_punct,
1312
+ 'remove_numbers': remove_nums
1313
+ }
1314
+
1315
+ with memory_cleanup():
1316
+ results = self.engine.analyze_batch(texts, language_code, preprocessing_options)
1317
+
1318
+ # Add to history
1319
+ batch_entries = []
1320
+ for result in results:
1321
+ if 'error' not in result:
1322
+ entry = {
1323
+ 'text': result['text'],
1324
+ 'full_text': result['full_text'],
1325
+ 'sentiment': result['sentiment'],
1326
+ 'confidence': result['confidence'],
1327
+ 'pos_prob': result.get('pos_prob', 0),
1328
+ 'neg_prob': result.get('neg_prob', 0),
1329
+ 'neu_prob': result.get('neu_prob', 0),
1330
+ 'language': result['language'],
1331
+ 'word_count': result['word_count'],
1332
+ 'analysis_type': 'batch',
1333
+ 'batch_index': result['batch_index']
1334
+ }
1335
+ batch_entries.append(entry)
1336
+
1337
+ self.history.add_batch(batch_entries)
1338
+
1339
+ # Create visualizations
1340
+ theme_ctx = ThemeContext(theme)
1341
+ summary_fig = PlotlyVisualizer.create_batch_summary(results, theme_ctx)
1342
+ confidence_fig = PlotlyVisualizer.create_confidence_distribution(results)
1343
+
1344
+ # Create results DataFrame
1345
+ df_data = []
1346
+ for result in results:
1347
+ if 'error' in result:
1348
+ df_data.append({
1349
+ 'Index': result['batch_index'] + 1,
1350
+ 'Text': result['text'],
1351
+ 'Sentiment': 'Error',
1352
+ 'Confidence': 0.0,
1353
+ 'Language': 'Unknown',
1354
+ 'Error': result['error']
1355
+ })
1356
+ else:
1357
+ df_data.append({
1358
+ 'Index': result['batch_index'] + 1,
1359
+ 'Text': result['text'],
1360
+ 'Sentiment': result['sentiment'],
1361
+ 'Confidence': f"{result['confidence']:.3f}",
1362
+ 'Language': result['language'].upper(),
1363
+ 'Word_Count': result.get('word_count', 0)
1364
+ })
1365
+
1366
+ df = pd.DataFrame(df_data)
1367
+
1368
+ # Create summary text
1369
+ successful_results = [r for r in results if 'error' not in r]
1370
+ error_count = len(results) - len(successful_results)
1371
+
1372
+ if successful_results:
1373
+ sentiment_counts = Counter([r['sentiment'] for r in successful_results])
1374
+ avg_confidence = np.mean([r['confidence'] for r in successful_results])
1375
+ languages = Counter([r['language'] for r in successful_results])
1376
+
1377
+ summary_text = f"""
1378
+ **Batch Analysis Summary:**
1379
+ - **Total Texts:** {len(texts)}
1380
+ - **Successful:** {len(successful_results)}
1381
+ - **Errors:** {error_count}
1382
+ - **Average Confidence:** {avg_confidence:.3f}
1383
+ - **Sentiments:** {dict(sentiment_counts)}
1384
+ - **Languages Detected:** {dict(languages)}
1385
+ """
1386
+ else:
1387
+ summary_text = f"All {len(texts)} texts failed to analyze."
1388
+
1389
+ return summary_text, df, summary_fig, confidence_fig
1390
+
1391
+ # FIXED advanced analysis methods with sample size control
1392
+ @handle_errors(default_return=("Please enter text", None))
1393
+ def analyze_with_shap(self, text: str, language: str, num_samples: int = 100):
1394
+ """Perform FIXED SHAP analysis with configurable samples"""
1395
+ language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
1396
+ language_code = language_map.get(language, 'auto')
1397
+
1398
+ return self.advanced_engine.analyze_with_shap(text, language_code, num_samples)
1399
+
1400
+ @handle_errors(default_return=("Please enter text", None))
1401
+ def analyze_with_lime(self, text: str, language: str, num_samples: int = 100):
1402
+ """Perform FIXED LIME analysis with configurable samples"""
1403
+ language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
1404
+ language_code = language_map.get(language, 'auto')
1405
+
1406
+ return self.advanced_engine.analyze_with_lime(text, language_code, num_samples)
1407
+
1408
+ @handle_errors(default_return=(None, "No history available"))
1409
+ def plot_history(self, theme: str = 'default'):
1410
+ """Plot comprehensive history analysis"""
1411
+ history = self.history.get_all()
1412
+ if len(history) < 2:
1413
+ return None, f"Need at least 2 analyses for trends. Current: {len(history)}"
1414
+
1415
+ theme_ctx = ThemeContext(theme)
1416
+
1417
+
1418
+
1419
+
1420
+
1421
+
1422
+ with memory_cleanup():
1423
+ fig = PlotlyVisualizer.create_history_dashboard(history, theme_ctx)
1424
+ stats = self.history.get_stats()
1425
+
1426
+ stats_text = f"""
1427
+ **History Statistics:**
1428
+ - **Total Analyses:** {stats.get('total_analyses', 0)}
1429
+ - **Positive:** {stats.get('positive_count', 0)}
1430
+ - **Negative:** {stats.get('negative_count', 0)}
1431
+ - **Neutral:** {stats.get('neutral_count', 0)}
1432
+ - **Average Confidence:** {stats.get('avg_confidence', 0):.3f}
1433
+ - **Languages:** {stats.get('languages_detected', 0)}
1434
+ - **Most Common Language:** {stats.get('most_common_language', 'N/A').upper()}
1435
+ """
1436
+
1437
+ return fig, stats_text
1438
+
1439
+ @handle_errors(default_return=("No data available",))
1440
+ def get_history_status(self):
1441
+ """Get current history status"""
1442
+ stats = self.history.get_stats()
1443
+ if not stats:
1444
+ return "No analyses performed yet"
1445
+
1446
+ return f"""
1447
+ **Current Status:**
1448
+ - **Total Analyses:** {stats['total_analyses']}
1449
+ - **Recent Sentiment Distribution:**
1450
+ * Positive: {stats['positive_count']}
1451
+ * Negative: {stats['negative_count']}
1452
+ * Neutral: {stats['neutral_count']}
1453
+ - **Average Confidence:** {stats['avg_confidence']:.3f}
1454
+ - **Languages Detected:** {stats['languages_detected']}
1455
+ """
1456
+
1457
+ # Optimized Gradio Interface
1458
+ def create_interface():
1459
+ """Create comprehensive Gradio interface with optimizations"""
1460
+ app = SentimentApp()
1461
+
1462
+ with gr.Blocks(theme=gr.themes.Soft(), title="Multilingual Sentiment Analyzer") as demo:
1463
+ gr.Markdown("# 🌍 Multilingual Sentiment Analyzer")
1464
+ gr.Markdown("AI-powered sentiment analysis with SHAP & LIME explainable AI features")
1465
+
1466
+ with gr.Tab("Single Analysis"):
1467
+ with gr.Row():
1468
+ with gr.Column():
1469
+ text_input = gr.Textbox(
1470
+ label="Enter Text for Analysis",
1471
+ placeholder="Enter your text in any supported language...",
1472
+ lines=5
1473
+ )
1474
+
1475
+ with gr.Row():
1476
+ language_selector = gr.Dropdown(
1477
+ choices=list(config.SUPPORTED_LANGUAGES.values()),
1478
+ value="Auto Detect",
1479
+ label="Language"
1480
+ )
1481
+ theme_selector = gr.Dropdown(
1482
+ choices=list(config.THEMES.keys()),
1483
+ value="default",
1484
+ label="Theme"
1485
+ )
1486
+
1487
+ with gr.Row():
1488
+ clean_text_cb = gr.Checkbox(label="Clean Text", value=False)
1489
+ remove_punct_cb = gr.Checkbox(label="Remove Punctuation", value=False)
1490
+ remove_nums_cb = gr.Checkbox(label="Remove Numbers", value=False)
1491
+
1492
+ analyze_btn = gr.Button("Analyze", variant="primary", size="lg")
1493
+
1494
+ gr.Examples(
1495
+ examples=app.examples,
1496
+ inputs=text_input,
1497
+ cache_examples=False
1498
+ )
1499
+
1500
+ with gr.Column():
1501
+ result_output = gr.Textbox(label="Analysis Results", lines=8)
1502
+
1503
+ with gr.Row():
1504
+ gauge_plot = gr.Plot(label="Sentiment Gauge")
1505
+ probability_plot = gr.Plot(label="Probability Distribution")
1506
+
1507
+ # FIXED Advanced Analysis Tab
1508
+ with gr.Tab("Advanced Analysis"):
1509
+ gr.Markdown("## Explainable AI Analysis")
1510
+ gr.Markdown("**SHAP and LIME analysis with FIXED implementation** - now handles text input correctly!")
1511
+
1512
+ with gr.Row():
1513
+ with gr.Column():
1514
+ advanced_text_input = gr.Textbox(
1515
+ label="Enter Text for Advanced Analysis",
1516
+ placeholder="Enter text to analyze with SHAP and LIME...",
1517
+ lines=6,
1518
+ value="This movie is absolutely fantastic and amazing!"
1519
+ )
1520
+
1521
+ with gr.Row():
1522
+ advanced_language = gr.Dropdown(
1523
+ choices=list(config.SUPPORTED_LANGUAGES.values()),
1524
+ value="Auto Detect",
1525
+ label="Language"
1526
+ )
1527
+
1528
+ num_samples_slider = gr.Slider(
1529
+ minimum=50,
1530
+ maximum=300,
1531
+ value=100,
1532
+ step=25,
1533
+ label="Number of Samples",
1534
+ info="Lower = Faster, Higher = More Accurate"
1535
+ )
1536
+
1537
+ with gr.Row():
1538
+ shap_btn = gr.Button("SHAP Analysis", variant="primary")
1539
+ lime_btn = gr.Button("LIME Analysis", variant="secondary")
1540
+
1541
+ gr.Markdown("""
1542
+
1543
+ **📊 Analysis Methods:**
1544
+ - **SHAP**: Token-level importance scores using Text masker
1545
+ - **LIME**: Feature importance through text perturbation
1546
+
1547
+ **⚡ Expected Performance:**
1548
+ - 50 samples: ~10-20s | 100 samples: ~20-40s | 200+ samples: ~40-80s
1549
+ """)
1550
+
1551
+ with gr.Column():
1552
+ advanced_results = gr.Textbox(label="Analysis Summary", lines=12)
1553
+
1554
+ with gr.Row():
1555
+ advanced_plot = gr.Plot(label="Feature Importance Visualization")
1556
+
1557
+ with gr.Tab("Batch Analysis"):
1558
+ with gr.Row():
1559
+ with gr.Column():
1560
+ file_upload = gr.File(
1561
+ label="Upload File (CSV/TXT)",
1562
+ file_types=[".csv", ".txt"]
1563
+ )
1564
+ batch_input = gr.Textbox(
1565
+ label="Batch Input (one text per line)",
1566
+ placeholder="Enter multiple texts, one per line...",
1567
+ lines=10
1568
+ )
1569
+
1570
+ with gr.Row():
1571
+ batch_language = gr.Dropdown(
1572
+ choices=list(config.SUPPORTED_LANGUAGES.values()),
1573
+ value="Auto Detect",
1574
+ label="Language"
1575
+ )
1576
+ batch_theme = gr.Dropdown(
1577
+ choices=list(config.THEMES.keys()),
1578
+ value="default",
1579
+ label="Theme"
1580
+ )
1581
+
1582
+ with gr.Row():
1583
+ batch_clean_cb = gr.Checkbox(label="Clean Text", value=False)
1584
+ batch_punct_cb = gr.Checkbox(label="Remove Punctuation", value=False)
1585
+ batch_nums_cb = gr.Checkbox(label="Remove Numbers", value=False)
1586
+
1587
+ with gr.Row():
1588
+ load_file_btn = gr.Button("Load File")
1589
+ analyze_batch_btn = gr.Button("Analyze Batch", variant="primary")
1590
+
1591
+ with gr.Column():
1592
+ batch_summary = gr.Textbox(label="Batch Summary", lines=8)
1593
+ batch_results_df = gr.Dataframe(
1594
+ label="Detailed Results",
1595
+ headers=["Index", "Text", "Sentiment", "Confidence", "Language", "Word_Count"],
1596
+ datatype=["number", "str", "str", "str", "str", "number"]
1597
+ )
1598
+
1599
+ with gr.Row():
1600
+ batch_plot = gr.Plot(label="Batch Analysis Summary")
1601
+ confidence_dist_plot = gr.Plot(label="Confidence Distribution")
1602
+
1603
+ with gr.Tab("History & Analytics"):
1604
+ with gr.Row():
1605
+ with gr.Column():
1606
+ with gr.Row():
1607
+ refresh_history_btn = gr.Button("Refresh History")
1608
+ clear_history_btn = gr.Button("Clear History", variant="stop")
1609
+ status_btn = gr.Button("Get Status")
1610
+
1611
+ history_theme = gr.Dropdown(
1612
+ choices=list(config.THEMES.keys()),
1613
+ value="default",
1614
+ label="Dashboard Theme"
1615
+ )
1616
+
1617
+ with gr.Row():
1618
+ export_csv_btn = gr.Button("Export CSV")
1619
+ export_json_btn = gr.Button("Export JSON")
1620
+
1621
+ with gr.Column():
1622
+ history_status = gr.Textbox(label="History Status", lines=8)
1623
+
1624
+ history_dashboard = gr.Plot(label="History Analytics Dashboard")
1625
+
1626
+ with gr.Row():
1627
+ csv_download = gr.File(label="CSV Download", visible=True)
1628
+ json_download = gr.File(label="JSON Download", visible=True)
1629
+
1630
+ # Event Handlers
1631
+
1632
+ # Single Analysis
1633
+ analyze_btn.click(
1634
+ app.analyze_single,
1635
+ inputs=[text_input, language_selector, theme_selector,
1636
+ clean_text_cb, remove_punct_cb, remove_nums_cb],
1637
+ outputs=[result_output, gauge_plot, probability_plot]
1638
+ )
1639
+
1640
+ # FIXED Advanced Analysis with sample size control
1641
+ shap_btn.click(
1642
+ app.analyze_with_shap,
1643
+ inputs=[advanced_text_input, advanced_language, num_samples_slider],
1644
+ outputs=[advanced_results, advanced_plot]
1645
+ )
1646
+
1647
+ lime_btn.click(
1648
+ app.analyze_with_lime,
1649
+ inputs=[advanced_text_input, advanced_language, num_samples_slider],
1650
+ outputs=[advanced_results, advanced_plot]
1651
+ )
1652
+
1653
+ # Batch Analysis
1654
+ load_file_btn.click(
1655
+ app.data_handler.process_file,
1656
+ inputs=file_upload,
1657
+ outputs=batch_input
1658
+ )
1659
+
1660
+ analyze_batch_btn.click(
1661
+ app.analyze_batch,
1662
+ inputs=[batch_input, batch_language, batch_theme,
1663
+ batch_clean_cb, batch_punct_cb, batch_nums_cb],
1664
+ outputs=[batch_summary, batch_results_df, batch_plot, confidence_dist_plot]
1665
+ )
1666
+
1667
+ # History & Analytics
1668
+ refresh_history_btn.click(
1669
+ app.plot_history,
1670
+ inputs=history_theme,
1671
+ outputs=[history_dashboard, history_status]
1672
+
1673
+
1674
+
1675
+
1676
+
1677
+
1678
+
1679
+
1680
+
1681
+
1682
+
1683
+
1684
+
1685
+
1686
+
1687
+
1688
+
1689
+
1690
+
1691
+
1692
+
1693
+
1694
+
1695
+
1696
+
1697
+
1698
+
1699
+
1700
+
1701
+
1702
+
1703
+
1704
+
1705
+
1706
+
1707
+
1708
+
1709
+
1710
+
1711
+
1712
+
1713
+
1714
+
1715
+
1716
+
1717
+
1718
+
1719
+
1720
+
1721
+
1722
+
1723
+
1724
+
1725
+
1726
+
1727
+
1728
+ )
1729
+
1730
+ clear_history_btn.click(
1731
+ lambda: f"Cleared {app.history.clear()} entries",
1732
+ outputs=history_status
1733
+
1734
+
1735
+
1736
+
1737
+
1738
+
1739
+
1740
+
1741
+
1742
+
1743
+
1744
+
1745
+
1746
+
1747
+
1748
+
1749
+
1750
+
1751
+ )
1752
+
1753
+ status_btn.click(
1754
+ app.get_history_status,
1755
+ outputs=history_status
1756
+
1757
+
1758
+
1759
+
1760
+
1761
+
1762
+
1763
+
1764
+
1765
+
1766
+
1767
+
1768
+
1769
+
1770
+
1771
+
1772
+
1773
+
1774
+
1775
+
1776
+
1777
+
1778
+
1779
+
1780
+
1781
+
1782
+
1783
+
1784
+ )
1785
+
1786
+ export_csv_btn.click(
1787
+ lambda: app.data_handler.export_data(app.history.get_all(), 'csv'),
1788
+ outputs=[csv_download, history_status]
1789
+
1790
+
1791
+ )
1792
+
1793
+ export_json_btn.click(
1794
+ lambda: app.data_handler.export_data(app.history.get_all(), 'json'),
1795
+ outputs=[json_download, history_status]
1796
+
1797
+ )
1798
+
1799
+ return demo
1800
+ # Application Entry Point
1801
  if __name__ == "__main__":
1802
+ logging.basicConfig(
1803
+ level=logging.INFO,
1804
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
1805
+ )
1806
+
1807
+ try:
1808
+ demo = create_interface()
1809
+ demo.launch(
1810
+ share=True,
1811
+ server_name="0.0.0.0",
1812
+ server_port=7860,
1813
+ show_error=True
1814
+ )
1815
+ except Exception as e:
1816
+ logger.error(f"Failed to launch application: {e}")
1817
+ raise