entropy25 commited on
Commit
061ab6f
·
verified ·
1 Parent(s): 75cb992

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +255 -401
app.py CHANGED
@@ -24,6 +24,11 @@ import langdetect
24
  import pandas as pd
25
  import gc
26
 
 
 
 
 
 
27
  # Configuration
28
  @dataclass
29
  class Config:
@@ -276,77 +281,16 @@ class HistoryManager:
276
  'most_common_language': Counter(languages).most_common(1)[0][0] if languages else 'en'
277
  }
278
 
279
- # Core Sentiment Analysis Engine
280
  class SentimentEngine:
281
  """Multi-language sentiment analysis engine"""
282
 
283
  def __init__(self):
284
  self.model_manager = ModelManager()
285
 
286
- def extract_attention_keywords(self, text: str, language: str = 'auto', top_k: int = 10) -> List[Tuple[str, float]]:
287
- """Extract keywords using attention weights"""
288
- try:
289
- if language == 'auto':
290
- language = self.model_manager.detect_language(text)
291
-
292
- model, tokenizer = self.model_manager.get_model(language)
293
-
294
- inputs = tokenizer(
295
- text, return_tensors="pt", padding=True,
296
- truncation=True, max_length=config.MAX_TEXT_LENGTH
297
- ).to(self.model_manager.device)
298
-
299
- with torch.no_grad():
300
- outputs = model(**inputs, output_attentions=True)
301
-
302
- if hasattr(outputs, 'attentions') and outputs.attentions:
303
- # Use attention weights
304
- attention = outputs.attentions[-1]
305
- avg_attention = attention.mean(dim=1)[0, 0, :]
306
-
307
- tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
308
- attention_scores = avg_attention.cpu().numpy()
309
-
310
- # Process tokens and scores
311
- word_scores = {}
312
- current_word = ""
313
- current_score = 0.0
314
-
315
- for token, score in zip(tokens, attention_scores):
316
- if token in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>']:
317
- continue
318
-
319
- if token.startswith('##') or token.startswith('▁'):
320
- current_word += token.replace('##', '').replace('▁', '')
321
- current_score = max(current_score, score)
322
- else:
323
- if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
324
- word_scores[current_word.lower()] = current_score
325
- current_word = token
326
- current_score = score
327
-
328
- if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
329
- word_scores[current_word.lower()] = current_score
330
-
331
- # Filter and sort
332
- filtered_words = {
333
- word: score for word, score in word_scores.items()
334
- if word not in STOP_WORDS and len(word) >= config.MIN_WORD_LENGTH
335
- }
336
-
337
- sorted_words = sorted(filtered_words.items(), key=lambda x: x[1], reverse=True)
338
- return sorted_words[:top_k]
339
-
340
- except Exception as e:
341
- logger.error(f"Attention keyword extraction failed: {e}")
342
-
343
- # Fallback to simple keyword extraction
344
- keywords = TextProcessor.extract_keywords(text, top_k)
345
- return [(word, 0.1) for word in keywords]
346
-
347
  @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0, 'keywords': []})
348
  def analyze_single(self, text: str, language: str = 'auto', preprocessing_options: Dict = None) -> Dict:
349
- """Analyze single text with enhanced features"""
350
  if not text.strip():
351
  raise ValueError("Empty text provided")
352
 
@@ -406,13 +350,14 @@ class SentimentEngine:
406
  'has_neutral': False
407
  }
408
 
409
- # Extract keywords
410
- keywords = self.extract_attention_keywords(text, detected_lang)
 
411
 
412
  # Add metadata
413
  result.update({
414
  'language': detected_lang,
415
- 'keywords': keywords,
416
  'word_count': len(text.split()),
417
  'char_count': len(text)
418
  })
@@ -454,7 +399,188 @@ class SentimentEngine:
454
 
455
  return results
456
 
457
- # Advanced Plotly Visualization System
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
  class PlotlyVisualizer:
459
  """Enhanced Plotly visualizations"""
460
 
@@ -539,7 +665,7 @@ class PlotlyVisualizer:
539
  @staticmethod
540
  @handle_errors(default_return=None)
541
  def create_keyword_chart(keywords: List[Tuple[str, float]], sentiment: str, theme: ThemeContext) -> go.Figure:
542
- """Create keyword importance chart"""
543
  if not keywords:
544
  fig = go.Figure()
545
  fig.add_annotation(text="No keywords extracted",
@@ -565,7 +691,7 @@ class PlotlyVisualizer:
565
 
566
  fig.update_layout(
567
  title=f"Top Keywords ({sentiment})",
568
- xaxis_title="Attention Weight",
569
  yaxis_title="Keywords",
570
  height=400,
571
  showlegend=False
@@ -761,6 +887,7 @@ class SentimentApp:
761
 
762
  def __init__(self):
763
  self.engine = SentimentEngine()
 
764
  self.history = HistoryManager()
765
  self.data_handler = DataHandler()
766
 
@@ -776,7 +903,7 @@ class SentimentApp:
776
  @handle_errors(default_return=("Please enter text", None, None, None))
777
  def analyze_single(self, text: str, language: str, theme: str, clean_text: bool,
778
  remove_punct: bool, remove_nums: bool):
779
- """Single text analysis with enhanced visualizations"""
780
  if not text.strip():
781
  return "Please enter text", None, None, None
782
 
@@ -932,6 +1059,23 @@ class SentimentApp:
932
 
933
  return summary_text, df, summary_fig, confidence_fig
934
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
935
  @handle_errors(default_return=(None, "No history available"))
936
  def plot_history(self, theme: str = 'default'):
937
  """Plot comprehensive history analysis"""
@@ -976,9 +1120,9 @@ class SentimentApp:
976
  - **Languages Detected:** {stats['languages_detected']}
977
  """
978
 
979
- # Gradio Interface
980
  def create_interface():
981
- """Create comprehensive Gradio interface"""
982
  app = SentimentApp()
983
 
984
  with gr.Blocks(theme=gr.themes.Soft(), title="Multilingual Sentiment Analyzer") as demo:
@@ -1027,349 +1171,42 @@ def create_interface():
1027
  probability_plot = gr.Plot(label="Probability Distribution")
1028
 
1029
  with gr.Row():
1030
- keyword_plot = gr.Plot(label="Key Contributing Words")
1031
 
1032
- with gr.Tab("Batch Analysis"):
1033
- with gr.Row():
1034
- with gr.Column():
1035
- file_upload = gr.File(
1036
- label="Upload File (CSV/TXT)",
1037
- file_types=[".csv", ".txt"]
1038
- )
1039
- batch_input = gr.Textbox(
1040
- label="Batch Input (one text per line)",
1041
- placeholder="Enter multiple texts, one per line...",
1042
- lines=10
1043
- )
1044
-
1045
- with gr.Row():
1046
- batch_language = gr.Dropdown(
1047
- choices=list(config.SUPPORTED_LANGUAGES.values()),
1048
- value="Auto Detect",
1049
- label="Language"
1050
- )
1051
- batch_theme = gr.Dropdown(
1052
- choices=list(config.THEMES.keys()),
1053
- value="default",
1054
- label="Theme"
1055
- )
1056
-
1057
- with gr.Row():
1058
- batch_clean_cb = gr.Checkbox(label="Clean Text", value=False)
1059
- batch_punct_cb = gr.Checkbox(label="Remove Punctuation", value=False)
1060
- batch_nums_cb = gr.Checkbox(label="Remove Numbers", value=False)
1061
-
1062
- with gr.Row():
1063
- load_file_btn = gr.Button("Load File")
1064
- analyze_batch_btn = gr.Button("Analyze Batch", variant="primary")
1065
-
1066
- with gr.Column():
1067
- batch_summary = gr.Textbox(label="Batch Summary", lines=8)
1068
- batch_results_df = gr.Dataframe(
1069
- label="Detailed Results",
1070
- headers=["Index", "Text", "Sentiment", "Confidence", "Language", "Keywords"],
1071
- datatype=["number", "str", "str", "str", "str", "str"]
1072
- )
1073
 
1074
- with gr.Row():
1075
- batch_plot = gr.Plot(label="Batch Analysis Summary")
1076
- confidence_dist_plot = gr.Plot(label="Confidence Distribution")
1077
-
1078
- with gr.Tab("History & Analytics"):
1079
  with gr.Row():
1080
  with gr.Column():
1081
- with gr.Row():
1082
- refresh_history_btn = gr.Button("Refresh History")
1083
- clear_history_btn = gr.Button("Clear History", variant="stop")
1084
- status_btn = gr.Button("Get Status")
1085
-
1086
- history_theme = gr.Dropdown(
1087
- choices=list(config.THEMES.keys()),
1088
- value="default",
1089
- label="Dashboard Theme"
1090
  )
1091
 
1092
- with gr.Row():
1093
- export_csv_btn = gr.Button("Export CSV")
1094
- export_json_btn = gr.Button("Export JSON")
1095
-
1096
- with gr.Column():
1097
- history_status = gr.Textbox(label="History Status", lines=8)
1098
-
1099
- history_dashboard = gr.Plot(label="History Analytics Dashboard")
1100
-
1101
- with gr.Row():
1102
- csv_download = gr.File(label="CSV Download", visible=True)
1103
- json_download = gr.File(label="JSON Download", visible=True)
1104
-
1105
- # Event Handlers
1106
- analyze_btn.click(
1107
- app.analyze_single,
1108
- inputs=[text_input, language_selector, theme_selector,
1109
- clean_text_cb, remove_punct_cb, remove_nums_cb],
1110
- outputs=[result_output, gauge_plot, probability_plot, keyword_plot]
1111
- )
1112
-
1113
- load_file_btn.click(
1114
- app.data_handler.process_file,
1115
- inputs=file_upload,
1116
- outputs=batch_input
1117
- )
1118
-
1119
- analyze_batch_btn.click(
1120
- app.analyze_batch,
1121
- inputs=[batch_input, batch_language, batch_theme,
1122
- batch_clean_cb, batch_punct_cb, batch_nums_cb],
1123
- outputs=[batch_summary, batch_results_df, batch_plot, confidence_dist_plot]
1124
- )
1125
-
1126
- refresh_history_btn.click(
1127
- app.plot_history,
1128
- inputs=history_theme,
1129
- outputs=[history_dashboard, history_status]
1130
- )
1131
-
1132
- clear_history_btn.click(
1133
- lambda: f"Cleared {app.history.clear()} entries",
1134
- outputs=history_status
1135
- )
1136
-
1137
- status_btn.click(
1138
- app.get_history_status,
1139
- outputs=history_status
1140
- )
1141
-
1142
- export_csv_btn.click(
1143
- lambda: app.data_handler.export_data(app.history.get_all(), 'csv'),
1144
- outputs=[csv_download, history_status]
1145
- )
1146
-
1147
- export_json_btn.click(
1148
- lambda: app.data_handler.export_data(app.history.get_all(), 'json'),
1149
- outputs=[json_download, history_status]
1150
- )
1151
-
1152
- return demo
1153
-
1154
- # Application Entry Point
1155
- if __name__ == "__main__":
1156
- logging.basicConfig(
1157
- level=logging.INFO,
1158
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
1159
- )
1160
-
1161
- try:
1162
- demo = create_interface()
1163
- demo.launch(
1164
- share=True,
1165
- server_name="0.0.0.0",
1166
- server_port=7860,
1167
- show_error=True
1168
- )
1169
- except Exception as e:
1170
- logger.error(f"Failed to launch application: {e}")
1171
- raise
1172
-
1173
- @handle_errors(default_return=("Please enter texts", None, None, None))
1174
- def analyze_batch(self, batch_text: str, language: str, theme: str,
1175
- clean_text: bool, remove_punct: bool, remove_nums: bool):
1176
- """Enhanced batch analysis"""
1177
- if not batch_text.strip():
1178
- return "Please enter texts (one per line)", None, None, None
1179
-
1180
- # Parse batch input
1181
- texts = TextProcessor.parse_batch_input(batch_text)
1182
-
1183
- if len(texts) > config.BATCH_SIZE_LIMIT:
1184
- return f"Too many texts. Maximum {config.BATCH_SIZE_LIMIT} allowed.", None, None, None
1185
-
1186
- if not texts:
1187
- return "No valid texts found", None, None, None
1188
-
1189
- # Map display names to language codes
1190
- language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
1191
- language_code = language_map.get(language, 'auto')
1192
-
1193
- preprocessing_options = {
1194
- 'clean_text': clean_text,
1195
- 'remove_punctuation': remove_punct,
1196
- 'remove_numbers': remove_nums
1197
- }
1198
-
1199
- with memory_cleanup():
1200
- results = self.engine.analyze_batch(texts, language_code, preprocessing_options)
1201
-
1202
- # Add to history
1203
- batch_entries = []
1204
- for result in results:
1205
- if 'error' not in result:
1206
- entry = {
1207
- 'text': result['text'],
1208
- 'full_text': result['full_text'],
1209
- 'sentiment': result['sentiment'],
1210
- 'confidence': result['confidence'],
1211
- 'pos_prob': result.get('pos_prob', 0),
1212
- 'neg_prob': result.get('neg_prob', 0),
1213
- 'neu_prob': result.get('neu_prob', 0),
1214
- 'language': result['language'],
1215
- 'keywords': result['keywords'],
1216
- 'word_count': result['word_count'],
1217
- 'analysis_type': 'batch',
1218
- 'batch_index': result['batch_index']
1219
- }
1220
- batch_entries.append(entry)
1221
-
1222
- self.history.add_batch(batch_entries)
1223
-
1224
- # Create visualizations
1225
- theme_ctx = ThemeContext(theme)
1226
- summary_fig = PlotlyVisualizer.create_batch_summary(results, theme_ctx)
1227
- confidence_fig = PlotlyVisualizer.create_confidence_distribution(results)
1228
-
1229
- # Create results DataFrame
1230
- df_data = []
1231
- for result in results:
1232
- if 'error' in result:
1233
- df_data.append({
1234
- 'Index': result['batch_index'] + 1,
1235
- 'Text': result['text'],
1236
- 'Sentiment': 'Error',
1237
- 'Confidence': 0.0,
1238
- 'Language': 'Unknown',
1239
- 'Error': result['error']
1240
- })
1241
- else:
1242
- keywords_str = ', '.join([word for word, _ in result['keywords'][:3]])
1243
- df_data.append({
1244
- 'Index': result['batch_index'] + 1,
1245
- 'Text': result['text'],
1246
- 'Sentiment': result['sentiment'],
1247
- 'Confidence': f"{result['confidence']:.3f}",
1248
- 'Language': result['language'].upper(),
1249
- 'Keywords': keywords_str
1250
- })
1251
-
1252
- df = pd.DataFrame(df_data)
1253
-
1254
- # Create summary text
1255
- successful_results = [r for r in results if 'error' not in r]
1256
- error_count = len(results) - len(successful_results)
1257
-
1258
- if successful_results:
1259
- sentiment_counts = Counter([r['sentiment'] for r in successful_results])
1260
- avg_confidence = np.mean([r['confidence'] for r in successful_results])
1261
- languages = Counter([r['language'] for r in successful_results])
1262
-
1263
- summary_text = f"""
1264
- **Batch Analysis Summary:**
1265
- - **Total Texts:** {len(texts)}
1266
- - **Successful:** {len(successful_results)}
1267
- - **Errors:** {error_count}
1268
- - **Average Confidence:** {avg_confidence:.3f}
1269
- - **Sentiments:** {dict(sentiment_counts)}
1270
- - **Languages Detected:** {dict(languages)}
1271
- """
1272
- else:
1273
- summary_text = f"All {len(texts)} texts failed to analyze."
1274
-
1275
- return summary_text, df, summary_fig, confidence_fig
1276
-
1277
- @handle_errors(default_return=(None, "No history available"))
1278
- def plot_history(self, theme: str = 'default'):
1279
- """Plot comprehensive history analysis"""
1280
- history = self.history.get_all()
1281
- if len(history) < 2:
1282
- return None, f"Need at least 2 analyses for trends. Current: {len(history)}"
1283
-
1284
- theme_ctx = ThemeContext(theme)
1285
-
1286
- with memory_cleanup():
1287
- fig = PlotlyVisualizer.create_history_dashboard(history, theme_ctx)
1288
- stats = self.history.get_stats()
1289
-
1290
- stats_text = f"""
1291
- **History Statistics:**
1292
- - **Total Analyses:** {stats.get('total_analyses', 0)}
1293
- - **Positive:** {stats.get('positive_count', 0)}
1294
- - **Negative:** {stats.get('negative_count', 0)}
1295
- - **Neutral:** {stats.get('neutral_count', 0)}
1296
- - **Average Confidence:** {stats.get('avg_confidence', 0):.3f}
1297
- - **Languages:** {stats.get('languages_detected', 0)}
1298
- - **Most Common Language:** {stats.get('most_common_language', 'N/A').upper()}
1299
- """
1300
-
1301
- return fig, stats_text
1302
-
1303
- @handle_errors(default_return=("No data available",))
1304
- def get_history_status(self):
1305
- """Get current history status"""
1306
- stats = self.history.get_stats()
1307
- if not stats:
1308
- return "No analyses performed yet"
1309
-
1310
- return f"""
1311
- **Current Status:**
1312
- - **Total Analyses:** {stats['total_analyses']}
1313
- - **Recent Sentiment Distribution:**
1314
- * Positive: {stats['positive_count']}
1315
- * Negative: {stats['negative_count']}
1316
- * Neutral: {stats['neutral_count']}
1317
- - **Average Confidence:** {stats['avg_confidence']:.3f}
1318
- - **Languages Detected:** {stats['languages_detected']}
1319
- """
1320
-
1321
- # Gradio Interface
1322
- def create_interface():
1323
- """Create comprehensive Gradio interface"""
1324
- app = SentimentApp()
1325
-
1326
- with gr.Blocks(theme=gr.themes.Soft(), title="Multilingual Sentiment Analyzer") as demo:
1327
- gr.Markdown("# 🌍 Advanced Multilingual Sentiment Analyzer")
1328
- gr.Markdown("AI-powered sentiment analysis with support for multiple languages, advanced visualizations, and explainable AI features")
1329
-
1330
- with gr.Tab("Single Analysis"):
1331
- with gr.Row():
1332
- with gr.Column():
1333
- text_input = gr.Textbox(
1334
- label="Enter Text for Analysis",
1335
- placeholder="Enter your text in any supported language...",
1336
- lines=5
1337
  )
1338
 
1339
  with gr.Row():
1340
- language_selector = gr.Dropdown(
1341
- choices=list(config.SUPPORTED_LANGUAGES.values()),
1342
- value="Auto Detect",
1343
- label="Language"
1344
- )
1345
- theme_selector = gr.Dropdown(
1346
- choices=list(config.THEMES.keys()),
1347
- value="default",
1348
- label="Theme"
1349
- )
1350
-
1351
- with gr.Row():
1352
- clean_text_cb = gr.Checkbox(label="Clean Text", value=False)
1353
- remove_punct_cb = gr.Checkbox(label="Remove Punctuation", value=False)
1354
- remove_nums_cb = gr.Checkbox(label="Remove Numbers", value=False)
1355
-
1356
- analyze_btn = gr.Button("Analyze", variant="primary", size="lg")
1357
 
1358
- gr.Examples(
1359
- examples=app.examples,
1360
- inputs=text_input,
1361
- cache_examples=False
1362
- )
1363
 
1364
  with gr.Column():
1365
- result_output = gr.Textbox(label="Analysis Results", lines=8)
1366
 
1367
  with gr.Row():
1368
- gauge_plot = gr.Plot(label="Sentiment Gauge")
1369
- probability_plot = gr.Plot(label="Probability Distribution")
1370
-
1371
- with gr.Row():
1372
- keyword_plot = gr.Plot(label="Key Contributing Words")
1373
 
1374
  with gr.Tab("Batch Analysis"):
1375
  with gr.Row():
@@ -1445,6 +1282,8 @@ def create_interface():
1445
  json_download = gr.File(label="JSON Download", visible=True)
1446
 
1447
  # Event Handlers
 
 
1448
  analyze_btn.click(
1449
  app.analyze_single,
1450
  inputs=[text_input, language_selector, theme_selector,
@@ -1452,6 +1291,20 @@ def create_interface():
1452
  outputs=[result_output, gauge_plot, probability_plot, keyword_plot]
1453
  )
1454
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1455
  load_file_btn.click(
1456
  app.data_handler.process_file,
1457
  inputs=file_upload,
@@ -1465,6 +1318,7 @@ def create_interface():
1465
  outputs=[batch_summary, batch_results_df, batch_plot, confidence_dist_plot]
1466
  )
1467
 
 
1468
  refresh_history_btn.click(
1469
  app.plot_history,
1470
  inputs=history_theme,
 
24
  import pandas as pd
25
  import gc
26
 
27
+ # Advanced analysis imports
28
+ import shap
29
+ import lime
30
+ from lime.lime_text import LimeTextExplainer
31
+
32
  # Configuration
33
  @dataclass
34
  class Config:
 
281
  'most_common_language': Counter(languages).most_common(1)[0][0] if languages else 'en'
282
  }
283
 
284
+ # Core Sentiment Analysis Engine (Modified - removed attention analysis)
285
  class SentimentEngine:
286
  """Multi-language sentiment analysis engine"""
287
 
288
  def __init__(self):
289
  self.model_manager = ModelManager()
290
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0, 'keywords': []})
292
  def analyze_single(self, text: str, language: str = 'auto', preprocessing_options: Dict = None) -> Dict:
293
+ """Analyze single text with basic features"""
294
  if not text.strip():
295
  raise ValueError("Empty text provided")
296
 
 
350
  'has_neutral': False
351
  }
352
 
353
+ # Extract basic keywords
354
+ keywords = TextProcessor.extract_keywords(text, 10)
355
+ keyword_tuples = [(word, 0.1) for word in keywords] # Simple keyword extraction
356
 
357
  # Add metadata
358
  result.update({
359
  'language': detected_lang,
360
+ 'keywords': keyword_tuples,
361
  'word_count': len(text.split()),
362
  'char_count': len(text)
363
  })
 
399
 
400
  return results
401
 
402
+ # Advanced Analysis Engine (NEW)
403
+ class AdvancedAnalysisEngine:
404
+ """Advanced analysis using SHAP and LIME"""
405
+
406
+ def __init__(self):
407
+ self.model_manager = ModelManager()
408
+
409
+ def create_prediction_function(self, model, tokenizer, device):
410
+ """Create prediction function for LIME/SHAP"""
411
+ def predict_proba(texts):
412
+ results = []
413
+ for text in texts:
414
+ inputs = tokenizer(text, return_tensors="pt", padding=True,
415
+ truncation=True, max_length=config.MAX_TEXT_LENGTH).to(device)
416
+ with torch.no_grad():
417
+ outputs = model(**inputs)
418
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
419
+ results.append(probs)
420
+ return np.array(results)
421
+ return predict_proba
422
+
423
+ @handle_errors(default_return=("Analysis failed", None, None))
424
+ def analyze_with_shap(self, text: str, language: str = 'auto') -> Tuple[str, go.Figure, Dict]:
425
+ """Perform SHAP analysis"""
426
+ if not text.strip():
427
+ return "Please enter text for analysis", None, {}
428
+
429
+ # Detect language and get model
430
+ if language == 'auto':
431
+ detected_lang = self.model_manager.detect_language(text)
432
+ else:
433
+ detected_lang = language
434
+
435
+ model, tokenizer = self.model_manager.get_model(detected_lang)
436
+
437
+ # Create prediction function
438
+ predict_fn = self.create_prediction_function(model, tokenizer, self.model_manager.device)
439
+
440
+ try:
441
+ # Initialize SHAP explainer
442
+ explainer = shap.Explainer(predict_fn, tokenizer)
443
+
444
+ # Get SHAP values
445
+ shap_values = explainer([text])
446
+
447
+ # Extract token importance
448
+ tokens = shap_values.data[0]
449
+ values = shap_values.values[0]
450
+
451
+ # Create visualization data
452
+ if len(values.shape) > 1:
453
+ # Multi-class case
454
+ pos_values = values[:, -1] if values.shape[1] == 3 else values[:, 1]
455
+ else:
456
+ pos_values = values
457
+
458
+ # Create SHAP plot
459
+ fig = go.Figure()
460
+
461
+ colors = ['red' if v < 0 else 'green' for v in pos_values]
462
+
463
+ fig.add_trace(go.Bar(
464
+ x=list(range(len(tokens))),
465
+ y=pos_values,
466
+ text=tokens,
467
+ textposition='outside',
468
+ marker_color=colors,
469
+ name='SHAP Values'
470
+ ))
471
+
472
+ fig.update_layout(
473
+ title="SHAP Analysis - Token Importance",
474
+ xaxis_title="Token Index",
475
+ yaxis_title="SHAP Value",
476
+ height=500,
477
+ xaxis=dict(tickmode='array', tickvals=list(range(len(tokens))), ticktext=tokens)
478
+ )
479
+
480
+ # Create analysis summary
481
+ analysis_data = {
482
+ 'method': 'SHAP',
483
+ 'language': detected_lang,
484
+ 'total_tokens': len(tokens),
485
+ 'positive_influence': sum(1 for v in pos_values if v > 0),
486
+ 'negative_influence': sum(1 for v in pos_values if v < 0),
487
+ 'most_important_tokens': [(tokens[i], float(pos_values[i]))
488
+ for i in np.argsort(np.abs(pos_values))[-5:]]
489
+ }
490
+
491
+ summary_text = f"""
492
+ **SHAP Analysis Results:**
493
+ - **Language:** {detected_lang.upper()}
494
+ - **Total Tokens:** {analysis_data['total_tokens']}
495
+ - **Positive Influence Tokens:** {analysis_data['positive_influence']}
496
+ - **Negative Influence Tokens:** {analysis_data['negative_influence']}
497
+ - **Most Important Tokens:** {', '.join([f"{token}({score:.3f})" for token, score in analysis_data['most_important_tokens']])}
498
+ """
499
+
500
+ return summary_text, fig, analysis_data
501
+
502
+ except Exception as e:
503
+ logger.error(f"SHAP analysis failed: {e}")
504
+ return f"SHAP analysis failed: {str(e)}", None, {}
505
+
506
+ @handle_errors(default_return=("Analysis failed", None, None))
507
+ def analyze_with_lime(self, text: str, language: str = 'auto') -> Tuple[str, go.Figure, Dict]:
508
+ """Perform LIME analysis"""
509
+ if not text.strip():
510
+ return "Please enter text for analysis", None, {}
511
+
512
+ # Detect language and get model
513
+ if language == 'auto':
514
+ detected_lang = self.model_manager.detect_language(text)
515
+ else:
516
+ detected_lang = language
517
+
518
+ model, tokenizer = self.model_manager.get_model(detected_lang)
519
+
520
+ # Create prediction function
521
+ predict_fn = self.create_prediction_function(model, tokenizer, self.model_manager.device)
522
+
523
+ try:
524
+ # Initialize LIME explainer
525
+ explainer = LimeTextExplainer(class_names=['Negative', 'Neutral', 'Positive'])
526
+
527
+ # Get LIME explanation
528
+ exp = explainer.explain_instance(text, predict_fn, num_features=20)
529
+
530
+ # Extract feature importance
531
+ lime_data = exp.as_list()
532
+
533
+ # Create visualization
534
+ words = [item[0] for item in lime_data]
535
+ scores = [item[1] for item in lime_data]
536
+
537
+ fig = go.Figure()
538
+
539
+ colors = ['red' if s < 0 else 'green' for s in scores]
540
+
541
+ fig.add_trace(go.Bar(
542
+ y=words,
543
+ x=scores,
544
+ orientation='h',
545
+ marker_color=colors,
546
+ text=[f'{s:.3f}' for s in scores],
547
+ textposition='auto',
548
+ name='LIME Importance'
549
+ ))
550
+
551
+ fig.update_layout(
552
+ title="LIME Analysis - Feature Importance",
553
+ xaxis_title="Importance Score",
554
+ yaxis_title="Words/Phrases",
555
+ height=500
556
+ )
557
+
558
+ # Create analysis summary
559
+ analysis_data = {
560
+ 'method': 'LIME',
561
+ 'language': detected_lang,
562
+ 'features_analyzed': len(lime_data),
563
+ 'positive_features': sum(1 for _, score in lime_data if score > 0),
564
+ 'negative_features': sum(1 for _, score in lime_data if score < 0),
565
+ 'feature_importance': lime_data
566
+ }
567
+
568
+ summary_text = f"""
569
+ **LIME Analysis Results:**
570
+ - **Language:** {detected_lang.upper()}
571
+ - **Features Analyzed:** {analysis_data['features_analyzed']}
572
+ - **Positive Features:** {analysis_data['positive_features']}
573
+ - **Negative Features:** {analysis_data['negative_features']}
574
+ - **Top Features:** {', '.join([f"{word}({score:.3f})" for word, score in lime_data[:5]])}
575
+ """
576
+
577
+ return summary_text, fig, analysis_data
578
+
579
+ except Exception as e:
580
+ logger.error(f"LIME analysis failed: {e}")
581
+ return f"LIME analysis failed: {str(e)}", None, {}
582
+
583
+ # Advanced Plotly Visualization System (Updated - removed attention visualization)
584
  class PlotlyVisualizer:
585
  """Enhanced Plotly visualizations"""
586
 
 
665
  @staticmethod
666
  @handle_errors(default_return=None)
667
  def create_keyword_chart(keywords: List[Tuple[str, float]], sentiment: str, theme: ThemeContext) -> go.Figure:
668
+ """Create basic keyword chart"""
669
  if not keywords:
670
  fig = go.Figure()
671
  fig.add_annotation(text="No keywords extracted",
 
691
 
692
  fig.update_layout(
693
  title=f"Top Keywords ({sentiment})",
694
+ xaxis_title="Frequency Score",
695
  yaxis_title="Keywords",
696
  height=400,
697
  showlegend=False
 
887
 
888
  def __init__(self):
889
  self.engine = SentimentEngine()
890
+ self.advanced_engine = AdvancedAnalysisEngine() # NEW
891
  self.history = HistoryManager()
892
  self.data_handler = DataHandler()
893
 
 
903
  @handle_errors(default_return=("Please enter text", None, None, None))
904
  def analyze_single(self, text: str, language: str, theme: str, clean_text: bool,
905
  remove_punct: bool, remove_nums: bool):
906
+ """Single text analysis with basic visualizations (removed attention analysis)"""
907
  if not text.strip():
908
  return "Please enter text", None, None, None
909
 
 
1059
 
1060
  return summary_text, df, summary_fig, confidence_fig
1061
 
1062
+ # NEW: Advanced analysis methods
1063
+ @handle_errors(default_return=("Please enter text", None))
1064
+ def analyze_with_shap(self, text: str, language: str):
1065
+ """Perform SHAP analysis"""
1066
+ language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
1067
+ language_code = language_map.get(language, 'auto')
1068
+
1069
+ return self.advanced_engine.analyze_with_shap(text, language_code)
1070
+
1071
+ @handle_errors(default_return=("Please enter text", None))
1072
+ def analyze_with_lime(self, text: str, language: str):
1073
+ """Perform LIME analysis"""
1074
+ language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
1075
+ language_code = language_map.get(language, 'auto')
1076
+
1077
+ return self.advanced_engine.analyze_with_lime(text, language_code)
1078
+
1079
  @handle_errors(default_return=(None, "No history available"))
1080
  def plot_history(self, theme: str = 'default'):
1081
  """Plot comprehensive history analysis"""
 
1120
  - **Languages Detected:** {stats['languages_detected']}
1121
  """
1122
 
1123
+ # Gradio Interface (Updated with Advanced Analysis tab)
1124
  def create_interface():
1125
+ """Create comprehensive Gradio interface with Advanced Analysis tab"""
1126
  app = SentimentApp()
1127
 
1128
  with gr.Blocks(theme=gr.themes.Soft(), title="Multilingual Sentiment Analyzer") as demo:
 
1171
  probability_plot = gr.Plot(label="Probability Distribution")
1172
 
1173
  with gr.Row():
1174
+ keyword_plot = gr.Plot(label="Basic Keywords")
1175
 
1176
+ # NEW: Advanced Analysis Tab
1177
+ with gr.Tab("Advanced Analysis"):
1178
+ gr.Markdown("## 🔬 Explainable AI Analysis")
1179
+ gr.Markdown("Use SHAP and LIME to understand which words and phrases most influence the sentiment prediction.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1180
 
 
 
 
 
 
1181
  with gr.Row():
1182
  with gr.Column():
1183
+ advanced_text_input = gr.Textbox(
1184
+ label="Enter Text for Advanced Analysis",
1185
+ placeholder="Enter text to analyze with SHAP and LIME...",
1186
+ lines=6
 
 
 
 
 
1187
  )
1188
 
1189
+ advanced_language = gr.Dropdown(
1190
+ choices=list(config.SUPPORTED_LANGUAGES.values()),
1191
+ value="Auto Detect",
1192
+ label="Language"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1193
  )
1194
 
1195
  with gr.Row():
1196
+ shap_btn = gr.Button("SHAP Analysis", variant="primary")
1197
+ lime_btn = gr.Button("LIME Analysis", variant="secondary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1198
 
1199
+ gr.Markdown("""
1200
+ **Analysis Methods:**
1201
+ - **SHAP**: Shows token-level importance scores
1202
+ - **LIME**: Explains predictions by perturbing input features
1203
+ """)
1204
 
1205
  with gr.Column():
1206
+ advanced_results = gr.Textbox(label="Analysis Summary", lines=10)
1207
 
1208
  with gr.Row():
1209
+ advanced_plot = gr.Plot(label="Feature Importance Visualization")
 
 
 
 
1210
 
1211
  with gr.Tab("Batch Analysis"):
1212
  with gr.Row():
 
1282
  json_download = gr.File(label="JSON Download", visible=True)
1283
 
1284
  # Event Handlers
1285
+
1286
+ # Single Analysis
1287
  analyze_btn.click(
1288
  app.analyze_single,
1289
  inputs=[text_input, language_selector, theme_selector,
 
1291
  outputs=[result_output, gauge_plot, probability_plot, keyword_plot]
1292
  )
1293
 
1294
+ # Advanced Analysis (NEW)
1295
+ shap_btn.click(
1296
+ app.analyze_with_shap,
1297
+ inputs=[advanced_text_input, advanced_language],
1298
+ outputs=[advanced_results, advanced_plot]
1299
+ )
1300
+
1301
+ lime_btn.click(
1302
+ app.analyze_with_lime,
1303
+ inputs=[advanced_text_input, advanced_language],
1304
+ outputs=[advanced_results, advanced_plot]
1305
+ )
1306
+
1307
+ # Batch Analysis
1308
  load_file_btn.click(
1309
  app.data_handler.process_file,
1310
  inputs=file_upload,
 
1318
  outputs=[batch_summary, batch_results_df, batch_plot, confidence_dist_plot]
1319
  )
1320
 
1321
+ # History & Analytics
1322
  refresh_history_btn.click(
1323
  app.plot_history,
1324
  inputs=history_theme,