Spaces:

entropy25
/

multilingual-sentiment-analyzer

Running

App Files Files Community

entropy25 commited on 27 days ago

Commit

743d0ec

verified ·

1 Parent(s): f0fc9bb

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -55

app.py CHANGED Viewed

@@ -484,30 +484,49 @@ class SentimentEngine:
         return results
-# Advanced Analysis Engine
 class AdvancedAnalysisEngine:
-    """Advanced analysis using SHAP and LIME"""
     def __init__(self):
         self.model_manager = ModelManager()
-    def create_prediction_function(self, model, tokenizer, device):
-        """Create prediction function for LIME/SHAP"""
         def predict_proba(texts):
             results = []
-            with torch.no_grad():
-                for text in texts:
-                    inputs = tokenizer(text, return_tensors="pt", padding=True,
-                                     truncation=True, max_length=config.MAX_TEXT_LENGTH).to(device)
                     outputs = model(**inputs)
-                    probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
-                    results.append(probs)
             return np.array(results)
         return predict_proba
     @handle_errors(default_return=("Analysis failed", None, None))
-    def analyze_with_shap(self, text: str, language: str = 'auto') -> Tuple[str, go.Figure, Dict]:
-        """Perform SHAP analysis"""
         if not text.strip():
             return "Please enter text for analysis", None, {}
@@ -519,12 +538,14 @@ class AdvancedAnalysisEngine:
         model, tokenizer = self.model_manager.get_model(detected_lang)
-        # Create prediction function
-        predict_fn = self.create_prediction_function(model, tokenizer, self.model_manager.device)
         try:
-            # Initialize SHAP explainer
-            explainer = shap.Explainer(predict_fn, tokenizer)
             # Get SHAP values
             shap_values = explainer([text])
@@ -551,11 +572,12 @@ class AdvancedAnalysisEngine:
                 text=tokens,
                 textposition='outside',
                 marker_color=colors,
-                name='SHAP Values'
             ))
             fig.update_layout(
-                title="SHAP Analysis - Token Importance",
                 xaxis_title="Token Index",
                 yaxis_title="SHAP Value",
                 height=500,
@@ -567,6 +589,7 @@ class AdvancedAnalysisEngine:
                 'method': 'SHAP',
                 'language': detected_lang,
                 'total_tokens': len(tokens),
                 'positive_influence': sum(1 for v in pos_values if v > 0),
                 'negative_influence': sum(1 for v in pos_values if v < 0),
                 'most_important_tokens': [(tokens[i], float(pos_values[i]))
@@ -577,9 +600,11 @@ class AdvancedAnalysisEngine:
 **SHAP Analysis Results:**
 - **Language:** {detected_lang.upper()}
 - **Total Tokens:** {analysis_data['total_tokens']}
 - **Positive Influence Tokens:** {analysis_data['positive_influence']}
 - **Negative Influence Tokens:** {analysis_data['negative_influence']}
 - **Most Important Tokens:** {', '.join([f"{token}({score:.3f})" for token, score in analysis_data['most_important_tokens']])}
             """
             return summary_text, fig, analysis_data
@@ -589,8 +614,8 @@ class AdvancedAnalysisEngine:
             return f"SHAP analysis failed: {str(e)}", None, {}
     @handle_errors(default_return=("Analysis failed", None, None))
-    def analyze_with_lime(self, text: str, language: str = 'auto') -> Tuple[str, go.Figure, Dict]:
-        """Perform LIME analysis"""
         if not text.strip():
             return "Please enter text for analysis", None, {}
@@ -602,15 +627,25 @@ class AdvancedAnalysisEngine:
         model, tokenizer = self.model_manager.get_model(detected_lang)
-        # Create prediction function
-        predict_fn = self.create_prediction_function(model, tokenizer, self.model_manager.device)
         try:
-            # Initialize LIME explainer
-            explainer = LimeTextExplainer(class_names=['Negative', 'Neutral', 'Positive'])
-            # Get LIME explanation
-            exp = explainer.explain_instance(text, predict_fn, num_features=20)
             # Extract feature importance
             lime_data = exp.as_list()
@@ -630,11 +665,12 @@ class AdvancedAnalysisEngine:
                 marker_color=colors,
                 text=[f'{s:.3f}' for s in scores],
                 textposition='auto',
-                name='LIME Importance'
             ))
             fig.update_layout(
-                title="LIME Analysis - Feature Importance",
                 xaxis_title="Importance Score",
                 yaxis_title="Words/Phrases",
                 height=500
@@ -645,6 +681,7 @@ class AdvancedAnalysisEngine:
                 'method': 'LIME',
                 'language': detected_lang,
                 'features_analyzed': len(lime_data),
                 'positive_features': sum(1 for _, score in lime_data if score > 0),
                 'negative_features': sum(1 for _, score in lime_data if score < 0),
                 'feature_importance': lime_data
@@ -654,9 +691,11 @@ class AdvancedAnalysisEngine:
 **LIME Analysis Results:**
 - **Language:** {detected_lang.upper()}
 - **Features Analyzed:** {analysis_data['features_analyzed']}
 - **Positive Features:** {analysis_data['positive_features']}
 - **Negative Features:** {analysis_data['negative_features']}
 - **Top Features:** {', '.join([f"{word}({score:.3f})" for word, score in lime_data[:5]])}
             """
             return summary_text, fig, analysis_data
@@ -795,7 +834,7 @@ class PlotlyVisualizer:
             yaxis_title="Frequency",
             height=400
         )
         return fig
     @staticmethod
@@ -949,7 +988,7 @@ class SentimentApp:
     @handle_errors(default_return=("Please enter text", None, None))
     def analyze_single(self, text: str, language: str, theme: str, clean_text: bool,
                        remove_punct: bool, remove_nums: bool):
-        """Optimized single text analysis without keyword extraction"""
         if not text.strip():
             return "Please enter text", None, None
@@ -966,7 +1005,7 @@ class SentimentApp:
         with memory_cleanup():
             result = self.engine.analyze_single(text, language_code, preprocessing_options)
-            # Add to history (without keywords)
             history_entry = {
                 'text': text[:100] + '...' if len(text) > 100 else text,
                 'full_text': text,
@@ -981,7 +1020,7 @@ class SentimentApp:
             }
             self.history.add(history_entry)
-            # Create visualizations (only gauge and probability bars)
             theme_ctx = ThemeContext(theme)
             gauge_fig = PlotlyVisualizer.create_sentiment_gauge(result, theme_ctx)
             bars_fig = PlotlyVisualizer.create_probability_bars(result, theme_ctx)
@@ -1099,22 +1138,22 @@ class SentimentApp:
             return summary_text, df, summary_fig, confidence_fig
-    # Advanced analysis methods
     @handle_errors(default_return=("Please enter text", None))
-    def analyze_with_shap(self, text: str, language: str):
-        """Perform SHAP analysis"""
         language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
         language_code = language_map.get(language, 'auto')
-        return self.advanced_engine.analyze_with_shap(text, language_code)
     @handle_errors(default_return=("Please enter text", None))
-    def analyze_with_lime(self, text: str, language: str):
-        """Perform LIME analysis"""
         language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
         language_code = language_map.get(language, 'auto')
-        return self.advanced_engine.analyze_with_lime(text, language_code)
     @handle_errors(default_return=(None, "No history available"))
     def plot_history(self, theme: str = 'default'):
@@ -1210,10 +1249,10 @@ def create_interface():
                 gauge_plot = gr.Plot(label="Sentiment Gauge")
                 probability_plot = gr.Plot(label="Probability Distribution")
-        # Advanced Analysis Tab
         with gr.Tab("Advanced Analysis"):
-            gr.Markdown("## 🔬 Explainable AI Analysis")
-            gr.Markdown("Use SHAP and LIME to understand which words and phrases most influence the sentiment prediction.")
             with gr.Row():
                 with gr.Column():
@@ -1223,24 +1262,45 @@ def create_interface():
                         lines=6
                     )
-                    advanced_language = gr.Dropdown(
-                        choices=list(config.SUPPORTED_LANGUAGES.values()),
-                        value="Auto Detect",
-                        label="Language"
-                    )
                     with gr.Row():
                         shap_btn = gr.Button("SHAP Analysis", variant="primary")
                         lime_btn = gr.Button("LIME Analysis", variant="secondary")
                     gr.Markdown("""
                     **Analysis Methods:**
-                    - **SHAP**: Shows token-level importance scores
-                    - **LIME**: Explains predictions by perturbing input features
                     """)
                 with gr.Column():
-                    advanced_results = gr.Textbox(label="Analysis Summary", lines=10)
             with gr.Row():
                 advanced_plot = gr.Plot(label="Feature Importance Visualization")
@@ -1318,9 +1378,9 @@ def create_interface():
                 csv_download = gr.File(label="CSV Download", visible=True)
                 json_download = gr.File(label="JSON Download", visible=True)
-        # Event Handlers - Updated for optimized single analysis
-        # Single Analysis (removed keyword_plot output)
         analyze_btn.click(
             app.analyze_single,
             inputs=[text_input, language_selector, theme_selector,
@@ -1328,16 +1388,16 @@ def create_interface():
             outputs=[result_output, gauge_plot, probability_plot]
         )
-        # Advanced Analysis
         shap_btn.click(
             app.analyze_with_shap,
-            inputs=[advanced_text_input, advanced_language],
             outputs=[advanced_results, advanced_plot]
         )
         lime_btn.click(
             app.analyze_with_lime,
-            inputs=[advanced_text_input, advanced_language],
             outputs=[advanced_results, advanced_plot]
         )

         return results
+# Optimized Advanced Analysis Engine
 class AdvancedAnalysisEngine:
+    """Advanced analysis using SHAP and LIME with performance optimizations"""
     def __init__(self):
         self.model_manager = ModelManager()
+        self.batch_size = 32  # Batch size for processing multiple samples
+    def create_batch_prediction_function(self, model, tokenizer, device, batch_size=32):
+        """Create optimized batch prediction function for LIME/SHAP"""
         def predict_proba(texts):
+            if not isinstance(texts, list):
+                texts = [texts]
             results = []
+            # Process in batches for efficiency
+            for i in range(0, len(texts), batch_size):
+                batch_texts = texts[i:i + batch_size]
+                with torch.no_grad():
+                    # Tokenize batch
+                    inputs = tokenizer(
+                        batch_texts,
+                        return_tensors="pt",
+                        padding=True,
+                        truncation=True,
+                        max_length=config.MAX_TEXT_LENGTH
+                    ).to(device)
+                    # Batch inference
                     outputs = model(**inputs)
+                    probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()
+                    results.extend(probs)
             return np.array(results)
         return predict_proba
     @handle_errors(default_return=("Analysis failed", None, None))
+    def analyze_with_shap(self, text: str, language: str = 'auto', num_samples: int = 100) -> Tuple[str, go.Figure, Dict]:
+        """Perform optimized SHAP analysis with configurable samples"""
         if not text.strip():
             return "Please enter text for analysis", None, {}
         model, tokenizer = self.model_manager.get_model(detected_lang)
+        # Create optimized prediction function
+        predict_fn = self.create_batch_prediction_function(
+            model, tokenizer, self.model_manager.device, self.batch_size
+        )
         try:
+            # Initialize SHAP explainer with reduced samples
+            explainer = shap.Explainer(predict_fn, tokenizer, max_evals=num_samples)
             # Get SHAP values
             shap_values = explainer([text])
                 text=tokens,
                 textposition='outside',
                 marker_color=colors,
+                name='SHAP Values',
+                hovertemplate='<b>%{text}</b><br>SHAP Value: %{y:.4f}<extra></extra>'
             ))
             fig.update_layout(
+                title=f"SHAP Analysis - Token Importance (Samples: {num_samples})",
                 xaxis_title="Token Index",
                 yaxis_title="SHAP Value",
                 height=500,
                 'method': 'SHAP',
                 'language': detected_lang,
                 'total_tokens': len(tokens),
+                'samples_used': num_samples,
                 'positive_influence': sum(1 for v in pos_values if v > 0),
                 'negative_influence': sum(1 for v in pos_values if v < 0),
                 'most_important_tokens': [(tokens[i], float(pos_values[i]))
 **SHAP Analysis Results:**
 - **Language:** {detected_lang.upper()}
 - **Total Tokens:** {analysis_data['total_tokens']}
+- **Samples Used:** {num_samples}
 - **Positive Influence Tokens:** {analysis_data['positive_influence']}
 - **Negative Influence Tokens:** {analysis_data['negative_influence']}
 - **Most Important Tokens:** {', '.join([f"{token}({score:.3f})" for token, score in analysis_data['most_important_tokens']])}
+- **Processing:** Optimized with batch processing (32 samples/batch)
             """
             return summary_text, fig, analysis_data
             return f"SHAP analysis failed: {str(e)}", None, {}
     @handle_errors(default_return=("Analysis failed", None, None))
+    def analyze_with_lime(self, text: str, language: str = 'auto', num_samples: int = 100) -> Tuple[str, go.Figure, Dict]:
+        """Perform optimized LIME analysis with configurable samples"""
         if not text.strip():
             return "Please enter text for analysis", None, {}
         model, tokenizer = self.model_manager.get_model(detected_lang)
+        # Create optimized prediction function
+        predict_fn = self.create_batch_prediction_function(
+            model, tokenizer, self.model_manager.device, self.batch_size
+        )
         try:
+            # Initialize LIME explainer with reduced samples
+            explainer = LimeTextExplainer(
+                class_names=['Negative', 'Neutral', 'Positive'],
+                mode='classification'
+            )
+            # Get LIME explanation with configurable samples
+            exp = explainer.explain_instance(
+                text,
+                predict_fn,
+                num_features=20,
+                num_samples=num_samples  # Configurable sample size
+            )
             # Extract feature importance
             lime_data = exp.as_list()
                 marker_color=colors,
                 text=[f'{s:.3f}' for s in scores],
                 textposition='auto',
+                name='LIME Importance',
+                hovertemplate='<b>%{y}</b><br>Importance: %{x:.4f}<extra></extra>'
             ))
             fig.update_layout(
+                title=f"LIME Analysis - Feature Importance (Samples: {num_samples})",
                 xaxis_title="Importance Score",
                 yaxis_title="Words/Phrases",
                 height=500
                 'method': 'LIME',
                 'language': detected_lang,
                 'features_analyzed': len(lime_data),
+                'samples_used': num_samples,
                 'positive_features': sum(1 for _, score in lime_data if score > 0),
                 'negative_features': sum(1 for _, score in lime_data if score < 0),
                 'feature_importance': lime_data
 **LIME Analysis Results:**
 - **Language:** {detected_lang.upper()}
 - **Features Analyzed:** {analysis_data['features_analyzed']}
+- **Samples Used:** {num_samples}
 - **Positive Features:** {analysis_data['positive_features']}
 - **Negative Features:** {analysis_data['negative_features']}
 - **Top Features:** {', '.join([f"{word}({score:.3f})" for word, score in lime_data[:5]])}
+- **Processing:** Optimized with batch processing (32 samples/batch)
             """
             return summary_text, fig, analysis_data
             yaxis_title="Frequency",
             height=400
         )
         return fig
     @staticmethod
     @handle_errors(default_return=("Please enter text", None, None))
     def analyze_single(self, text: str, language: str, theme: str, clean_text: bool,
                        remove_punct: bool, remove_nums: bool):
+        """Optimized single text analysis"""
         if not text.strip():
             return "Please enter text", None, None
         with memory_cleanup():
             result = self.engine.analyze_single(text, language_code, preprocessing_options)
+            # Add to history
             history_entry = {
                 'text': text[:100] + '...' if len(text) > 100 else text,
                 'full_text': text,
             }
             self.history.add(history_entry)
+            # Create visualizations
             theme_ctx = ThemeContext(theme)
             gauge_fig = PlotlyVisualizer.create_sentiment_gauge(result, theme_ctx)
             bars_fig = PlotlyVisualizer.create_probability_bars(result, theme_ctx)
             return summary_text, df, summary_fig, confidence_fig
+    # Optimized advanced analysis methods with sample size control
     @handle_errors(default_return=("Please enter text", None))
+    def analyze_with_shap(self, text: str, language: str, num_samples: int = 100):
+        """Perform optimized SHAP analysis with configurable samples"""
         language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
         language_code = language_map.get(language, 'auto')
+        return self.advanced_engine.analyze_with_shap(text, language_code, num_samples)
     @handle_errors(default_return=("Please enter text", None))
+    def analyze_with_lime(self, text: str, language: str, num_samples: int = 100):
+        """Perform optimized LIME analysis with configurable samples"""
         language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
         language_code = language_map.get(language, 'auto')
+        return self.advanced_engine.analyze_with_lime(text, language_code, num_samples)
     @handle_errors(default_return=(None, "No history available"))
     def plot_history(self, theme: str = 'default'):
                 gauge_plot = gr.Plot(label="Sentiment Gauge")
                 probability_plot = gr.Plot(label="Probability Distribution")
+        # Optimized Advanced Analysis Tab
         with gr.Tab("Advanced Analysis"):
+            gr.Markdown("## 🔬 Explainable AI Analysis (Optimized)")
+            gr.Markdown("Use SHAP and LIME to understand which words influence sentiment prediction. **Optimized with batch processing and configurable sample sizes.**")
             with gr.Row():
                 with gr.Column():
                         lines=6
                     )
+                    with gr.Row():
+                        advanced_language = gr.Dropdown(
+                            choices=list(config.SUPPORTED_LANGUAGES.values()),
+                            value="Auto Detect",
+                            label="Language"
+                        )
+                        num_samples_slider = gr.Slider(
+                            minimum=50,
+                            maximum=500,
+                            value=100,
+                            step=50,
+                            label="Number of Samples",
+                            info="Lower = Faster, Higher = More Accurate"
+                        )
                     with gr.Row():
                         shap_btn = gr.Button("SHAP Analysis", variant="primary")
                         lime_btn = gr.Button("LIME Analysis", variant="secondary")
                     gr.Markdown("""
+                    **Optimizations Applied:**
+                    - ✅ **Batch Processing**: Multiple samples processed together (32 samples/batch)
+                    - ✅ **Configurable Samples**: Adjust speed vs accuracy trade-off
+                    - ✅ **Memory Optimization**: Efficient GPU memory management
+                    - 📊 **Performance**: ~5-10x faster than standard implementation
                     **Analysis Methods:**
+                    - **SHAP**: Token-level importance scores
+                    - **LIME**: Feature importance through perturbation
+                    **Expected Times:**
+                    - 50 samples: ~10-20 seconds
+                    - 100 samples: ~20-40 seconds
+                    - 200+ samples: ~40-80 seconds
                     """)
                 with gr.Column():
+                    advanced_results = gr.Textbox(label="Analysis Summary", lines=12)
             with gr.Row():
                 advanced_plot = gr.Plot(label="Feature Importance Visualization")
                 csv_download = gr.File(label="CSV Download", visible=True)
                 json_download = gr.File(label="JSON Download", visible=True)
+        # Event Handlers
+        # Single Analysis
         analyze_btn.click(
             app.analyze_single,
             inputs=[text_input, language_selector, theme_selector,
             outputs=[result_output, gauge_plot, probability_plot]
         )
+        # Advanced Analysis with sample size control
         shap_btn.click(
             app.analyze_with_shap,
+            inputs=[advanced_text_input, advanced_language, num_samples_slider],
             outputs=[advanced_results, advanced_plot]
         )
         lime_btn.click(
             app.analyze_with_lime,
+            inputs=[advanced_text_input, advanced_language, num_samples_slider],
             outputs=[advanced_results, advanced_plot]
         )