entropy25 commited on
Commit
f0fc9bb
·
verified ·
1 Parent(s): 219103c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -146
app.py CHANGED
@@ -6,7 +6,7 @@ import plotly.express as px
6
  from plotly.subplots import make_subplots
7
  import numpy as np
8
  from wordcloud import WordCloud
9
- from collections import Counter, defaultdict
10
  import re
11
  import json
12
  import csv
@@ -23,6 +23,10 @@ from nltk.corpus import stopwords
23
  import langdetect
24
  import pandas as pd
25
  import gc
 
 
 
 
26
 
27
  # Advanced analysis imports
28
  import shap
@@ -38,6 +42,7 @@ class Config:
38
  MIN_WORD_LENGTH: int = 2
39
  CACHE_SIZE: int = 128
40
  BATCH_PROCESSING_SIZE: int = 8
 
41
 
42
  # Supported languages and models
43
  SUPPORTED_LANGUAGES = {
@@ -99,6 +104,8 @@ def memory_cleanup():
99
  yield
100
  finally:
101
  gc.collect()
 
 
102
 
103
  class ThemeContext:
104
  """Theme management context"""
@@ -106,9 +113,50 @@ class ThemeContext:
106
  self.theme = theme
107
  self.colors = config.THEMES.get(theme, config.THEMES['default'])
108
 
109
- # Enhanced Model Manager with Multi-language Support
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  class ModelManager:
111
- """Multi-language model manager with lazy loading"""
112
  _instance = None
113
 
114
  def __new__(cls):
@@ -119,38 +167,64 @@ class ModelManager:
119
 
120
  def __init__(self):
121
  if not self._initialized:
122
- self.models = {}
123
- self.tokenizers = {}
124
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
125
- self._load_default_models()
 
126
  self._initialized = True
 
127
 
128
- def _load_default_models(self):
129
- """Load default models"""
130
  try:
131
- # Load multilingual model as default
132
- model_name = config.MODELS['multilingual']
133
- self.tokenizers['default'] = AutoTokenizer.from_pretrained(model_name)
134
- self.models['default'] = AutoModelForSequenceClassification.from_pretrained(model_name)
135
- self.models['default'].to(self.device)
136
- logger.info(f"Default model loaded: {model_name}")
137
 
138
- # Load Chinese model
139
- zh_model_name = config.MODELS['zh']
140
- self.tokenizers['zh'] = AutoTokenizer.from_pretrained(zh_model_name)
141
- self.models['zh'] = AutoModelForSequenceClassification.from_pretrained(zh_model_name)
142
- self.models['zh'].to(self.device)
143
- logger.info(f"Chinese model loaded: {zh_model_name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
  except Exception as e:
146
- logger.error(f"Failed to load models: {e}")
147
  raise
148
 
149
  def get_model(self, language='en'):
150
- """Get model for specific language"""
 
151
  if language == 'zh':
152
- return self.models['zh'], self.tokenizers['zh']
153
- return self.models['default'], self.tokenizers['default']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  @staticmethod
156
  def detect_language(text: str) -> str:
@@ -192,22 +266,6 @@ class TextProcessor:
192
  cleaned_words = [w for w in words if w not in STOP_WORDS and len(w) >= config.MIN_WORD_LENGTH]
193
  return ' '.join(cleaned_words)
194
 
195
- @staticmethod
196
- def extract_keywords(text: str, top_k: int = 5) -> List[str]:
197
- """Extract keywords with language support"""
198
- if re.search(r'[\u4e00-\u9fff]', text):
199
- # Chinese text processing
200
- words = re.findall(r'[\u4e00-\u9fff]+', text)
201
- all_chars = ''.join(words)
202
- char_freq = Counter(all_chars)
203
- return [char for char, _ in char_freq.most_common(top_k)]
204
- else:
205
- # Other languages
206
- cleaned = TextProcessor.clean_text(text)
207
- words = cleaned.split()
208
- word_freq = Counter(words)
209
- return [word for word, _ in word_freq.most_common(top_k)]
210
-
211
  @staticmethod
212
  def parse_batch_input(text: str) -> List[str]:
213
  """Parse batch input from textarea"""
@@ -281,16 +339,17 @@ class HistoryManager:
281
  'most_common_language': Counter(languages).most_common(1)[0][0] if languages else 'en'
282
  }
283
 
284
- # Core Sentiment Analysis Engine (Modified - removed attention analysis)
285
  class SentimentEngine:
286
- """Multi-language sentiment analysis engine"""
287
 
288
  def __init__(self):
289
  self.model_manager = ModelManager()
 
290
 
291
- @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0, 'keywords': []})
292
  def analyze_single(self, text: str, language: str = 'auto', preprocessing_options: Dict = None) -> Dict:
293
- """Analyze single text with basic features"""
294
  if not text.strip():
295
  raise ValueError("Empty text provided")
296
 
@@ -313,14 +372,19 @@ class SentimentEngine:
313
  options.get('remove_numbers', False)
314
  )
315
 
316
- # Tokenize and analyze
317
  inputs = tokenizer(processed_text, return_tensors="pt", padding=True,
318
  truncation=True, max_length=config.MAX_TEXT_LENGTH).to(self.model_manager.device)
319
 
 
320
  with torch.no_grad():
321
  outputs = model(**inputs)
322
  probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
323
 
 
 
 
 
324
  # Handle different model outputs
325
  if len(probs) == 3: # negative, neutral, positive
326
  sentiment_idx = np.argmax(probs)
@@ -350,56 +414,77 @@ class SentimentEngine:
350
  'has_neutral': False
351
  }
352
 
353
- # Extract basic keywords
354
- keywords = TextProcessor.extract_keywords(text, 10)
355
- keyword_tuples = [(word, 0.1) for word in keywords] # Simple keyword extraction
356
-
357
  # Add metadata
358
  result.update({
359
  'language': detected_lang,
360
- 'keywords': keyword_tuples,
361
  'word_count': len(text.split()),
362
  'char_count': len(text)
363
  })
364
 
365
  return result
366
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
  @handle_errors(default_return=[])
368
  def analyze_batch(self, texts: List[str], language: str = 'auto',
369
  preprocessing_options: Dict = None, progress_callback=None) -> List[Dict]:
370
- """Optimized batch processing"""
371
  if len(texts) > config.BATCH_SIZE_LIMIT:
372
  texts = texts[:config.BATCH_SIZE_LIMIT]
373
 
374
- results = []
375
- batch_size = config.BATCH_PROCESSING_SIZE
376
-
377
- for i in range(0, len(texts), batch_size):
378
- batch = texts[i:i+batch_size]
379
-
380
- if progress_callback:
381
- progress_callback((i + len(batch)) / len(texts))
 
 
 
 
 
 
 
382
 
383
- for text in batch:
 
 
 
 
384
  try:
385
- result = self.analyze_single(text, language, preprocessing_options)
386
- result['batch_index'] = len(results)
387
- result['text'] = text[:100] + '...' if len(text) > 100 else text
388
- result['full_text'] = text
389
  results.append(result)
390
  except Exception as e:
391
  results.append({
392
  'sentiment': 'Error',
393
  'confidence': 0.0,
394
- 'error': str(e),
395
- 'batch_index': len(results),
396
- 'text': text[:100] + '...' if len(text) > 100 else text,
397
- 'full_text': text
398
  })
399
 
400
  return results
401
 
402
- # Advanced Analysis Engine (NEW)
403
  class AdvancedAnalysisEngine:
404
  """Advanced analysis using SHAP and LIME"""
405
 
@@ -410,13 +495,13 @@ class AdvancedAnalysisEngine:
410
  """Create prediction function for LIME/SHAP"""
411
  def predict_proba(texts):
412
  results = []
413
- for text in texts:
414
- inputs = tokenizer(text, return_tensors="pt", padding=True,
415
- truncation=True, max_length=config.MAX_TEXT_LENGTH).to(device)
416
- with torch.no_grad():
417
  outputs = model(**inputs)
418
  probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
419
- results.append(probs)
420
  return np.array(results)
421
  return predict_proba
422
 
@@ -580,7 +665,7 @@ class AdvancedAnalysisEngine:
580
  logger.error(f"LIME analysis failed: {e}")
581
  return f"LIME analysis failed: {str(e)}", None, {}
582
 
583
- # Advanced Plotly Visualization System (Updated - removed attention visualization)
584
  class PlotlyVisualizer:
585
  """Enhanced Plotly visualizations"""
586
 
@@ -662,43 +747,6 @@ class PlotlyVisualizer:
662
 
663
  return fig
664
 
665
- @staticmethod
666
- @handle_errors(default_return=None)
667
- def create_keyword_chart(keywords: List[Tuple[str, float]], sentiment: str, theme: ThemeContext) -> go.Figure:
668
- """Create basic keyword chart"""
669
- if not keywords:
670
- fig = go.Figure()
671
- fig.add_annotation(text="No keywords extracted",
672
- xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)
673
- fig.update_layout(height=400, title="Keywords")
674
- return fig
675
-
676
- words = [word for word, score in keywords]
677
- scores = [score for word, score in keywords]
678
-
679
- color = theme.colors['pos'] if sentiment == 'Positive' else theme.colors['neg']
680
-
681
- fig = go.Figure(data=[
682
- go.Bar(
683
- y=words,
684
- x=scores,
685
- orientation='h',
686
- marker_color=color,
687
- text=[f'{score:.3f}' for score in scores],
688
- textposition='auto'
689
- )
690
- ])
691
-
692
- fig.update_layout(
693
- title=f"Top Keywords ({sentiment})",
694
- xaxis_title="Frequency Score",
695
- yaxis_title="Keywords",
696
- height=400,
697
- showlegend=False
698
- )
699
-
700
- return fig
701
-
702
  @staticmethod
703
  @handle_errors(default_return=None)
704
  def create_batch_summary(results: List[Dict], theme: ThemeContext) -> go.Figure:
@@ -827,9 +875,8 @@ class DataHandler:
827
  if format_type == 'csv':
828
  writer = csv.writer(temp_file)
829
  writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Language',
830
- 'Pos_Prob', 'Neg_Prob', 'Neu_Prob', 'Keywords', 'Word_Count'])
831
  for entry in data:
832
- keywords_str = "|".join([f"{word}:{score:.3f}" for word, score in entry.get('keywords', [])])
833
  writer.writerow([
834
  entry.get('timestamp', ''),
835
  entry.get('text', ''),
@@ -839,7 +886,6 @@ class DataHandler:
839
  f"{entry.get('pos_prob', 0):.4f}",
840
  f"{entry.get('neg_prob', 0):.4f}",
841
  f"{entry.get('neu_prob', 0):.4f}",
842
- keywords_str,
843
  entry.get('word_count', 0)
844
  ])
845
  elif format_type == 'json':
@@ -881,13 +927,13 @@ class DataHandler:
881
 
882
  return content
883
 
884
- # Main Application Class
885
  class SentimentApp:
886
- """Main multilingual sentiment analysis application"""
887
 
888
  def __init__(self):
889
  self.engine = SentimentEngine()
890
- self.advanced_engine = AdvancedAnalysisEngine() # NEW
891
  self.history = HistoryManager()
892
  self.data_handler = DataHandler()
893
 
@@ -900,12 +946,12 @@ class SentimentApp:
900
  ["Ce film était magnifique, j'ai adoré la réalisation."], # French
901
  ]
902
 
903
- @handle_errors(default_return=("Please enter text", None, None, None))
904
  def analyze_single(self, text: str, language: str, theme: str, clean_text: bool,
905
  remove_punct: bool, remove_nums: bool):
906
- """Single text analysis with basic visualizations (removed attention analysis)"""
907
  if not text.strip():
908
- return "Please enter text", None, None, None
909
 
910
  # Map display names to language codes
911
  language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
@@ -920,7 +966,7 @@ class SentimentApp:
920
  with memory_cleanup():
921
  result = self.engine.analyze_single(text, language_code, preprocessing_options)
922
 
923
- # Add to history
924
  history_entry = {
925
  'text': text[:100] + '...' if len(text) > 100 else text,
926
  'full_text': text,
@@ -930,35 +976,31 @@ class SentimentApp:
930
  'neg_prob': result.get('neg_prob', 0),
931
  'neu_prob': result.get('neu_prob', 0),
932
  'language': result['language'],
933
- 'keywords': result['keywords'],
934
  'word_count': result['word_count'],
935
  'analysis_type': 'single'
936
  }
937
  self.history.add(history_entry)
938
 
939
- # Create visualizations
940
  theme_ctx = ThemeContext(theme)
941
  gauge_fig = PlotlyVisualizer.create_sentiment_gauge(result, theme_ctx)
942
  bars_fig = PlotlyVisualizer.create_probability_bars(result, theme_ctx)
943
- keyword_fig = PlotlyVisualizer.create_keyword_chart(result['keywords'], result['sentiment'], theme_ctx)
944
 
945
  # Create comprehensive result text
946
- keywords_str = ", ".join([f"{word}({score:.3f})" for word, score in result['keywords'][:5]])
947
-
948
  info_text = f"""
949
  **Analysis Results:**
950
  - **Sentiment:** {result['sentiment']} ({result['confidence']:.3f} confidence)
951
  - **Language:** {result['language'].upper()}
952
- - **Keywords:** {keywords_str}
953
  - **Statistics:** {result['word_count']} words, {result['char_count']} characters
 
954
  """
955
 
956
- return info_text, gauge_fig, bars_fig, keyword_fig
957
 
958
  @handle_errors(default_return=("Please enter texts", None, None, None))
959
  def analyze_batch(self, batch_text: str, language: str, theme: str,
960
  clean_text: bool, remove_punct: bool, remove_nums: bool):
961
- """Enhanced batch analysis"""
962
  if not batch_text.strip():
963
  return "Please enter texts (one per line)", None, None, None
964
 
@@ -997,7 +1039,6 @@ class SentimentApp:
997
  'neg_prob': result.get('neg_prob', 0),
998
  'neu_prob': result.get('neu_prob', 0),
999
  'language': result['language'],
1000
- 'keywords': result['keywords'],
1001
  'word_count': result['word_count'],
1002
  'analysis_type': 'batch',
1003
  'batch_index': result['batch_index']
@@ -1024,14 +1065,13 @@ class SentimentApp:
1024
  'Error': result['error']
1025
  })
1026
  else:
1027
- keywords_str = ', '.join([word for word, _ in result['keywords'][:3]])
1028
  df_data.append({
1029
  'Index': result['batch_index'] + 1,
1030
  'Text': result['text'],
1031
  'Sentiment': result['sentiment'],
1032
  'Confidence': f"{result['confidence']:.3f}",
1033
  'Language': result['language'].upper(),
1034
- 'Keywords': keywords_str
1035
  })
1036
 
1037
  df = pd.DataFrame(df_data)
@@ -1059,7 +1099,7 @@ class SentimentApp:
1059
 
1060
  return summary_text, df, summary_fig, confidence_fig
1061
 
1062
- # NEW: Advanced analysis methods
1063
  @handle_errors(default_return=("Please enter text", None))
1064
  def analyze_with_shap(self, text: str, language: str):
1065
  """Perform SHAP analysis"""
@@ -1120,9 +1160,9 @@ class SentimentApp:
1120
  - **Languages Detected:** {stats['languages_detected']}
1121
  """
1122
 
1123
- # Gradio Interface (Updated with Advanced Analysis tab)
1124
  def create_interface():
1125
- """Create comprehensive Gradio interface with Advanced Analysis tab"""
1126
  app = SentimentApp()
1127
 
1128
  with gr.Blocks(theme=gr.themes.Soft(), title="Multilingual Sentiment Analyzer") as demo:
@@ -1169,11 +1209,8 @@ def create_interface():
1169
  with gr.Row():
1170
  gauge_plot = gr.Plot(label="Sentiment Gauge")
1171
  probability_plot = gr.Plot(label="Probability Distribution")
1172
-
1173
- with gr.Row():
1174
- keyword_plot = gr.Plot(label="Basic Keywords")
1175
 
1176
- # NEW: Advanced Analysis Tab
1177
  with gr.Tab("Advanced Analysis"):
1178
  gr.Markdown("## 🔬 Explainable AI Analysis")
1179
  gr.Markdown("Use SHAP and LIME to understand which words and phrases most influence the sentiment prediction.")
@@ -1246,8 +1283,8 @@ def create_interface():
1246
  batch_summary = gr.Textbox(label="Batch Summary", lines=8)
1247
  batch_results_df = gr.Dataframe(
1248
  label="Detailed Results",
1249
- headers=["Index", "Text", "Sentiment", "Confidence", "Language", "Keywords"],
1250
- datatype=["number", "str", "str", "str", "str", "str"]
1251
  )
1252
 
1253
  with gr.Row():
@@ -1281,17 +1318,17 @@ def create_interface():
1281
  csv_download = gr.File(label="CSV Download", visible=True)
1282
  json_download = gr.File(label="JSON Download", visible=True)
1283
 
1284
- # Event Handlers
1285
 
1286
- # Single Analysis
1287
  analyze_btn.click(
1288
  app.analyze_single,
1289
  inputs=[text_input, language_selector, theme_selector,
1290
  clean_text_cb, remove_punct_cb, remove_nums_cb],
1291
- outputs=[result_output, gauge_plot, probability_plot, keyword_plot]
1292
  )
1293
 
1294
- # Advanced Analysis (NEW)
1295
  shap_btn.click(
1296
  app.analyze_with_shap,
1297
  inputs=[advanced_text_input, advanced_language],
 
6
  from plotly.subplots import make_subplots
7
  import numpy as np
8
  from wordcloud import WordCloud
9
+ from collections import Counter, defaultdict, OrderedDict
10
  import re
11
  import json
12
  import csv
 
23
  import langdetect
24
  import pandas as pd
25
  import gc
26
+ import threading
27
+ import asyncio
28
+ from concurrent.futures import ThreadPoolExecutor
29
+ import time
30
 
31
  # Advanced analysis imports
32
  import shap
 
42
  MIN_WORD_LENGTH: int = 2
43
  CACHE_SIZE: int = 128
44
  BATCH_PROCESSING_SIZE: int = 8
45
+ MODEL_CACHE_SIZE: int = 2 # Maximum models to keep in memory
46
 
47
  # Supported languages and models
48
  SUPPORTED_LANGUAGES = {
 
104
  yield
105
  finally:
106
  gc.collect()
107
+ if torch.cuda.is_available():
108
+ torch.cuda.empty_cache()
109
 
110
  class ThemeContext:
111
  """Theme management context"""
 
113
  self.theme = theme
114
  self.colors = config.THEMES.get(theme, config.THEMES['default'])
115
 
116
+ class LRUModelCache:
117
+ """LRU Cache for models with memory management"""
118
+ def __init__(self, max_size: int = 2):
119
+ self.max_size = max_size
120
+ self.cache = OrderedDict()
121
+ self.lock = threading.Lock()
122
+
123
+ def get(self, key):
124
+ with self.lock:
125
+ if key in self.cache:
126
+ # Move to end (most recently used)
127
+ self.cache.move_to_end(key)
128
+ return self.cache[key]
129
+ return None
130
+
131
+ def put(self, key, value):
132
+ with self.lock:
133
+ if key in self.cache:
134
+ self.cache.move_to_end(key)
135
+ else:
136
+ if len(self.cache) >= self.max_size:
137
+ # Remove least recently used
138
+ oldest_key = next(iter(self.cache))
139
+ old_model, old_tokenizer = self.cache.pop(oldest_key)
140
+ # Force cleanup
141
+ del old_model, old_tokenizer
142
+ gc.collect()
143
+ if torch.cuda.is_available():
144
+ torch.cuda.empty_cache()
145
+
146
+ self.cache[key] = value
147
+
148
+ def clear(self):
149
+ with self.lock:
150
+ for model, tokenizer in self.cache.values():
151
+ del model, tokenizer
152
+ self.cache.clear()
153
+ gc.collect()
154
+ if torch.cuda.is_available():
155
+ torch.cuda.empty_cache()
156
+
157
+ # Enhanced Model Manager with Optimized Memory Management
158
  class ModelManager:
159
+ """Optimized multi-language model manager with LRU cache and lazy loading"""
160
  _instance = None
161
 
162
  def __new__(cls):
 
167
 
168
  def __init__(self):
169
  if not self._initialized:
 
 
170
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
171
+ self.model_cache = LRUModelCache(config.MODEL_CACHE_SIZE)
172
+ self.loading_lock = threading.Lock()
173
  self._initialized = True
174
+ logger.info(f"ModelManager initialized on device: {self.device}")
175
 
176
+ def _load_model(self, model_name: str, cache_key: str):
177
+ """Load model with memory optimization"""
178
  try:
179
+ logger.info(f"Loading model: {model_name}")
 
 
 
 
 
180
 
181
+ # Load with memory optimization
182
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
183
+ model = AutoModelForSequenceClassification.from_pretrained(
184
+ model_name,
185
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
186
+ device_map="auto" if torch.cuda.is_available() else None
187
+ )
188
+
189
+ if not torch.cuda.is_available():
190
+ model.to(self.device)
191
+
192
+ # Set to eval mode to save memory
193
+ model.eval()
194
+
195
+ # Cache the model
196
+ self.model_cache.put(cache_key, (model, tokenizer))
197
+ logger.info(f"Model {model_name} loaded and cached successfully")
198
+
199
+ return model, tokenizer
200
 
201
  except Exception as e:
202
+ logger.error(f"Failed to load model {model_name}: {e}")
203
  raise
204
 
205
  def get_model(self, language='en'):
206
+ """Get model for specific language with lazy loading and caching"""
207
+ # Determine cache key and model name
208
  if language == 'zh':
209
+ cache_key = 'zh'
210
+ model_name = config.MODELS['zh']
211
+ else:
212
+ cache_key = 'multilingual'
213
+ model_name = config.MODELS['multilingual']
214
+
215
+ # Try to get from cache first
216
+ cached_model = self.model_cache.get(cache_key)
217
+ if cached_model is not None:
218
+ return cached_model
219
+
220
+ # Load model if not in cache (with thread safety)
221
+ with self.loading_lock:
222
+ # Double-check pattern
223
+ cached_model = self.model_cache.get(cache_key)
224
+ if cached_model is not None:
225
+ return cached_model
226
+
227
+ return self._load_model(model_name, cache_key)
228
 
229
  @staticmethod
230
  def detect_language(text: str) -> str:
 
266
  cleaned_words = [w for w in words if w not in STOP_WORDS and len(w) >= config.MIN_WORD_LENGTH]
267
  return ' '.join(cleaned_words)
268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  @staticmethod
270
  def parse_batch_input(text: str) -> List[str]:
271
  """Parse batch input from textarea"""
 
339
  'most_common_language': Counter(languages).most_common(1)[0][0] if languages else 'en'
340
  }
341
 
342
+ # Core Sentiment Analysis Engine with Performance Optimizations
343
  class SentimentEngine:
344
+ """Optimized multi-language sentiment analysis engine"""
345
 
346
  def __init__(self):
347
  self.model_manager = ModelManager()
348
+ self.executor = ThreadPoolExecutor(max_workers=4)
349
 
350
+ @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0})
351
  def analyze_single(self, text: str, language: str = 'auto', preprocessing_options: Dict = None) -> Dict:
352
+ """Optimized single text analysis"""
353
  if not text.strip():
354
  raise ValueError("Empty text provided")
355
 
 
372
  options.get('remove_numbers', False)
373
  )
374
 
375
+ # Tokenize and analyze with memory optimization
376
  inputs = tokenizer(processed_text, return_tensors="pt", padding=True,
377
  truncation=True, max_length=config.MAX_TEXT_LENGTH).to(self.model_manager.device)
378
 
379
+ # Use no_grad for inference to save memory
380
  with torch.no_grad():
381
  outputs = model(**inputs)
382
  probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
383
 
384
+ # Clear GPU cache after inference
385
+ if torch.cuda.is_available():
386
+ torch.cuda.empty_cache()
387
+
388
  # Handle different model outputs
389
  if len(probs) == 3: # negative, neutral, positive
390
  sentiment_idx = np.argmax(probs)
 
414
  'has_neutral': False
415
  }
416
 
 
 
 
 
417
  # Add metadata
418
  result.update({
419
  'language': detected_lang,
 
420
  'word_count': len(text.split()),
421
  'char_count': len(text)
422
  })
423
 
424
  return result
425
 
426
+ def _analyze_text_batch(self, text: str, language: str, preprocessing_options: Dict, index: int) -> Dict:
427
+ """Single text analysis for batch processing"""
428
+ try:
429
+ result = self.analyze_single(text, language, preprocessing_options)
430
+ result['batch_index'] = index
431
+ result['text'] = text[:100] + '...' if len(text) > 100 else text
432
+ result['full_text'] = text
433
+ return result
434
+ except Exception as e:
435
+ return {
436
+ 'sentiment': 'Error',
437
+ 'confidence': 0.0,
438
+ 'error': str(e),
439
+ 'batch_index': index,
440
+ 'text': text[:100] + '...' if len(text) > 100 else text,
441
+ 'full_text': text
442
+ }
443
+
444
  @handle_errors(default_return=[])
445
  def analyze_batch(self, texts: List[str], language: str = 'auto',
446
  preprocessing_options: Dict = None, progress_callback=None) -> List[Dict]:
447
+ """Optimized parallel batch processing"""
448
  if len(texts) > config.BATCH_SIZE_LIMIT:
449
  texts = texts[:config.BATCH_SIZE_LIMIT]
450
 
451
+ if not texts:
452
+ return []
453
+
454
+ # Pre-load model to avoid race conditions
455
+ self.model_manager.get_model(language if language != 'auto' else 'en')
456
+
457
+ # Use ThreadPoolExecutor for parallel processing
458
+ with ThreadPoolExecutor(max_workers=min(4, len(texts))) as executor:
459
+ futures = []
460
+ for i, text in enumerate(texts):
461
+ future = executor.submit(
462
+ self._analyze_text_batch,
463
+ text, language, preprocessing_options, i
464
+ )
465
+ futures.append(future)
466
 
467
+ results = []
468
+ for i, future in enumerate(futures):
469
+ if progress_callback:
470
+ progress_callback((i + 1) / len(futures))
471
+
472
  try:
473
+ result = future.result(timeout=30) # 30 second timeout per text
 
 
 
474
  results.append(result)
475
  except Exception as e:
476
  results.append({
477
  'sentiment': 'Error',
478
  'confidence': 0.0,
479
+ 'error': f"Timeout or error: {str(e)}",
480
+ 'batch_index': i,
481
+ 'text': texts[i][:100] + '...' if len(texts[i]) > 100 else texts[i],
482
+ 'full_text': texts[i]
483
  })
484
 
485
  return results
486
 
487
+ # Advanced Analysis Engine
488
  class AdvancedAnalysisEngine:
489
  """Advanced analysis using SHAP and LIME"""
490
 
 
495
  """Create prediction function for LIME/SHAP"""
496
  def predict_proba(texts):
497
  results = []
498
+ with torch.no_grad():
499
+ for text in texts:
500
+ inputs = tokenizer(text, return_tensors="pt", padding=True,
501
+ truncation=True, max_length=config.MAX_TEXT_LENGTH).to(device)
502
  outputs = model(**inputs)
503
  probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
504
+ results.append(probs)
505
  return np.array(results)
506
  return predict_proba
507
 
 
665
  logger.error(f"LIME analysis failed: {e}")
666
  return f"LIME analysis failed: {str(e)}", None, {}
667
 
668
+ # Optimized Plotly Visualization System
669
  class PlotlyVisualizer:
670
  """Enhanced Plotly visualizations"""
671
 
 
747
 
748
  return fig
749
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
750
  @staticmethod
751
  @handle_errors(default_return=None)
752
  def create_batch_summary(results: List[Dict], theme: ThemeContext) -> go.Figure:
 
875
  if format_type == 'csv':
876
  writer = csv.writer(temp_file)
877
  writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Language',
878
+ 'Pos_Prob', 'Neg_Prob', 'Neu_Prob', 'Word_Count'])
879
  for entry in data:
 
880
  writer.writerow([
881
  entry.get('timestamp', ''),
882
  entry.get('text', ''),
 
886
  f"{entry.get('pos_prob', 0):.4f}",
887
  f"{entry.get('neg_prob', 0):.4f}",
888
  f"{entry.get('neu_prob', 0):.4f}",
 
889
  entry.get('word_count', 0)
890
  ])
891
  elif format_type == 'json':
 
927
 
928
  return content
929
 
930
+ # Main Application Class - Optimized
931
  class SentimentApp:
932
+ """Optimized multilingual sentiment analysis application"""
933
 
934
  def __init__(self):
935
  self.engine = SentimentEngine()
936
+ self.advanced_engine = AdvancedAnalysisEngine()
937
  self.history = HistoryManager()
938
  self.data_handler = DataHandler()
939
 
 
946
  ["Ce film était magnifique, j'ai adoré la réalisation."], # French
947
  ]
948
 
949
+ @handle_errors(default_return=("Please enter text", None, None))
950
  def analyze_single(self, text: str, language: str, theme: str, clean_text: bool,
951
  remove_punct: bool, remove_nums: bool):
952
+ """Optimized single text analysis without keyword extraction"""
953
  if not text.strip():
954
+ return "Please enter text", None, None
955
 
956
  # Map display names to language codes
957
  language_map = {v: k for k, v in config.SUPPORTED_LANGUAGES.items()}
 
966
  with memory_cleanup():
967
  result = self.engine.analyze_single(text, language_code, preprocessing_options)
968
 
969
+ # Add to history (without keywords)
970
  history_entry = {
971
  'text': text[:100] + '...' if len(text) > 100 else text,
972
  'full_text': text,
 
976
  'neg_prob': result.get('neg_prob', 0),
977
  'neu_prob': result.get('neu_prob', 0),
978
  'language': result['language'],
 
979
  'word_count': result['word_count'],
980
  'analysis_type': 'single'
981
  }
982
  self.history.add(history_entry)
983
 
984
+ # Create visualizations (only gauge and probability bars)
985
  theme_ctx = ThemeContext(theme)
986
  gauge_fig = PlotlyVisualizer.create_sentiment_gauge(result, theme_ctx)
987
  bars_fig = PlotlyVisualizer.create_probability_bars(result, theme_ctx)
 
988
 
989
  # Create comprehensive result text
 
 
990
  info_text = f"""
991
  **Analysis Results:**
992
  - **Sentiment:** {result['sentiment']} ({result['confidence']:.3f} confidence)
993
  - **Language:** {result['language'].upper()}
 
994
  - **Statistics:** {result['word_count']} words, {result['char_count']} characters
995
+ - **Probabilities:** Positive: {result.get('pos_prob', 0):.3f}, Negative: {result.get('neg_prob', 0):.3f}, Neutral: {result.get('neu_prob', 0):.3f}
996
  """
997
 
998
+ return info_text, gauge_fig, bars_fig
999
 
1000
  @handle_errors(default_return=("Please enter texts", None, None, None))
1001
  def analyze_batch(self, batch_text: str, language: str, theme: str,
1002
  clean_text: bool, remove_punct: bool, remove_nums: bool):
1003
+ """Enhanced batch analysis with parallel processing"""
1004
  if not batch_text.strip():
1005
  return "Please enter texts (one per line)", None, None, None
1006
 
 
1039
  'neg_prob': result.get('neg_prob', 0),
1040
  'neu_prob': result.get('neu_prob', 0),
1041
  'language': result['language'],
 
1042
  'word_count': result['word_count'],
1043
  'analysis_type': 'batch',
1044
  'batch_index': result['batch_index']
 
1065
  'Error': result['error']
1066
  })
1067
  else:
 
1068
  df_data.append({
1069
  'Index': result['batch_index'] + 1,
1070
  'Text': result['text'],
1071
  'Sentiment': result['sentiment'],
1072
  'Confidence': f"{result['confidence']:.3f}",
1073
  'Language': result['language'].upper(),
1074
+ 'Word_Count': result.get('word_count', 0)
1075
  })
1076
 
1077
  df = pd.DataFrame(df_data)
 
1099
 
1100
  return summary_text, df, summary_fig, confidence_fig
1101
 
1102
+ # Advanced analysis methods
1103
  @handle_errors(default_return=("Please enter text", None))
1104
  def analyze_with_shap(self, text: str, language: str):
1105
  """Perform SHAP analysis"""
 
1160
  - **Languages Detected:** {stats['languages_detected']}
1161
  """
1162
 
1163
+ # Optimized Gradio Interface
1164
  def create_interface():
1165
+ """Create comprehensive Gradio interface with optimizations"""
1166
  app = SentimentApp()
1167
 
1168
  with gr.Blocks(theme=gr.themes.Soft(), title="Multilingual Sentiment Analyzer") as demo:
 
1209
  with gr.Row():
1210
  gauge_plot = gr.Plot(label="Sentiment Gauge")
1211
  probability_plot = gr.Plot(label="Probability Distribution")
 
 
 
1212
 
1213
+ # Advanced Analysis Tab
1214
  with gr.Tab("Advanced Analysis"):
1215
  gr.Markdown("## 🔬 Explainable AI Analysis")
1216
  gr.Markdown("Use SHAP and LIME to understand which words and phrases most influence the sentiment prediction.")
 
1283
  batch_summary = gr.Textbox(label="Batch Summary", lines=8)
1284
  batch_results_df = gr.Dataframe(
1285
  label="Detailed Results",
1286
+ headers=["Index", "Text", "Sentiment", "Confidence", "Language", "Word_Count"],
1287
+ datatype=["number", "str", "str", "str", "str", "number"]
1288
  )
1289
 
1290
  with gr.Row():
 
1318
  csv_download = gr.File(label="CSV Download", visible=True)
1319
  json_download = gr.File(label="JSON Download", visible=True)
1320
 
1321
+ # Event Handlers - Updated for optimized single analysis
1322
 
1323
+ # Single Analysis (removed keyword_plot output)
1324
  analyze_btn.click(
1325
  app.analyze_single,
1326
  inputs=[text_input, language_selector, theme_selector,
1327
  clean_text_cb, remove_punct_cb, remove_nums_cb],
1328
+ outputs=[result_output, gauge_plot, probability_plot]
1329
  )
1330
 
1331
+ # Advanced Analysis
1332
  shap_btn.click(
1333
  app.analyze_with_shap,
1334
  inputs=[advanced_text_input, advanced_language],