|
import torch |
|
import gradio as gr |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline |
|
import plotly.graph_objects as go |
|
import plotly.express as px |
|
from plotly.subplots import make_subplots |
|
import numpy as np |
|
from wordcloud import WordCloud |
|
from collections import Counter, defaultdict |
|
import re |
|
import json |
|
import csv |
|
import io |
|
import tempfile |
|
from datetime import datetime |
|
import logging |
|
from functools import lru_cache, wraps |
|
from dataclasses import dataclass |
|
from typing import List, Dict, Optional, Tuple, Any, Callable |
|
from contextlib import contextmanager |
|
import gc |
|
import base64 |
|
|
|
|
|
@dataclass |
|
class Config: |
|
MAX_HISTORY_SIZE: int = 1000 |
|
BATCH_SIZE_LIMIT: int = 50 |
|
MAX_TEXT_LENGTH: int = 512 |
|
MIN_WORD_LENGTH: int = 2 |
|
CACHE_SIZE: int = 128 |
|
BATCH_PROCESSING_SIZE: int = 8 |
|
|
|
|
|
FIGURE_WIDTH: int = 800 |
|
FIGURE_HEIGHT: int = 500 |
|
WORDCLOUD_SIZE: Tuple[int, int] = (800, 400) |
|
|
|
THEMES = { |
|
'default': {'pos': '#4ecdc4', 'neg': '#ff6b6b'}, |
|
'ocean': {'pos': '#0077be', 'neg': '#ff6b35'}, |
|
'forest': {'pos': '#228b22', 'neg': '#dc143c'}, |
|
'sunset': {'pos': '#ff8c00', 'neg': '#8b0000'} |
|
} |
|
|
|
|
|
MODELS = { |
|
'multilingual': { |
|
'name': 'cardiffnlp/twitter-xlm-roberta-base-sentiment', |
|
'labels': ['NEGATIVE', 'NEUTRAL', 'POSITIVE'] |
|
}, |
|
'english': { |
|
'name': 'cardiffnlp/twitter-roberta-base-sentiment-latest', |
|
'labels': ['NEGATIVE', 'NEUTRAL', 'POSITIVE'] |
|
}, |
|
'chinese': { |
|
'name': 'uer/roberta-base-finetuned-chinanews-chinese', |
|
'labels': ['NEGATIVE', 'POSITIVE'] |
|
}, |
|
'spanish': { |
|
'name': 'finiteautomata/beto-sentiment-analysis', |
|
'labels': ['NEGATIVE', 'NEUTRAL', 'POSITIVE'] |
|
}, |
|
'french': { |
|
'name': 'tblard/tf-allocine', |
|
'labels': ['NEGATIVE', 'POSITIVE'] |
|
} |
|
} |
|
|
|
STOP_WORDS = { |
|
'en': {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'will', 'would', 'could', 'should'}, |
|
'zh': {'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看'}, |
|
'es': {'el', 'la', 'de', 'que', 'y', 'a', 'en', 'un', 'es', 'se', 'no', 'te', 'lo', 'le', 'da', 'su', 'por', 'son', 'con', 'para', 'al', 'del', 'los', 'las'}, |
|
'fr': {'le', 'la', 'les', 'de', 'un', 'une', 'du', 'des', 'et', 'à', 'ce', 'il', 'que', 'qui', 'ne', 'se', 'pas', 'tout', 'être', 'avoir', 'sur', 'avec', 'par'}, |
|
} |
|
|
|
config = Config() |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def handle_errors(default_return=None): |
|
"""Centralized error handling decorator""" |
|
def decorator(func: Callable) -> Callable: |
|
@wraps(func) |
|
def wrapper(*args, **kwargs): |
|
try: |
|
return func(*args, **kwargs) |
|
except Exception as e: |
|
logger.error(f"{func.__name__} failed: {e}") |
|
return default_return if default_return is not None else f"Error: {str(e)}" |
|
return wrapper |
|
return decorator |
|
|
|
class ThemeContext: |
|
"""Theme management context""" |
|
def __init__(self, theme: str = 'default'): |
|
self.theme = theme |
|
self.colors = config.THEMES.get(theme, config.THEMES['default']) |
|
|
|
|
|
class ModelManager: |
|
"""Multi-language model manager with lazy loading""" |
|
_instance = None |
|
_models = {} |
|
_tokenizers = {} |
|
_pipelines = {} |
|
_device = None |
|
|
|
def __new__(cls): |
|
if cls._instance is None: |
|
cls._instance = super().__new__(cls) |
|
return cls._instance |
|
|
|
@property |
|
def device(self): |
|
if self._device is None: |
|
self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
return self._device |
|
|
|
def get_pipeline(self, model_key: str = 'multilingual'): |
|
"""Get or create sentiment analysis pipeline for specified model""" |
|
if model_key not in self._pipelines: |
|
try: |
|
model_config = config.MODELS[model_key] |
|
self._pipelines[model_key] = pipeline( |
|
"sentiment-analysis", |
|
model=model_config['name'], |
|
tokenizer=model_config['name'], |
|
device=0 if torch.cuda.is_available() else -1, |
|
top_k=None |
|
) |
|
logger.info(f"Model {model_key} loaded successfully") |
|
except Exception as e: |
|
logger.error(f"Failed to load model {model_key}: {e}") |
|
|
|
if model_key != 'multilingual': |
|
return self.get_pipeline('multilingual') |
|
raise |
|
return self._pipelines[model_key] |
|
|
|
def get_model_and_tokenizer(self, model_key: str = 'multilingual'): |
|
"""Get model and tokenizer for attention extraction""" |
|
if model_key not in self._models: |
|
try: |
|
model_config = config.MODELS[model_key] |
|
self._tokenizers[model_key] = AutoTokenizer.from_pretrained(model_config['name']) |
|
self._models[model_key] = AutoModelForSequenceClassification.from_pretrained(model_config['name']) |
|
self._models[model_key].to(self.device) |
|
logger.info(f"Model and tokenizer {model_key} loaded for attention extraction") |
|
except Exception as e: |
|
logger.error(f"Failed to load model/tokenizer {model_key}: {e}") |
|
if model_key != 'multilingual': |
|
return self.get_model_and_tokenizer('multilingual') |
|
raise |
|
return self._models[model_key], self._tokenizers[model_key] |
|
|
|
|
|
class LanguageDetector: |
|
"""Simple language detection based on character patterns""" |
|
|
|
@staticmethod |
|
def detect_language(text: str) -> str: |
|
"""Detect language based on character patterns""" |
|
|
|
if re.search(r'[\u4e00-\u9fff]', text): |
|
return 'chinese' |
|
|
|
elif re.search(r'[ñáéíóúü]', text.lower()): |
|
return 'spanish' |
|
|
|
elif re.search(r'[àâäçéèêëïîôùûüÿ]', text.lower()): |
|
return 'french' |
|
|
|
else: |
|
return 'multilingual' |
|
|
|
|
|
class TextProcessor: |
|
"""Optimized text processing with multi-language support""" |
|
@staticmethod |
|
@lru_cache(maxsize=config.CACHE_SIZE) |
|
def clean_text(text: str, language: str = 'en') -> Tuple[str, ...]: |
|
"""Single-pass text cleaning with language-specific stop words""" |
|
words = re.findall(r'\b\w{2,}\b', text.lower()) |
|
stop_words = config.STOP_WORDS.get(language, config.STOP_WORDS['en']) |
|
return tuple(w for w in words if w not in stop_words and len(w) >= config.MIN_WORD_LENGTH) |
|
|
|
class HistoryManager: |
|
"""Simplified history management""" |
|
def __init__(self): |
|
self._history = [] |
|
|
|
def add(self, entry: Dict): |
|
self._history.append({**entry, 'timestamp': datetime.now().isoformat()}) |
|
if len(self._history) > config.MAX_HISTORY_SIZE: |
|
self._history = self._history[-config.MAX_HISTORY_SIZE:] |
|
|
|
def get_all(self) -> List[Dict]: |
|
return self._history.copy() |
|
|
|
def clear(self) -> int: |
|
count = len(self._history) |
|
self._history.clear() |
|
return count |
|
|
|
def size(self) -> int: |
|
return len(self._history) |
|
|
|
|
|
class SentimentEngine: |
|
"""Multi-language sentiment analysis with attention-based keyword extraction""" |
|
def __init__(self): |
|
self.model_manager = ModelManager() |
|
self.language_detector = LanguageDetector() |
|
|
|
def extract_key_words(self, text: str, model_key: str = 'multilingual', top_k: int = 10) -> List[Tuple[str, float]]: |
|
"""Extract contributing words using attention weights""" |
|
try: |
|
model, tokenizer = self.model_manager.get_model_and_tokenizer(model_key) |
|
|
|
inputs = tokenizer( |
|
text, return_tensors="pt", padding=True, |
|
truncation=True, max_length=config.MAX_TEXT_LENGTH |
|
).to(self.model_manager.device) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs, output_attentions=True) |
|
attention = outputs.attentions |
|
|
|
|
|
last_attention = attention[-1] |
|
avg_attention = last_attention.mean(dim=1) |
|
|
|
|
|
cls_attention = avg_attention[0, 0, :] |
|
|
|
|
|
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]) |
|
attention_scores = cls_attention.cpu().numpy() |
|
|
|
|
|
word_scores = {} |
|
current_word = "" |
|
current_score = 0.0 |
|
|
|
for i, (token, score) in enumerate(zip(tokens, attention_scores)): |
|
if token in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>', '<pad>']: |
|
continue |
|
|
|
if token.startswith('##') or token.startswith('▁'): |
|
|
|
current_word += token[2:] if token.startswith('##') else token[1:] |
|
current_score = max(current_score, score) |
|
else: |
|
|
|
if current_word and len(current_word) >= config.MIN_WORD_LENGTH: |
|
word_scores[current_word.lower()] = current_score |
|
|
|
current_word = token |
|
current_score = score |
|
|
|
|
|
if current_word and len(current_word) >= config.MIN_WORD_LENGTH: |
|
word_scores[current_word.lower()] = current_score |
|
|
|
|
|
lang_code = 'zh' if model_key == 'chinese' else 'es' if model_key == 'spanish' else 'fr' if model_key == 'french' else 'en' |
|
stop_words = config.STOP_WORDS.get(lang_code, config.STOP_WORDS['en']) |
|
|
|
filtered_words = { |
|
word: score for word, score in word_scores.items() |
|
if word not in stop_words and len(word) >= config.MIN_WORD_LENGTH |
|
} |
|
|
|
|
|
sorted_words = sorted(filtered_words.items(), key=lambda x: x[1], reverse=True) |
|
return sorted_words[:top_k] |
|
|
|
except Exception as e: |
|
logger.error(f"Key word extraction failed: {e}") |
|
return [] |
|
|
|
@handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0, 'key_words': []}) |
|
def analyze_single(self, text: str, model_key: str = None) -> Dict: |
|
"""Analyze single text with automatic language detection""" |
|
if not text.strip(): |
|
raise ValueError("Empty text") |
|
|
|
|
|
if model_key is None: |
|
detected_lang = self.language_detector.detect_language(text) |
|
model_key = detected_lang if detected_lang in config.MODELS else 'multilingual' |
|
|
|
|
|
classifier = self.model_manager.get_pipeline(model_key) |
|
results = classifier(text) |
|
|
|
|
|
if isinstance(results[0], list): |
|
results = results[0] |
|
|
|
|
|
sentiment_map = {'POSITIVE': 'Positive', 'NEGATIVE': 'Negative', 'NEUTRAL': 'Neutral'} |
|
|
|
|
|
pos_score = 0.0 |
|
neg_score = 0.0 |
|
neutral_score = 0.0 |
|
|
|
for result in results: |
|
label = result['label'] |
|
score = result['score'] |
|
|
|
if 'POSITIVE' in label: |
|
pos_score = score |
|
elif 'NEGATIVE' in label: |
|
neg_score = score |
|
elif 'NEUTRAL' in label: |
|
neutral_score = score |
|
|
|
|
|
if pos_score > neg_score and pos_score > neutral_score: |
|
sentiment = 'Positive' |
|
confidence = pos_score |
|
elif neg_score > pos_score and neg_score > neutral_score: |
|
sentiment = 'Negative' |
|
confidence = neg_score |
|
else: |
|
sentiment = 'Neutral' |
|
confidence = neutral_score |
|
|
|
|
|
key_words = self.extract_key_words(text, model_key) |
|
|
|
return { |
|
'sentiment': sentiment, |
|
'confidence': float(confidence), |
|
'pos_prob': float(pos_score), |
|
'neg_prob': float(neg_score), |
|
'neutral_prob': float(neutral_score), |
|
'key_words': key_words, |
|
'language': model_key |
|
} |
|
|
|
@handle_errors(default_return=[]) |
|
def analyze_batch(self, texts: List[str], model_key: str = None, progress_callback=None) -> List[Dict]: |
|
"""Optimized batch processing with key words""" |
|
if len(texts) > config.BATCH_SIZE_LIMIT: |
|
texts = texts[:config.BATCH_SIZE_LIMIT] |
|
|
|
results = [] |
|
|
|
for i, text in enumerate(texts): |
|
if progress_callback: |
|
progress_callback((i + 1) / len(texts)) |
|
|
|
result = self.analyze_single(text, model_key) |
|
result['text'] = text[:50] + '...' if len(text) > 50 else text |
|
result['full_text'] = text |
|
results.append(result) |
|
|
|
return results |
|
|
|
|
|
class PlotFactory: |
|
"""Factory for creating Plotly visualizations""" |
|
|
|
@staticmethod |
|
@handle_errors(default_return=None) |
|
def create_sentiment_bars(result: Dict, theme: ThemeContext) -> go.Figure: |
|
"""Create sentiment probability bars using Plotly""" |
|
labels = [] |
|
values = [] |
|
colors = [] |
|
|
|
if 'neg_prob' in result and result['neg_prob'] > 0: |
|
labels.append("Negative") |
|
values.append(result['neg_prob']) |
|
colors.append(theme.colors['neg']) |
|
|
|
if 'neutral_prob' in result and result['neutral_prob'] > 0: |
|
labels.append("Neutral") |
|
values.append(result['neutral_prob']) |
|
colors.append('#FFA500') |
|
|
|
if 'pos_prob' in result and result['pos_prob'] > 0: |
|
labels.append("Positive") |
|
values.append(result['pos_prob']) |
|
colors.append(theme.colors['pos']) |
|
|
|
fig = go.Figure(data=[ |
|
go.Bar( |
|
x=labels, |
|
y=values, |
|
marker_color=colors, |
|
text=[f'{v:.3f}' for v in values], |
|
textposition='auto', |
|
) |
|
]) |
|
|
|
fig.update_layout( |
|
title="Sentiment Probabilities", |
|
xaxis_title="Sentiment", |
|
yaxis_title="Probability", |
|
yaxis=dict(range=[0, 1]), |
|
width=config.FIGURE_WIDTH, |
|
height=config.FIGURE_HEIGHT, |
|
showlegend=False |
|
) |
|
|
|
return fig |
|
|
|
@staticmethod |
|
@handle_errors(default_return=None) |
|
def create_confidence_gauge(confidence: float, sentiment: str, theme: ThemeContext) -> go.Figure: |
|
"""Create confidence gauge using Plotly""" |
|
color = theme.colors['pos'] if sentiment == 'Positive' else theme.colors['neg'] if sentiment == 'Negative' else '#FFA500' |
|
|
|
fig = go.Figure(go.Indicator( |
|
mode = "gauge+number+delta", |
|
value = confidence, |
|
domain = {'x': [0, 1], 'y': [0, 1]}, |
|
title = {'text': f"{sentiment} Confidence"}, |
|
delta = {'reference': 0.5}, |
|
gauge = { |
|
'axis': {'range': [None, 1]}, |
|
'bar': {'color': color}, |
|
'steps': [ |
|
{'range': [0, 0.5], 'color': "lightgray"}, |
|
{'range': [0.5, 1], 'color': "gray"} |
|
], |
|
'threshold': { |
|
'line': {'color': "red", 'width': 4}, |
|
'thickness': 0.75, |
|
'value': 0.9 |
|
} |
|
} |
|
)) |
|
|
|
fig.update_layout( |
|
width=config.FIGURE_WIDTH, |
|
height=config.FIGURE_HEIGHT |
|
) |
|
|
|
return fig |
|
|
|
@staticmethod |
|
@handle_errors(default_return=None) |
|
def create_keyword_chart(key_words: List[Tuple[str, float]], sentiment: str, theme: ThemeContext) -> Optional[go.Figure]: |
|
"""Create horizontal bar chart for key contributing words""" |
|
if not key_words: |
|
return None |
|
|
|
words = [word for word, score in key_words] |
|
scores = [score for word, score in key_words] |
|
|
|
|
|
color = theme.colors['pos'] if sentiment == 'Positive' else theme.colors['neg'] if sentiment == 'Negative' else '#FFA500' |
|
|
|
fig = go.Figure(go.Bar( |
|
x=scores, |
|
y=words, |
|
orientation='h', |
|
marker_color=color, |
|
text=[f'{score:.3f}' for score in scores], |
|
textposition='auto', |
|
)) |
|
|
|
fig.update_layout( |
|
title=f'Top Contributing Words ({sentiment})', |
|
xaxis_title='Attention Weight', |
|
yaxis_title='Words', |
|
width=config.FIGURE_WIDTH, |
|
height=config.FIGURE_HEIGHT, |
|
yaxis={'categoryorder': 'total ascending'} |
|
) |
|
|
|
return fig |
|
|
|
@staticmethod |
|
@handle_errors(default_return=None) |
|
def create_wordcloud_plot(text: str, sentiment: str, theme: ThemeContext) -> Optional[go.Figure]: |
|
"""Create word cloud visualization""" |
|
if len(text.split()) < 3: |
|
return None |
|
|
|
try: |
|
colormap = 'Greens' if sentiment == 'Positive' else 'Reds' if sentiment == 'Negative' else 'Blues' |
|
wc = WordCloud( |
|
width=config.WORDCLOUD_SIZE[0], |
|
height=config.WORDCLOUD_SIZE[1], |
|
background_color='white', |
|
colormap=colormap, |
|
max_words=30 |
|
).generate(text) |
|
|
|
|
|
img_array = wc.to_array() |
|
|
|
fig = go.Figure() |
|
fig.add_trace(go.Image(z=img_array)) |
|
fig.update_layout( |
|
title=f'{sentiment} Word Cloud', |
|
xaxis={'visible': False}, |
|
yaxis={'visible': False}, |
|
width=config.FIGURE_WIDTH, |
|
height=config.FIGURE_HEIGHT, |
|
margin=dict(l=0, r=0, t=30, b=0) |
|
) |
|
|
|
return fig |
|
|
|
except Exception as e: |
|
logger.error(f"Word cloud generation failed: {e}") |
|
return None |
|
|
|
@staticmethod |
|
@handle_errors(default_return=None) |
|
def create_batch_analysis(results: List[Dict], theme: ThemeContext) -> go.Figure: |
|
"""Create comprehensive batch visualization using Plotly subplots""" |
|
fig = make_subplots( |
|
rows=2, cols=2, |
|
subplot_titles=['Sentiment Distribution', 'Confidence Distribution', |
|
'Sentiment Progression', 'Language Distribution'], |
|
specs=[[{"type": "pie"}, {"type": "histogram"}], |
|
[{"type": "scatter", "colspan": 2}, None]] |
|
) |
|
|
|
|
|
sent_counts = Counter([r['sentiment'] for r in results]) |
|
colors_pie = [theme.colors['pos'] if s == 'Positive' else theme.colors['neg'] if s == 'Negative' else '#FFA500' for s in sent_counts.keys()] |
|
|
|
fig.add_trace( |
|
go.Pie(labels=list(sent_counts.keys()), values=list(sent_counts.values()), |
|
marker_colors=colors_pie, name="Sentiment"), |
|
row=1, col=1 |
|
) |
|
|
|
|
|
confs = [r['confidence'] for r in results] |
|
fig.add_trace( |
|
go.Histogram(x=confs, nbinsx=8, marker_color='skyblue', name="Confidence"), |
|
row=1, col=2 |
|
) |
|
|
|
|
|
pos_probs = [r.get('pos_prob', 0) for r in results] |
|
indices = list(range(len(results))) |
|
colors_scatter = [theme.colors['pos'] if r['sentiment'] == 'Positive' |
|
else theme.colors['neg'] if r['sentiment'] == 'Negative' |
|
else '#FFA500' for r in results] |
|
|
|
fig.add_trace( |
|
go.Scatter(x=indices, y=pos_probs, mode='markers', |
|
marker=dict(color=colors_scatter, size=8), |
|
name="Sentiment Progression"), |
|
row=2, col=1 |
|
) |
|
|
|
|
|
fig.add_hline(y=0.5, line_dash="dash", line_color="gray", row=2, col=1) |
|
|
|
fig.update_layout( |
|
height=800, |
|
width=1000, |
|
showlegend=False, |
|
title_text="Batch Analysis Results" |
|
) |
|
|
|
return fig |
|
|
|
|
|
class DataHandler: |
|
"""Handles all data operations""" |
|
|
|
@staticmethod |
|
@handle_errors(default_return=(None, "Export failed")) |
|
def export_data(data: List[Dict], format_type: str) -> Tuple[Optional[str], str]: |
|
"""Universal data export""" |
|
if not data: |
|
return None, "No data to export" |
|
|
|
temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, |
|
suffix=f'.{format_type}', encoding='utf-8') |
|
|
|
if format_type == 'csv': |
|
writer = csv.writer(temp_file) |
|
writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Pos_Prob', 'Neg_Prob', 'Neutral_Prob', 'Language', 'Key_Words']) |
|
for entry in data: |
|
writer.writerow([ |
|
entry.get('timestamp', ''), |
|
entry.get('text', ''), |
|
entry.get('sentiment', ''), |
|
f"{entry.get('confidence', 0):.4f}", |
|
f"{entry.get('pos_prob', 0):.4f}", |
|
f"{entry.get('neg_prob', 0):.4f}", |
|
f"{entry.get('neutral_prob', 0):.4f}", |
|
entry.get('language', ''), |
|
"|".join([f"{word}:{score:.3f}" for word, score in entry.get('key_words', [])]) |
|
]) |
|
elif format_type == 'json': |
|
json.dump(data, temp_file, indent=2, ensure_ascii=False) |
|
|
|
temp_file.close() |
|
return temp_file.name, f"Exported {len(data)} entries" |
|
|
|
@staticmethod |
|
@handle_errors(default_return="") |
|
def process_file(file) -> str: |
|
"""Process uploaded file""" |
|
if not file: |
|
return "" |
|
|
|
content = file.read().decode('utf-8') |
|
|
|
if file.name.endswith('.csv'): |
|
import io |
|
csv_file = io.StringIO(content) |
|
reader = csv.reader(csv_file) |
|
try: |
|
next(reader) |
|
texts = [] |
|
for row in reader: |
|
if row and row[0].strip(): |
|
text = row[0].strip().strip('"') |
|
if text: |
|
texts.append(text) |
|
return '\n'.join(texts) |
|
except Exception as e: |
|
lines = content.strip().split('\n')[1:] |
|
texts = [] |
|
for line in lines: |
|
if line.strip(): |
|
text = line.strip().strip('"') |
|
if text: |
|
texts.append(text) |
|
return '\n'.join(texts) |
|
return content |
|
|
|
|
|
class SentimentApp: |
|
"""Main application orchestrator with multi-language support""" |
|
|
|
def __init__(self): |
|
self.engine = SentimentEngine() |
|
self.history = HistoryManager() |
|
self.data_handler = DataHandler() |
|
|
|
|
|
self.examples = [ |
|
["While the film's visual effects were undeniably impressive, the story lacked emotional weight, and the pacing felt inconsistent throughout."], |
|
["这部电影的视觉效果令人印象深刻,但故事缺乏情感深度,节奏感也不够连贯。"], |
|
["Aunque los efectos visuales de la película fueron innegablemente impresionantes, la historia carecía de peso emocional."], |
|
["Bien que les effets visuels du film soient indéniablement impressionnants, l'histoire manquait de poids émotionnel."], |
|
["An extraordinary achievement in filmmaking — the direction was masterful, the script was sharp, and every performance added depth and realism."] |
|
] |
|
|
|
@handle_errors(default_return=("Please enter text", None, None, None, None)) |
|
def analyze_single(self, text: str, model_key: str = 'multilingual', theme: str = 'default'): |
|
"""Single text analysis with multi-language support""" |
|
if not text.strip(): |
|
return "Please enter text", None, None, None, None |
|
|
|
result = self.engine.analyze_single(text, model_key) |
|
|
|
|
|
self.history.add({ |
|
'text': text[:100], |
|
'full_text': text, |
|
**result |
|
}) |
|
|
|
|
|
theme_ctx = ThemeContext(theme) |
|
|
|
prob_plot = PlotFactory.create_sentiment_bars(result, theme_ctx) |
|
gauge_plot = PlotFactory.create_confidence_gauge(result['confidence'], result['sentiment'], theme_ctx) |
|
cloud_plot = PlotFactory.create_wordcloud_plot(text, result['sentiment'], theme_ctx) |
|
keyword_plot = PlotFactory.create_keyword_chart(result['key_words'], result['sentiment'], theme_ctx) |
|
|
|
|
|
key_words_str = ", ".join([f"{word}({score:.3f})" for word, score in result['key_words'][:5]]) |
|
result_text = (f"Sentiment: {result['sentiment']} (Confidence: {result['confidence']:.3f})\n" |
|
f"Language: {result['language']}\n" |
|
f"Key Words: {key_words_str}") |
|
|
|
return result_text, prob_plot, gauge_plot, cloud_plot, keyword_plot |
|
|
|
@handle_errors(default_return=None) |
|
def analyze_batch(self, reviews: str, model_key: str = 'multilingual', progress=None): |
|
"""Batch analysis with multi-language support""" |
|
if not reviews.strip(): |
|
return None |
|
|
|
texts = [r.strip() for r in reviews.split('\n') if r.strip()] |
|
if len(texts) < 2: |
|
return None |
|
|
|
results = self.engine.analyze_batch(texts, model_key, progress) |
|
|
|
|
|
for result in results: |
|
self.history.add(result) |
|
|
|
|
|
theme_ctx = ThemeContext('default') |
|
return PlotFactory.create_batch_analysis(results, theme_ctx) |
|
|
|
@handle_errors(default_return=(None, "No history available")) |
|
def plot_history(self, theme: str = 'default'): |
|
"""Plot analysis history using Plotly""" |
|
history = self.history.get_all() |
|
if len(history) < 2: |
|
return None, f"Need at least 2 analyses for trends. Current: {len(history)}" |
|
|
|
theme_ctx = ThemeContext(theme) |
|
|
|
|
|
fig = make_subplots( |
|
rows=2, cols=1, |
|
subplot_titles=['Sentiment History', 'Confidence Over Time'], |
|
vertical_spacing=0.12 |
|
) |
|
|
|
indices = list(range(len(history))) |
|
pos_probs = [item.get('pos_prob', 0) for item in history] |
|
confs = [item['confidence'] for item in history] |
|
|
|
|
|
colors = [theme_ctx.colors['pos'] if p > 0.5 else theme_ctx.colors['neg'] for p in pos_probs] |
|
|
|
fig.add_trace( |
|
go.Scatter( |
|
x=indices, |
|
y=pos_probs, |
|
mode='markers+lines', |
|
marker=dict(color=colors, size=8), |
|
line=dict(color='gray', width=2), |
|
name='Sentiment Trend' |
|
), |
|
row=1, col=1 |
|
) |
|
|
|
|
|
fig.add_hline(y=0.5, line_dash="dash", line_color="gray", row=1, col=1) |
|
|
|
|
|
fig.add_trace( |
|
go.Bar( |
|
x=indices, |
|
y=confs, |
|
marker_color='lightblue', |
|
marker_line_color='navy', |
|
marker_line_width=1, |
|
name='Confidence' |
|
), |
|
row=2, col=1 |
|
) |
|
|
|
fig.update_layout( |
|
height=800, |
|
width=1000, |
|
showlegend=False, |
|
title_text="Analysis History" |
|
) |
|
|
|
fig.update_xaxes(title_text="Analysis Number", row=2, col=1) |
|
fig.update_yaxes(title_text="Positive Probability", row=1, col=1) |
|
fig.update_yaxes(title_text="Confidence", row=2, col=1) |
|
|
|
return fig, f"History: {len(history)} analyses" |
|
|
|
|
|
def create_interface(): |
|
"""Create streamlined Gradio interface with multi-language support""" |
|
app = SentimentApp() |
|
|
|
with gr.Blocks(theme=gr.themes.Soft(), title="Multi-language Sentiment Analyzer") as demo: |
|
gr.Markdown("# 🌍 AI Multi-language Sentiment Analyzer") |
|
gr.Markdown("Advanced sentiment analysis supporting multiple languages with Plotly visualizations and key word extraction") |
|
|
|
with gr.Tab("Single Analysis"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
text_input = gr.Textbox( |
|
label="Review Text (Multiple Languages Supported)", |
|
placeholder="Enter your review in any supported language...", |
|
lines=5 |
|
) |
|
with gr.Row(): |
|
analyze_btn = gr.Button("Analyze", variant="primary") |
|
model_selector = gr.Dropdown( |
|
choices=[ |
|
('Auto-detect', 'multilingual'), |
|
('Multilingual', 'multilingual'), |
|
('English', 'english'), |
|
('Chinese 中文', 'chinese'), |
|
('Spanish Español', 'spanish'), |
|
('French Français', 'french') |
|
], |
|
value="multilingual", |
|
label="Language Model" |
|
) |
|
theme_selector = gr.Dropdown( |
|
choices=list(config.THEMES.keys()), |
|
value="default", |
|
label="Theme" |
|
) |
|
|
|
gr.Examples( |
|
examples=app.examples, |
|
inputs=text_input, |
|
label="Multi-language Examples" |
|
) |
|
|
|
with gr.Column(): |
|
result_output = gr.Textbox(label="Analysis Result", lines=4) |
|
|
|
with gr.Row(): |
|
prob_plot = gr.Plot(label="Sentiment Probabilities") |
|
gauge_plot = gr.Plot(label="Confidence Gauge") |
|
|
|
with gr.Row(): |
|
wordcloud_plot = gr.Plot(label="Word Cloud") |
|
keyword_plot = gr.Plot(label="Key Contributing Words") |
|
|
|
with gr.Tab("Batch Analysis"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
file_upload = gr.File(label="Upload File", file_types=[".csv", ".txt"]) |
|
batch_input = gr.Textbox( |
|
label="Reviews (one per line, mixed languages supported)", |
|
lines=8, |
|
placeholder="Enter multiple reviews, one per line...\nSupports mixed languages in the same batch!" |
|
) |
|
|
|
with gr.Column(): |
|
load_btn = gr.Button("Load File") |
|
with gr.Row(): |
|
batch_btn = gr.Button("Analyze Batch", variant="primary") |
|
batch_model_selector = gr.Dropdown( |
|
choices=[ |
|
('Auto-detect', 'multilingual'), |
|
('Multilingual', 'multilingual'), |
|
('English', 'english'), |
|
('Chinese 中文', 'chinese'), |
|
('Spanish Español', 'spanish'), |
|
('French Français', 'french') |
|
], |
|
value="multilingual", |
|
label="Batch Model" |
|
) |
|
|
|
batch_plot = gr.Plot(label="Batch Analysis Results") |
|
|
|
with gr.Tab("History & Export"): |
|
with gr.Row(): |
|
refresh_btn = gr.Button("Refresh History") |
|
clear_btn = gr.Button("Clear History", variant="stop") |
|
status_btn = gr.Button("Show Status") |
|
|
|
with gr.Row(): |
|
csv_btn = gr.Button("Export CSV") |
|
json_btn = gr.Button("Export JSON") |
|
|
|
history_status = gr.Textbox(label="Status Information") |
|
history_plot = gr.Plot(label="History Trends") |
|
csv_file = gr.File(label="CSV Download", visible=True) |
|
json_file = gr.File(label="JSON Download", visible=True) |
|
|
|
with gr.Tab("Model Information"): |
|
gr.Markdown(""" |
|
## Supported Languages and Models |
|
|
|
| Language | Model | Description | |
|
|----------|-------|-------------| |
|
| **Multilingual** | XLM-RoBERTa | Supports 100+ languages automatically | |
|
| **English** | RoBERTa-base | Optimized for English text | |
|
| **Chinese 中文** | RoBERTa-Chinese | Specialized for Chinese language | |
|
| **Spanish Español** | BETO | Fine-tuned for Spanish sentiment | |
|
| **French Français** | tf-allocine | Trained on French movie reviews | |
|
|
|
### Features: |
|
- **Automatic Language Detection**: The system can automatically detect the input language |
|
- **Attention-based Keywords**: Extract words that contribute most to sentiment prediction |
|
- **Interactive Visualizations**: Plotly-powered charts and graphs |
|
- **Batch Processing**: Analyze multiple texts at once |
|
- **Export Capabilities**: Save results in CSV or JSON format |
|
- **Multi-language Support**: Mix different languages in batch analysis |
|
""") |
|
|
|
|
|
analyze_btn.click( |
|
app.analyze_single, |
|
inputs=[text_input, model_selector, theme_selector], |
|
outputs=[result_output, prob_plot, gauge_plot, wordcloud_plot, keyword_plot] |
|
) |
|
|
|
load_btn.click( |
|
app.data_handler.process_file, |
|
inputs=file_upload, |
|
outputs=batch_input |
|
) |
|
|
|
batch_btn.click( |
|
app.analyze_batch, |
|
inputs=[batch_input, batch_model_selector], |
|
outputs=batch_plot |
|
) |
|
|
|
refresh_btn.click( |
|
lambda theme: app.plot_history(theme), |
|
inputs=theme_selector, |
|
outputs=[history_plot, history_status] |
|
) |
|
|
|
clear_btn.click( |
|
lambda: f"Cleared {app.history.clear()} entries", |
|
outputs=history_status |
|
) |
|
|
|
status_btn.click( |
|
lambda: f"History: {app.history.size()} entries | Available Models: {', '.join(config.MODELS.keys())}", |
|
outputs=history_status |
|
) |
|
|
|
csv_btn.click( |
|
lambda: app.data_handler.export_data(app.history.get_all(), 'csv'), |
|
outputs=[csv_file, history_status] |
|
) |
|
|
|
json_btn.click( |
|
lambda: app.data_handler.export_data(app.history.get_all(), 'json'), |
|
outputs=[json_file, history_status] |
|
) |
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
logging.basicConfig(level=logging.INFO) |
|
demo = create_interface() |
|
demo.launch( |
|
share=True, |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
show_error=True |
|
) |