Update app.py
Browse files
app.py
CHANGED
@@ -96,9 +96,16 @@ class ModelManager:
|
|
96 |
|
97 |
@staticmethod
|
98 |
def detect_language(text: str) -> str:
|
99 |
-
"""Detect text language"""
|
100 |
try:
|
|
|
101 |
detected = langdetect.detect(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
return detected if detected in config.SUPPORTED_LANGUAGES else 'en'
|
103 |
except:
|
104 |
return 'en'
|
@@ -162,10 +169,18 @@ class TextProcessor:
|
|
162 |
@staticmethod
|
163 |
def extract_keywords(text: str, top_k: int = 5) -> List[str]:
|
164 |
"""Extract key words from text"""
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
class SentimentAnalyzer:
|
171 |
"""Enhanced sentiment analysis"""
|
@@ -185,10 +200,10 @@ class SentimentAnalyzer:
|
|
185 |
# Get appropriate model
|
186 |
model, tokenizer = model_manager.get_model(detected_lang)
|
187 |
|
188 |
-
# Preprocessing options
|
189 |
options = preprocessing_options or {}
|
190 |
processed_text = text
|
191 |
-
if options.get('clean_text', False):
|
192 |
processed_text = TextProcessor.clean_text(
|
193 |
text,
|
194 |
options.get('remove_punctuation', True),
|
@@ -388,7 +403,7 @@ def analyze_single_text(text: str, language: str, theme: str, clean_text: bool,
|
|
388 |
"""Enhanced single text analysis"""
|
389 |
try:
|
390 |
if not text.strip():
|
391 |
-
return "Please enter text", None, None
|
392 |
|
393 |
# Map display names back to language codes
|
394 |
language_map = {
|
@@ -437,11 +452,11 @@ def analyze_single_text(text: str, language: str, theme: str, clean_text: bool,
|
|
437 |
- **Stats:** {result['word_count']} words, {result['char_count']} characters
|
438 |
"""
|
439 |
|
440 |
-
return info_text, gauge_fig, bars_fig
|
441 |
|
442 |
except Exception as e:
|
443 |
logger.error(f"Analysis failed: {e}")
|
444 |
-
return f"Error: {str(e)}", None, None
|
445 |
|
446 |
def get_history_stats():
|
447 |
"""Get history statistics"""
|
@@ -555,8 +570,6 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Advanced Sentiment Analyzer") as d
|
|
555 |
with gr.Row():
|
556 |
gauge_plot = gr.Plot(label="Sentiment Gauge")
|
557 |
bars_plot = gr.Plot(label="Probability Distribution")
|
558 |
-
|
559 |
-
status_output = gr.Textbox(label="Status", interactive=False)
|
560 |
|
561 |
with gr.Tab("📊 History & Analytics"):
|
562 |
with gr.Row():
|
@@ -576,7 +589,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Advanced Sentiment Analyzer") as d
|
|
576 |
analyze_btn.click(
|
577 |
analyze_single_text,
|
578 |
inputs=[text_input, language_select, theme_select, clean_text, remove_punct, remove_nums],
|
579 |
-
outputs=[result_info, gauge_plot, bars_plot
|
580 |
)
|
581 |
|
582 |
stats_btn.click(
|
|
|
96 |
|
97 |
@staticmethod
|
98 |
def detect_language(text: str) -> str:
|
99 |
+
"""Detect text language properly"""
|
100 |
try:
|
101 |
+
# Use langdetect for all languages
|
102 |
detected = langdetect.detect(text)
|
103 |
+
# Map some common langdetect codes to our supported languages
|
104 |
+
language_mapping = {
|
105 |
+
'zh-cn': 'zh',
|
106 |
+
'zh-tw': 'zh'
|
107 |
+
}
|
108 |
+
detected = language_mapping.get(detected, detected)
|
109 |
return detected if detected in config.SUPPORTED_LANGUAGES else 'en'
|
110 |
except:
|
111 |
return 'en'
|
|
|
169 |
@staticmethod
|
170 |
def extract_keywords(text: str, top_k: int = 5) -> List[str]:
|
171 |
"""Extract key words from text"""
|
172 |
+
# For Chinese text, extract characters
|
173 |
+
if re.search(r'[\u4e00-\u9fff]', text):
|
174 |
+
words = re.findall(r'[\u4e00-\u9fff]+', text)
|
175 |
+
all_chars = ''.join(words)
|
176 |
+
char_freq = Counter(all_chars)
|
177 |
+
return [char for char, _ in char_freq.most_common(top_k)]
|
178 |
+
else:
|
179 |
+
# For other languages, use word-based extraction
|
180 |
+
cleaned = TextProcessor.clean_text(text)
|
181 |
+
words = cleaned.split()
|
182 |
+
word_freq = Counter(words)
|
183 |
+
return [word for word, _ in word_freq.most_common(top_k)]
|
184 |
|
185 |
class SentimentAnalyzer:
|
186 |
"""Enhanced sentiment analysis"""
|
|
|
200 |
# Get appropriate model
|
201 |
model, tokenizer = model_manager.get_model(detected_lang)
|
202 |
|
203 |
+
# Preprocessing options - don't clean Chinese text
|
204 |
options = preprocessing_options or {}
|
205 |
processed_text = text
|
206 |
+
if options.get('clean_text', False) and not re.search(r'[\u4e00-\u9fff]', text):
|
207 |
processed_text = TextProcessor.clean_text(
|
208 |
text,
|
209 |
options.get('remove_punctuation', True),
|
|
|
403 |
"""Enhanced single text analysis"""
|
404 |
try:
|
405 |
if not text.strip():
|
406 |
+
return "Please enter text", None, None
|
407 |
|
408 |
# Map display names back to language codes
|
409 |
language_map = {
|
|
|
452 |
- **Stats:** {result['word_count']} words, {result['char_count']} characters
|
453 |
"""
|
454 |
|
455 |
+
return info_text, gauge_fig, bars_fig
|
456 |
|
457 |
except Exception as e:
|
458 |
logger.error(f"Analysis failed: {e}")
|
459 |
+
return f"Error: {str(e)}", None, None
|
460 |
|
461 |
def get_history_stats():
|
462 |
"""Get history statistics"""
|
|
|
570 |
with gr.Row():
|
571 |
gauge_plot = gr.Plot(label="Sentiment Gauge")
|
572 |
bars_plot = gr.Plot(label="Probability Distribution")
|
|
|
|
|
573 |
|
574 |
with gr.Tab("📊 History & Analytics"):
|
575 |
with gr.Row():
|
|
|
589 |
analyze_btn.click(
|
590 |
analyze_single_text,
|
591 |
inputs=[text_input, language_select, theme_select, clean_text, remove_punct, remove_nums],
|
592 |
+
outputs=[result_info, gauge_plot, bars_plot]
|
593 |
)
|
594 |
|
595 |
stats_btn.click(
|