Spaces:
Sleeping
Sleeping
isolate prompts
Browse files- README.md +5 -19
- app.py +20 -46
- classifiers.py +6 -22
- prompts.py +63 -0
README.md
CHANGED
|
@@ -60,33 +60,19 @@ brainbox4/
|
|
| 60 |
```
|
| 61 |
|
| 62 |
## 🔧 Optimisations de Performance
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
- Exploitation d'`asyncio` pour effectuer des appels API simultanés.
|
| 66 |
-
- Gestion par lots de 20 textes par requête pour optimiser le débit.
|
| 67 |
-
|
| 68 |
-
### Sélection Intelligente du Modèle
|
| 69 |
-
- **GPT-3.5** : Utilisé par défaut pour moins de 100 textes.
|
| 70 |
-
- **GPT-3.5-16k** : Adapté pour des volumes de 100 à 500 textes.
|
| 71 |
-
- **GPT-4** : Préféré pour plus de 500 textes.
|
| 72 |
-
- Intégration future de modèles hébergés localement pour une flexibilité accrue.
|
| 73 |
|
| 74 |
## 🎨 Optimisations de l'Interface Utilisateur
|
| 75 |
-
|
| 76 |
-
### Suggestions Automatiques
|
| 77 |
-
- Propositions automatiques de catégories et de colonnes basées sur un échantillon de textes.
|
| 78 |
-
|
| 79 |
-
### Évaluation et Reclassification
|
| 80 |
- Rapport d'évaluation détaillé après classification : analyse des catégories, détection des incohérences, suggestions d'amélioration.
|
| 81 |
-
-
|
| 82 |
-
|
| 83 |
|
| 84 |
## ✨ Fonctionnalités Principales
|
| 85 |
-
|
| 86 |
1. **Classification Rapide**
|
| 87 |
- Traitement parallèle des textes
|
| 88 |
- Support des fichiers Excel/CSV
|
| 89 |
-
- Scores de confiance
|
| 90 |
|
| 91 |
2. **Interface Simple**
|
| 92 |
- Upload de fichiers
|
|
|
|
| 60 |
```
|
| 61 |
|
| 62 |
## 🔧 Optimisations de Performance
|
| 63 |
+
- parallélisation des requêtes API par lot de 10 maximum pour accélérer la classification.
|
| 64 |
+
- suggestion automatique du modèle.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
## 🎨 Optimisations de l'Interface Utilisateur
|
| 67 |
+
- Suggestion automatiques de catégories et de colonnes basées sur un échantillon de textes.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
- Rapport d'évaluation détaillé après classification : analyse des catégories, détection des incohérences, suggestions d'amélioration.
|
| 69 |
+
- Suggestion de reclassification des textes selon les recommandations du rapport.
|
|
|
|
| 70 |
|
| 71 |
## ✨ Fonctionnalités Principales
|
|
|
|
| 72 |
1. **Classification Rapide**
|
| 73 |
- Traitement parallèle des textes
|
| 74 |
- Support des fichiers Excel/CSV
|
| 75 |
+
- Scores de confiance et justification
|
| 76 |
|
| 77 |
2. **Interface Simple**
|
| 78 |
- Upload de fichiers
|
app.py
CHANGED
|
@@ -12,11 +12,16 @@ import time
|
|
| 12 |
import torch
|
| 13 |
import traceback
|
| 14 |
import logging
|
| 15 |
-
import asyncio
|
| 16 |
|
| 17 |
# Import local modules
|
| 18 |
from classifiers import TFIDFClassifier, LLMClassifier
|
| 19 |
from utils import load_data, export_data, visualize_results, validate_results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
# Configure logging
|
| 22 |
logging.basicConfig(level=logging.INFO,
|
|
@@ -269,12 +274,8 @@ with gr.Blocks(title="Text Classification System") as demo:
|
|
| 269 |
|
| 270 |
process_button = gr.Button("Process and Classify", visible=False)
|
| 271 |
|
| 272 |
-
|
| 273 |
-
|
| 274 |
results_df = gr.Dataframe(interactive=True, visible=False)
|
| 275 |
|
| 276 |
-
|
| 277 |
-
|
| 278 |
# Create containers for visualization and validation report
|
| 279 |
with gr.Row(visible=False) as results_row:
|
| 280 |
with gr.Column():
|
|
@@ -286,7 +287,6 @@ with gr.Blocks(title="Text Classification System") as demo:
|
|
| 286 |
validation_output = gr.Textbox(label="Validation Report", interactive=False)
|
| 287 |
improve_button = gr.Button("Improve Classification with Report", visible=False)
|
| 288 |
|
| 289 |
-
|
| 290 |
# Function to load file and suggest categories
|
| 291 |
def load_file_and_suggest_categories(file):
|
| 292 |
if not file:
|
|
@@ -319,13 +319,7 @@ with gr.Blocks(title="Text Classification System") as demo:
|
|
| 319 |
|
| 320 |
# Use LLM to suggest categories
|
| 321 |
if client:
|
| 322 |
-
prompt =
|
| 323 |
-
Based on these example texts, suggest 5 appropriate categories for classification:
|
| 324 |
-
|
| 325 |
-
{sample_texts[:5]}
|
| 326 |
-
|
| 327 |
-
Return your answer as a comma-separated list of category names only.
|
| 328 |
-
"""
|
| 329 |
try:
|
| 330 |
response = client.chat.completions.create(
|
| 331 |
model="gpt-3.5-turbo",
|
|
@@ -396,15 +390,10 @@ with gr.Blocks(title="Text Classification System") as demo:
|
|
| 396 |
sample_texts.extend(df[col].head(5).tolist())
|
| 397 |
|
| 398 |
if client:
|
| 399 |
-
prompt =
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
Example texts:
|
| 404 |
-
{sample_texts[:5]}
|
| 405 |
-
|
| 406 |
-
Return only the suggested category name, nothing else.
|
| 407 |
-
"""
|
| 408 |
try:
|
| 409 |
response = client.chat.completions.create(
|
| 410 |
model="gpt-3.5-turbo",
|
|
@@ -438,20 +427,10 @@ with gr.Blocks(title="Text Classification System") as demo:
|
|
| 438 |
try:
|
| 439 |
# Extract insights from validation report
|
| 440 |
if client:
|
| 441 |
-
prompt =
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
Return your answer in JSON format with these fields:
|
| 447 |
-
- suggested_categories: list of improved category names (must be different from current categories: {categories})
|
| 448 |
-
- confidence_threshold: a number between 0 and 100 for minimum confidence
|
| 449 |
-
- focus_areas: list of specific aspects to focus on during classification
|
| 450 |
-
- analysis: a brief analysis of what needs improvement
|
| 451 |
-
- new_categories_needed: boolean indicating if new categories should be added
|
| 452 |
-
|
| 453 |
-
JSON response:
|
| 454 |
-
"""
|
| 455 |
try:
|
| 456 |
response = client.chat.completions.create(
|
| 457 |
model="gpt-4",
|
|
@@ -475,16 +454,11 @@ with gr.Blocks(title="Text Classification System") as demo:
|
|
| 475 |
temp_df = load_data(file.name)
|
| 476 |
sample_texts.extend(temp_df[col].head(5).tolist())
|
| 477 |
|
| 478 |
-
category_prompt =
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
Example texts:
|
| 484 |
-
{sample_texts[:5]}
|
| 485 |
-
|
| 486 |
-
Return your answer as a comma-separated list of new category names only.
|
| 487 |
-
"""
|
| 488 |
|
| 489 |
category_response = client.chat.completions.create(
|
| 490 |
model="gpt-4",
|
|
|
|
| 12 |
import torch
|
| 13 |
import traceback
|
| 14 |
import logging
|
|
|
|
| 15 |
|
| 16 |
# Import local modules
|
| 17 |
from classifiers import TFIDFClassifier, LLMClassifier
|
| 18 |
from utils import load_data, export_data, visualize_results, validate_results
|
| 19 |
+
from prompts import (
|
| 20 |
+
CATEGORY_SUGGESTION_PROMPT,
|
| 21 |
+
ADDITIONAL_CATEGORY_PROMPT,
|
| 22 |
+
VALIDATION_ANALYSIS_PROMPT,
|
| 23 |
+
CATEGORY_IMPROVEMENT_PROMPT
|
| 24 |
+
)
|
| 25 |
|
| 26 |
# Configure logging
|
| 27 |
logging.basicConfig(level=logging.INFO,
|
|
|
|
| 274 |
|
| 275 |
process_button = gr.Button("Process and Classify", visible=False)
|
| 276 |
|
|
|
|
|
|
|
| 277 |
results_df = gr.Dataframe(interactive=True, visible=False)
|
| 278 |
|
|
|
|
|
|
|
| 279 |
# Create containers for visualization and validation report
|
| 280 |
with gr.Row(visible=False) as results_row:
|
| 281 |
with gr.Column():
|
|
|
|
| 287 |
validation_output = gr.Textbox(label="Validation Report", interactive=False)
|
| 288 |
improve_button = gr.Button("Improve Classification with Report", visible=False)
|
| 289 |
|
|
|
|
| 290 |
# Function to load file and suggest categories
|
| 291 |
def load_file_and_suggest_categories(file):
|
| 292 |
if not file:
|
|
|
|
| 319 |
|
| 320 |
# Use LLM to suggest categories
|
| 321 |
if client:
|
| 322 |
+
prompt = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts[:5]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
try:
|
| 324 |
response = client.chat.completions.create(
|
| 325 |
model="gpt-3.5-turbo",
|
|
|
|
| 390 |
sample_texts.extend(df[col].head(5).tolist())
|
| 391 |
|
| 392 |
if client:
|
| 393 |
+
prompt = ADDITIONAL_CATEGORY_PROMPT.format(
|
| 394 |
+
existing_categories=", ".join(current_categories),
|
| 395 |
+
sample_texts="\n---\n".join(sample_texts[:5])
|
| 396 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
try:
|
| 398 |
response = client.chat.completions.create(
|
| 399 |
model="gpt-3.5-turbo",
|
|
|
|
| 427 |
try:
|
| 428 |
# Extract insights from validation report
|
| 429 |
if client:
|
| 430 |
+
prompt = VALIDATION_ANALYSIS_PROMPT.format(
|
| 431 |
+
validation_report=validation_report,
|
| 432 |
+
current_categories=categories
|
| 433 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
try:
|
| 435 |
response = client.chat.completions.create(
|
| 436 |
model="gpt-4",
|
|
|
|
| 454 |
temp_df = load_data(file.name)
|
| 455 |
sample_texts.extend(temp_df[col].head(5).tolist())
|
| 456 |
|
| 457 |
+
category_prompt = CATEGORY_IMPROVEMENT_PROMPT.format(
|
| 458 |
+
current_categories=", ".join(current_categories),
|
| 459 |
+
analysis=improvements.get('analysis', ''),
|
| 460 |
+
sample_texts="\n---\n".join(sample_texts[:5])
|
| 461 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 462 |
|
| 463 |
category_response = client.chat.completions.create(
|
| 464 |
model="gpt-4",
|
classifiers.py
CHANGED
|
@@ -7,6 +7,7 @@ import random
|
|
| 7 |
import json
|
| 8 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 9 |
from typing import List, Dict, Any, Optional
|
|
|
|
| 10 |
|
| 11 |
class BaseClassifier:
|
| 12 |
"""Base class for text classifiers"""
|
|
@@ -183,14 +184,7 @@ class LLMClassifier(BaseClassifier):
|
|
| 183 |
else:
|
| 184 |
sample_texts = texts
|
| 185 |
|
| 186 |
-
prompt = ""
|
| 187 |
-
I have a collection of texts that I need to classify into categories. Here are some examples:
|
| 188 |
-
|
| 189 |
-
{}
|
| 190 |
-
|
| 191 |
-
Based on these examples, suggest up 2 to 5 appropriate categories for classification.
|
| 192 |
-
Return your answer as a comma-separated list of category names only.
|
| 193 |
-
""".format("\n---\n".join(sample_texts))
|
| 194 |
|
| 195 |
try:
|
| 196 |
response = self.client.chat.completions.create(
|
|
@@ -212,20 +206,10 @@ class LLMClassifier(BaseClassifier):
|
|
| 212 |
|
| 213 |
def _classify_text(self, text: str, categories: List[str]) -> Dict[str, Any]:
|
| 214 |
"""Use LLM to classify a single text"""
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
Text: {text}
|
| 221 |
-
|
| 222 |
-
Return your answer in JSON format with these fields:
|
| 223 |
-
- category: the chosen category from the list
|
| 224 |
-
- confidence: a value between 0 and 100 indicating your confidence in this classification (as a percentage)
|
| 225 |
-
- explanation: a brief explanation of why this category was chosen (1-2 sentences)
|
| 226 |
-
|
| 227 |
-
JSON response:
|
| 228 |
-
"""
|
| 229 |
|
| 230 |
try:
|
| 231 |
response = self.client.chat.completions.create(
|
|
|
|
| 7 |
import json
|
| 8 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 9 |
from typing import List, Dict, Any, Optional
|
| 10 |
+
from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
|
| 11 |
|
| 12 |
class BaseClassifier:
|
| 13 |
"""Base class for text classifiers"""
|
|
|
|
| 184 |
else:
|
| 185 |
sample_texts = texts
|
| 186 |
|
| 187 |
+
prompt = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
try:
|
| 190 |
response = self.client.chat.completions.create(
|
|
|
|
| 206 |
|
| 207 |
def _classify_text(self, text: str, categories: List[str]) -> Dict[str, Any]:
|
| 208 |
"""Use LLM to classify a single text"""
|
| 209 |
+
prompt = TEXT_CLASSIFICATION_PROMPT.format(
|
| 210 |
+
categories=", ".join(categories),
|
| 211 |
+
text=text
|
| 212 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
try:
|
| 215 |
response = self.client.chat.completions.create(
|
prompts.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Prompts used in the text classification system"""
|
| 2 |
+
|
| 3 |
+
# Category suggestion prompt
|
| 4 |
+
CATEGORY_SUGGESTION_PROMPT = """
|
| 5 |
+
Based on these example texts, suggest 5 appropriate categories for classification:
|
| 6 |
+
|
| 7 |
+
{}
|
| 8 |
+
|
| 9 |
+
Return your answer as a comma-separated list of category names only.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
# Text classification prompt
|
| 13 |
+
TEXT_CLASSIFICATION_PROMPT = """
|
| 14 |
+
Classify the following text into one of these categories: {categories}
|
| 15 |
+
|
| 16 |
+
Text: {text}
|
| 17 |
+
|
| 18 |
+
Return your answer in JSON format with these fields:
|
| 19 |
+
- category: the chosen category from the list
|
| 20 |
+
- confidence: a value between 0 and 100 indicating your confidence in this classification (as a percentage)
|
| 21 |
+
- explanation: a brief explanation of why this category was chosen (1-2 sentences)
|
| 22 |
+
|
| 23 |
+
JSON response:
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
# Additional category suggestion prompt
|
| 27 |
+
ADDITIONAL_CATEGORY_PROMPT = """
|
| 28 |
+
Based on these example texts and the existing categories ({existing_categories}),
|
| 29 |
+
suggest one additional appropriate category for classification.
|
| 30 |
+
|
| 31 |
+
Example texts:
|
| 32 |
+
{}
|
| 33 |
+
|
| 34 |
+
Return only the suggested category name, nothing else.
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
# Validation report analysis prompt
|
| 38 |
+
VALIDATION_ANALYSIS_PROMPT = """
|
| 39 |
+
Based on this validation report, analyze the current classification and suggest improvements:
|
| 40 |
+
|
| 41 |
+
{validation_report}
|
| 42 |
+
|
| 43 |
+
Return your answer in JSON format with these fields:
|
| 44 |
+
- suggested_categories: list of improved category names (must be different from current categories: {current_categories})
|
| 45 |
+
- confidence_threshold: a number between 0 and 100 for minimum confidence
|
| 46 |
+
- focus_areas: list of specific aspects to focus on during classification
|
| 47 |
+
- analysis: a brief analysis of what needs improvement
|
| 48 |
+
- new_categories_needed: boolean indicating if new categories should be added
|
| 49 |
+
|
| 50 |
+
JSON response:
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
# Category improvement prompt
|
| 54 |
+
CATEGORY_IMPROVEMENT_PROMPT = """
|
| 55 |
+
Based on these example texts and the current categories ({current_categories}),
|
| 56 |
+
suggest new categories that would improve the classification. The validation report indicates:
|
| 57 |
+
{analysis}
|
| 58 |
+
|
| 59 |
+
Example texts:
|
| 60 |
+
{}
|
| 61 |
+
|
| 62 |
+
Return your answer as a comma-separated list of new category names only.
|
| 63 |
+
"""
|