Spaces:

simondh
/

classifieur

Sleeping

App Files Files Community

classifieur / process.py

simondh

add tpes

535a3a5 about 1 month ago

raw

history blame

6.61 kB

	import logging
	import time
	import traceback
	import asyncio
	from sklearn.feature_extraction.text import TfidfVectorizer
	from typing import Optional, List, Dict, Any, Tuple, Union
	import pandas as pd
	from pathlib import Path

	from classifiers import TFIDFClassifier, LLMClassifier
	from utils import load_data, validate_results
	from client import get_client


	def update_api_key(api_key: str) -> Tuple[bool, str]:
	"""Update the OpenAI API key"""
	from client import initialize_client
	return initialize_client(api_key)


	async def process_file_async(
	file: Union[str, Path],
	text_columns: List[str],
	categories: Optional[str],
	classifier_type: str,
	show_explanations: bool
	) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
	"""Async version of process_file"""
	# Initialize result_df and validation_report
	result_df: Optional[pd.DataFrame] = None
	validation_report: Optional[str] = None

	try:
	# Load data from file
	if isinstance(file, str):
	df: pd.DataFrame = load_data(file)
	else:
	df: pd.DataFrame = load_data(file.name)

	if not text_columns:
	return None, "Please select at least one text column"

	# Check if all selected columns exist
	missing_columns: List[str] = [col for col in text_columns if col not in df.columns]
	if missing_columns:
	return (
	None,
	f"Columns not found in the file: {', '.join(missing_columns)}. Available columns: {', '.join(df.columns)}",
	)

	# Combine text from selected columns
	texts: List[str] = []
	for _, row in df.iterrows():
	combined_text: str = " ".join(str(row[col]) for col in text_columns)
	texts.append(combined_text)

	# Parse categories if provided
	category_list: List[str] = []
	if categories:
	category_list = [cat.strip() for cat in categories.split(",")]

	# Select classifier based on data size and user choice
	num_texts: int = len(texts)

	# If no specific model is chosen, select the most appropriate one
	if classifier_type == "auto":
	if num_texts <= 500:
	classifier_type = "gpt4"
	elif num_texts <= 1000:
	classifier_type = "gpt35"
	elif num_texts <= 5000:
	classifier_type = "hybrid"
	else:
	classifier_type = "tfidf"

	# Get the client instance
	client = get_client()

	# Initialize appropriate classifier
	if classifier_type == "tfidf":
	classifier: TFIDFClassifier = TFIDFClassifier()
	results: List[Dict[str, Any]] = classifier.classify(texts, category_list)
	elif classifier_type in ["gpt35", "gpt4"]:
	if client is None:
	return (
	None,
	"Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'.",
	)
	model: str = "gpt-3.5-turbo" if classifier_type == "gpt35" else "gpt-4"
	classifier: LLMClassifier = LLMClassifier(client=client, model=model)
	results: List[Dict[str, Any]] = await classifier.classify_async(texts, category_list)
	else: # hybrid
	if client is None:
	return (
	None,
	"Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'.",
	)
	# First pass with TF-IDF
	tfidf_classifier: TFIDFClassifier = TFIDFClassifier()
	tfidf_results: List[Dict[str, Any]] = tfidf_classifier.classify(texts, category_list)

	# Second pass with LLM for low confidence results
	llm_classifier: LLMClassifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
	results: List[Optional[Dict[str, Any]]] = []
	low_confidence_texts: List[str] = []
	low_confidence_indices: List[int] = []

	for i, (text, tfidf_result) in enumerate(zip(texts, tfidf_results)):
	if tfidf_result["confidence"] < 70: # If confidence is below 70%
	low_confidence_texts.append(text)
	low_confidence_indices.append(i)
	results.append(None) # Placeholder
	else:
	results.append(tfidf_result)

	if low_confidence_texts:
	llm_results: List[Dict[str, Any]] = await llm_classifier.classify_async(
	low_confidence_texts, category_list
	)
	for idx, llm_result in zip(low_confidence_indices, llm_results):
	results[idx] = llm_result

	# Create results dataframe
	result_df = df.copy()
	result_df["Category"] = [r["category"] for r in results]
	result_df["Confidence"] = [r["confidence"] for r in results]

	if show_explanations:
	result_df["Explanation"] = [r["explanation"] for r in results]

	# Validate results using LLM
	validation_report = validate_results(result_df, text_columns, client)

	return result_df, validation_report

	except Exception as e:
	error_traceback: str = traceback.format_exc()
	return None, f"Error: {str(e)}\n{error_traceback}"


	def process_file(
	file: Union[str, Path],
	text_columns: List[str],
	categories: Optional[str],
	classifier_type: str,
	show_explanations: bool
	) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
	"""Synchronous wrapper for process_file_async"""
	return asyncio.run(process_file_async(file, text_columns, categories, classifier_type, show_explanations))


	def export_results(df: pd.DataFrame, format_type: str) -> Optional[str]:
	"""Export results to a file and return the file path for download"""
	if df is None:
	return None

	# Create a temporary file
	import tempfile
	import os

	# Create a temporary directory if it doesn't exist
	temp_dir: str = "temp_exports"
	os.makedirs(temp_dir, exist_ok=True)

	# Generate a unique filename
	timestamp: str = time.strftime("%Y%m%d-%H%M%S")
	filename: str = f"classification_results_{timestamp}"

	if format_type == "excel":
	file_path: str = os.path.join(temp_dir, f"{filename}.xlsx")
	df.to_excel(file_path, index=False)
	else:
	file_path: str = os.path.join(temp_dir, f"{filename}.csv")
	df.to_csv(file_path, index=False)

	return file_path