Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
""" | |
Standalone script to train the email classifier model | |
""" | |
import pandas as pd | |
import numpy as np | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.pipeline import Pipeline | |
from sklearn.model_selection import train_test_split, cross_val_score | |
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix | |
import joblib | |
import re | |
import os | |
from datetime import datetime | |
def preprocess_text(text: str) -> str: | |
"""Preprocess email text""" | |
text = text.lower() | |
text = re.sub(r'\s+', ' ', text) | |
text = re.sub(r'[^\w\s,.\-!?]', ' ', text) | |
return text.strip() | |
def load_data(file_path: str): | |
"""Load and preprocess the dataset""" | |
print(f"Loading dataset from {file_path}...") | |
if not os.path.exists(file_path): | |
print(f"Error: Dataset file {file_path} not found!") | |
return None, None | |
df = pd.read_csv(file_path) | |
print(f"Dataset loaded: {len(df)} samples") | |
print(f"Columns: {list(df.columns)}") | |
# Basic data info | |
print(f"\nLabel distribution:") | |
print(df['label'].value_counts()) | |
# Preprocess messages | |
df['processed_message'] = df['message'].apply(preprocess_text) | |
return df['processed_message'], df['label'] | |
def train_model(X, y): | |
"""Train the Naive Bayes model""" | |
print("\nSplitting data...") | |
X_train, X_test, y_train, y_test = train_test_split( | |
X, y, test_size=0.2, random_state=42, stratify=y | |
) | |
print(f"Training set: {len(X_train)} samples") | |
print(f"Test set: {len(X_test)} samples") | |
# Create pipeline | |
print("\nCreating model pipeline...") | |
pipeline = Pipeline([ | |
('tfidf', TfidfVectorizer( | |
max_features=1000, | |
ngram_range=(1, 2), | |
stop_words='english', | |
lowercase=True, | |
min_df=1, | |
max_df=0.95 | |
)), | |
('classifier', MultinomialNB(alpha=1.0)) | |
]) | |
# Train model | |
print("Training model...") | |
pipeline.fit(X_train, y_train) | |
# Cross-validation | |
print("Performing cross-validation...") | |
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy') | |
print(f"Cross-validation scores: {cv_scores}") | |
print(f"Mean CV accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})") | |
# Test set evaluation | |
print("\nEvaluating on test set...") | |
y_pred = pipeline.predict(X_test) | |
test_accuracy = accuracy_score(y_test, y_pred) | |
print(f"Test accuracy: {test_accuracy:.4f}") | |
print("\nClassification Report:") | |
print(classification_report(y_test, y_pred, target_names=['No Attachment', 'Has Attachment'])) | |
print("\nConfusion Matrix:") | |
print(confusion_matrix(y_test, y_pred)) | |
# Feature analysis | |
print("\nAnalyzing most important features...") | |
feature_names = pipeline.named_steps['tfidf'].get_feature_names_out() | |
feature_scores = pipeline.named_steps['classifier'].feature_log_prob_ | |
# Top features for each class | |
for class_idx, class_name in enumerate(['No Attachment', 'Has Attachment']): | |
top_features_idx = np.argsort(feature_scores[class_idx])[-20:] | |
top_features = [feature_names[i] for i in top_features_idx] | |
print(f"\nTop 20 features for {class_name}:") | |
print(", ".join(reversed(top_features))) | |
return pipeline, test_accuracy | |
def save_model(pipeline, accuracy, output_path='email_classifier_model.pkl'): | |
"""Save the trained model""" | |
print(f"\nSaving model to {output_path}...") | |
# Add metadata | |
model_info = { | |
'pipeline': pipeline, | |
'accuracy': accuracy, | |
'feature_count': len(pipeline.named_steps['tfidf'].vocabulary_), | |
'training_date': datetime.now().isoformat(), | |
'model_type': 'Multinomial Naive Bayes', | |
'preprocessing': 'TF-IDF with 1-2 grams' | |
} | |
joblib.dump(model_info, output_path) | |
print(f"Model saved successfully!") | |
print(f"Model info:") | |
print(f" - Accuracy: {accuracy:.4f}") | |
print(f" - Features: {model_info['feature_count']}") | |
print(f" - Training date: {model_info['training_date']}") | |
def test_model_predictions(pipeline): | |
"""Test model with sample predictions""" | |
print("\n" + "="*50) | |
print("TESTING MODEL WITH SAMPLE PREDICTIONS") | |
print("="*50) | |
test_messages = [ | |
"Hello, please find attached the document you requested.", | |
"Good morning, I'm sharing the report as discussed.", | |
"Hi team, attached is the presentation for tomorrow's meeting.", | |
"Dear all, kindly review the attached files.", | |
"Hello, how are you doing today?", | |
"I will send you the information later.", | |
"Please let me know if you need any clarification.", | |
"The meeting is scheduled for 3 PM tomorrow." | |
] | |
for msg in test_messages: | |
processed_msg = preprocess_text(msg) | |
prediction = pipeline.predict([processed_msg])[0] | |
probabilities = pipeline.predict_proba([processed_msg])[0] | |
confidence = max(probabilities) | |
label = "Has Attachment" if prediction == 1 else "No Attachment" | |
print(f"\nMessage: '{msg}'") | |
print(f"Prediction: {label} (confidence: {confidence:.3f})") | |
print(f"Probabilities: No={probabilities[0]:.3f}, Yes={probabilities[1]:.3f}") | |
def main(): | |
"""Main training function""" | |
print("="*60) | |
print("EMAIL ATTACHMENT CLASSIFIER TRAINING") | |
print("="*60) | |
# Load data | |
dataset_path = 'Synthetic_Email_Dataset.csv' | |
X, y = load_data(dataset_path) | |
if X is None: | |
print("Failed to load dataset. Exiting...") | |
return | |
# Train model | |
pipeline, accuracy = train_model(X, y) | |
# Save model | |
save_model(pipeline, accuracy) | |
# Test predictions | |
test_model_predictions(pipeline) | |
print("\n" + "="*60) | |
print("TRAINING COMPLETED SUCCESSFULLY!") | |
print("="*60) | |
print(f"Final model accuracy: {accuracy:.4f}") | |
print("Model saved as 'email_classifier_model.pkl'") | |
print("You can now deploy the API using 'python app.py'") | |
if __name__ == "__main__": | |
main() |