Spaces:
Sleeping
Sleeping
File size: 6,242 Bytes
1291f7a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
#!/usr/bin/env python3
"""
Standalone script to train the email classifier model
"""
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib
import re
import os
from datetime import datetime
def preprocess_text(text: str) -> str:
"""Preprocess email text"""
text = text.lower()
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s,.\-!?]', ' ', text)
return text.strip()
def load_data(file_path: str):
"""Load and preprocess the dataset"""
print(f"Loading dataset from {file_path}...")
if not os.path.exists(file_path):
print(f"Error: Dataset file {file_path} not found!")
return None, None
df = pd.read_csv(file_path)
print(f"Dataset loaded: {len(df)} samples")
print(f"Columns: {list(df.columns)}")
# Basic data info
print(f"\nLabel distribution:")
print(df['label'].value_counts())
# Preprocess messages
df['processed_message'] = df['message'].apply(preprocess_text)
return df['processed_message'], df['label']
def train_model(X, y):
"""Train the Naive Bayes model"""
print("\nSplitting data...")
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
# Create pipeline
print("\nCreating model pipeline...")
pipeline = Pipeline([
('tfidf', TfidfVectorizer(
max_features=1000,
ngram_range=(1, 2),
stop_words='english',
lowercase=True,
min_df=1,
max_df=0.95
)),
('classifier', MultinomialNB(alpha=1.0))
])
# Train model
print("Training model...")
pipeline.fit(X_train, y_train)
# Cross-validation
print("Performing cross-validation...")
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
# Test set evaluation
print("\nEvaluating on test set...")
y_pred = pipeline.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {test_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Attachment', 'Has Attachment']))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
# Feature analysis
print("\nAnalyzing most important features...")
feature_names = pipeline.named_steps['tfidf'].get_feature_names_out()
feature_scores = pipeline.named_steps['classifier'].feature_log_prob_
# Top features for each class
for class_idx, class_name in enumerate(['No Attachment', 'Has Attachment']):
top_features_idx = np.argsort(feature_scores[class_idx])[-20:]
top_features = [feature_names[i] for i in top_features_idx]
print(f"\nTop 20 features for {class_name}:")
print(", ".join(reversed(top_features)))
return pipeline, test_accuracy
def save_model(pipeline, accuracy, output_path='email_classifier_model.pkl'):
"""Save the trained model"""
print(f"\nSaving model to {output_path}...")
# Add metadata
model_info = {
'pipeline': pipeline,
'accuracy': accuracy,
'feature_count': len(pipeline.named_steps['tfidf'].vocabulary_),
'training_date': datetime.now().isoformat(),
'model_type': 'Multinomial Naive Bayes',
'preprocessing': 'TF-IDF with 1-2 grams'
}
joblib.dump(model_info, output_path)
print(f"Model saved successfully!")
print(f"Model info:")
print(f" - Accuracy: {accuracy:.4f}")
print(f" - Features: {model_info['feature_count']}")
print(f" - Training date: {model_info['training_date']}")
def test_model_predictions(pipeline):
"""Test model with sample predictions"""
print("\n" + "="*50)
print("TESTING MODEL WITH SAMPLE PREDICTIONS")
print("="*50)
test_messages = [
"Hello, please find attached the document you requested.",
"Good morning, I'm sharing the report as discussed.",
"Hi team, attached is the presentation for tomorrow's meeting.",
"Dear all, kindly review the attached files.",
"Hello, how are you doing today?",
"I will send you the information later.",
"Please let me know if you need any clarification.",
"The meeting is scheduled for 3 PM tomorrow."
]
for msg in test_messages:
processed_msg = preprocess_text(msg)
prediction = pipeline.predict([processed_msg])[0]
probabilities = pipeline.predict_proba([processed_msg])[0]
confidence = max(probabilities)
label = "Has Attachment" if prediction == 1 else "No Attachment"
print(f"\nMessage: '{msg}'")
print(f"Prediction: {label} (confidence: {confidence:.3f})")
print(f"Probabilities: No={probabilities[0]:.3f}, Yes={probabilities[1]:.3f}")
def main():
"""Main training function"""
print("="*60)
print("EMAIL ATTACHMENT CLASSIFIER TRAINING")
print("="*60)
# Load data
dataset_path = 'Synthetic_Email_Dataset.csv'
X, y = load_data(dataset_path)
if X is None:
print("Failed to load dataset. Exiting...")
return
# Train model
pipeline, accuracy = train_model(X, y)
# Save model
save_model(pipeline, accuracy)
# Test predictions
test_model_predictions(pipeline)
print("\n" + "="*60)
print("TRAINING COMPLETED SUCCESSFULLY!")
print("="*60)
print(f"Final model accuracy: {accuracy:.4f}")
print("Model saved as 'email_classifier_model.pkl'")
print("You can now deploy the API using 'python app.py'")
if __name__ == "__main__":
main() |