File size: 6,242 Bytes
1291f7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env python3
"""
Standalone script to train the email classifier model
"""

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib
import re
import os
from datetime import datetime

def preprocess_text(text: str) -> str:
    """Preprocess email text"""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s,.\-!?]', ' ', text)
    return text.strip()

def load_data(file_path: str):
    """Load and preprocess the dataset"""
    print(f"Loading dataset from {file_path}...")
    
    if not os.path.exists(file_path):
        print(f"Error: Dataset file {file_path} not found!")
        return None, None
    
    df = pd.read_csv(file_path)
    print(f"Dataset loaded: {len(df)} samples")
    print(f"Columns: {list(df.columns)}")
    
    # Basic data info
    print(f"\nLabel distribution:")
    print(df['label'].value_counts())
    
    # Preprocess messages
    df['processed_message'] = df['message'].apply(preprocess_text)
    
    return df['processed_message'], df['label']

def train_model(X, y):
    """Train the Naive Bayes model"""
    print("\nSplitting data...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"Training set: {len(X_train)} samples")
    print(f"Test set: {len(X_test)} samples")
    
    # Create pipeline
    print("\nCreating model pipeline...")
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            max_features=1000,
            ngram_range=(1, 2),
            stop_words='english',
            lowercase=True,
            min_df=1,
            max_df=0.95
        )),
        ('classifier', MultinomialNB(alpha=1.0))
    ])
    
    # Train model
    print("Training model...")
    pipeline.fit(X_train, y_train)
    
    # Cross-validation
    print("Performing cross-validation...")
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Mean CV accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    
    # Test set evaluation
    print("\nEvaluating on test set...")
    y_pred = pipeline.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    
    print(f"Test accuracy: {test_accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['No Attachment', 'Has Attachment']))
    
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # Feature analysis
    print("\nAnalyzing most important features...")
    feature_names = pipeline.named_steps['tfidf'].get_feature_names_out()
    feature_scores = pipeline.named_steps['classifier'].feature_log_prob_
    
    # Top features for each class
    for class_idx, class_name in enumerate(['No Attachment', 'Has Attachment']):
        top_features_idx = np.argsort(feature_scores[class_idx])[-20:]
        top_features = [feature_names[i] for i in top_features_idx]
        print(f"\nTop 20 features for {class_name}:")
        print(", ".join(reversed(top_features)))
    
    return pipeline, test_accuracy

def save_model(pipeline, accuracy, output_path='email_classifier_model.pkl'):
    """Save the trained model"""
    print(f"\nSaving model to {output_path}...")
    
    # Add metadata
    model_info = {
        'pipeline': pipeline,
        'accuracy': accuracy,
        'feature_count': len(pipeline.named_steps['tfidf'].vocabulary_),
        'training_date': datetime.now().isoformat(),
        'model_type': 'Multinomial Naive Bayes',
        'preprocessing': 'TF-IDF with 1-2 grams'
    }
    
    joblib.dump(model_info, output_path)
    print(f"Model saved successfully!")
    print(f"Model info:")
    print(f"  - Accuracy: {accuracy:.4f}")
    print(f"  - Features: {model_info['feature_count']}")
    print(f"  - Training date: {model_info['training_date']}")

def test_model_predictions(pipeline):
    """Test model with sample predictions"""
    print("\n" + "="*50)
    print("TESTING MODEL WITH SAMPLE PREDICTIONS")
    print("="*50)
    
    test_messages = [
        "Hello, please find attached the document you requested.",
        "Good morning, I'm sharing the report as discussed.",
        "Hi team, attached is the presentation for tomorrow's meeting.",
        "Dear all, kindly review the attached files.",
        "Hello, how are you doing today?",
        "I will send you the information later.",
        "Please let me know if you need any clarification.",
        "The meeting is scheduled for 3 PM tomorrow."
    ]
    
    for msg in test_messages:
        processed_msg = preprocess_text(msg)
        prediction = pipeline.predict([processed_msg])[0]
        probabilities = pipeline.predict_proba([processed_msg])[0]
        confidence = max(probabilities)
        
        label = "Has Attachment" if prediction == 1 else "No Attachment"
        print(f"\nMessage: '{msg}'")
        print(f"Prediction: {label} (confidence: {confidence:.3f})")
        print(f"Probabilities: No={probabilities[0]:.3f}, Yes={probabilities[1]:.3f}")

def main():
    """Main training function"""
    print("="*60)
    print("EMAIL ATTACHMENT CLASSIFIER TRAINING")
    print("="*60)
    
    # Load data
    dataset_path = 'Synthetic_Email_Dataset.csv'
    X, y = load_data(dataset_path)
    
    if X is None:
        print("Failed to load dataset. Exiting...")
        return
    
    # Train model
    pipeline, accuracy = train_model(X, y)
    
    # Save model
    save_model(pipeline, accuracy)
    
    # Test predictions
    test_model_predictions(pipeline)
    
    print("\n" + "="*60)
    print("TRAINING COMPLETED SUCCESSFULLY!")
    print("="*60)
    print(f"Final model accuracy: {accuracy:.4f}")
    print("Model saved as 'email_classifier_model.pkl'")
    print("You can now deploy the API using 'python app.py'")

if __name__ == "__main__":
    main()