File size: 2,597 Bytes
6b21b32
bf70aa2
 
 
 
11184ec
cea309f
 
 
6b21b32
11184ec
 
cea309f
 
11184ec
bf70aa2
 
11184ec
bf70aa2
11184ec
bf70aa2
0082dc5
bf70aa2
11184ec
bf70aa2
11184ec
 
bf70aa2
 
0082dc5
 
 
 
 
 
 
bf70aa2
 
 
11184ec
 
 
0082dc5
 
 
11184ec
 
 
 
 
 
 
 
 
bf70aa2
11184ec
 
 
 
 
 
bf70aa2
11184ec
bf70aa2
 
11184ec
 
 
 
bf70aa2
11184ec
 
 
 
bf70aa2
11184ec
 
 
 
bf70aa2
6b21b32
11184ec
6b21b32
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from fastapi import FastAPI
from pydantic import BaseModel
import joblib
import re

# Initialize FastAPI app
app = FastAPI(
    title="Email Classification API",
    version="1.0.0",
    description="Classifies support emails into categories and masks personal information.",
    docs_url="/docs",
    redoc_url="/redoc"
)

# Load pre-trained model
model = joblib.load("model.joblib")

# Input schema
class EmailInput(BaseModel):
    input_email_body: str

# Updated PII Masking Function (fixes Aadhaar vs Card and name misclassifications)
def mask_and_store_all_pii(text):
    text = str(text)
    pii_map = {}
    entity_list = []

    patterns = {
        "email": r"\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b",
        "phone_number": r"\+?\d[\d\s\-]{7,14}\d",
        "dob": r"\b\d{1,2}[\/\-\.\s]\d{1,2}[\/\-\.\s]\d{2,4}\b",
        "aadhar_num": r"\b\d{4}[ -]?\d{4}[ -]?\d{4}\b(?!\d)",
        "credit_debit_no": r"\b(?:\d[ -]*?){13,19}\b",
        "cvv_no": r"(?i)\b(?:cvv[:\s\-]*)?(\d{3,4})\b",
        "expiry_no": r"\b(0[1-9]|1[0-2])[\/\-]\d{2,4}\b",
        # Avoid naive full name pattern to prevent false positives like 'Dear Sir'
    }

    for label, pattern in patterns.items():
        for match in re.finditer(pattern, text):
            original = match.group()
            start, end = match.start(), match.end()
            placeholder = f"[{label}_{len(pii_map):03d}]"
            if original not in text:
                continue
            pii_map[placeholder] = original
            entity_list.append({
                "position": [start, end],
                "classification": label,
                "entity": original
            })
            text = text.replace(original, placeholder, 1)

    return text, pii_map, entity_list

# Restore PII
def restore_pii(masked_text, pii_map):
    restored = masked_text
    for placeholder, original in pii_map.items():
        restored = restored.replace(placeholder, original)
    return restored

# Classification Endpoint
@app.post("/classify")
def classify_email(data: EmailInput):
    raw_text = data.input_email_body

    # Masking
    masked_text, pii_map, entity_list = mask_and_store_all_pii(raw_text)

    # Prediction
    predicted_category = model.predict([masked_text])[0]

    # Response format
    return {
        "input_email_body": raw_text,
        "list_of_masked_entities": entity_list,
        "masked_email": masked_text,
        "category_of_the_email": predicted_category
    }

# Health check endpoint
@app.get("/")
def root():
    return {"message": "Email Classification API is running."}