File size: 1,798 Bytes
bf70aa2
 
 
 
 
cea309f
 
 
 
 
 
 
bf70aa2
 
 
cea309f
bf70aa2
 
 
 
cea309f
bf70aa2
 
 
 
cea309f
bf70aa2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cea309f
bf70aa2
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import joblib
import re

# Initialize the FastAPI app with default docs enabled
app = FastAPI(
    title="Email Classification API",
    version="1.0.0",
    description="Classifies emails and masks PII/PCI information."
)

# Load the model once when the app starts
model = joblib.load("model.joblib")

# Define the root endpoint
@app.get("/")
def root():
    return {"message": "Email Classification API is running."}

# Define input data schema
class EmailInput(BaseModel):
    subject: str = ""
    email: str

# Function to mask and store PII
def mask_and_store_all_pii(text):
    text = str(text)
    pii_map = {}

    patterns = {
        "email": r"\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b",
        "phone_number": r"\b\d{10}\b",
        "dob": r"\b\d{2}[/-]\d{2}[/-]\d{4}\b",
        "aadhar_num": r"\b\d{4}[- ]?\d{4}[- ]?\d{4}\b",
        "credit_debit_no": r"\b(?:\d[ -]*?){13,16}\b",
        "cvv_no": r"\b\d{3}\b",
        "expiry_no": r"\b(0[1-9]|1[0-2])\/\d{2,4}\b",
        "full_name": r"\b([A-Z][a-z]+(?:\s[A-Z][a-z]+)+)\b"
    }

    for label, pattern in patterns.items():
        matches = re.findall(pattern, text)
        for i, match in enumerate(matches):
            placeholder = f"[{label}_{i}]"
            pii_map[placeholder] = match
            text = text.replace(match, placeholder)

    return text, pii_map

# Endpoint to classify email
@app.post("/classify")
def classify_email(data: EmailInput):
    raw_text = f"{data.subject} {data.email}"
    masked_text, pii_map = mask_and_store_all_pii(raw_text)
    prediction = model.predict([masked_text])[0]

    return {
        "predicted_category": prediction,
        "masked_text": masked_text,
        "pii_map": pii_map
    }