Upload 10 files
Browse files- EDA.py +69 -0
- anomaly_detector_rf_model.pkl +3 -0
- app.py +58 -0
- export_model.py +9 -0
- feature_engineered_transactions.csv +0 -0
- feature_engineering.py +33 -0
- feature_order.pkl +3 -0
- modeling.py +88 -0
- synthetic dataset.py +66 -0
- synthetic_elderly_transactions.csv +0 -0
EDA.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import seaborn as sns
|
4 |
+
|
5 |
+
# Set font (optional)
|
6 |
+
plt.rcParams['font.family'] = 'Segoe UI Emoji'
|
7 |
+
|
8 |
+
# Load data
|
9 |
+
df = pd.read_csv('synthetic_elderly_transactions.csv', parse_dates=['timestamp'])
|
10 |
+
df['date'] = df['timestamp'].dt.date
|
11 |
+
|
12 |
+
# Set style
|
13 |
+
sns.set_style("whitegrid")
|
14 |
+
|
15 |
+
# Create dashboard layout with extra space
|
16 |
+
fig = plt.figure(figsize=(20, 24), constrained_layout=True)
|
17 |
+
gs = fig.add_gridspec(3, 2, height_ratios=[1, 1, 1.2]) # Bottom plot gets more room
|
18 |
+
|
19 |
+
# --- Plot 1: Normal vs Suspicious ---
|
20 |
+
ax1 = fig.add_subplot(gs[0, 0])
|
21 |
+
sns.countplot(data=df, x='is_anomalous', hue='is_anomalous', palette='Set2', legend=False, ax=ax1)
|
22 |
+
ax1.set_title("Normal vs Suspicious Transactions", fontsize=16, pad=15)
|
23 |
+
ax1.set_xticks([0, 1])
|
24 |
+
ax1.set_xticklabels(['Normal', 'Suspicious'], fontsize=12)
|
25 |
+
ax1.set_xlabel("Transaction Type", fontsize=13, labelpad=10)
|
26 |
+
ax1.set_ylabel("Count", fontsize=13, labelpad=10)
|
27 |
+
|
28 |
+
# --- Plot 2: Transaction Amount Distribution ---
|
29 |
+
ax2 = fig.add_subplot(gs[0, 1])
|
30 |
+
sns.histplot(data=df, x='amount', hue='is_anomalous', kde=True, bins=50, palette='coolwarm', ax=ax2)
|
31 |
+
ax2.set_title("Transaction Amount Distribution", fontsize=16, pad=15)
|
32 |
+
ax2.set_xlabel("Amount", fontsize=13, labelpad=10)
|
33 |
+
ax2.set_ylabel("Count", fontsize=13, labelpad=10)
|
34 |
+
ax2.tick_params(axis='x', rotation=30)
|
35 |
+
|
36 |
+
# --- Plot 3: Transaction Types ---
|
37 |
+
ax3 = fig.add_subplot(gs[1, 0])
|
38 |
+
sns.countplot(data=df, x='transaction_type', hue='is_anomalous', palette='muted', ax=ax3)
|
39 |
+
ax3.set_title("Transaction Types by Normal/Suspicious", fontsize=16, pad=15)
|
40 |
+
ax3.set_xlabel("Transaction Type", fontsize=13, labelpad=10)
|
41 |
+
ax3.set_ylabel("Count", fontsize=13, labelpad=10)
|
42 |
+
ax3.tick_params(axis='x', rotation=30)
|
43 |
+
|
44 |
+
# --- Plot 4: Top Merchants ---
|
45 |
+
ax4 = fig.add_subplot(gs[1, 1])
|
46 |
+
sns.countplot(
|
47 |
+
data=df,
|
48 |
+
y='merchant',
|
49 |
+
order=df['merchant'].value_counts().index,
|
50 |
+
hue=None,
|
51 |
+
palette='viridis',
|
52 |
+
ax=ax4
|
53 |
+
)
|
54 |
+
ax4.set_title("Most Frequent Merchants", fontsize=16, pad=15)
|
55 |
+
ax4.set_xlabel("Count", fontsize=13, labelpad=10)
|
56 |
+
ax4.set_ylabel("Merchant", fontsize=13, labelpad=10)
|
57 |
+
ax4.tick_params(axis='y', labelsize=11)
|
58 |
+
|
59 |
+
# --- Plot 5: Time Series ---
|
60 |
+
ax5 = fig.add_subplot(gs[2, :]) # Full width
|
61 |
+
transactions_per_day = df.groupby('date').size()
|
62 |
+
ax5.plot(transactions_per_day.index, transactions_per_day.values, color='tab:blue', linewidth=2)
|
63 |
+
ax5.set_title("🗓️ Transactions Over Time", fontsize=18, pad=20)
|
64 |
+
ax5.set_xlabel("Date", fontsize=13, labelpad=10)
|
65 |
+
ax5.set_ylabel("Number of Transactions", fontsize=13, labelpad=10)
|
66 |
+
ax5.tick_params(axis='x', rotation=45)
|
67 |
+
|
68 |
+
# Show clean and clear dashboard
|
69 |
+
plt.show()
|
anomaly_detector_rf_model.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:35e52f8138004330847e1a71fd3bb99216d843b201296d746cba8d2b95bda791
|
3 |
+
size 89129
|
app.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import joblib
|
4 |
+
import seaborn as sns
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
|
7 |
+
# Load trained model & feature order
|
8 |
+
model = joblib.load('anomaly_detector_rf_model.pkl')
|
9 |
+
feature_order = joblib.load('feature_order.pkl')
|
10 |
+
|
11 |
+
|
12 |
+
# Function to make predictions
|
13 |
+
def detect_anomalies(file):
|
14 |
+
# Read uploaded CSV
|
15 |
+
df = pd.read_csv(file)
|
16 |
+
|
17 |
+
# Ensure correct feature order
|
18 |
+
df = df[feature_order]
|
19 |
+
|
20 |
+
# Get predictions (0 = Normal, 1 = Anomalous)
|
21 |
+
df['Prediction'] = model.predict(df)
|
22 |
+
|
23 |
+
# Count anomalies
|
24 |
+
anomaly_count = df['Prediction'].sum()
|
25 |
+
total = len(df)
|
26 |
+
|
27 |
+
# Visualization
|
28 |
+
plt.figure(figsize=(5, 3))
|
29 |
+
sns.countplot(x=df['Prediction'], palette=['green', 'red'])
|
30 |
+
plt.xticks([0, 1], ["Normal", "Anomalous"])
|
31 |
+
plt.title("Anomaly Distribution")
|
32 |
+
plt.xlabel("Transaction Type")
|
33 |
+
plt.ylabel("Count")
|
34 |
+
plt.tight_layout()
|
35 |
+
|
36 |
+
# Save the plot
|
37 |
+
plot_path = "anomaly_plot.png"
|
38 |
+
plt.savefig(plot_path)
|
39 |
+
|
40 |
+
# Return table and plot
|
41 |
+
return df.head(), f"Detected {anomaly_count} anomalies out of {total} transactions.", plot_path
|
42 |
+
|
43 |
+
|
44 |
+
# Gradio Interface
|
45 |
+
interface = gr.Interface(
|
46 |
+
fn=detect_anomalies,
|
47 |
+
inputs=gr.File(label="Upload Transaction CSV"),
|
48 |
+
outputs=[
|
49 |
+
gr.Dataframe(label="Predictions"),
|
50 |
+
gr.Text(label="Summary"),
|
51 |
+
gr.Image(label="Anomaly Chart")
|
52 |
+
],
|
53 |
+
title="Financial Anomaly Detector",
|
54 |
+
description="Upload a CSV file with transactions, and the model will detect suspicious activities."
|
55 |
+
)
|
56 |
+
|
57 |
+
# Launch app
|
58 |
+
interface.launch()
|
export_model.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import joblib
|
2 |
+
|
3 |
+
# Load model and feature list
|
4 |
+
model = joblib.load('anomaly_detector_rf_model.pkl')
|
5 |
+
feature_order = joblib.load('feature_order.pkl')
|
6 |
+
|
7 |
+
print("✅ Model and features loaded successfully!")
|
8 |
+
print("🔹 Feature columns used:")
|
9 |
+
print(feature_order)
|
feature_engineered_transactions.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
feature_engineering.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from sklearn.preprocessing import LabelEncoder
|
4 |
+
|
5 |
+
# Load the dataset
|
6 |
+
df = pd.read_csv('synthetic_elderly_transactions.csv', parse_dates=['timestamp'])
|
7 |
+
|
8 |
+
# --- A. TEMPORAL FEATURES ---
|
9 |
+
df['hour'] = df['timestamp'].dt.hour
|
10 |
+
df['day_of_week'] = df['timestamp'].dt.dayofweek
|
11 |
+
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
|
12 |
+
|
13 |
+
# --- B. BEHAVIORAL FEATURES ---
|
14 |
+
df['merchant_avg_amount'] = df.groupby('merchant')['amount'].transform('mean')
|
15 |
+
df['amount_zscore'] = (df['amount'] - df['amount'].mean()) / df['amount'].std()
|
16 |
+
df['log_amount'] = df['amount'].apply(lambda x: np.log1p(x)) # fixed line ✅
|
17 |
+
|
18 |
+
# --- C. ENCODING ---
|
19 |
+
df = pd.get_dummies(df, columns=['transaction_type'], prefix='type')
|
20 |
+
|
21 |
+
le = LabelEncoder()
|
22 |
+
df['merchant_encoded'] = le.fit_transform(df['merchant'])
|
23 |
+
|
24 |
+
# --- D. DROP UNUSED COLUMNS ---
|
25 |
+
df = df.drop(columns=['timestamp', 'merchant', 'transaction_id', 'customer_id', 'age'])
|
26 |
+
|
27 |
+
# Print feature columns
|
28 |
+
print("✅ Final features:")
|
29 |
+
print(df.columns)
|
30 |
+
|
31 |
+
# Save to CSV
|
32 |
+
df.to_csv('feature_engineered_transactions.csv', index=False)
|
33 |
+
print("✅ Feature engineered dataset saved as 'feature_engineered_transactions.csv'")
|
feature_order.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a6861a92cdf5a61cd90851209d85a71c053e35cbb549aec6dd1e1a370bc173e
|
3 |
+
size 178
|
modeling.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import seaborn as sns
|
4 |
+
from sklearn.model_selection import train_test_split
|
5 |
+
from sklearn.ensemble import RandomForestClassifier
|
6 |
+
from sklearn.metrics import classification_report, confusion_matrix
|
7 |
+
|
8 |
+
# -------------------------------
|
9 |
+
# Load the feature-engineered dataset
|
10 |
+
# -------------------------------
|
11 |
+
df = pd.read_csv('feature_engineered_transactions.csv')
|
12 |
+
|
13 |
+
# -------------------------------
|
14 |
+
# Split into features and labels
|
15 |
+
# -------------------------------
|
16 |
+
X = df.drop(columns=['is_anomalous'])
|
17 |
+
y = df['is_anomalous']
|
18 |
+
|
19 |
+
# -------------------------------
|
20 |
+
# Train-test split
|
21 |
+
# -------------------------------
|
22 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
23 |
+
X, y, test_size=0.2, stratify=y, random_state=42
|
24 |
+
)
|
25 |
+
|
26 |
+
# -------------------------------
|
27 |
+
# Train Random Forest Classifier
|
28 |
+
# -------------------------------
|
29 |
+
clf = RandomForestClassifier(n_estimators=100, random_state=42)
|
30 |
+
clf.fit(X_train, y_train)
|
31 |
+
|
32 |
+
# -------------------------------
|
33 |
+
# Make predictions
|
34 |
+
# -------------------------------
|
35 |
+
y_pred = clf.predict(X_test)
|
36 |
+
|
37 |
+
# -------------------------------
|
38 |
+
# Evaluation Report
|
39 |
+
# -------------------------------
|
40 |
+
print("\n✅ Classification Report:\n")
|
41 |
+
print(classification_report(y_test, y_pred, digits=4))
|
42 |
+
|
43 |
+
# -------------------------------
|
44 |
+
# Create one page with subplots
|
45 |
+
# -------------------------------
|
46 |
+
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
|
47 |
+
plt.suptitle("Anomaly Detection Results", fontsize=16, fontweight='bold')
|
48 |
+
|
49 |
+
# --- A. Confusion Matrix ---
|
50 |
+
cm = confusion_matrix(y_test, y_pred)
|
51 |
+
sns.heatmap(
|
52 |
+
cm,
|
53 |
+
annot=True,
|
54 |
+
fmt="d",
|
55 |
+
cmap="Blues",
|
56 |
+
xticklabels=["Normal", "Suspicious"],
|
57 |
+
yticklabels=["Normal", "Suspicious"],
|
58 |
+
ax=axes[0]
|
59 |
+
)
|
60 |
+
axes[0].set_title("Confusion Matrix")
|
61 |
+
axes[0].set_xlabel("Predicted")
|
62 |
+
axes[0].set_ylabel("Actual")
|
63 |
+
|
64 |
+
# --- B. Feature Importance ---
|
65 |
+
importances = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)
|
66 |
+
sns.barplot(
|
67 |
+
x=importances.values[:10],
|
68 |
+
y=importances.index[:10],
|
69 |
+
color='skyblue',
|
70 |
+
ax=axes[1]
|
71 |
+
)
|
72 |
+
axes[1].set_title("Top 10 Feature Importances")
|
73 |
+
axes[1].set_xlabel("Importance")
|
74 |
+
axes[1].set_ylabel("Feature")
|
75 |
+
|
76 |
+
# --- Layout ---
|
77 |
+
plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust to fit title
|
78 |
+
plt.show()
|
79 |
+
|
80 |
+
import joblib
|
81 |
+
|
82 |
+
# Save model
|
83 |
+
joblib.dump(clf, 'anomaly_detector_rf_model.pkl')
|
84 |
+
|
85 |
+
# Save feature order for later use
|
86 |
+
joblib.dump(list(X.columns), 'feature_order.pkl')
|
87 |
+
|
88 |
+
print("✅ Model and feature list saved!")
|
synthetic dataset.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from faker import Faker
|
4 |
+
import random
|
5 |
+
from datetime import datetime, timedelta
|
6 |
+
|
7 |
+
fake = Faker()
|
8 |
+
np.random.seed(42)
|
9 |
+
|
10 |
+
# Parameters
|
11 |
+
num_customers = 50
|
12 |
+
num_transactions = 3000
|
13 |
+
suspicious_ratio = 0.05 # 5% transactions will be anomalies
|
14 |
+
|
15 |
+
# Transaction categories
|
16 |
+
regular_merchants = ['Pharmacy', 'Supermarket', 'Electricity Bill', 'Water Bill', 'Medical Checkup', 'Rent',
|
17 |
+
'Insurance']
|
18 |
+
suspicious_merchants = ['Gift Card Store', 'Unknown Transfer', 'Crypto Exchange', 'Late Night ATM', 'Online Casino']
|
19 |
+
|
20 |
+
transaction_types = ['debit', 'credit', 'atm_withdrawal']
|
21 |
+
|
22 |
+
# Generate customers
|
23 |
+
customer_ids = [f"CUST{1000 + i}" for i in range(num_customers)]
|
24 |
+
ages = np.random.randint(65, 90, size=num_customers) # Elderly age range
|
25 |
+
|
26 |
+
# Generate transactions
|
27 |
+
data = []
|
28 |
+
|
29 |
+
for _ in range(num_transactions):
|
30 |
+
customer_idx = np.random.randint(0, num_customers)
|
31 |
+
customer_id = customer_ids[customer_idx]
|
32 |
+
age = ages[customer_idx]
|
33 |
+
|
34 |
+
timestamp = fake.date_time_between(start_date='-180d', end_date='now')
|
35 |
+
|
36 |
+
is_anomalous = np.random.rand() < suspicious_ratio
|
37 |
+
|
38 |
+
if is_anomalous:
|
39 |
+
merchant = random.choice(suspicious_merchants)
|
40 |
+
amount = round(np.random.uniform(200, 5000), 2)
|
41 |
+
transaction_type = random.choice(['debit', 'atm_withdrawal'])
|
42 |
+
else:
|
43 |
+
merchant = random.choice(regular_merchants)
|
44 |
+
amount = round(np.random.uniform(10, 300), 2)
|
45 |
+
transaction_type = random.choice(transaction_types)
|
46 |
+
|
47 |
+
data.append({
|
48 |
+
'customer_id': customer_id,
|
49 |
+
'age': age,
|
50 |
+
'transaction_id': fake.uuid4(),
|
51 |
+
'timestamp': timestamp,
|
52 |
+
'merchant': merchant,
|
53 |
+
'amount': amount,
|
54 |
+
'transaction_type': transaction_type,
|
55 |
+
'is_anomalous': int(is_anomalous)
|
56 |
+
})
|
57 |
+
|
58 |
+
# Convert to DataFrame
|
59 |
+
df = pd.DataFrame(data)
|
60 |
+
|
61 |
+
# Sort by timestamp
|
62 |
+
df = df.sort_values(by='timestamp')
|
63 |
+
|
64 |
+
# Save to CSV
|
65 |
+
df.to_csv('synthetic_elderly_transactions.csv', index=False)
|
66 |
+
print("✅ Dataset created and saved as 'synthetic_elderly_transactions.csv'")
|
synthetic_elderly_transactions.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|