Ujeshhh commited on
Commit
d16c0f6
·
verified ·
1 Parent(s): c204a34

Upload 10 files

Browse files
EDA.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import matplotlib.pyplot as plt
3
+ import seaborn as sns
4
+
5
+ # Set font (optional)
6
+ plt.rcParams['font.family'] = 'Segoe UI Emoji'
7
+
8
+ # Load data
9
+ df = pd.read_csv('synthetic_elderly_transactions.csv', parse_dates=['timestamp'])
10
+ df['date'] = df['timestamp'].dt.date
11
+
12
+ # Set style
13
+ sns.set_style("whitegrid")
14
+
15
+ # Create dashboard layout with extra space
16
+ fig = plt.figure(figsize=(20, 24), constrained_layout=True)
17
+ gs = fig.add_gridspec(3, 2, height_ratios=[1, 1, 1.2]) # Bottom plot gets more room
18
+
19
+ # --- Plot 1: Normal vs Suspicious ---
20
+ ax1 = fig.add_subplot(gs[0, 0])
21
+ sns.countplot(data=df, x='is_anomalous', hue='is_anomalous', palette='Set2', legend=False, ax=ax1)
22
+ ax1.set_title("Normal vs Suspicious Transactions", fontsize=16, pad=15)
23
+ ax1.set_xticks([0, 1])
24
+ ax1.set_xticklabels(['Normal', 'Suspicious'], fontsize=12)
25
+ ax1.set_xlabel("Transaction Type", fontsize=13, labelpad=10)
26
+ ax1.set_ylabel("Count", fontsize=13, labelpad=10)
27
+
28
+ # --- Plot 2: Transaction Amount Distribution ---
29
+ ax2 = fig.add_subplot(gs[0, 1])
30
+ sns.histplot(data=df, x='amount', hue='is_anomalous', kde=True, bins=50, palette='coolwarm', ax=ax2)
31
+ ax2.set_title("Transaction Amount Distribution", fontsize=16, pad=15)
32
+ ax2.set_xlabel("Amount", fontsize=13, labelpad=10)
33
+ ax2.set_ylabel("Count", fontsize=13, labelpad=10)
34
+ ax2.tick_params(axis='x', rotation=30)
35
+
36
+ # --- Plot 3: Transaction Types ---
37
+ ax3 = fig.add_subplot(gs[1, 0])
38
+ sns.countplot(data=df, x='transaction_type', hue='is_anomalous', palette='muted', ax=ax3)
39
+ ax3.set_title("Transaction Types by Normal/Suspicious", fontsize=16, pad=15)
40
+ ax3.set_xlabel("Transaction Type", fontsize=13, labelpad=10)
41
+ ax3.set_ylabel("Count", fontsize=13, labelpad=10)
42
+ ax3.tick_params(axis='x', rotation=30)
43
+
44
+ # --- Plot 4: Top Merchants ---
45
+ ax4 = fig.add_subplot(gs[1, 1])
46
+ sns.countplot(
47
+ data=df,
48
+ y='merchant',
49
+ order=df['merchant'].value_counts().index,
50
+ hue=None,
51
+ palette='viridis',
52
+ ax=ax4
53
+ )
54
+ ax4.set_title("Most Frequent Merchants", fontsize=16, pad=15)
55
+ ax4.set_xlabel("Count", fontsize=13, labelpad=10)
56
+ ax4.set_ylabel("Merchant", fontsize=13, labelpad=10)
57
+ ax4.tick_params(axis='y', labelsize=11)
58
+
59
+ # --- Plot 5: Time Series ---
60
+ ax5 = fig.add_subplot(gs[2, :]) # Full width
61
+ transactions_per_day = df.groupby('date').size()
62
+ ax5.plot(transactions_per_day.index, transactions_per_day.values, color='tab:blue', linewidth=2)
63
+ ax5.set_title("🗓️ Transactions Over Time", fontsize=18, pad=20)
64
+ ax5.set_xlabel("Date", fontsize=13, labelpad=10)
65
+ ax5.set_ylabel("Number of Transactions", fontsize=13, labelpad=10)
66
+ ax5.tick_params(axis='x', rotation=45)
67
+
68
+ # Show clean and clear dashboard
69
+ plt.show()
anomaly_detector_rf_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35e52f8138004330847e1a71fd3bb99216d843b201296d746cba8d2b95bda791
3
+ size 89129
app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import joblib
4
+ import seaborn as sns
5
+ import matplotlib.pyplot as plt
6
+
7
+ # Load trained model & feature order
8
+ model = joblib.load('anomaly_detector_rf_model.pkl')
9
+ feature_order = joblib.load('feature_order.pkl')
10
+
11
+
12
+ # Function to make predictions
13
+ def detect_anomalies(file):
14
+ # Read uploaded CSV
15
+ df = pd.read_csv(file)
16
+
17
+ # Ensure correct feature order
18
+ df = df[feature_order]
19
+
20
+ # Get predictions (0 = Normal, 1 = Anomalous)
21
+ df['Prediction'] = model.predict(df)
22
+
23
+ # Count anomalies
24
+ anomaly_count = df['Prediction'].sum()
25
+ total = len(df)
26
+
27
+ # Visualization
28
+ plt.figure(figsize=(5, 3))
29
+ sns.countplot(x=df['Prediction'], palette=['green', 'red'])
30
+ plt.xticks([0, 1], ["Normal", "Anomalous"])
31
+ plt.title("Anomaly Distribution")
32
+ plt.xlabel("Transaction Type")
33
+ plt.ylabel("Count")
34
+ plt.tight_layout()
35
+
36
+ # Save the plot
37
+ plot_path = "anomaly_plot.png"
38
+ plt.savefig(plot_path)
39
+
40
+ # Return table and plot
41
+ return df.head(), f"Detected {anomaly_count} anomalies out of {total} transactions.", plot_path
42
+
43
+
44
+ # Gradio Interface
45
+ interface = gr.Interface(
46
+ fn=detect_anomalies,
47
+ inputs=gr.File(label="Upload Transaction CSV"),
48
+ outputs=[
49
+ gr.Dataframe(label="Predictions"),
50
+ gr.Text(label="Summary"),
51
+ gr.Image(label="Anomaly Chart")
52
+ ],
53
+ title="Financial Anomaly Detector",
54
+ description="Upload a CSV file with transactions, and the model will detect suspicious activities."
55
+ )
56
+
57
+ # Launch app
58
+ interface.launch()
export_model.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+
3
+ # Load model and feature list
4
+ model = joblib.load('anomaly_detector_rf_model.pkl')
5
+ feature_order = joblib.load('feature_order.pkl')
6
+
7
+ print("✅ Model and features loaded successfully!")
8
+ print("🔹 Feature columns used:")
9
+ print(feature_order)
feature_engineered_transactions.csv ADDED
The diff for this file is too large to render. See raw diff
 
feature_engineering.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.preprocessing import LabelEncoder
4
+
5
+ # Load the dataset
6
+ df = pd.read_csv('synthetic_elderly_transactions.csv', parse_dates=['timestamp'])
7
+
8
+ # --- A. TEMPORAL FEATURES ---
9
+ df['hour'] = df['timestamp'].dt.hour
10
+ df['day_of_week'] = df['timestamp'].dt.dayofweek
11
+ df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
12
+
13
+ # --- B. BEHAVIORAL FEATURES ---
14
+ df['merchant_avg_amount'] = df.groupby('merchant')['amount'].transform('mean')
15
+ df['amount_zscore'] = (df['amount'] - df['amount'].mean()) / df['amount'].std()
16
+ df['log_amount'] = df['amount'].apply(lambda x: np.log1p(x)) # fixed line ✅
17
+
18
+ # --- C. ENCODING ---
19
+ df = pd.get_dummies(df, columns=['transaction_type'], prefix='type')
20
+
21
+ le = LabelEncoder()
22
+ df['merchant_encoded'] = le.fit_transform(df['merchant'])
23
+
24
+ # --- D. DROP UNUSED COLUMNS ---
25
+ df = df.drop(columns=['timestamp', 'merchant', 'transaction_id', 'customer_id', 'age'])
26
+
27
+ # Print feature columns
28
+ print("✅ Final features:")
29
+ print(df.columns)
30
+
31
+ # Save to CSV
32
+ df.to_csv('feature_engineered_transactions.csv', index=False)
33
+ print("✅ Feature engineered dataset saved as 'feature_engineered_transactions.csv'")
feature_order.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a6861a92cdf5a61cd90851209d85a71c053e35cbb549aec6dd1e1a370bc173e
3
+ size 178
modeling.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import matplotlib.pyplot as plt
3
+ import seaborn as sns
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.ensemble import RandomForestClassifier
6
+ from sklearn.metrics import classification_report, confusion_matrix
7
+
8
+ # -------------------------------
9
+ # Load the feature-engineered dataset
10
+ # -------------------------------
11
+ df = pd.read_csv('feature_engineered_transactions.csv')
12
+
13
+ # -------------------------------
14
+ # Split into features and labels
15
+ # -------------------------------
16
+ X = df.drop(columns=['is_anomalous'])
17
+ y = df['is_anomalous']
18
+
19
+ # -------------------------------
20
+ # Train-test split
21
+ # -------------------------------
22
+ X_train, X_test, y_train, y_test = train_test_split(
23
+ X, y, test_size=0.2, stratify=y, random_state=42
24
+ )
25
+
26
+ # -------------------------------
27
+ # Train Random Forest Classifier
28
+ # -------------------------------
29
+ clf = RandomForestClassifier(n_estimators=100, random_state=42)
30
+ clf.fit(X_train, y_train)
31
+
32
+ # -------------------------------
33
+ # Make predictions
34
+ # -------------------------------
35
+ y_pred = clf.predict(X_test)
36
+
37
+ # -------------------------------
38
+ # Evaluation Report
39
+ # -------------------------------
40
+ print("\n✅ Classification Report:\n")
41
+ print(classification_report(y_test, y_pred, digits=4))
42
+
43
+ # -------------------------------
44
+ # Create one page with subplots
45
+ # -------------------------------
46
+ fig, axes = plt.subplots(1, 2, figsize=(15, 6))
47
+ plt.suptitle("Anomaly Detection Results", fontsize=16, fontweight='bold')
48
+
49
+ # --- A. Confusion Matrix ---
50
+ cm = confusion_matrix(y_test, y_pred)
51
+ sns.heatmap(
52
+ cm,
53
+ annot=True,
54
+ fmt="d",
55
+ cmap="Blues",
56
+ xticklabels=["Normal", "Suspicious"],
57
+ yticklabels=["Normal", "Suspicious"],
58
+ ax=axes[0]
59
+ )
60
+ axes[0].set_title("Confusion Matrix")
61
+ axes[0].set_xlabel("Predicted")
62
+ axes[0].set_ylabel("Actual")
63
+
64
+ # --- B. Feature Importance ---
65
+ importances = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)
66
+ sns.barplot(
67
+ x=importances.values[:10],
68
+ y=importances.index[:10],
69
+ color='skyblue',
70
+ ax=axes[1]
71
+ )
72
+ axes[1].set_title("Top 10 Feature Importances")
73
+ axes[1].set_xlabel("Importance")
74
+ axes[1].set_ylabel("Feature")
75
+
76
+ # --- Layout ---
77
+ plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust to fit title
78
+ plt.show()
79
+
80
+ import joblib
81
+
82
+ # Save model
83
+ joblib.dump(clf, 'anomaly_detector_rf_model.pkl')
84
+
85
+ # Save feature order for later use
86
+ joblib.dump(list(X.columns), 'feature_order.pkl')
87
+
88
+ print("✅ Model and feature list saved!")
synthetic dataset.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from faker import Faker
4
+ import random
5
+ from datetime import datetime, timedelta
6
+
7
+ fake = Faker()
8
+ np.random.seed(42)
9
+
10
+ # Parameters
11
+ num_customers = 50
12
+ num_transactions = 3000
13
+ suspicious_ratio = 0.05 # 5% transactions will be anomalies
14
+
15
+ # Transaction categories
16
+ regular_merchants = ['Pharmacy', 'Supermarket', 'Electricity Bill', 'Water Bill', 'Medical Checkup', 'Rent',
17
+ 'Insurance']
18
+ suspicious_merchants = ['Gift Card Store', 'Unknown Transfer', 'Crypto Exchange', 'Late Night ATM', 'Online Casino']
19
+
20
+ transaction_types = ['debit', 'credit', 'atm_withdrawal']
21
+
22
+ # Generate customers
23
+ customer_ids = [f"CUST{1000 + i}" for i in range(num_customers)]
24
+ ages = np.random.randint(65, 90, size=num_customers) # Elderly age range
25
+
26
+ # Generate transactions
27
+ data = []
28
+
29
+ for _ in range(num_transactions):
30
+ customer_idx = np.random.randint(0, num_customers)
31
+ customer_id = customer_ids[customer_idx]
32
+ age = ages[customer_idx]
33
+
34
+ timestamp = fake.date_time_between(start_date='-180d', end_date='now')
35
+
36
+ is_anomalous = np.random.rand() < suspicious_ratio
37
+
38
+ if is_anomalous:
39
+ merchant = random.choice(suspicious_merchants)
40
+ amount = round(np.random.uniform(200, 5000), 2)
41
+ transaction_type = random.choice(['debit', 'atm_withdrawal'])
42
+ else:
43
+ merchant = random.choice(regular_merchants)
44
+ amount = round(np.random.uniform(10, 300), 2)
45
+ transaction_type = random.choice(transaction_types)
46
+
47
+ data.append({
48
+ 'customer_id': customer_id,
49
+ 'age': age,
50
+ 'transaction_id': fake.uuid4(),
51
+ 'timestamp': timestamp,
52
+ 'merchant': merchant,
53
+ 'amount': amount,
54
+ 'transaction_type': transaction_type,
55
+ 'is_anomalous': int(is_anomalous)
56
+ })
57
+
58
+ # Convert to DataFrame
59
+ df = pd.DataFrame(data)
60
+
61
+ # Sort by timestamp
62
+ df = df.sort_values(by='timestamp')
63
+
64
+ # Save to CSV
65
+ df.to_csv('synthetic_elderly_transactions.csv', index=False)
66
+ print("✅ Dataset created and saved as 'synthetic_elderly_transactions.csv'")
synthetic_elderly_transactions.csv ADDED
The diff for this file is too large to render. See raw diff