Spaces:

Ujeshhh
/

Anomaly

Sleeping

App Files Files Community

Ujeshhh commited on Apr 4

Commit

d16c0f6

verified ·

1 Parent(s): c204a34

Upload 10 files

Browse files

Files changed (10) hide show

EDA.py +69 -0
anomaly_detector_rf_model.pkl +3 -0
app.py +58 -0
export_model.py +9 -0
feature_engineered_transactions.csv +0 -0
feature_engineering.py +33 -0
feature_order.pkl +3 -0
modeling.py +88 -0
synthetic dataset.py +66 -0
synthetic_elderly_transactions.csv +0 -0

EDA.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+# Set font (optional)
+plt.rcParams['font.family'] = 'Segoe UI Emoji'
+# Load data
+df = pd.read_csv('synthetic_elderly_transactions.csv', parse_dates=['timestamp'])
+df['date'] = df['timestamp'].dt.date
+# Set style
+sns.set_style("whitegrid")
+# Create dashboard layout with extra space
+fig = plt.figure(figsize=(20, 24), constrained_layout=True)
+gs = fig.add_gridspec(3, 2, height_ratios=[1, 1, 1.2])  # Bottom plot gets more room
+# --- Plot 1: Normal vs Suspicious ---
+ax1 = fig.add_subplot(gs[0, 0])
+sns.countplot(data=df, x='is_anomalous', hue='is_anomalous', palette='Set2', legend=False, ax=ax1)
+ax1.set_title("Normal vs Suspicious Transactions", fontsize=16, pad=15)
+ax1.set_xticks([0, 1])
+ax1.set_xticklabels(['Normal', 'Suspicious'], fontsize=12)
+ax1.set_xlabel("Transaction Type", fontsize=13, labelpad=10)
+ax1.set_ylabel("Count", fontsize=13, labelpad=10)
+# --- Plot 2: Transaction Amount Distribution ---
+ax2 = fig.add_subplot(gs[0, 1])
+sns.histplot(data=df, x='amount', hue='is_anomalous', kde=True, bins=50, palette='coolwarm', ax=ax2)
+ax2.set_title("Transaction Amount Distribution", fontsize=16, pad=15)
+ax2.set_xlabel("Amount", fontsize=13, labelpad=10)
+ax2.set_ylabel("Count", fontsize=13, labelpad=10)
+ax2.tick_params(axis='x', rotation=30)
+# --- Plot 3: Transaction Types ---
+ax3 = fig.add_subplot(gs[1, 0])
+sns.countplot(data=df, x='transaction_type', hue='is_anomalous', palette='muted', ax=ax3)
+ax3.set_title("Transaction Types by Normal/Suspicious", fontsize=16, pad=15)
+ax3.set_xlabel("Transaction Type", fontsize=13, labelpad=10)
+ax3.set_ylabel("Count", fontsize=13, labelpad=10)
+ax3.tick_params(axis='x', rotation=30)
+# --- Plot 4: Top Merchants ---
+ax4 = fig.add_subplot(gs[1, 1])
+sns.countplot(
+    data=df,
+    y='merchant',
+    order=df['merchant'].value_counts().index,
+    hue=None,
+    palette='viridis',
+    ax=ax4
+)
+ax4.set_title("Most Frequent Merchants", fontsize=16, pad=15)
+ax4.set_xlabel("Count", fontsize=13, labelpad=10)
+ax4.set_ylabel("Merchant", fontsize=13, labelpad=10)
+ax4.tick_params(axis='y', labelsize=11)
+# --- Plot 5: Time Series ---
+ax5 = fig.add_subplot(gs[2, :])  # Full width
+transactions_per_day = df.groupby('date').size()
+ax5.plot(transactions_per_day.index, transactions_per_day.values, color='tab:blue', linewidth=2)
+ax5.set_title("🗓️ Transactions Over Time", fontsize=18, pad=20)
+ax5.set_xlabel("Date", fontsize=13, labelpad=10)
+ax5.set_ylabel("Number of Transactions", fontsize=13, labelpad=10)
+ax5.tick_params(axis='x', rotation=45)
+# Show clean and clear dashboard
+plt.show()

anomaly_detector_rf_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35e52f8138004330847e1a71fd3bb99216d843b201296d746cba8d2b95bda791
+size 89129

app.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import gradio as gr
+import pandas as pd
+import joblib
+import seaborn as sns
+import matplotlib.pyplot as plt
+# Load trained model & feature order
+model = joblib.load('anomaly_detector_rf_model.pkl')
+feature_order = joblib.load('feature_order.pkl')
+# Function to make predictions
+def detect_anomalies(file):
+    # Read uploaded CSV
+    df = pd.read_csv(file)
+    # Ensure correct feature order
+    df = df[feature_order]
+    # Get predictions (0 = Normal, 1 = Anomalous)
+    df['Prediction'] = model.predict(df)
+    # Count anomalies
+    anomaly_count = df['Prediction'].sum()
+    total = len(df)
+    # Visualization
+    plt.figure(figsize=(5, 3))
+    sns.countplot(x=df['Prediction'], palette=['green', 'red'])
+    plt.xticks([0, 1], ["Normal", "Anomalous"])
+    plt.title("Anomaly Distribution")
+    plt.xlabel("Transaction Type")
+    plt.ylabel("Count")
+    plt.tight_layout()
+    # Save the plot
+    plot_path = "anomaly_plot.png"
+    plt.savefig(plot_path)
+    # Return table and plot
+    return df.head(), f"Detected {anomaly_count} anomalies out of {total} transactions.", plot_path
+# Gradio Interface
+interface = gr.Interface(
+    fn=detect_anomalies,
+    inputs=gr.File(label="Upload Transaction CSV"),
+    outputs=[
+        gr.Dataframe(label="Predictions"),
+        gr.Text(label="Summary"),
+        gr.Image(label="Anomaly Chart")
+    ],
+    title="Financial Anomaly Detector",
+    description="Upload a CSV file with transactions, and the model will detect suspicious activities."
+)
+# Launch app
+interface.launch()

export_model.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import joblib
+# Load model and feature list
+model = joblib.load('anomaly_detector_rf_model.pkl')
+feature_order = joblib.load('feature_order.pkl')
+print("✅ Model and features loaded successfully!")
+print("🔹 Feature columns used:")
+print(feature_order)

feature_engineered_transactions.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

feature_engineering.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import LabelEncoder
+# Load the dataset
+df = pd.read_csv('synthetic_elderly_transactions.csv', parse_dates=['timestamp'])
+# --- A. TEMPORAL FEATURES ---
+df['hour'] = df['timestamp'].dt.hour
+df['day_of_week'] = df['timestamp'].dt.dayofweek
+df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
+# --- B. BEHAVIORAL FEATURES ---
+df['merchant_avg_amount'] = df.groupby('merchant')['amount'].transform('mean')
+df['amount_zscore'] = (df['amount'] - df['amount'].mean()) / df['amount'].std()
+df['log_amount'] = df['amount'].apply(lambda x: np.log1p(x))  # fixed line ✅
+# --- C. ENCODING ---
+df = pd.get_dummies(df, columns=['transaction_type'], prefix='type')
+le = LabelEncoder()
+df['merchant_encoded'] = le.fit_transform(df['merchant'])
+# --- D. DROP UNUSED COLUMNS ---
+df = df.drop(columns=['timestamp', 'merchant', 'transaction_id', 'customer_id', 'age'])
+# Print feature columns
+print("✅ Final features:")
+print(df.columns)
+# Save to CSV
+df.to_csv('feature_engineered_transactions.csv', index=False)
+print("✅ Feature engineered dataset saved as 'feature_engineered_transactions.csv'")

feature_order.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a6861a92cdf5a61cd90851209d85a71c053e35cbb549aec6dd1e1a370bc173e
+size 178

modeling.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import classification_report, confusion_matrix
+# -------------------------------
+# Load the feature-engineered dataset
+# -------------------------------
+df = pd.read_csv('feature_engineered_transactions.csv')
+# -------------------------------
+# Split into features and labels
+# -------------------------------
+X = df.drop(columns=['is_anomalous'])
+y = df['is_anomalous']
+# -------------------------------
+# Train-test split
+# -------------------------------
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, stratify=y, random_state=42
+)
+# -------------------------------
+# Train Random Forest Classifier
+# -------------------------------
+clf = RandomForestClassifier(n_estimators=100, random_state=42)
+clf.fit(X_train, y_train)
+# -------------------------------
+# Make predictions
+# -------------------------------
+y_pred = clf.predict(X_test)
+# -------------------------------
+# Evaluation Report
+# -------------------------------
+print("\n✅ Classification Report:\n")
+print(classification_report(y_test, y_pred, digits=4))
+# -------------------------------
+# Create one page with subplots
+# -------------------------------
+fig, axes = plt.subplots(1, 2, figsize=(15, 6))
+plt.suptitle("Anomaly Detection Results", fontsize=16, fontweight='bold')
+# --- A. Confusion Matrix ---
+cm = confusion_matrix(y_test, y_pred)
+sns.heatmap(
+    cm,
+    annot=True,
+    fmt="d",
+    cmap="Blues",
+    xticklabels=["Normal", "Suspicious"],
+    yticklabels=["Normal", "Suspicious"],
+    ax=axes[0]
+)
+axes[0].set_title("Confusion Matrix")
+axes[0].set_xlabel("Predicted")
+axes[0].set_ylabel("Actual")
+# --- B. Feature Importance ---
+importances = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)
+sns.barplot(
+    x=importances.values[:10],
+    y=importances.index[:10],
+    color='skyblue',
+    ax=axes[1]
+)
+axes[1].set_title("Top 10 Feature Importances")
+axes[1].set_xlabel("Importance")
+axes[1].set_ylabel("Feature")
+# --- Layout ---
+plt.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust to fit title
+plt.show()
+import joblib
+# Save model
+joblib.dump(clf, 'anomaly_detector_rf_model.pkl')
+# Save feature order for later use
+joblib.dump(list(X.columns), 'feature_order.pkl')
+print("✅ Model and feature list saved!")

synthetic dataset.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import pandas as pd
+import numpy as np
+from faker import Faker
+import random
+from datetime import datetime, timedelta
+fake = Faker()
+np.random.seed(42)
+# Parameters
+num_customers = 50
+num_transactions = 3000
+suspicious_ratio = 0.05  # 5% transactions will be anomalies
+# Transaction categories
+regular_merchants = ['Pharmacy', 'Supermarket', 'Electricity Bill', 'Water Bill', 'Medical Checkup', 'Rent',
+                     'Insurance']
+suspicious_merchants = ['Gift Card Store', 'Unknown Transfer', 'Crypto Exchange', 'Late Night ATM', 'Online Casino']
+transaction_types = ['debit', 'credit', 'atm_withdrawal']
+# Generate customers
+customer_ids = [f"CUST{1000 + i}" for i in range(num_customers)]
+ages = np.random.randint(65, 90, size=num_customers)  # Elderly age range
+# Generate transactions
+data = []
+for _ in range(num_transactions):
+    customer_idx = np.random.randint(0, num_customers)
+    customer_id = customer_ids[customer_idx]
+    age = ages[customer_idx]
+    timestamp = fake.date_time_between(start_date='-180d', end_date='now')
+    is_anomalous = np.random.rand() < suspicious_ratio
+    if is_anomalous:
+        merchant = random.choice(suspicious_merchants)
+        amount = round(np.random.uniform(200, 5000), 2)
+        transaction_type = random.choice(['debit', 'atm_withdrawal'])
+    else:
+        merchant = random.choice(regular_merchants)
+        amount = round(np.random.uniform(10, 300), 2)
+        transaction_type = random.choice(transaction_types)
+    data.append({
+        'customer_id': customer_id,
+        'age': age,
+        'transaction_id': fake.uuid4(),
+        'timestamp': timestamp,
+        'merchant': merchant,
+        'amount': amount,
+        'transaction_type': transaction_type,
+        'is_anomalous': int(is_anomalous)
+    })
+# Convert to DataFrame
+df = pd.DataFrame(data)
+# Sort by timestamp
+df = df.sort_values(by='timestamp')
+# Save to CSV
+df.to_csv('synthetic_elderly_transactions.csv', index=False)
+print("✅ Dataset created and saved as 'synthetic_elderly_transactions.csv'")

synthetic_elderly_transactions.csv ADDED Viewed

The diff for this file is too large to render. See raw diff