File size: 4,451 Bytes
93aa575
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import re

st.title("Expense Category Prediction")

# Load data from CSV
df = pd.read_csv("financial_data.csv", sep='\s\s+', engine='python')

# Data Preprocessing
def preprocess_data(df):

    # Clean the date column
    df['Date'] = df['Date'].str.extract(r'(\d{4}-\d{2}-\d{2})')
    
    # Forward fill missing dates
    df['Date'] = df['Date'].ffill()
    
    # Remove rows with missing dates
    df.dropna(subset=['Date'], inplace=True)
    
    # Convert 'Date' to datetime objects
    df['Date'] = pd.to_datetime(df['Date'])

    # Fill missing values in 'Expense_Category' and 'Description' with 'Unknown'
    df['Expense_Category'] = df['Expense_Category'].fillna('Unknown')
    df['Description'] = df['Description'].fillna('Unknown')

    # Convert 'Amount' to numeric, fill missing with 0
    df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce').fillna(0)
    
    # Date Feature Engineering
    df['Month'] = df['Date'].dt.month
    df['DayOfWeek'] = df['Date'].dt.dayofweek

    # Description Text Processing
    def clean_text(text):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        return text

    df['Description_Cleaned'] = df['Description'].apply(clean_text)

    # TF-IDF Vectorization
    tfidf_vectorizer = TfidfVectorizer(max_features=100)  # Limiting features for simplicity
    tfidf_features = tfidf_vectorizer.fit_transform(df['Description_Cleaned']).toarray()
    tfidf_df = pd.DataFrame(tfidf_features, index=df.index)

    # Combine Features
    features_df = pd.concat([df[['Amount', 'Month', 'DayOfWeek']], tfidf_df], axis=1)

    # Encode the target variable
    label_encoder = LabelEncoder()
    df['Expense_Category_Encoded'] = label_encoder.fit_transform(df['Expense_Category'])

    # Select features and target
    X = features_df
    y = df['Expense_Category_Encoded']

    # Scale the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    return X, y, label_encoder, df  # Return the original dataframe

X, y, label_encoder, df = preprocess_data(df.copy())

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# --- Models ---
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# --- Streamlit Tabs ---
tabs = st.tabs(list(models.keys()))

for tab, model_name in zip(tabs, models.keys()):
    with tab:
        st.header(model_name)
        model = models[model_name]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # --- Confusion Matrix ---
        st.subheader("Confusion Matrix")
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        st.pyplot(plt.gcf())

        # --- Classification Report ---
        st.subheader("Classification Report")
        cr = classification_report(y_test, y_pred,
                                   target_names=label_encoder.inverse_transform(
                                       df['Expense_Category_Encoded'].unique()),
                                   zero_division=0)  # Get original category names
        st.text(cr)

        # --- Remarks ---
        st.subheader("Remarks")
        st.write("Model Performance Analysis:")
        st.write(
            f"The {model_name} model's performance in predicting Expense Categories is shown above.")
        st.write("Key Metrics:")
        st.write(
            "-   The model uses a combination of expense amount, time-based features, and text descriptions to predict the expense category."
        )
        st.write(
            "-   The classification report provides insights into the model's precision, recall, and F1-score for each expense category."
        )