|
import streamlit as st
|
|
import pandas as pd
|
|
import numpy as np
|
|
import joblib
|
|
import re
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.svm import LinearSVC
|
|
from sklearn.metrics import classification_report, accuracy_score
|
|
from sklearn.model_selection import train_test_split
|
|
import nltk
|
|
from nltk.corpus import stopwords
|
|
from nltk.stem import WordNetLemmatizer
|
|
import plotly.express as px
|
|
import plotly.graph_objects as go
|
|
from plotly.subplots import make_subplots
|
|
|
|
|
|
@st.cache_resource
|
|
def download_nltk_data():
|
|
try:
|
|
nltk.data.find('tokenizers/punkt')
|
|
nltk.data.find('corpora/stopwords')
|
|
nltk.data.find('corpora/wordnet')
|
|
except LookupError:
|
|
nltk.download('punkt', quiet=True)
|
|
nltk.download('stopwords', quiet=True)
|
|
nltk.download('wordnet', quiet=True)
|
|
nltk.download('omw-1.4', quiet=True)
|
|
|
|
download_nltk_data()
|
|
|
|
|
|
stop_words = set(stopwords.words('english'))
|
|
lemmatizer = WordNetLemmatizer()
|
|
|
|
def preprocess_text(text):
|
|
"""Clean and preprocess text for classification"""
|
|
if pd.isna(text):
|
|
return ""
|
|
|
|
text = str(text).lower()
|
|
text = re.sub(r'[^\w\s]', '', text)
|
|
text = re.sub(r'\d+', '', text)
|
|
|
|
words = text.split()
|
|
words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 2]
|
|
|
|
return ' '.join(words)
|
|
|
|
|
|
@st.cache_data
|
|
def create_sample_data():
|
|
"""Create sample transaction data"""
|
|
sample_data = [
|
|
("Monthly apartment rent payment", "rent"),
|
|
("Grocery shopping at walmart", "groceries"),
|
|
("Electric bill payment", "utilities"),
|
|
("Netflix monthly subscription", "subscription"),
|
|
("Gas station fuel", "transportation"),
|
|
("Restaurant dinner", "dining"),
|
|
("Apartment rent for december", "rent"),
|
|
("Weekly grocery shopping", "groceries"),
|
|
("Water bill payment", "utilities"),
|
|
("Spotify premium subscription", "subscription"),
|
|
("Bus fare to work", "transportation"),
|
|
("Coffee shop breakfast", "dining"),
|
|
("Monthly rent payment", "rent"),
|
|
("Food shopping at target", "groceries"),
|
|
("Internet bill", "utilities"),
|
|
("Amazon Prime membership", "subscription"),
|
|
("Uber ride home", "transportation"),
|
|
("Pizza delivery", "dining"),
|
|
("Rent for apartment", "rent"),
|
|
("Supermarket groceries", "groceries"),
|
|
("Phone bill payment", "utilities"),
|
|
("YouTube premium", "subscription"),
|
|
("Train ticket", "transportation"),
|
|
("Fast food lunch", "dining"),
|
|
("Office supplies", "shopping"),
|
|
("Medical appointment", "healthcare"),
|
|
("Gym membership", "fitness"),
|
|
("Book purchase", "shopping"),
|
|
("Doctor visit", "healthcare"),
|
|
("Fitness class", "fitness"),
|
|
("Clothing purchase", "shopping"),
|
|
("Pharmacy prescription", "healthcare"),
|
|
("Personal trainer", "fitness"),
|
|
("Electronics store", "shopping"),
|
|
("Dentist appointment", "healthcare"),
|
|
("Yoga class", "fitness"),
|
|
("Gift for friend", "shopping"),
|
|
("Eye exam", "healthcare"),
|
|
("Swimming pool fee", "fitness"),
|
|
("Home improvement", "shopping")
|
|
]
|
|
|
|
df = pd.DataFrame(sample_data, columns=['purpose_text', 'transaction_type'])
|
|
return df
|
|
|
|
@st.cache_resource
|
|
def train_models(df):
|
|
"""Train multiple models and return the best one"""
|
|
|
|
df['cleaned_purpose'] = df['purpose_text'].apply(preprocess_text)
|
|
|
|
X = df["cleaned_purpose"]
|
|
y = df["transaction_type"]
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
|
|
|
|
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
|
|
X_train_vec = vectorizer.fit_transform(X_train)
|
|
X_test_vec = vectorizer.transform(X_test)
|
|
|
|
|
|
models = {
|
|
"Naive Bayes": MultinomialNB(),
|
|
"Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
|
|
"SVM (LinearSVC)": LinearSVC(random_state=42)
|
|
}
|
|
|
|
results = {}
|
|
trained_models = {}
|
|
|
|
for name, model in models.items():
|
|
model.fit(X_train_vec, y_train)
|
|
y_pred = model.predict(X_test_vec)
|
|
acc = accuracy_score(y_test, y_pred)
|
|
results[name] = {
|
|
'accuracy': acc,
|
|
'predictions': y_pred,
|
|
'actual': y_test
|
|
}
|
|
trained_models[name] = model
|
|
|
|
|
|
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
|
|
best_model = trained_models[best_model_name]
|
|
|
|
return best_model, vectorizer, results, trained_models
|
|
|
|
def main():
|
|
st.set_page_config(
|
|
page_title="Transaction Classification System",
|
|
page_icon="π³",
|
|
layout="wide"
|
|
)
|
|
|
|
st.title("π³ Transaction Purpose Classification")
|
|
st.markdown("---")
|
|
|
|
|
|
st.sidebar.title("Navigation")
|
|
page = st.sidebar.radio("Choose a page:", ["π Home", "π Model Training", "π Classification", "π Model Comparison"])
|
|
|
|
|
|
df = create_sample_data()
|
|
|
|
if page == "π Home":
|
|
st.header("Welcome to Transaction Classification System")
|
|
|
|
col1, col2 = st.columns(2)
|
|
|
|
with col1:
|
|
st.subheader("π Project Overview")
|
|
st.write("""
|
|
This system classifies financial transactions based on their purpose text using machine learning.
|
|
|
|
**Features:**
|
|
- Multiple ML models (Naive Bayes, Logistic Regression, SVM)
|
|
- Text preprocessing with NLTK
|
|
- Interactive model comparison
|
|
- Real-time transaction classification
|
|
""")
|
|
|
|
with col2:
|
|
st.subheader("π Sample Data")
|
|
st.dataframe(df.head(10))
|
|
|
|
st.subheader("π·οΈ Transaction Types")
|
|
type_counts = df['transaction_type'].value_counts()
|
|
fig = px.pie(values=type_counts.values, names=type_counts.index, title="Distribution of Transaction Types")
|
|
st.plotly_chart(fig, use_container_width=True)
|
|
|
|
elif page == "π Model Training":
|
|
st.header("Model Training & Evaluation")
|
|
|
|
|
|
with st.spinner("Training models..."):
|
|
best_model, vectorizer, results, trained_models = train_models(df)
|
|
|
|
col1, col2 = st.columns(2)
|
|
|
|
with col1:
|
|
st.subheader("π Model Performance")
|
|
|
|
|
|
results_df = pd.DataFrame({
|
|
'Model': list(results.keys()),
|
|
'Accuracy': [results[model]['accuracy'] for model in results.keys()]
|
|
})
|
|
|
|
fig = px.bar(results_df, x='Model', y='Accuracy', title="Model Accuracy Comparison")
|
|
fig.update_layout(yaxis_range=[0, 1])
|
|
st.plotly_chart(fig, use_container_width=True)
|
|
|
|
st.dataframe(results_df)
|
|
|
|
with col2:
|
|
st.subheader("π― Best Model Details")
|
|
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
|
|
st.success(f"**Best Model:** {best_model_name}")
|
|
st.metric("Accuracy", f"{results[best_model_name]['accuracy']:.3f}")
|
|
|
|
|
|
st.subheader("π Classification Report")
|
|
y_test = results[best_model_name]['actual']
|
|
y_pred = results[best_model_name]['predictions']
|
|
|
|
report = classification_report(y_test, y_pred, output_dict=True)
|
|
report_df = pd.DataFrame(report).transpose()
|
|
st.dataframe(report_df.round(3))
|
|
|
|
|
|
st.session_state.best_model = best_model
|
|
st.session_state.vectorizer = vectorizer
|
|
st.session_state.trained_models = trained_models
|
|
|
|
elif page == "π Classification":
|
|
st.header("Classify New Transaction")
|
|
|
|
|
|
if 'best_model' not in st.session_state:
|
|
st.warning("Please train the models first by visiting the 'Model Training' page.")
|
|
return
|
|
|
|
|
|
with st.form("classification_form"):
|
|
purpose_text = st.text_area("Enter transaction purpose:",
|
|
placeholder="e.g., Monthly apartment rent payment",
|
|
height=100)
|
|
|
|
submitted = st.form_submit_button("Classify Transaction")
|
|
|
|
if submitted and purpose_text:
|
|
|
|
cleaned_text = preprocess_text(purpose_text)
|
|
|
|
|
|
vectorized_text = st.session_state.vectorizer.transform([cleaned_text])
|
|
prediction = st.session_state.best_model.predict(vectorized_text)[0]
|
|
prediction_proba = st.session_state.best_model.predict_proba(vectorized_text)[0]
|
|
|
|
|
|
classes = st.session_state.best_model.classes_
|
|
|
|
|
|
col1, col2 = st.columns(2)
|
|
|
|
with col1:
|
|
st.subheader("π― Classification Result")
|
|
st.success(f"**Predicted Type:** {prediction}")
|
|
st.info(f"**Original Text:** {purpose_text}")
|
|
st.info(f"**Processed Text:** {cleaned_text}")
|
|
|
|
with col2:
|
|
st.subheader("π Prediction Confidence")
|
|
proba_df = pd.DataFrame({
|
|
'Transaction Type': classes,
|
|
'Probability': prediction_proba
|
|
}).sort_values('Probability', ascending=False)
|
|
|
|
fig = px.bar(proba_df, x='Probability', y='Transaction Type',
|
|
orientation='h', title="Prediction Probabilities")
|
|
st.plotly_chart(fig, use_container_width=True)
|
|
|
|
elif page == "π Model Comparison":
|
|
st.header("Detailed Model Comparison")
|
|
|
|
|
|
if 'trained_models' not in st.session_state:
|
|
st.warning("Please train the models first by visiting the 'Model Training' page.")
|
|
return
|
|
|
|
|
|
st.subheader("π Model Analysis")
|
|
|
|
|
|
sample_texts = [
|
|
"Monthly rent payment",
|
|
"Grocery shopping",
|
|
"Netflix subscription",
|
|
"Gas station",
|
|
"Restaurant dinner"
|
|
]
|
|
|
|
comparison_data = []
|
|
for text in sample_texts:
|
|
cleaned = preprocess_text(text)
|
|
vectorized = st.session_state.vectorizer.transform([cleaned])
|
|
|
|
row = {'Text': text, 'Cleaned': cleaned}
|
|
for model_name, model in st.session_state.trained_models.items():
|
|
prediction = model.predict(vectorized)[0]
|
|
row[model_name] = prediction
|
|
|
|
comparison_data.append(row)
|
|
|
|
comparison_df = pd.DataFrame(comparison_data)
|
|
st.dataframe(comparison_df, use_container_width=True)
|
|
|
|
|
|
st.subheader("π€ Large Language Model Approach")
|
|
|
|
with st.expander("Click to see LLM implementation strategy"):
|
|
st.markdown("""
|
|
### Using Transformer Models for Transaction Classification
|
|
|
|
**Approach:**
|
|
1. **Pre-trained Model Selection**: Use `bert-base-uncased` or `distilbert-base-uncased`
|
|
2. **Tokenization**: Use HuggingFace's tokenizer for the selected model
|
|
3. **Model Architecture**: Add a classification head on top of the transformer
|
|
4. **Fine-tuning**: Train on labeled transaction data
|
|
|
|
**Code Example:**
|
|
```python
|
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
from transformers import Trainer, TrainingArguments
|
|
|
|
# Load pre-trained model
|
|
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
|
model = AutoModelForSequenceClassification.from_pretrained(
|
|
'bert-base-uncased',
|
|
num_labels=len(unique_labels)
|
|
)
|
|
|
|
# Tokenize data
|
|
def tokenize_function(examples):
|
|
return tokenizer(examples['purpose_text'], truncation=True, padding=True)
|
|
|
|
# Fine-tune model
|
|
training_args = TrainingArguments(
|
|
output_dir='./results',
|
|
num_train_epochs=3,
|
|
per_device_train_batch_size=16,
|
|
per_device_eval_batch_size=64,
|
|
warmup_steps=500,
|
|
weight_decay=0.01,
|
|
)
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=train_dataset,
|
|
eval_dataset=eval_dataset,
|
|
)
|
|
|
|
trainer.train()
|
|
```
|
|
|
|
**Benefits:**
|
|
- Better semantic understanding
|
|
- Handles context better than TF-IDF
|
|
- Can capture complex patterns
|
|
- State-of-the-art performance
|
|
|
|
**Drawbacks:**
|
|
- Requires more computational resources
|
|
- Longer training time
|
|
- More complex deployment
|
|
""")
|
|
|
|
if __name__ == "__main__":
|
|
main() |