train / app.py
luohoa97's picture
Create app.py
40510d6 verified
raw
history blame
5.51 kB
import streamlit as st
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import os
import json
# Dataset class for PyTorch
class TextDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
# Return input_ids, attention_mask, and labels for each item
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx]) # Adding labels for loss calculation
return item
def __len__(self):
return len(self.labels)
# Function to load configuration
def load_config(config_path='config.json'):
with open(config_path, 'r') as f:
config = json.load(f)
return config
# Main function
def main():
st.title("CSV Data Processing and Model Training 🧠")
# Load configuration
config = load_config()
# Upload multiple CSV files
uploaded_files = st.file_uploader("Upload CSV files", accept_multiple_files=True, type="csv")
if uploaded_files:
combined_texts = []
# Process each uploaded CSV file
for uploaded_file in uploaded_files:
df = pd.read_csv(uploaded_file)
# Combine all columns into a single text string for each row
combined_texts.extend(df.astype(str).agg(' '.join, axis=1))
# Check the combined text
st.write("Combined text for training:", combined_texts[:5]) # Show first 5 for verification
# Ask the user if they want to load an existing model or train a new one
use_existing_model = st.checkbox("Load an existing local model?", value=False)
if use_existing_model:
# Allow the user to select a local model directory
model_path = st.text_input("Enter the path to the local model directory:", value="")
if model_path and os.path.exists(model_path):
model = AutoModelForSequenceClassification.from_pretrained(model_path)
st.write(f"Loaded model from {model_path} successfully! πŸŽ‰")
else:
st.warning("Please provide a valid model directory path.")
return
else:
# Initialize a new model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# Tokenize combined text data
inputs = tokenizer(combined_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
# Create dummy labels (e.g., 0s for all entries)
labels = [0] * len(combined_texts) # Dummy labels for all data
# Split data into training and validation sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
inputs['input_ids'], labels, test_size=0.2, random_state=42
)
# Prepare datasets
train_dataset = TextDataset(encodings={'input_ids': train_inputs}, labels=train_labels)
val_dataset = TextDataset(encodings={'input_ids': val_inputs}, labels=val_labels)
# Determine number of threads from config
num_workers = config.get('num_workers', 4)
# Set up DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=8, num_workers=num_workers)
val_dataloader = DataLoader(val_dataset, batch_size=8, num_workers=num_workers)
# Training arguments
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=1, # total number of training epochs
per_device_train_batch_size=8, # batch size per device during training
per_device_eval_batch_size=8, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10,
evaluation_strategy="epoch"
)
# Initialize Trainer
trainer = Trainer(
model=model, # the instantiated πŸ€— Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset # evaluation dataset
)
# Start training
trainer.train()
# Ask the user for a directory to save the trained model
save_path = st.text_input("Enter the directory path to save the trained model:", value="./trained_model")
if save_path:
os.makedirs(save_path, exist_ok=True)
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
st.write(f"Model saved successfully to {save_path}! πŸŽ‰")
else:
st.warning("Please provide a valid directory path to save the model.")
# Notify user of training completion
st.success("Training completed successfully! πŸš€")
if __name__ == "__main__":
main()