Spaces:

luohoa97
/

train

Sleeping

App Files Files Community

train / app.py

luohoa97

Create app.py

40510d6 verified about 1 year ago

raw

history blame

5.51 kB

	import streamlit as st
	import pandas as pd
	import torch
	from torch.utils.data import Dataset, DataLoader
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
	from sklearn.model_selection import train_test_split
	import os
	import json

	# Dataset class for PyTorch
	class TextDataset(Dataset):
	def __init__(self, encodings, labels):
	self.encodings = encodings
	self.labels = labels

	def __getitem__(self, idx):
	# Return input_ids, attention_mask, and labels for each item
	item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
	item['labels'] = torch.tensor(self.labels[idx]) # Adding labels for loss calculation
	return item

	def __len__(self):
	return len(self.labels)

	# Function to load configuration
	def load_config(config_path='config.json'):
	with open(config_path, 'r') as f:
	config = json.load(f)
	return config

	# Main function
	def main():
	st.title("CSV Data Processing and Model Training 🧠")

	# Load configuration
	config = load_config()

	# Upload multiple CSV files
	uploaded_files = st.file_uploader("Upload CSV files", accept_multiple_files=True, type="csv")

	if uploaded_files:
	combined_texts = []

	# Process each uploaded CSV file
	for uploaded_file in uploaded_files:
	df = pd.read_csv(uploaded_file)

	# Combine all columns into a single text string for each row
	combined_texts.extend(df.astype(str).agg(' '.join, axis=1))

	# Check the combined text
	st.write("Combined text for training:", combined_texts[:5]) # Show first 5 for verification

	# Ask the user if they want to load an existing model or train a new one
	use_existing_model = st.checkbox("Load an existing local model?", value=False)

	if use_existing_model:
	# Allow the user to select a local model directory
	model_path = st.text_input("Enter the path to the local model directory:", value="")
	if model_path and os.path.exists(model_path):
	model = AutoModelForSequenceClassification.from_pretrained(model_path)
	st.write(f"Loaded model from {model_path} successfully! 🎉")
	else:
	st.warning("Please provide a valid model directory path.")
	return
	else:
	# Initialize a new model
	model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

	# Initialize tokenizer
	tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

	# Tokenize combined text data
	inputs = tokenizer(combined_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)

	# Create dummy labels (e.g., 0s for all entries)
	labels = [0] * len(combined_texts) # Dummy labels for all data

	# Split data into training and validation sets
	train_inputs, val_inputs, train_labels, val_labels = train_test_split(
	inputs['input_ids'], labels, test_size=0.2, random_state=42
	)

	# Prepare datasets
	train_dataset = TextDataset(encodings={'input_ids': train_inputs}, labels=train_labels)
	val_dataset = TextDataset(encodings={'input_ids': val_inputs}, labels=val_labels)

	# Determine number of threads from config
	num_workers = config.get('num_workers', 4)

	# Set up DataLoaders
	train_dataloader = DataLoader(train_dataset, batch_size=8, num_workers=num_workers)
	val_dataloader = DataLoader(val_dataset, batch_size=8, num_workers=num_workers)

	# Training arguments
	training_args = TrainingArguments(
	output_dir='./results', # output directory
	num_train_epochs=1, # total number of training epochs
	per_device_train_batch_size=8, # batch size per device during training
	per_device_eval_batch_size=8, # batch size for evaluation
	warmup_steps=500, # number of warmup steps for learning rate scheduler
	weight_decay=0.01, # strength of weight decay
	logging_dir='./logs', # directory for storing logs
	logging_steps=10,
	evaluation_strategy="epoch"
	)

	# Initialize Trainer
	trainer = Trainer(
	model=model, # the instantiated 🤗 Transformers model to be trained
	args=training_args, # training arguments, defined above
	train_dataset=train_dataset, # training dataset
	eval_dataset=val_dataset # evaluation dataset
	)

	# Start training
	trainer.train()

	# Ask the user for a directory to save the trained model
	save_path = st.text_input("Enter the directory path to save the trained model:", value="./trained_model")

	if save_path:
	os.makedirs(save_path, exist_ok=True)
	model.save_pretrained(save_path)
	tokenizer.save_pretrained(save_path)
	st.write(f"Model saved successfully to {save_path}! 🎉")
	else:
	st.warning("Please provide a valid directory path to save the model.")

	# Notify user of training completion
	st.success("Training completed successfully! 🚀")

	if __name__ == "__main__":
	main()