Spaces:

limitedonly41
/

csv_manipulations

Sleeping

App Files Files Community

csv_manipulations / app.py

limitedonly41

Rename csv-manipulator.py to app.py

ad61191 verified 23 days ago

raw

history blame

51.8 kB

	#!/usr/bin/env python3
	"""
	Advanced CSV Manipulation Tool with Gradio Interface
	Commercial-ready application for powerful CSV data processing

	Features:
	- File upload with 1GB limit
	- Data preview with selectable rows
	- Value replacement based on conditions
	- CSV concatenation with column selection
	- Advanced statistical analysis and visualization
	- Data validation and quality checks
	- Export to CSV, Excel, JSON
	- Batch operations and operation recipes
	- Undo/Redo functionality
	- Memory-efficient large file processing
	"""

	import gradio as gr
	import pandas as pd
	import numpy as np
	import json
	import io
	import zipfile
	from datetime import datetime, timedelta
	import re
	import matplotlib.pyplot as plt
	import seaborn as sns
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	import warnings
	import os
	from typing import Dict, List, Tuple, Optional, Any
	import hashlib
	import pickle
	from pathlib import Path

	warnings.filterwarnings('ignore')
	plt.style.use('seaborn-v0_8')
	sns.set_palette("husl")

	class CSVProcessor:
	"""Advanced CSV processing class with state management and history"""

	def __init__(self):
	self.original_df = None
	self.current_df = None
	self.history = []
	self.recipes = {}
	self.batch_files = []

	def load_data(self, file, preview_rows=100, encoding='utf-8'):
	"""Load data file with error handling and memory optimization"""
	try:
	if file is None:
	return None, "No file provided"

	file_path = file.name if hasattr(file, 'name') else str(file)
	file_extension = Path(file_path).suffix.lower()

	# Chunked reading for large files
	if file_extension == '.csv':
	# Try different encodings
	encodings = [encoding, 'utf-8', 'latin-1', 'cp1252']
	df = None
	for enc in encodings:
	try:
	df = pd.read_csv(file_path, encoding=enc, low_memory=False)
	break
	except UnicodeDecodeError:
	continue
	if df is None:
	return None, "Failed to decode file with supported encodings"

	elif file_extension in ['.xlsx', '.xls']:
	df = pd.read_excel(file_path)
	elif file_extension == '.json':
	df = pd.read_json(file_path)
	elif file_extension == '.parquet':
	df = pd.read_parquet(file_path)
	else:
	return None, f"Unsupported file format: {file_extension}"

	self.original_df = df.copy()
	self.current_df = df.copy()
	self.history = []

	# Create preview
	if preview_rows > 0:
	preview = df.head(preview_rows)
	else:
	preview = df

	# Memory and performance info
	memory_mb = df.memory_usage(deep=True).sum() / 1024**2
	info = {
	'rows': len(df),
	'columns': len(df.columns),
	'memory_usage': f"{memory_mb:.2f} MB",
	'dtypes': dict(df.dtypes.astype(str)),
	'null_counts': dict(df.isnull().sum()),
	'duplicates': df.duplicated().sum()
	}

	success_msg = f"✅ File loaded successfully!\n"
	success_msg += f"📊 {info['rows']:,} rows × {info['columns']} columns\n"
	success_msg += f"💾 Memory usage: {info['memory_usage']}\n"
	success_msg += f"🔄 Duplicates: {info['duplicates']:,}\n"
	success_msg += f"❌ Missing values: {sum(info['null_counts'].values()):,}"

	return preview, success_msg, info

	except Exception as e:
	return None, f"❌ Error loading file: {str(e)}", {}

	def save_state(self, operation_name: str):
	"""Save current state to history with memory management"""
	if len(self.history) > 50: # Limit history to prevent memory issues
	self.history = self.history[-25:] # Keep last 25 operations

	self.history.append({
	'operation': operation_name,
	'timestamp': datetime.now(),
	'df': self.current_df.copy() if self.current_df is not None else None
	})

	def undo_operation(self):
	"""Undo last operation"""
	if len(self.history) > 1:
	self.history.pop()
	self.current_df = self.history[-1]['df'].copy()
	return self.current_df, f"✅ Undone: {self.history[-1]['operation']}"
	elif len(self.history) == 1:
	self.current_df = self.original_df.copy()
	self.history = []
	return self.current_df, "✅ Reset to original data"
	else:
	return self.current_df, "❌ No operations to undo"

	def reset_to_original(self):
	"""Reset to original data"""
	if self.original_df is not None:
	self.current_df = self.original_df.copy()
	self.history = []
	return self.current_df, "✅ Reset to original data"
	return None, "❌ No original data available"

	# Global processor instance
	processor = CSVProcessor()

	def create_download_file(df: pd.DataFrame, format_type: str, filename: str = "processed_data"):
	"""Create downloadable file in specified format"""
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename_with_timestamp = f"{filename}_{timestamp}"

	try:
	if format_type == "csv":
	csv_data = df.to_csv(index=False)
	return csv_data, f"{filename_with_timestamp}.csv"
	elif format_type == "excel":
	buffer = io.BytesIO()
	with pd.ExcelWriter(buffer, engine='openpyxl') as writer:
	df.to_excel(writer, index=False, sheet_name='Data')
	buffer.seek(0)
	return buffer.getvalue(), f"{filename_with_timestamp}.xlsx"
	elif format_type == "json":
	json_data = df.to_json(orient='records', indent=2, date_format='iso')
	return json_data, f"{filename_with_timestamp}.json"
	except Exception as e:
	return None, f"Error creating {format_type} file: {str(e)}"

	def get_data_info(df: pd.DataFrame) -> str:
	"""Get comprehensive data information"""
	if df is None or df.empty:
	return "No data loaded"

	info_dict = {
	'📊 Shape': f"{df.shape[0]:,} rows × {df.shape[1]} columns",
	'💾 Memory': f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB",
	'🔄 Duplicates': f"{df.duplicated().sum():,}",
	'❌ Missing Values': f"{df.isnull().sum().sum():,}",
	'📈 Numeric Columns': f"{len(df.select_dtypes(include=[np.number]).columns)}",
	'📝 Text Columns': f"{len(df.select_dtypes(include=['object']).columns)}",
	'📅 Date Columns': f"{len(df.select_dtypes(include=['datetime64']).columns)}"
	}

	return "\n".join([f"{k}: {v}" for k, v in info_dict.items()])

	def get_column_options(df: pd.DataFrame) -> List[str]:
	"""Get list of column names for dropdowns"""
	return list(df.columns) if df is not None else []

	# ===========================================
	# CORE DATA MANIPULATION FUNCTIONS
	# ===========================================

	def rename_values_conditional(df: pd.DataFrame, target_col: str, condition_col: str,
	condition_value: str, new_value: str, match_type: str = "exact") -> Tuple[pd.DataFrame, str]:
	"""Rename values in target column based on condition in another column"""
	try:
	if df is None or df.empty:
	return df, "❌ No data available"

	if target_col not in df.columns or condition_col not in df.columns:
	return df, "❌ One or more columns not found"

	df_result = df.copy()

	if match_type == "exact":
	mask = df_result[condition_col] == condition_value
	elif match_type == "contains":
	mask = df_result[condition_col].astype(str).str.contains(condition_value, na=False)
	elif match_type == "regex":
	mask = df_result[condition_col].astype(str).str.match(condition_value, na=False)
	elif match_type == "starts_with":
	mask = df_result[condition_col].astype(str).str.startswith(condition_value, na=False)
	elif match_type == "ends_with":
	mask = df_result[condition_col].astype(str).str.endswith(condition_value, na=False)

	affected_rows = mask.sum()
	df_result.loc[mask, target_col] = new_value

	processor.current_df = df_result
	processor.save_state(f"Renamed values in '{target_col}' based on '{condition_col}'")

	return df_result, f"✅ Updated {affected_rows:,} rows in column '{target_col}'"

	except Exception as e:
	return df, f"❌ Error: {str(e)}"

	def concatenate_csvs(files: List, selected_columns: str, join_type: str = "outer") -> Tuple[pd.DataFrame, str]:
	"""Concatenate multiple CSV files with column selection"""
	try:
	if not files:
	return None, "❌ No files provided"

	dfs = []
	columns_to_use = [col.strip() for col in selected_columns.split(",") if col.strip()] if selected_columns else None

	for file in files:
	if hasattr(file, 'name'):
	file_path = file.name
	if file_path.endswith('.csv'):
	df = pd.read_csv(file_path, encoding='utf-8', low_memory=False)
	elif file_path.endswith(('.xlsx', '.xls')):
	df = pd.read_excel(file_path)
	else:
	continue

	# Select specific columns if specified
	if columns_to_use:
	available_cols = [col for col in columns_to_use if col in df.columns]
	if available_cols:
	df = df[available_cols]
	else:
	continue

	# Add source file identifier
	df['_source_file'] = Path(file_path).stem
	dfs.append(df)

	if not dfs:
	return None, "❌ No valid files found or columns don't exist"

	# Concatenate with specified join type
	if join_type == "inner":
	result_df = pd.concat(dfs, ignore_index=True, join='inner')
	else:
	result_df = pd.concat(dfs, ignore_index=True, join='outer')

	processor.current_df = result_df
	processor.save_state(f"Concatenated {len(dfs)} files")

	return result_df, f"✅ Successfully concatenated {len(dfs)} files with {len(result_df):,} total rows"

	except Exception as e:
	return None, f"❌ Error concatenating files: {str(e)}"

	def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 20, normalize: bool = False) -> Tuple[pd.DataFrame, str]:
	"""Get value counts for specified column"""
	try:
	if df is None or df.empty:
	return None, "❌ No data available"

	if column not in df.columns:
	return None, f"❌ Column '{column}' not found"

	value_counts = df[column].value_counts(normalize=normalize, dropna=False).head(top_n)

	# Convert to DataFrame for better display
	result_df = pd.DataFrame({
	'Value': value_counts.index,
	'Count' if not normalize else 'Percentage': value_counts.values
	})

	if normalize:
	result_df['Percentage'] = result_df['Percentage'].map(lambda x: f"{x:.2%}")

	return result_df, f"✅ Value counts for '{column}' (Top {min(top_n, len(result_df))})"

	except Exception as e:
	return None, f"❌ Error: {str(e)}"

	def filter_data(df: pd.DataFrame, column: str, condition: str, value: str) -> Tuple[pd.DataFrame, str]:
	"""Filter data based on conditions"""
	try:
	if df is None or df.empty:
	return df, "❌ No data available"

	if column not in df.columns:
	return df, f"❌ Column '{column}' not found"

	df_result = df.copy()

	if condition == "equals":
	mask = df_result[column] == value
	elif condition == "not_equals":
	mask = df_result[column] != value
	elif condition == "contains":
	mask = df_result[column].astype(str).str.contains(value, na=False)
	elif condition == "not_contains":
	mask = ~df_result[column].astype(str).str.contains(value, na=False)
	elif condition == "starts_with":
	mask = df_result[column].astype(str).str.startswith(value, na=False)
	elif condition == "ends_with":
	mask = df_result[column].astype(str).str.endswith(value, na=False)
	elif condition == "greater_than":
	mask = pd.to_numeric(df_result[column], errors='coerce') > float(value)
	elif condition == "less_than":
	mask = pd.to_numeric(df_result[column], errors='coerce') < float(value)
	elif condition == "is_null":
	mask = df_result[column].isnull()
	elif condition == "is_not_null":
	mask = df_result[column].notnull()
	else:
	return df, f"❌ Unknown condition: {condition}"

	filtered_df = df_result[mask]

	processor.current_df = filtered_df
	processor.save_state(f"Filtered data: {column} {condition} {value}")

	return filtered_df, f"✅ Filtered to {len(filtered_df):,} rows (removed {len(df) - len(filtered_df):,} rows)"

	except Exception as e:
	return df, f"❌ Error: {str(e)}"

	def handle_missing_values(df: pd.DataFrame, column: str, method: str, fill_value: str = "") -> Tuple[pd.DataFrame, str]:
	"""Handle missing values in specified column"""
	try:
	if df is None or df.empty:
	return df, "❌ No data available"

	if column != "ALL" and column not in df.columns:
	return df, f"❌ Column '{column}' not found"

	df_result = df.copy()
	columns_to_process = [column] if column != "ALL" else df_result.columns.tolist()

	total_missing_before = df_result.isnull().sum().sum()

	for col in columns_to_process:
	if method == "drop_rows":
	df_result = df_result.dropna(subset=[col])
	elif method == "fill_value":
	df_result[col] = df_result[col].fillna(fill_value)
	elif method == "fill_mean":
	if df_result[col].dtype in ['int64', 'float64']:
	df_result[col] = df_result[col].fillna(df_result[col].mean())
	elif method == "fill_median":
	if df_result[col].dtype in ['int64', 'float64']:
	df_result[col] = df_result[col].fillna(df_result[col].median())
	elif method == "fill_mode":
	mode_val = df_result[col].mode()
	if len(mode_val) > 0:
	df_result[col] = df_result[col].fillna(mode_val[0])
	elif method == "forward_fill":
	df_result[col] = df_result[col].fillna(method='ffill')
	elif method == "backward_fill":
	df_result[col] = df_result[col].fillna(method='bfill')

	total_missing_after = df_result.isnull().sum().sum()

	processor.current_df = df_result
	processor.save_state(f"Handle missing values: {method}")

	return df_result, f"✅ Processed missing values. Before: {total_missing_before:,}, After: {total_missing_after:,}"

	except Exception as e:
	return df, f"❌ Error: {str(e)}"

	def detect_and_remove_duplicates(df: pd.DataFrame, columns: str = "", keep: str = "first") -> Tuple[pd.DataFrame, str]:
	"""Detect and remove duplicate rows"""
	try:
	if df is None or df.empty:
	return df, "❌ No data available"

	df_result = df.copy()

	# Parse columns
	if columns.strip():
	cols_list = [col.strip() for col in columns.split(",") if col.strip() in df.columns]
	subset = cols_list if cols_list else None
	else:
	subset = None

	duplicates_before = df_result.duplicated(subset=subset).sum()

	if duplicates_before == 0:
	return df_result, "✅ No duplicate rows found"

	df_result = df_result.drop_duplicates(subset=subset, keep=keep)

	processor.current_df = df_result
	processor.save_state(f"Removed {duplicates_before:,} duplicate rows")

	return df_result, f"✅ Removed {duplicates_before:,} duplicate rows. Remaining: {len(df_result):,} rows"

	except Exception as e:
	return df, f"❌ Error: {str(e)}"

	def perform_column_operations(df: pd.DataFrame, operation: str, col1: str, col2: str = "",
	new_col_name: str = "", constant: str = "") -> Tuple[pd.DataFrame, str]:
	"""Perform mathematical and string operations on columns"""
	try:
	if df is None or df.empty:
	return df, "❌ No data available"

	if col1 not in df.columns:
	return df, f"❌ Column '{col1}' not found"

	df_result = df.copy()

	if not new_col_name:
	new_col_name = f"{col1}_{operation}"

	if operation == "add":
	if col2 and col2 in df.columns:
	df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') + pd.to_numeric(df_result[col2], errors='coerce')
	elif constant:
	df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') + float(constant)

	elif operation == "subtract":
	if col2 and col2 in df.columns:
	df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') - pd.to_numeric(df_result[col2], errors='coerce')
	elif constant:
	df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') - float(constant)

	elif operation == "multiply":
	if col2 and col2 in df.columns:
	df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') * pd.to_numeric(df_result[col2], errors='coerce')
	elif constant:
	df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') * float(constant)

	elif operation == "divide":
	if col2 and col2 in df.columns:
	df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') / pd.to_numeric(df_result[col2], errors='coerce')
	elif constant:
	df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') / float(constant)

	elif operation == "concatenate":
	if col2 and col2 in df.columns:
	df_result[new_col_name] = df_result[col1].astype(str) + " " + df_result[col2].astype(str)
	elif constant:
	df_result[new_col_name] = df_result[col1].astype(str) + constant

	elif operation == "extract_numbers":
	df_result[new_col_name] = df_result[col1].astype(str).str.extract(r'(\d+)')[0]

	elif operation == "upper":
	df_result[new_col_name] = df_result[col1].astype(str).str.upper()

	elif operation == "lower":
	df_result[new_col_name] = df_result[col1].astype(str).str.lower()

	elif operation == "title":
	df_result[new_col_name] = df_result[col1].astype(str).str.title()

	elif operation == "length":
	df_result[new_col_name] = df_result[col1].astype(str).str.len()

	else:
	return df, f"❌ Unknown operation: {operation}"

	processor.current_df = df_result
	processor.save_state(f"Column operation: {operation} on {col1}")

	return df_result, f"✅ Created new column '{new_col_name}' using {operation} operation"

	except Exception as e:
	return df, f"❌ Error: {str(e)}"

	def convert_data_types(df: pd.DataFrame, column: str, target_type: str) -> Tuple[pd.DataFrame, str]:
	"""Convert column data types"""
	try:
	if df is None or df.empty:
	return df, "❌ No data available"

	if column not in df.columns:
	return df, f"❌ Column '{column}' not found"

	df_result = df.copy()

	if target_type == "string":
	df_result[column] = df_result[column].astype(str)
	elif target_type == "integer":
	df_result[column] = pd.to_numeric(df_result[column], errors='coerce').astype('Int64')
	elif target_type == "float":
	df_result[column] = pd.to_numeric(df_result[column], errors='coerce')
	elif target_type == "datetime":
	df_result[column] = pd.to_datetime(df_result[column], errors='coerce')
	elif target_type == "boolean":
	df_result[column] = df_result[column].astype(bool)
	elif target_type == "category":
	df_result[column] = df_result[column].astype('category')
	else:
	return df, f"❌ Unknown data type: {target_type}"

	processor.current_df = df_result
	processor.save_state(f"Converted '{column}' to {target_type}")

	return df_result, f"✅ Converted column '{column}' to {target_type}"

	except Exception as e:
	return df, f"❌ Error: {str(e)}"

	# ===========================================
	# ANALYSIS AND VISUALIZATION FUNCTIONS
	# ===========================================

	def generate_statistical_summary(df: pd.DataFrame) -> Tuple[pd.DataFrame, str]:
	"""Generate comprehensive statistical summary"""
	try:
	if df is None or df.empty:
	return None, "❌ No data available"

	numeric_cols = df.select_dtypes(include=[np.number]).columns

	if len(numeric_cols) == 0:
	return None, "❌ No numeric columns found"

	stats_df = df[numeric_cols].describe()

	# Add additional statistics
	stats_df.loc['variance'] = df[numeric_cols].var()
	stats_df.loc['skewness'] = df[numeric_cols].skew()
	stats_df.loc['kurtosis'] = df[numeric_cols].kurtosis()
	stats_df.loc['missing'] = df[numeric_cols].isnull().sum()

	return stats_df.round(4), "✅ Statistical summary generated"

	except Exception as e:
	return None, f"❌ Error: {str(e)}"

	def create_correlation_matrix(df: pd.DataFrame) -> Tuple[str, str]:
	"""Create correlation matrix visualization"""
	try:
	if df is None or df.empty:
	return None, "❌ No data available"

	numeric_cols = df.select_dtypes(include=[np.number]).columns

	if len(numeric_cols) < 2:
	return None, "❌ Need at least 2 numeric columns for correlation"

	# Calculate correlation matrix
	corr_matrix = df[numeric_cols].corr()

	# Create heatmap
	plt.figure(figsize=(12, 8))
	mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
	sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
	square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
	plt.title('Correlation Matrix Heatmap', fontsize=16, fontweight='bold')
	plt.tight_layout()

	# Save plot
	plt.savefig('correlation_matrix.png', dpi=300, bbox_inches='tight')
	plt.close()

	return 'correlation_matrix.png', "✅ Correlation matrix created"

	except Exception as e:
	return None, f"❌ Error: {str(e)}"

	def create_distribution_plots(df: pd.DataFrame, column: str, plot_type: str = "histogram") -> Tuple[str, str]:
	"""Create distribution plots"""
	try:
	if df is None or df.empty:
	return None, "❌ No data available"

	if column not in df.columns:
	return None, f"❌ Column '{column}' not found"

	plt.figure(figsize=(12, 6))

	if plot_type == "histogram":
	plt.subplot(1, 2, 1)
	df[column].hist(bins=30, edgecolor='black', alpha=0.7)
	plt.title(f'Histogram of {column}')
	plt.xlabel(column)
	plt.ylabel('Frequency')

	plt.subplot(1, 2, 2)
	df.boxplot(column=column)
	plt.title(f'Box Plot of {column}')

	elif plot_type == "density":
	plt.subplot(1, 2, 1)
	df[column].plot(kind='density')
	plt.title(f'Density Plot of {column}')
	plt.xlabel(column)

	plt.subplot(1, 2, 2)
	df[column].plot(kind='box')
	plt.title(f'Box Plot of {column}')

	plt.tight_layout()
	plt.savefig(f'distribution_{column}_{plot_type}.png', dpi=300, bbox_inches='tight')
	plt.close()

	return f'distribution_{column}_{plot_type}.png', f"✅ Distribution plot created for {column}"

	except Exception as e:
	return None, f"❌ Error: {str(e)}"

	# ===========================================
	# GRADIO INTERFACE SETUP
	# ===========================================

	def create_interface():
	"""Create the main Gradio interface"""

	with gr.Blocks(title="Advanced CSV Manipulation Tool", theme=gr.themes.Soft()) as demo:

	gr.HTML("""
	<div style="text-align: center; padding: 20px;">
	<h1 style="color: #2e7d32; margin-bottom: 10px;">🔥 Advanced CSV Manipulation Tool</h1>
	<p style="font-size: 18px; color: #666;">Commercial-ready data processing with advanced analytics</p>
	<hr style="margin: 20px 0;">
	</div>
	""")

	# Global state variables
	current_data = gr.State(None)
	data_info = gr.State({})

	with gr.Tabs():

	# ===== FILE UPLOAD TAB =====
	with gr.TabItem("📁 File Upload & Preview"):
	with gr.Row():
	with gr.Column(scale=1):
	file_upload = gr.File(
	label="Upload CSV/Excel/JSON file (Max 1GB)",
	file_types=[".csv", ".xlsx", ".xls", ".json"],
	file_count="single"
	)
	preview_rows = gr.Slider(
	minimum=0,
	maximum=1000,
	value=100,
	step=50,
	label="Preview Rows (0 = All)",
	info="Number of rows to display in preview"
	)
	upload_btn = gr.Button("📊 Load & Analyze Data", variant="primary", size="lg")

	with gr.Column(scale=2):
	upload_status = gr.Textbox(label="Status", lines=5, interactive=False)
	data_info_display = gr.Textbox(label="Data Information", lines=8, interactive=False)

	data_preview = gr.DataFrame(label="Data Preview", interactive=False, height=400)

	def load_file_handler(file, rows):
	if file is None:
	return None, "Please upload a file first", "", None, {}

	preview, status, info = processor.load_data(file, rows)
	info_text = get_data_info(processor.current_df) if processor.current_df is not None else ""

	return preview, status, info_text, processor.current_df, info

	upload_btn.click(
	load_file_handler,
	inputs=[file_upload, preview_rows],
	outputs=[data_preview, upload_status, data_info_display, current_data, data_info]
	)

	# ===== VALUE REPLACEMENT TAB =====
	with gr.TabItem("🔄 Value Replacement"):
	gr.HTML("<h3>Replace values in one column based on conditions in another column</h3>")

	with gr.Row():
	with gr.Column():
	target_col = gr.Dropdown(label="Target Column (to modify)", choices=[], interactive=True)
	condition_col = gr.Dropdown(label="Condition Column (to check)", choices=[], interactive=True)
	condition_value = gr.Textbox(label="Condition Value", placeholder="Value to match in condition column")
	new_value = gr.Textbox(label="New Value", placeholder="Replacement value for target column")
	match_type = gr.Radio(
	choices=["exact", "contains", "starts_with", "ends_with", "regex"],
	value="exact",
	label="Match Type"
	)
	replace_btn = gr.Button("🔄 Replace Values", variant="primary")

	with gr.Column():
	replace_status = gr.Textbox(label="Status", lines=3, interactive=False)

	# Update column choices when data changes
	def update_columns(df):
	if df is not None:
	cols = list(df.columns)
	return gr.Dropdown(choices=cols), gr.Dropdown(choices=cols)
	return gr.Dropdown(choices=[]), gr.Dropdown(choices=[])

	current_data.change(
	update_columns,
	inputs=[current_data],
	outputs=[target_col, condition_col]
	)

	def replace_values_handler(df, tcol, ccol, cval, nval, mtype):
	if df is None:
	return None, "❌ No data loaded", ""

	result_df, status = rename_values_conditional(df, tcol, ccol, cval, nval, mtype)
	info_text = get_data_info(result_df) if result_df is not None else ""

	return result_df, status, info_text

	replace_btn.click(
	replace_values_handler,
	inputs=[current_data, target_col, condition_col, condition_value, new_value, match_type],
	outputs=[current_data, replace_status, data_info_display]
	)

	# ===== CSV CONCATENATION TAB =====
	with gr.TabItem("📋 CSV Concatenation"):
	gr.HTML("<h3>Combine multiple CSV files with column selection</h3>")

	with gr.Row():
	with gr.Column():
	multi_files = gr.File(
	label="Upload Multiple Files",
	file_types=[".csv", ".xlsx", ".xls"],
	file_count="multiple"
	)
	selected_columns = gr.Textbox(
	label="Columns to Include",
	placeholder="column1, column2, column3 (leave empty for all)",
	info="Comma-separated list of column names"
	)
	join_type = gr.Radio(
	choices=["outer", "inner"],
	value="outer",
	label="Join Type",
	info="Outer: keep all columns, Inner: only common columns"
	)
	concat_btn = gr.Button("📋 Concatenate Files", variant="primary")

	with gr.Column():
	concat_status = gr.Textbox(label="Status", lines=5, interactive=False)

	def concat_handler(files, cols, jtype):
	if not files:
	return None, "❌ Please upload files first", ""

	result_df, status = concatenate_csvs(files, cols, jtype)
	info_text = get_data_info(result_df) if result_df is not None else ""

	return result_df, status, info_text

	concat_btn.click(
	concat_handler,
	inputs=[multi_files, selected_columns, join_type],
	outputs=[current_data, concat_status, data_info_display]
	)

	# ===== VALUE COUNTS TAB =====
	with gr.TabItem("📊 Value Analysis"):
	gr.HTML("<h3>Analyze value frequencies and distributions</h3>")

	with gr.Row():
	with gr.Column():
	analysis_col = gr.Dropdown(label="Column to Analyze", choices=[], interactive=True)
	top_n = gr.Slider(minimum=5, maximum=100, value=20, step=5, label="Top N Values")
	normalize_counts = gr.Checkbox(label="Show Percentages", value=False)
	analyze_btn = gr.Button("📊 Analyze Values", variant="primary")

	with gr.Column():
	analysis_status = gr.Textbox(label="Status", lines=3, interactive=False)

	analysis_results = gr.DataFrame(label="Value Counts", height=400)

	# Update analysis column choices
	current_data.change(
	lambda df: gr.Dropdown(choices=list(df.columns) if df is not None else []),
	inputs=[current_data],
	outputs=[analysis_col]
	)

	def analysis_handler(df, col, n, norm):
	if df is None:
	return None, "❌ No data loaded"

	return get_value_counts(df, col, n, norm)

	analyze_btn.click(
	analysis_handler,
	inputs=[current_data, analysis_col, top_n, normalize_counts],
	outputs=[analysis_results, analysis_status]
	)

	# ===== DATA CLEANING TAB =====
	with gr.TabItem("🧹 Data Cleaning"):
	gr.HTML("<h3>Clean and preprocess your data</h3>")

	with gr.Tabs():
	# Missing Values
	with gr.TabItem("Missing Values"):
	with gr.Row():
	with gr.Column():
	missing_col = gr.Dropdown(label="Column", choices=["ALL"], value="ALL", interactive=True)
	missing_method = gr.Radio(
	choices=["drop_rows", "fill_value", "fill_mean", "fill_median", "fill_mode", "forward_fill", "backward_fill"],
	value="drop_rows",
	label="Method"
	)
	fill_value_input = gr.Textbox(label="Fill Value", placeholder="For fill_value method")
	missing_btn = gr.Button("🧹 Handle Missing Values", variant="primary")

	with gr.Column():
	missing_status = gr.Textbox(label="Status", lines=4, interactive=False)

	# Duplicates
	with gr.TabItem("Duplicates"):
	with gr.Row():
	with gr.Column():
	duplicate_cols = gr.Textbox(
	label="Columns to Check",
	placeholder="column1, column2 (empty = all columns)"
	)
	keep_method = gr.Radio(
	choices=["first", "last", "false"],
	value="first",
	label="Keep Method"
	)
	duplicate_btn = gr.Button("🗑️ Remove Duplicates", variant="primary")

	with gr.Column():
	duplicate_status = gr.Textbox(label="Status", lines=4, interactive=False)

	# Data Filtering
	with gr.TabItem("Filtering"):
	with gr.Row():
	with gr.Column():
	filter_col = gr.Dropdown(label="Column", choices=[], interactive=True)
	filter_condition = gr.Dropdown(
	choices=["equals", "not_equals", "contains", "not_contains", "starts_with", "ends_with",
	"greater_than", "less_than", "is_null", "is_not_null"],
	value="equals",
	label="Condition"
	)
	filter_value = gr.Textbox(label="Value")
	filter_btn = gr.Button("🔍 Filter Data", variant="primary")

	with gr.Column():
	filter_status = gr.Textbox(label="Status", lines=4, interactive=False)

	# Update dropdown choices
	current_data.change(
	lambda df: (
	gr.Dropdown(choices=["ALL"] + list(df.columns) if df is not None else ["ALL"]),
	gr.Dropdown(choices=list(df.columns) if df is not None else [])
	),
	inputs=[current_data],
	outputs=[missing_col, filter_col]
	)

	# Event handlers
	missing_btn.click(
	lambda df, col, method, val: handle_missing_values(df, col, method, val)[1] if df is not None else "❌ No data",
	inputs=[current_data, missing_col, missing_method, fill_value_input],
	outputs=[missing_status]
	).then(
	lambda: processor.current_df,
	outputs=[current_data]
	).then(
	lambda df: get_data_info(df),
	inputs=[current_data],
	outputs=[data_info_display]
	)

	duplicate_btn.click(
	lambda df, cols, keep: detect_and_remove_duplicates(df, cols, keep)[1] if df is not None else "❌ No data",
	inputs=[current_data, duplicate_cols, keep_method],
	outputs=[duplicate_status]
	).then(
	lambda: processor.current_df,
	outputs=[current_data]
	).then(
	lambda df: get_data_info(df),
	inputs=[current_data],
	outputs=[data_info_display]
	)

	filter_btn.click(
	lambda df, col, cond, val: filter_data(df, col, cond, val)[1] if df is not None else "❌ No data",
	inputs=[current_data, filter_col, filter_condition, filter_value],
	outputs=[filter_status]
	).then(
	lambda: processor.current_df,
	outputs=[current_data]
	).then(
	lambda df: get_data_info(df),
	inputs=[current_data],
	outputs=[data_info_display]
	)

	# ===== COLUMN OPERATIONS TAB =====
	with gr.TabItem("⚙️ Column Operations"):
	gr.HTML("<h3>Perform operations on columns</h3>")

	with gr.Row():
	with gr.Column():
	op_type = gr.Dropdown(
	choices=["add", "subtract", "multiply", "divide", "concatenate",
	"extract_numbers", "upper", "lower", "title", "length"],
	value="add",
	label="Operation"
	)
	op_col1 = gr.Dropdown(label="Primary Column", choices=[], interactive=True)
	op_col2 = gr.Dropdown(label="Second Column (optional)", choices=[], interactive=True)
	op_constant = gr.Textbox(label="Constant Value (optional)")
	op_new_name = gr.Textbox(label="New Column Name")
	op_btn = gr.Button("⚙️ Execute Operation", variant="primary")

	with gr.Column():
	op_status = gr.Textbox(label="Status", lines=5, interactive=False)

	# Data type conversion
	gr.HTML("<hr><h4>Data Type Conversion</h4>")
	convert_col = gr.Dropdown(label="Column", choices=[], interactive=True)
	convert_type = gr.Dropdown(
	choices=["string", "integer", "float", "datetime", "boolean", "category"],
	value="string",
	label="Target Type"
	)
	convert_btn = gr.Button("🔄 Convert Type", variant="secondary")
	convert_status = gr.Textbox(label="Conversion Status", lines=2, interactive=False)

	# Update column choices
	current_data.change(
	lambda df: (
	gr.Dropdown(choices=list(df.columns) if df is not None else []),
	gr.Dropdown(choices=list(df.columns) if df is not None else []),
	gr.Dropdown(choices=list(df.columns) if df is not None else [])
	),
	inputs=[current_data],
	outputs=[op_col1, op_col2, convert_col]
	)

	# Event handlers
	def operation_handler(df, op, col1, col2, const, new_name):
	if df is None:
	return None, "❌ No data loaded", ""

	result_df, status = perform_column_operations(df, op, col1, col2, new_name, const)
	info_text = get_data_info(result_df) if result_df is not None else ""

	return result_df, status, info_text

	op_btn.click(
	operation_handler,
	inputs=[current_data, op_type, op_col1, op_col2, op_constant, op_new_name],
	outputs=[current_data, op_status, data_info_display]
	)

	def convert_handler(df, col, target_type):
	if df is None:
	return None, "❌ No data loaded", ""

	result_df, status = convert_data_types(df, col, target_type)
	info_text = get_data_info(result_df) if result_df is not None else ""

	return result_df, status, info_text

	convert_btn.click(
	convert_handler,
	inputs=[current_data, convert_col, convert_type],
	outputs=[current_data, convert_status, data_info_display]
	)

	# ===== STATISTICS TAB =====
	with gr.TabItem("📈 Statistics & Analysis"):
	gr.HTML("<h3>Statistical analysis and insights</h3>")

	with gr.Row():
	with gr.Column():
	stats_btn = gr.Button("📊 Generate Statistical Summary", variant="primary")
	corr_btn = gr.Button("🔗 Create Correlation Matrix", variant="secondary")

	# Distribution plots
	gr.HTML("<hr><h4>Distribution Analysis</h4>")
	dist_col = gr.Dropdown(label="Column", choices=[], interactive=True)
	plot_type = gr.Radio(choices=["histogram", "density"], value="histogram", label="Plot Type")
	dist_btn = gr.Button("📈 Create Distribution Plot", variant="secondary")

	with gr.Column():
	stats_status = gr.Textbox(label="Status", lines=3, interactive=False)
	plot_output = gr.Image(label="Visualization")

	stats_results = gr.DataFrame(label="Statistical Summary", height=400)

	# Update column choices
	current_data.change(
	lambda df: gr.Dropdown(choices=list(df.select_dtypes(include=[np.number]).columns) if df is not None else []),
	inputs=[current_data],
	outputs=[dist_col]
	)

	# Event handlers
	stats_btn.click(
	lambda df: generate_statistical_summary(df) if df is not None else (None, "❌ No data"),
	inputs=[current_data],
	outputs=[stats_results, stats_status]
	)

	corr_btn.click(
	lambda df: create_correlation_matrix(df) if df is not None else (None, "❌ No data"),
	inputs=[current_data],
	outputs=[plot_output, stats_status]
	)

	dist_btn.click(
	lambda df, col, ptype: create_distribution_plots(df, col, ptype) if df is not None else (None, "❌ No data"),
	inputs=[current_data, dist_col, plot_type],
	outputs=[plot_output, stats_status]
	)

	# ===== EXPORT TAB =====
	with gr.TabItem("💾 Export & Download"):
	gr.HTML("<h3>Export your processed data</h3>")

	with gr.Row():
	with gr.Column():
	export_format = gr.Radio(
	choices=["csv", "excel", "json"],
	value="csv",
	label="Export Format"
	)
	export_filename = gr.Textbox(
	label="Filename (without extension)",
	value="processed_data",
	placeholder="Enter filename"
	)
	export_btn = gr.Button("💾 Create Download File", variant="primary", size="lg")

	with gr.Column():
	export_status = gr.Textbox(label="Status", lines=3, interactive=False)
	download_file = gr.File(label="Download", visible=False)

	# History and Undo/Redo
	with gr.Row():
	with gr.Column():
	gr.HTML("<hr><h4>History & Undo Operations</h4>")
	undo_btn = gr.Button("↶ Undo Last Operation", variant="secondary")
	reset_btn = gr.Button("🔄 Reset to Original", variant="secondary")

	with gr.Column():
	history_status = gr.Textbox(label="History Status", lines=3, interactive=False)

	def export_handler(df, fmt, filename):
	if df is None:
	return None, "❌ No data to export", gr.File(visible=False)

	try:
	file_data, file_name = create_download_file(df, fmt, filename)

	# Save file temporarily
	with open(file_name, 'wb' if fmt == 'excel' else 'w', encoding=None if fmt == 'excel' else 'utf-8') as f:
	if fmt == 'excel':
	f.write(file_data)
	else:
	f.write(file_data)

	return file_name, f"✅ File created successfully: {file_name}", gr.File(value=file_name, visible=True)

	except Exception as e:
	return None, f"❌ Export error: {str(e)}", gr.File(visible=False)

	export_btn.click(
	export_handler,
	inputs=[current_data, export_format, export_filename],
	outputs=[download_file, export_status, download_file]
	)

	def undo_handler():
	result_df, status = processor.undo_operation()
	info_text = get_data_info(result_df) if result_df is not None else ""
	return result_df, status, info_text

	def reset_handler():
	result_df, status = processor.reset_to_original()
	info_text = get_data_info(result_df) if result_df is not None else ""
	return result_df, status, info_text

	undo_btn.click(
	undo_handler,
	outputs=[current_data, history_status, data_info_display]
	)

	reset_btn.click(
	reset_handler,
	outputs=[current_data, history_status, data_info_display]
	)

	# Footer
	gr.HTML("""
	<div style="text-align: center; padding: 20px; margin-top: 30px; border-top: 1px solid #ddd;">
	<p style="color: #666; font-size: 14px;">
	🚀 <strong>Advanced CSV Manipulation Tool</strong> \|
	Commercial-ready data processing with enterprise features \|
	Built with Gradio & Python
	</p>
	</div>
	""")

	return demo

	if __name__ == "__main__":
	# Create and launch the interface
	demo = create_interface()
	demo.launch(
	share=True,
	inbrowser=True,
	server_name="0.0.0.0",
	server_port=7860,
	max_file_size="1gb"
	)