Spaces:
Sleeping
Sleeping
Upload csv-manipulator.py
Browse files- csv-manipulator.py +1142 -0
csv-manipulator.py
ADDED
@@ -0,0 +1,1142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Advanced CSV Manipulation Tool with Gradio Interface
|
4 |
+
Commercial-ready application for powerful CSV data processing
|
5 |
+
|
6 |
+
Features:
|
7 |
+
- File upload with 1GB limit
|
8 |
+
- Data preview with selectable rows
|
9 |
+
- Value replacement based on conditions
|
10 |
+
- CSV concatenation with column selection
|
11 |
+
- Advanced statistical analysis and visualization
|
12 |
+
- Data validation and quality checks
|
13 |
+
- Export to CSV, Excel, JSON
|
14 |
+
- Batch operations and operation recipes
|
15 |
+
- Undo/Redo functionality
|
16 |
+
- Memory-efficient large file processing
|
17 |
+
"""
|
18 |
+
|
19 |
+
import gradio as gr
|
20 |
+
import pandas as pd
|
21 |
+
import numpy as np
|
22 |
+
import json
|
23 |
+
import io
|
24 |
+
import zipfile
|
25 |
+
from datetime import datetime, timedelta
|
26 |
+
import re
|
27 |
+
import matplotlib.pyplot as plt
|
28 |
+
import seaborn as sns
|
29 |
+
import plotly.express as px
|
30 |
+
import plotly.graph_objects as go
|
31 |
+
from plotly.subplots import make_subplots
|
32 |
+
import warnings
|
33 |
+
import os
|
34 |
+
from typing import Dict, List, Tuple, Optional, Any
|
35 |
+
import hashlib
|
36 |
+
import pickle
|
37 |
+
from pathlib import Path
|
38 |
+
|
39 |
+
warnings.filterwarnings('ignore')
|
40 |
+
plt.style.use('seaborn-v0_8')
|
41 |
+
sns.set_palette("husl")
|
42 |
+
|
43 |
+
class CSVProcessor:
|
44 |
+
"""Advanced CSV processing class with state management and history"""
|
45 |
+
|
46 |
+
def __init__(self):
|
47 |
+
self.original_df = None
|
48 |
+
self.current_df = None
|
49 |
+
self.history = []
|
50 |
+
self.recipes = {}
|
51 |
+
self.batch_files = []
|
52 |
+
|
53 |
+
def load_data(self, file, preview_rows=100, encoding='utf-8'):
|
54 |
+
"""Load data file with error handling and memory optimization"""
|
55 |
+
try:
|
56 |
+
if file is None:
|
57 |
+
return None, "No file provided"
|
58 |
+
|
59 |
+
file_path = file.name if hasattr(file, 'name') else str(file)
|
60 |
+
file_extension = Path(file_path).suffix.lower()
|
61 |
+
|
62 |
+
# Chunked reading for large files
|
63 |
+
if file_extension == '.csv':
|
64 |
+
# Try different encodings
|
65 |
+
encodings = [encoding, 'utf-8', 'latin-1', 'cp1252']
|
66 |
+
df = None
|
67 |
+
for enc in encodings:
|
68 |
+
try:
|
69 |
+
df = pd.read_csv(file_path, encoding=enc, low_memory=False)
|
70 |
+
break
|
71 |
+
except UnicodeDecodeError:
|
72 |
+
continue
|
73 |
+
if df is None:
|
74 |
+
return None, "Failed to decode file with supported encodings"
|
75 |
+
|
76 |
+
elif file_extension in ['.xlsx', '.xls']:
|
77 |
+
df = pd.read_excel(file_path)
|
78 |
+
elif file_extension == '.json':
|
79 |
+
df = pd.read_json(file_path)
|
80 |
+
elif file_extension == '.parquet':
|
81 |
+
df = pd.read_parquet(file_path)
|
82 |
+
else:
|
83 |
+
return None, f"Unsupported file format: {file_extension}"
|
84 |
+
|
85 |
+
self.original_df = df.copy()
|
86 |
+
self.current_df = df.copy()
|
87 |
+
self.history = []
|
88 |
+
|
89 |
+
# Create preview
|
90 |
+
if preview_rows > 0:
|
91 |
+
preview = df.head(preview_rows)
|
92 |
+
else:
|
93 |
+
preview = df
|
94 |
+
|
95 |
+
# Memory and performance info
|
96 |
+
memory_mb = df.memory_usage(deep=True).sum() / 1024**2
|
97 |
+
info = {
|
98 |
+
'rows': len(df),
|
99 |
+
'columns': len(df.columns),
|
100 |
+
'memory_usage': f"{memory_mb:.2f} MB",
|
101 |
+
'dtypes': dict(df.dtypes.astype(str)),
|
102 |
+
'null_counts': dict(df.isnull().sum()),
|
103 |
+
'duplicates': df.duplicated().sum()
|
104 |
+
}
|
105 |
+
|
106 |
+
success_msg = f"β
File loaded successfully!\n"
|
107 |
+
success_msg += f"π {info['rows']:,} rows Γ {info['columns']} columns\n"
|
108 |
+
success_msg += f"πΎ Memory usage: {info['memory_usage']}\n"
|
109 |
+
success_msg += f"π Duplicates: {info['duplicates']:,}\n"
|
110 |
+
success_msg += f"β Missing values: {sum(info['null_counts'].values()):,}"
|
111 |
+
|
112 |
+
return preview, success_msg, info
|
113 |
+
|
114 |
+
except Exception as e:
|
115 |
+
return None, f"β Error loading file: {str(e)}", {}
|
116 |
+
|
117 |
+
def save_state(self, operation_name: str):
|
118 |
+
"""Save current state to history with memory management"""
|
119 |
+
if len(self.history) > 50: # Limit history to prevent memory issues
|
120 |
+
self.history = self.history[-25:] # Keep last 25 operations
|
121 |
+
|
122 |
+
self.history.append({
|
123 |
+
'operation': operation_name,
|
124 |
+
'timestamp': datetime.now(),
|
125 |
+
'df': self.current_df.copy() if self.current_df is not None else None
|
126 |
+
})
|
127 |
+
|
128 |
+
def undo_operation(self):
|
129 |
+
"""Undo last operation"""
|
130 |
+
if len(self.history) > 1:
|
131 |
+
self.history.pop()
|
132 |
+
self.current_df = self.history[-1]['df'].copy()
|
133 |
+
return self.current_df, f"β
Undone: {self.history[-1]['operation']}"
|
134 |
+
elif len(self.history) == 1:
|
135 |
+
self.current_df = self.original_df.copy()
|
136 |
+
self.history = []
|
137 |
+
return self.current_df, "β
Reset to original data"
|
138 |
+
else:
|
139 |
+
return self.current_df, "β No operations to undo"
|
140 |
+
|
141 |
+
def reset_to_original(self):
|
142 |
+
"""Reset to original data"""
|
143 |
+
if self.original_df is not None:
|
144 |
+
self.current_df = self.original_df.copy()
|
145 |
+
self.history = []
|
146 |
+
return self.current_df, "β
Reset to original data"
|
147 |
+
return None, "β No original data available"
|
148 |
+
|
149 |
+
# Global processor instance
|
150 |
+
processor = CSVProcessor()
|
151 |
+
|
152 |
+
def create_download_file(df: pd.DataFrame, format_type: str, filename: str = "processed_data"):
|
153 |
+
"""Create downloadable file in specified format"""
|
154 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
155 |
+
filename_with_timestamp = f"{filename}_{timestamp}"
|
156 |
+
|
157 |
+
try:
|
158 |
+
if format_type == "csv":
|
159 |
+
csv_data = df.to_csv(index=False)
|
160 |
+
return csv_data, f"{filename_with_timestamp}.csv"
|
161 |
+
elif format_type == "excel":
|
162 |
+
buffer = io.BytesIO()
|
163 |
+
with pd.ExcelWriter(buffer, engine='openpyxl') as writer:
|
164 |
+
df.to_excel(writer, index=False, sheet_name='Data')
|
165 |
+
buffer.seek(0)
|
166 |
+
return buffer.getvalue(), f"{filename_with_timestamp}.xlsx"
|
167 |
+
elif format_type == "json":
|
168 |
+
json_data = df.to_json(orient='records', indent=2, date_format='iso')
|
169 |
+
return json_data, f"{filename_with_timestamp}.json"
|
170 |
+
except Exception as e:
|
171 |
+
return None, f"Error creating {format_type} file: {str(e)}"
|
172 |
+
|
173 |
+
def get_data_info(df: pd.DataFrame) -> str:
|
174 |
+
"""Get comprehensive data information"""
|
175 |
+
if df is None or df.empty:
|
176 |
+
return "No data loaded"
|
177 |
+
|
178 |
+
info_dict = {
|
179 |
+
'π Shape': f"{df.shape[0]:,} rows Γ {df.shape[1]} columns",
|
180 |
+
'πΎ Memory': f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB",
|
181 |
+
'π Duplicates': f"{df.duplicated().sum():,}",
|
182 |
+
'β Missing Values': f"{df.isnull().sum().sum():,}",
|
183 |
+
'π Numeric Columns': f"{len(df.select_dtypes(include=[np.number]).columns)}",
|
184 |
+
'π Text Columns': f"{len(df.select_dtypes(include=['object']).columns)}",
|
185 |
+
'π
Date Columns': f"{len(df.select_dtypes(include=['datetime64']).columns)}"
|
186 |
+
}
|
187 |
+
|
188 |
+
return "\n".join([f"{k}: {v}" for k, v in info_dict.items()])
|
189 |
+
|
190 |
+
def get_column_options(df: pd.DataFrame) -> List[str]:
|
191 |
+
"""Get list of column names for dropdowns"""
|
192 |
+
return list(df.columns) if df is not None else []
|
193 |
+
|
194 |
+
# ===========================================
|
195 |
+
# CORE DATA MANIPULATION FUNCTIONS
|
196 |
+
# ===========================================
|
197 |
+
|
198 |
+
def rename_values_conditional(df: pd.DataFrame, target_col: str, condition_col: str,
|
199 |
+
condition_value: str, new_value: str, match_type: str = "exact") -> Tuple[pd.DataFrame, str]:
|
200 |
+
"""Rename values in target column based on condition in another column"""
|
201 |
+
try:
|
202 |
+
if df is None or df.empty:
|
203 |
+
return df, "β No data available"
|
204 |
+
|
205 |
+
if target_col not in df.columns or condition_col not in df.columns:
|
206 |
+
return df, "β One or more columns not found"
|
207 |
+
|
208 |
+
df_result = df.copy()
|
209 |
+
|
210 |
+
if match_type == "exact":
|
211 |
+
mask = df_result[condition_col] == condition_value
|
212 |
+
elif match_type == "contains":
|
213 |
+
mask = df_result[condition_col].astype(str).str.contains(condition_value, na=False)
|
214 |
+
elif match_type == "regex":
|
215 |
+
mask = df_result[condition_col].astype(str).str.match(condition_value, na=False)
|
216 |
+
elif match_type == "starts_with":
|
217 |
+
mask = df_result[condition_col].astype(str).str.startswith(condition_value, na=False)
|
218 |
+
elif match_type == "ends_with":
|
219 |
+
mask = df_result[condition_col].astype(str).str.endswith(condition_value, na=False)
|
220 |
+
|
221 |
+
affected_rows = mask.sum()
|
222 |
+
df_result.loc[mask, target_col] = new_value
|
223 |
+
|
224 |
+
processor.current_df = df_result
|
225 |
+
processor.save_state(f"Renamed values in '{target_col}' based on '{condition_col}'")
|
226 |
+
|
227 |
+
return df_result, f"β
Updated {affected_rows:,} rows in column '{target_col}'"
|
228 |
+
|
229 |
+
except Exception as e:
|
230 |
+
return df, f"β Error: {str(e)}"
|
231 |
+
|
232 |
+
def concatenate_csvs(files: List, selected_columns: str, join_type: str = "outer") -> Tuple[pd.DataFrame, str]:
|
233 |
+
"""Concatenate multiple CSV files with column selection"""
|
234 |
+
try:
|
235 |
+
if not files:
|
236 |
+
return None, "β No files provided"
|
237 |
+
|
238 |
+
dfs = []
|
239 |
+
columns_to_use = [col.strip() for col in selected_columns.split(",") if col.strip()] if selected_columns else None
|
240 |
+
|
241 |
+
for file in files:
|
242 |
+
if hasattr(file, 'name'):
|
243 |
+
file_path = file.name
|
244 |
+
if file_path.endswith('.csv'):
|
245 |
+
df = pd.read_csv(file_path, encoding='utf-8', low_memory=False)
|
246 |
+
elif file_path.endswith(('.xlsx', '.xls')):
|
247 |
+
df = pd.read_excel(file_path)
|
248 |
+
else:
|
249 |
+
continue
|
250 |
+
|
251 |
+
# Select specific columns if specified
|
252 |
+
if columns_to_use:
|
253 |
+
available_cols = [col for col in columns_to_use if col in df.columns]
|
254 |
+
if available_cols:
|
255 |
+
df = df[available_cols]
|
256 |
+
else:
|
257 |
+
continue
|
258 |
+
|
259 |
+
# Add source file identifier
|
260 |
+
df['_source_file'] = Path(file_path).stem
|
261 |
+
dfs.append(df)
|
262 |
+
|
263 |
+
if not dfs:
|
264 |
+
return None, "β No valid files found or columns don't exist"
|
265 |
+
|
266 |
+
# Concatenate with specified join type
|
267 |
+
if join_type == "inner":
|
268 |
+
result_df = pd.concat(dfs, ignore_index=True, join='inner')
|
269 |
+
else:
|
270 |
+
result_df = pd.concat(dfs, ignore_index=True, join='outer')
|
271 |
+
|
272 |
+
processor.current_df = result_df
|
273 |
+
processor.save_state(f"Concatenated {len(dfs)} files")
|
274 |
+
|
275 |
+
return result_df, f"β
Successfully concatenated {len(dfs)} files with {len(result_df):,} total rows"
|
276 |
+
|
277 |
+
except Exception as e:
|
278 |
+
return None, f"β Error concatenating files: {str(e)}"
|
279 |
+
|
280 |
+
def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 20, normalize: bool = False) -> Tuple[pd.DataFrame, str]:
|
281 |
+
"""Get value counts for specified column"""
|
282 |
+
try:
|
283 |
+
if df is None or df.empty:
|
284 |
+
return None, "β No data available"
|
285 |
+
|
286 |
+
if column not in df.columns:
|
287 |
+
return None, f"β Column '{column}' not found"
|
288 |
+
|
289 |
+
value_counts = df[column].value_counts(normalize=normalize, dropna=False).head(top_n)
|
290 |
+
|
291 |
+
# Convert to DataFrame for better display
|
292 |
+
result_df = pd.DataFrame({
|
293 |
+
'Value': value_counts.index,
|
294 |
+
'Count' if not normalize else 'Percentage': value_counts.values
|
295 |
+
})
|
296 |
+
|
297 |
+
if normalize:
|
298 |
+
result_df['Percentage'] = result_df['Percentage'].map(lambda x: f"{x:.2%}")
|
299 |
+
|
300 |
+
return result_df, f"β
Value counts for '{column}' (Top {min(top_n, len(result_df))})"
|
301 |
+
|
302 |
+
except Exception as e:
|
303 |
+
return None, f"β Error: {str(e)}"
|
304 |
+
|
305 |
+
def filter_data(df: pd.DataFrame, column: str, condition: str, value: str) -> Tuple[pd.DataFrame, str]:
|
306 |
+
"""Filter data based on conditions"""
|
307 |
+
try:
|
308 |
+
if df is None or df.empty:
|
309 |
+
return df, "β No data available"
|
310 |
+
|
311 |
+
if column not in df.columns:
|
312 |
+
return df, f"β Column '{column}' not found"
|
313 |
+
|
314 |
+
df_result = df.copy()
|
315 |
+
|
316 |
+
if condition == "equals":
|
317 |
+
mask = df_result[column] == value
|
318 |
+
elif condition == "not_equals":
|
319 |
+
mask = df_result[column] != value
|
320 |
+
elif condition == "contains":
|
321 |
+
mask = df_result[column].astype(str).str.contains(value, na=False)
|
322 |
+
elif condition == "not_contains":
|
323 |
+
mask = ~df_result[column].astype(str).str.contains(value, na=False)
|
324 |
+
elif condition == "starts_with":
|
325 |
+
mask = df_result[column].astype(str).str.startswith(value, na=False)
|
326 |
+
elif condition == "ends_with":
|
327 |
+
mask = df_result[column].astype(str).str.endswith(value, na=False)
|
328 |
+
elif condition == "greater_than":
|
329 |
+
mask = pd.to_numeric(df_result[column], errors='coerce') > float(value)
|
330 |
+
elif condition == "less_than":
|
331 |
+
mask = pd.to_numeric(df_result[column], errors='coerce') < float(value)
|
332 |
+
elif condition == "is_null":
|
333 |
+
mask = df_result[column].isnull()
|
334 |
+
elif condition == "is_not_null":
|
335 |
+
mask = df_result[column].notnull()
|
336 |
+
else:
|
337 |
+
return df, f"β Unknown condition: {condition}"
|
338 |
+
|
339 |
+
filtered_df = df_result[mask]
|
340 |
+
|
341 |
+
processor.current_df = filtered_df
|
342 |
+
processor.save_state(f"Filtered data: {column} {condition} {value}")
|
343 |
+
|
344 |
+
return filtered_df, f"β
Filtered to {len(filtered_df):,} rows (removed {len(df) - len(filtered_df):,} rows)"
|
345 |
+
|
346 |
+
except Exception as e:
|
347 |
+
return df, f"β Error: {str(e)}"
|
348 |
+
|
349 |
+
def handle_missing_values(df: pd.DataFrame, column: str, method: str, fill_value: str = "") -> Tuple[pd.DataFrame, str]:
|
350 |
+
"""Handle missing values in specified column"""
|
351 |
+
try:
|
352 |
+
if df is None or df.empty:
|
353 |
+
return df, "β No data available"
|
354 |
+
|
355 |
+
if column != "ALL" and column not in df.columns:
|
356 |
+
return df, f"β Column '{column}' not found"
|
357 |
+
|
358 |
+
df_result = df.copy()
|
359 |
+
columns_to_process = [column] if column != "ALL" else df_result.columns.tolist()
|
360 |
+
|
361 |
+
total_missing_before = df_result.isnull().sum().sum()
|
362 |
+
|
363 |
+
for col in columns_to_process:
|
364 |
+
if method == "drop_rows":
|
365 |
+
df_result = df_result.dropna(subset=[col])
|
366 |
+
elif method == "fill_value":
|
367 |
+
df_result[col] = df_result[col].fillna(fill_value)
|
368 |
+
elif method == "fill_mean":
|
369 |
+
if df_result[col].dtype in ['int64', 'float64']:
|
370 |
+
df_result[col] = df_result[col].fillna(df_result[col].mean())
|
371 |
+
elif method == "fill_median":
|
372 |
+
if df_result[col].dtype in ['int64', 'float64']:
|
373 |
+
df_result[col] = df_result[col].fillna(df_result[col].median())
|
374 |
+
elif method == "fill_mode":
|
375 |
+
mode_val = df_result[col].mode()
|
376 |
+
if len(mode_val) > 0:
|
377 |
+
df_result[col] = df_result[col].fillna(mode_val[0])
|
378 |
+
elif method == "forward_fill":
|
379 |
+
df_result[col] = df_result[col].fillna(method='ffill')
|
380 |
+
elif method == "backward_fill":
|
381 |
+
df_result[col] = df_result[col].fillna(method='bfill')
|
382 |
+
|
383 |
+
total_missing_after = df_result.isnull().sum().sum()
|
384 |
+
|
385 |
+
processor.current_df = df_result
|
386 |
+
processor.save_state(f"Handle missing values: {method}")
|
387 |
+
|
388 |
+
return df_result, f"β
Processed missing values. Before: {total_missing_before:,}, After: {total_missing_after:,}"
|
389 |
+
|
390 |
+
except Exception as e:
|
391 |
+
return df, f"β Error: {str(e)}"
|
392 |
+
|
393 |
+
def detect_and_remove_duplicates(df: pd.DataFrame, columns: str = "", keep: str = "first") -> Tuple[pd.DataFrame, str]:
|
394 |
+
"""Detect and remove duplicate rows"""
|
395 |
+
try:
|
396 |
+
if df is None or df.empty:
|
397 |
+
return df, "β No data available"
|
398 |
+
|
399 |
+
df_result = df.copy()
|
400 |
+
|
401 |
+
# Parse columns
|
402 |
+
if columns.strip():
|
403 |
+
cols_list = [col.strip() for col in columns.split(",") if col.strip() in df.columns]
|
404 |
+
subset = cols_list if cols_list else None
|
405 |
+
else:
|
406 |
+
subset = None
|
407 |
+
|
408 |
+
duplicates_before = df_result.duplicated(subset=subset).sum()
|
409 |
+
|
410 |
+
if duplicates_before == 0:
|
411 |
+
return df_result, "β
No duplicate rows found"
|
412 |
+
|
413 |
+
df_result = df_result.drop_duplicates(subset=subset, keep=keep)
|
414 |
+
|
415 |
+
processor.current_df = df_result
|
416 |
+
processor.save_state(f"Removed {duplicates_before:,} duplicate rows")
|
417 |
+
|
418 |
+
return df_result, f"β
Removed {duplicates_before:,} duplicate rows. Remaining: {len(df_result):,} rows"
|
419 |
+
|
420 |
+
except Exception as e:
|
421 |
+
return df, f"β Error: {str(e)}"
|
422 |
+
|
423 |
+
def perform_column_operations(df: pd.DataFrame, operation: str, col1: str, col2: str = "",
|
424 |
+
new_col_name: str = "", constant: str = "") -> Tuple[pd.DataFrame, str]:
|
425 |
+
"""Perform mathematical and string operations on columns"""
|
426 |
+
try:
|
427 |
+
if df is None or df.empty:
|
428 |
+
return df, "β No data available"
|
429 |
+
|
430 |
+
if col1 not in df.columns:
|
431 |
+
return df, f"β Column '{col1}' not found"
|
432 |
+
|
433 |
+
df_result = df.copy()
|
434 |
+
|
435 |
+
if not new_col_name:
|
436 |
+
new_col_name = f"{col1}_{operation}"
|
437 |
+
|
438 |
+
if operation == "add":
|
439 |
+
if col2 and col2 in df.columns:
|
440 |
+
df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') + pd.to_numeric(df_result[col2], errors='coerce')
|
441 |
+
elif constant:
|
442 |
+
df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') + float(constant)
|
443 |
+
|
444 |
+
elif operation == "subtract":
|
445 |
+
if col2 and col2 in df.columns:
|
446 |
+
df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') - pd.to_numeric(df_result[col2], errors='coerce')
|
447 |
+
elif constant:
|
448 |
+
df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') - float(constant)
|
449 |
+
|
450 |
+
elif operation == "multiply":
|
451 |
+
if col2 and col2 in df.columns:
|
452 |
+
df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') * pd.to_numeric(df_result[col2], errors='coerce')
|
453 |
+
elif constant:
|
454 |
+
df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') * float(constant)
|
455 |
+
|
456 |
+
elif operation == "divide":
|
457 |
+
if col2 and col2 in df.columns:
|
458 |
+
df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') / pd.to_numeric(df_result[col2], errors='coerce')
|
459 |
+
elif constant:
|
460 |
+
df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') / float(constant)
|
461 |
+
|
462 |
+
elif operation == "concatenate":
|
463 |
+
if col2 and col2 in df.columns:
|
464 |
+
df_result[new_col_name] = df_result[col1].astype(str) + " " + df_result[col2].astype(str)
|
465 |
+
elif constant:
|
466 |
+
df_result[new_col_name] = df_result[col1].astype(str) + constant
|
467 |
+
|
468 |
+
elif operation == "extract_numbers":
|
469 |
+
df_result[new_col_name] = df_result[col1].astype(str).str.extract(r'(\d+)')[0]
|
470 |
+
|
471 |
+
elif operation == "upper":
|
472 |
+
df_result[new_col_name] = df_result[col1].astype(str).str.upper()
|
473 |
+
|
474 |
+
elif operation == "lower":
|
475 |
+
df_result[new_col_name] = df_result[col1].astype(str).str.lower()
|
476 |
+
|
477 |
+
elif operation == "title":
|
478 |
+
df_result[new_col_name] = df_result[col1].astype(str).str.title()
|
479 |
+
|
480 |
+
elif operation == "length":
|
481 |
+
df_result[new_col_name] = df_result[col1].astype(str).str.len()
|
482 |
+
|
483 |
+
else:
|
484 |
+
return df, f"β Unknown operation: {operation}"
|
485 |
+
|
486 |
+
processor.current_df = df_result
|
487 |
+
processor.save_state(f"Column operation: {operation} on {col1}")
|
488 |
+
|
489 |
+
return df_result, f"β
Created new column '{new_col_name}' using {operation} operation"
|
490 |
+
|
491 |
+
except Exception as e:
|
492 |
+
return df, f"β Error: {str(e)}"
|
493 |
+
|
494 |
+
def convert_data_types(df: pd.DataFrame, column: str, target_type: str) -> Tuple[pd.DataFrame, str]:
|
495 |
+
"""Convert column data types"""
|
496 |
+
try:
|
497 |
+
if df is None or df.empty:
|
498 |
+
return df, "β No data available"
|
499 |
+
|
500 |
+
if column not in df.columns:
|
501 |
+
return df, f"β Column '{column}' not found"
|
502 |
+
|
503 |
+
df_result = df.copy()
|
504 |
+
|
505 |
+
if target_type == "string":
|
506 |
+
df_result[column] = df_result[column].astype(str)
|
507 |
+
elif target_type == "integer":
|
508 |
+
df_result[column] = pd.to_numeric(df_result[column], errors='coerce').astype('Int64')
|
509 |
+
elif target_type == "float":
|
510 |
+
df_result[column] = pd.to_numeric(df_result[column], errors='coerce')
|
511 |
+
elif target_type == "datetime":
|
512 |
+
df_result[column] = pd.to_datetime(df_result[column], errors='coerce')
|
513 |
+
elif target_type == "boolean":
|
514 |
+
df_result[column] = df_result[column].astype(bool)
|
515 |
+
elif target_type == "category":
|
516 |
+
df_result[column] = df_result[column].astype('category')
|
517 |
+
else:
|
518 |
+
return df, f"β Unknown data type: {target_type}"
|
519 |
+
|
520 |
+
processor.current_df = df_result
|
521 |
+
processor.save_state(f"Converted '{column}' to {target_type}")
|
522 |
+
|
523 |
+
return df_result, f"β
Converted column '{column}' to {target_type}"
|
524 |
+
|
525 |
+
except Exception as e:
|
526 |
+
return df, f"β Error: {str(e)}"
|
527 |
+
|
528 |
+
# ===========================================
|
529 |
+
# ANALYSIS AND VISUALIZATION FUNCTIONS
|
530 |
+
# ===========================================
|
531 |
+
|
532 |
+
def generate_statistical_summary(df: pd.DataFrame) -> Tuple[pd.DataFrame, str]:
|
533 |
+
"""Generate comprehensive statistical summary"""
|
534 |
+
try:
|
535 |
+
if df is None or df.empty:
|
536 |
+
return None, "β No data available"
|
537 |
+
|
538 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
539 |
+
|
540 |
+
if len(numeric_cols) == 0:
|
541 |
+
return None, "β No numeric columns found"
|
542 |
+
|
543 |
+
stats_df = df[numeric_cols].describe()
|
544 |
+
|
545 |
+
# Add additional statistics
|
546 |
+
stats_df.loc['variance'] = df[numeric_cols].var()
|
547 |
+
stats_df.loc['skewness'] = df[numeric_cols].skew()
|
548 |
+
stats_df.loc['kurtosis'] = df[numeric_cols].kurtosis()
|
549 |
+
stats_df.loc['missing'] = df[numeric_cols].isnull().sum()
|
550 |
+
|
551 |
+
return stats_df.round(4), "β
Statistical summary generated"
|
552 |
+
|
553 |
+
except Exception as e:
|
554 |
+
return None, f"β Error: {str(e)}"
|
555 |
+
|
556 |
+
def create_correlation_matrix(df: pd.DataFrame) -> Tuple[str, str]:
|
557 |
+
"""Create correlation matrix visualization"""
|
558 |
+
try:
|
559 |
+
if df is None or df.empty:
|
560 |
+
return None, "β No data available"
|
561 |
+
|
562 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
563 |
+
|
564 |
+
if len(numeric_cols) < 2:
|
565 |
+
return None, "β Need at least 2 numeric columns for correlation"
|
566 |
+
|
567 |
+
# Calculate correlation matrix
|
568 |
+
corr_matrix = df[numeric_cols].corr()
|
569 |
+
|
570 |
+
# Create heatmap
|
571 |
+
plt.figure(figsize=(12, 8))
|
572 |
+
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
|
573 |
+
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
|
574 |
+
square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
|
575 |
+
plt.title('Correlation Matrix Heatmap', fontsize=16, fontweight='bold')
|
576 |
+
plt.tight_layout()
|
577 |
+
|
578 |
+
# Save plot
|
579 |
+
plt.savefig('correlation_matrix.png', dpi=300, bbox_inches='tight')
|
580 |
+
plt.close()
|
581 |
+
|
582 |
+
return 'correlation_matrix.png', "β
Correlation matrix created"
|
583 |
+
|
584 |
+
except Exception as e:
|
585 |
+
return None, f"β Error: {str(e)}"
|
586 |
+
|
587 |
+
def create_distribution_plots(df: pd.DataFrame, column: str, plot_type: str = "histogram") -> Tuple[str, str]:
|
588 |
+
"""Create distribution plots"""
|
589 |
+
try:
|
590 |
+
if df is None or df.empty:
|
591 |
+
return None, "β No data available"
|
592 |
+
|
593 |
+
if column not in df.columns:
|
594 |
+
return None, f"β Column '{column}' not found"
|
595 |
+
|
596 |
+
plt.figure(figsize=(12, 6))
|
597 |
+
|
598 |
+
if plot_type == "histogram":
|
599 |
+
plt.subplot(1, 2, 1)
|
600 |
+
df[column].hist(bins=30, edgecolor='black', alpha=0.7)
|
601 |
+
plt.title(f'Histogram of {column}')
|
602 |
+
plt.xlabel(column)
|
603 |
+
plt.ylabel('Frequency')
|
604 |
+
|
605 |
+
plt.subplot(1, 2, 2)
|
606 |
+
df.boxplot(column=column)
|
607 |
+
plt.title(f'Box Plot of {column}')
|
608 |
+
|
609 |
+
elif plot_type == "density":
|
610 |
+
plt.subplot(1, 2, 1)
|
611 |
+
df[column].plot(kind='density')
|
612 |
+
plt.title(f'Density Plot of {column}')
|
613 |
+
plt.xlabel(column)
|
614 |
+
|
615 |
+
plt.subplot(1, 2, 2)
|
616 |
+
df[column].plot(kind='box')
|
617 |
+
plt.title(f'Box Plot of {column}')
|
618 |
+
|
619 |
+
plt.tight_layout()
|
620 |
+
plt.savefig(f'distribution_{column}_{plot_type}.png', dpi=300, bbox_inches='tight')
|
621 |
+
plt.close()
|
622 |
+
|
623 |
+
return f'distribution_{column}_{plot_type}.png', f"β
Distribution plot created for {column}"
|
624 |
+
|
625 |
+
except Exception as e:
|
626 |
+
return None, f"β Error: {str(e)}"
|
627 |
+
|
628 |
+
# ===========================================
|
629 |
+
# GRADIO INTERFACE SETUP
|
630 |
+
# ===========================================
|
631 |
+
|
632 |
+
def create_interface():
|
633 |
+
"""Create the main Gradio interface"""
|
634 |
+
|
635 |
+
with gr.Blocks(title="Advanced CSV Manipulation Tool", theme=gr.themes.Soft()) as demo:
|
636 |
+
|
637 |
+
gr.HTML("""
|
638 |
+
<div style="text-align: center; padding: 20px;">
|
639 |
+
<h1 style="color: #2e7d32; margin-bottom: 10px;">π₯ Advanced CSV Manipulation Tool</h1>
|
640 |
+
<p style="font-size: 18px; color: #666;">Commercial-ready data processing with advanced analytics</p>
|
641 |
+
<hr style="margin: 20px 0;">
|
642 |
+
</div>
|
643 |
+
""")
|
644 |
+
|
645 |
+
# Global state variables
|
646 |
+
current_data = gr.State(None)
|
647 |
+
data_info = gr.State({})
|
648 |
+
|
649 |
+
with gr.Tabs():
|
650 |
+
|
651 |
+
# ===== FILE UPLOAD TAB =====
|
652 |
+
with gr.TabItem("π File Upload & Preview"):
|
653 |
+
with gr.Row():
|
654 |
+
with gr.Column(scale=1):
|
655 |
+
file_upload = gr.File(
|
656 |
+
label="Upload CSV/Excel/JSON file (Max 1GB)",
|
657 |
+
file_types=[".csv", ".xlsx", ".xls", ".json"],
|
658 |
+
file_count="single"
|
659 |
+
)
|
660 |
+
preview_rows = gr.Slider(
|
661 |
+
minimum=0,
|
662 |
+
maximum=1000,
|
663 |
+
value=100,
|
664 |
+
step=50,
|
665 |
+
label="Preview Rows (0 = All)",
|
666 |
+
info="Number of rows to display in preview"
|
667 |
+
)
|
668 |
+
upload_btn = gr.Button("π Load & Analyze Data", variant="primary", size="lg")
|
669 |
+
|
670 |
+
with gr.Column(scale=2):
|
671 |
+
upload_status = gr.Textbox(label="Status", lines=5, interactive=False)
|
672 |
+
data_info_display = gr.Textbox(label="Data Information", lines=8, interactive=False)
|
673 |
+
|
674 |
+
data_preview = gr.DataFrame(label="Data Preview", interactive=False, height=400)
|
675 |
+
|
676 |
+
def load_file_handler(file, rows):
|
677 |
+
if file is None:
|
678 |
+
return None, "Please upload a file first", "", None, {}
|
679 |
+
|
680 |
+
preview, status, info = processor.load_data(file, rows)
|
681 |
+
info_text = get_data_info(processor.current_df) if processor.current_df is not None else ""
|
682 |
+
|
683 |
+
return preview, status, info_text, processor.current_df, info
|
684 |
+
|
685 |
+
upload_btn.click(
|
686 |
+
load_file_handler,
|
687 |
+
inputs=[file_upload, preview_rows],
|
688 |
+
outputs=[data_preview, upload_status, data_info_display, current_data, data_info]
|
689 |
+
)
|
690 |
+
|
691 |
+
# ===== VALUE REPLACEMENT TAB =====
|
692 |
+
with gr.TabItem("π Value Replacement"):
|
693 |
+
gr.HTML("<h3>Replace values in one column based on conditions in another column</h3>")
|
694 |
+
|
695 |
+
with gr.Row():
|
696 |
+
with gr.Column():
|
697 |
+
target_col = gr.Dropdown(label="Target Column (to modify)", choices=[], interactive=True)
|
698 |
+
condition_col = gr.Dropdown(label="Condition Column (to check)", choices=[], interactive=True)
|
699 |
+
condition_value = gr.Textbox(label="Condition Value", placeholder="Value to match in condition column")
|
700 |
+
new_value = gr.Textbox(label="New Value", placeholder="Replacement value for target column")
|
701 |
+
match_type = gr.Radio(
|
702 |
+
choices=["exact", "contains", "starts_with", "ends_with", "regex"],
|
703 |
+
value="exact",
|
704 |
+
label="Match Type"
|
705 |
+
)
|
706 |
+
replace_btn = gr.Button("π Replace Values", variant="primary")
|
707 |
+
|
708 |
+
with gr.Column():
|
709 |
+
replace_status = gr.Textbox(label="Status", lines=3, interactive=False)
|
710 |
+
|
711 |
+
# Update column choices when data changes
|
712 |
+
def update_columns(df):
|
713 |
+
if df is not None:
|
714 |
+
cols = list(df.columns)
|
715 |
+
return gr.Dropdown(choices=cols), gr.Dropdown(choices=cols)
|
716 |
+
return gr.Dropdown(choices=[]), gr.Dropdown(choices=[])
|
717 |
+
|
718 |
+
current_data.change(
|
719 |
+
update_columns,
|
720 |
+
inputs=[current_data],
|
721 |
+
outputs=[target_col, condition_col]
|
722 |
+
)
|
723 |
+
|
724 |
+
def replace_values_handler(df, tcol, ccol, cval, nval, mtype):
|
725 |
+
if df is None:
|
726 |
+
return None, "β No data loaded", ""
|
727 |
+
|
728 |
+
result_df, status = rename_values_conditional(df, tcol, ccol, cval, nval, mtype)
|
729 |
+
info_text = get_data_info(result_df) if result_df is not None else ""
|
730 |
+
|
731 |
+
return result_df, status, info_text
|
732 |
+
|
733 |
+
replace_btn.click(
|
734 |
+
replace_values_handler,
|
735 |
+
inputs=[current_data, target_col, condition_col, condition_value, new_value, match_type],
|
736 |
+
outputs=[current_data, replace_status, data_info_display]
|
737 |
+
)
|
738 |
+
|
739 |
+
# ===== CSV CONCATENATION TAB =====
|
740 |
+
with gr.TabItem("π CSV Concatenation"):
|
741 |
+
gr.HTML("<h3>Combine multiple CSV files with column selection</h3>")
|
742 |
+
|
743 |
+
with gr.Row():
|
744 |
+
with gr.Column():
|
745 |
+
multi_files = gr.File(
|
746 |
+
label="Upload Multiple Files",
|
747 |
+
file_types=[".csv", ".xlsx", ".xls"],
|
748 |
+
file_count="multiple"
|
749 |
+
)
|
750 |
+
selected_columns = gr.Textbox(
|
751 |
+
label="Columns to Include",
|
752 |
+
placeholder="column1, column2, column3 (leave empty for all)",
|
753 |
+
info="Comma-separated list of column names"
|
754 |
+
)
|
755 |
+
join_type = gr.Radio(
|
756 |
+
choices=["outer", "inner"],
|
757 |
+
value="outer",
|
758 |
+
label="Join Type",
|
759 |
+
info="Outer: keep all columns, Inner: only common columns"
|
760 |
+
)
|
761 |
+
concat_btn = gr.Button("π Concatenate Files", variant="primary")
|
762 |
+
|
763 |
+
with gr.Column():
|
764 |
+
concat_status = gr.Textbox(label="Status", lines=5, interactive=False)
|
765 |
+
|
766 |
+
def concat_handler(files, cols, jtype):
|
767 |
+
if not files:
|
768 |
+
return None, "β Please upload files first", ""
|
769 |
+
|
770 |
+
result_df, status = concatenate_csvs(files, cols, jtype)
|
771 |
+
info_text = get_data_info(result_df) if result_df is not None else ""
|
772 |
+
|
773 |
+
return result_df, status, info_text
|
774 |
+
|
775 |
+
concat_btn.click(
|
776 |
+
concat_handler,
|
777 |
+
inputs=[multi_files, selected_columns, join_type],
|
778 |
+
outputs=[current_data, concat_status, data_info_display]
|
779 |
+
)
|
780 |
+
|
781 |
+
# ===== VALUE COUNTS TAB =====
|
782 |
+
with gr.TabItem("π Value Analysis"):
|
783 |
+
gr.HTML("<h3>Analyze value frequencies and distributions</h3>")
|
784 |
+
|
785 |
+
with gr.Row():
|
786 |
+
with gr.Column():
|
787 |
+
analysis_col = gr.Dropdown(label="Column to Analyze", choices=[], interactive=True)
|
788 |
+
top_n = gr.Slider(minimum=5, maximum=100, value=20, step=5, label="Top N Values")
|
789 |
+
normalize_counts = gr.Checkbox(label="Show Percentages", value=False)
|
790 |
+
analyze_btn = gr.Button("π Analyze Values", variant="primary")
|
791 |
+
|
792 |
+
with gr.Column():
|
793 |
+
analysis_status = gr.Textbox(label="Status", lines=3, interactive=False)
|
794 |
+
|
795 |
+
analysis_results = gr.DataFrame(label="Value Counts", height=400)
|
796 |
+
|
797 |
+
# Update analysis column choices
|
798 |
+
current_data.change(
|
799 |
+
lambda df: gr.Dropdown(choices=list(df.columns) if df is not None else []),
|
800 |
+
inputs=[current_data],
|
801 |
+
outputs=[analysis_col]
|
802 |
+
)
|
803 |
+
|
804 |
+
def analysis_handler(df, col, n, norm):
|
805 |
+
if df is None:
|
806 |
+
return None, "β No data loaded"
|
807 |
+
|
808 |
+
return get_value_counts(df, col, n, norm)
|
809 |
+
|
810 |
+
analyze_btn.click(
|
811 |
+
analysis_handler,
|
812 |
+
inputs=[current_data, analysis_col, top_n, normalize_counts],
|
813 |
+
outputs=[analysis_results, analysis_status]
|
814 |
+
)
|
815 |
+
|
816 |
+
# ===== DATA CLEANING TAB =====
|
817 |
+
with gr.TabItem("π§Ή Data Cleaning"):
|
818 |
+
gr.HTML("<h3>Clean and preprocess your data</h3>")
|
819 |
+
|
820 |
+
with gr.Tabs():
|
821 |
+
# Missing Values
|
822 |
+
with gr.TabItem("Missing Values"):
|
823 |
+
with gr.Row():
|
824 |
+
with gr.Column():
|
825 |
+
missing_col = gr.Dropdown(label="Column", choices=["ALL"], value="ALL", interactive=True)
|
826 |
+
missing_method = gr.Radio(
|
827 |
+
choices=["drop_rows", "fill_value", "fill_mean", "fill_median", "fill_mode", "forward_fill", "backward_fill"],
|
828 |
+
value="drop_rows",
|
829 |
+
label="Method"
|
830 |
+
)
|
831 |
+
fill_value_input = gr.Textbox(label="Fill Value", placeholder="For fill_value method")
|
832 |
+
missing_btn = gr.Button("π§Ή Handle Missing Values", variant="primary")
|
833 |
+
|
834 |
+
with gr.Column():
|
835 |
+
missing_status = gr.Textbox(label="Status", lines=4, interactive=False)
|
836 |
+
|
837 |
+
# Duplicates
|
838 |
+
with gr.TabItem("Duplicates"):
|
839 |
+
with gr.Row():
|
840 |
+
with gr.Column():
|
841 |
+
duplicate_cols = gr.Textbox(
|
842 |
+
label="Columns to Check",
|
843 |
+
placeholder="column1, column2 (empty = all columns)"
|
844 |
+
)
|
845 |
+
keep_method = gr.Radio(
|
846 |
+
choices=["first", "last", "false"],
|
847 |
+
value="first",
|
848 |
+
label="Keep Method"
|
849 |
+
)
|
850 |
+
duplicate_btn = gr.Button("ποΈ Remove Duplicates", variant="primary")
|
851 |
+
|
852 |
+
with gr.Column():
|
853 |
+
duplicate_status = gr.Textbox(label="Status", lines=4, interactive=False)
|
854 |
+
|
855 |
+
# Data Filtering
|
856 |
+
with gr.TabItem("Filtering"):
|
857 |
+
with gr.Row():
|
858 |
+
with gr.Column():
|
859 |
+
filter_col = gr.Dropdown(label="Column", choices=[], interactive=True)
|
860 |
+
filter_condition = gr.Dropdown(
|
861 |
+
choices=["equals", "not_equals", "contains", "not_contains", "starts_with", "ends_with",
|
862 |
+
"greater_than", "less_than", "is_null", "is_not_null"],
|
863 |
+
value="equals",
|
864 |
+
label="Condition"
|
865 |
+
)
|
866 |
+
filter_value = gr.Textbox(label="Value")
|
867 |
+
filter_btn = gr.Button("π Filter Data", variant="primary")
|
868 |
+
|
869 |
+
with gr.Column():
|
870 |
+
filter_status = gr.Textbox(label="Status", lines=4, interactive=False)
|
871 |
+
|
872 |
+
# Update dropdown choices
|
873 |
+
current_data.change(
|
874 |
+
lambda df: (
|
875 |
+
gr.Dropdown(choices=["ALL"] + list(df.columns) if df is not None else ["ALL"]),
|
876 |
+
gr.Dropdown(choices=list(df.columns) if df is not None else [])
|
877 |
+
),
|
878 |
+
inputs=[current_data],
|
879 |
+
outputs=[missing_col, filter_col]
|
880 |
+
)
|
881 |
+
|
882 |
+
# Event handlers
|
883 |
+
missing_btn.click(
|
884 |
+
lambda df, col, method, val: handle_missing_values(df, col, method, val)[1] if df is not None else "β No data",
|
885 |
+
inputs=[current_data, missing_col, missing_method, fill_value_input],
|
886 |
+
outputs=[missing_status]
|
887 |
+
).then(
|
888 |
+
lambda: processor.current_df,
|
889 |
+
outputs=[current_data]
|
890 |
+
).then(
|
891 |
+
lambda df: get_data_info(df),
|
892 |
+
inputs=[current_data],
|
893 |
+
outputs=[data_info_display]
|
894 |
+
)
|
895 |
+
|
896 |
+
duplicate_btn.click(
|
897 |
+
lambda df, cols, keep: detect_and_remove_duplicates(df, cols, keep)[1] if df is not None else "β No data",
|
898 |
+
inputs=[current_data, duplicate_cols, keep_method],
|
899 |
+
outputs=[duplicate_status]
|
900 |
+
).then(
|
901 |
+
lambda: processor.current_df,
|
902 |
+
outputs=[current_data]
|
903 |
+
).then(
|
904 |
+
lambda df: get_data_info(df),
|
905 |
+
inputs=[current_data],
|
906 |
+
outputs=[data_info_display]
|
907 |
+
)
|
908 |
+
|
909 |
+
filter_btn.click(
|
910 |
+
lambda df, col, cond, val: filter_data(df, col, cond, val)[1] if df is not None else "β No data",
|
911 |
+
inputs=[current_data, filter_col, filter_condition, filter_value],
|
912 |
+
outputs=[filter_status]
|
913 |
+
).then(
|
914 |
+
lambda: processor.current_df,
|
915 |
+
outputs=[current_data]
|
916 |
+
).then(
|
917 |
+
lambda df: get_data_info(df),
|
918 |
+
inputs=[current_data],
|
919 |
+
outputs=[data_info_display]
|
920 |
+
)
|
921 |
+
|
922 |
+
# ===== COLUMN OPERATIONS TAB =====
|
923 |
+
with gr.TabItem("βοΈ Column Operations"):
|
924 |
+
gr.HTML("<h3>Perform operations on columns</h3>")
|
925 |
+
|
926 |
+
with gr.Row():
|
927 |
+
with gr.Column():
|
928 |
+
op_type = gr.Dropdown(
|
929 |
+
choices=["add", "subtract", "multiply", "divide", "concatenate",
|
930 |
+
"extract_numbers", "upper", "lower", "title", "length"],
|
931 |
+
value="add",
|
932 |
+
label="Operation"
|
933 |
+
)
|
934 |
+
op_col1 = gr.Dropdown(label="Primary Column", choices=[], interactive=True)
|
935 |
+
op_col2 = gr.Dropdown(label="Second Column (optional)", choices=[], interactive=True)
|
936 |
+
op_constant = gr.Textbox(label="Constant Value (optional)")
|
937 |
+
op_new_name = gr.Textbox(label="New Column Name")
|
938 |
+
op_btn = gr.Button("βοΈ Execute Operation", variant="primary")
|
939 |
+
|
940 |
+
with gr.Column():
|
941 |
+
op_status = gr.Textbox(label="Status", lines=5, interactive=False)
|
942 |
+
|
943 |
+
# Data type conversion
|
944 |
+
gr.HTML("<hr><h4>Data Type Conversion</h4>")
|
945 |
+
convert_col = gr.Dropdown(label="Column", choices=[], interactive=True)
|
946 |
+
convert_type = gr.Dropdown(
|
947 |
+
choices=["string", "integer", "float", "datetime", "boolean", "category"],
|
948 |
+
value="string",
|
949 |
+
label="Target Type"
|
950 |
+
)
|
951 |
+
convert_btn = gr.Button("π Convert Type", variant="secondary")
|
952 |
+
convert_status = gr.Textbox(label="Conversion Status", lines=2, interactive=False)
|
953 |
+
|
954 |
+
# Update column choices
|
955 |
+
current_data.change(
|
956 |
+
lambda df: (
|
957 |
+
gr.Dropdown(choices=list(df.columns) if df is not None else []),
|
958 |
+
gr.Dropdown(choices=list(df.columns) if df is not None else []),
|
959 |
+
gr.Dropdown(choices=list(df.columns) if df is not None else [])
|
960 |
+
),
|
961 |
+
inputs=[current_data],
|
962 |
+
outputs=[op_col1, op_col2, convert_col]
|
963 |
+
)
|
964 |
+
|
965 |
+
# Event handlers
|
966 |
+
def operation_handler(df, op, col1, col2, const, new_name):
|
967 |
+
if df is None:
|
968 |
+
return None, "β No data loaded", ""
|
969 |
+
|
970 |
+
result_df, status = perform_column_operations(df, op, col1, col2, new_name, const)
|
971 |
+
info_text = get_data_info(result_df) if result_df is not None else ""
|
972 |
+
|
973 |
+
return result_df, status, info_text
|
974 |
+
|
975 |
+
op_btn.click(
|
976 |
+
operation_handler,
|
977 |
+
inputs=[current_data, op_type, op_col1, op_col2, op_constant, op_new_name],
|
978 |
+
outputs=[current_data, op_status, data_info_display]
|
979 |
+
)
|
980 |
+
|
981 |
+
def convert_handler(df, col, target_type):
|
982 |
+
if df is None:
|
983 |
+
return None, "β No data loaded", ""
|
984 |
+
|
985 |
+
result_df, status = convert_data_types(df, col, target_type)
|
986 |
+
info_text = get_data_info(result_df) if result_df is not None else ""
|
987 |
+
|
988 |
+
return result_df, status, info_text
|
989 |
+
|
990 |
+
convert_btn.click(
|
991 |
+
convert_handler,
|
992 |
+
inputs=[current_data, convert_col, convert_type],
|
993 |
+
outputs=[current_data, convert_status, data_info_display]
|
994 |
+
)
|
995 |
+
|
996 |
+
# ===== STATISTICS TAB =====
|
997 |
+
with gr.TabItem("π Statistics & Analysis"):
|
998 |
+
gr.HTML("<h3>Statistical analysis and insights</h3>")
|
999 |
+
|
1000 |
+
with gr.Row():
|
1001 |
+
with gr.Column():
|
1002 |
+
stats_btn = gr.Button("π Generate Statistical Summary", variant="primary")
|
1003 |
+
corr_btn = gr.Button("π Create Correlation Matrix", variant="secondary")
|
1004 |
+
|
1005 |
+
# Distribution plots
|
1006 |
+
gr.HTML("<hr><h4>Distribution Analysis</h4>")
|
1007 |
+
dist_col = gr.Dropdown(label="Column", choices=[], interactive=True)
|
1008 |
+
plot_type = gr.Radio(choices=["histogram", "density"], value="histogram", label="Plot Type")
|
1009 |
+
dist_btn = gr.Button("π Create Distribution Plot", variant="secondary")
|
1010 |
+
|
1011 |
+
with gr.Column():
|
1012 |
+
stats_status = gr.Textbox(label="Status", lines=3, interactive=False)
|
1013 |
+
plot_output = gr.Image(label="Visualization")
|
1014 |
+
|
1015 |
+
stats_results = gr.DataFrame(label="Statistical Summary", height=400)
|
1016 |
+
|
1017 |
+
# Update column choices
|
1018 |
+
current_data.change(
|
1019 |
+
lambda df: gr.Dropdown(choices=list(df.select_dtypes(include=[np.number]).columns) if df is not None else []),
|
1020 |
+
inputs=[current_data],
|
1021 |
+
outputs=[dist_col]
|
1022 |
+
)
|
1023 |
+
|
1024 |
+
# Event handlers
|
1025 |
+
stats_btn.click(
|
1026 |
+
lambda df: generate_statistical_summary(df) if df is not None else (None, "β No data"),
|
1027 |
+
inputs=[current_data],
|
1028 |
+
outputs=[stats_results, stats_status]
|
1029 |
+
)
|
1030 |
+
|
1031 |
+
corr_btn.click(
|
1032 |
+
lambda df: create_correlation_matrix(df) if df is not None else (None, "β No data"),
|
1033 |
+
inputs=[current_data],
|
1034 |
+
outputs=[plot_output, stats_status]
|
1035 |
+
)
|
1036 |
+
|
1037 |
+
dist_btn.click(
|
1038 |
+
lambda df, col, ptype: create_distribution_plots(df, col, ptype) if df is not None else (None, "β No data"),
|
1039 |
+
inputs=[current_data, dist_col, plot_type],
|
1040 |
+
outputs=[plot_output, stats_status]
|
1041 |
+
)
|
1042 |
+
|
1043 |
+
# ===== EXPORT TAB =====
|
1044 |
+
with gr.TabItem("πΎ Export & Download"):
|
1045 |
+
gr.HTML("<h3>Export your processed data</h3>")
|
1046 |
+
|
1047 |
+
with gr.Row():
|
1048 |
+
with gr.Column():
|
1049 |
+
export_format = gr.Radio(
|
1050 |
+
choices=["csv", "excel", "json"],
|
1051 |
+
value="csv",
|
1052 |
+
label="Export Format"
|
1053 |
+
)
|
1054 |
+
export_filename = gr.Textbox(
|
1055 |
+
label="Filename (without extension)",
|
1056 |
+
value="processed_data",
|
1057 |
+
placeholder="Enter filename"
|
1058 |
+
)
|
1059 |
+
export_btn = gr.Button("πΎ Create Download File", variant="primary", size="lg")
|
1060 |
+
|
1061 |
+
with gr.Column():
|
1062 |
+
export_status = gr.Textbox(label="Status", lines=3, interactive=False)
|
1063 |
+
download_file = gr.File(label="Download", visible=False)
|
1064 |
+
|
1065 |
+
# History and Undo/Redo
|
1066 |
+
with gr.Row():
|
1067 |
+
with gr.Column():
|
1068 |
+
gr.HTML("<hr><h4>History & Undo Operations</h4>")
|
1069 |
+
undo_btn = gr.Button("βΆ Undo Last Operation", variant="secondary")
|
1070 |
+
reset_btn = gr.Button("π Reset to Original", variant="secondary")
|
1071 |
+
|
1072 |
+
with gr.Column():
|
1073 |
+
history_status = gr.Textbox(label="History Status", lines=3, interactive=False)
|
1074 |
+
|
1075 |
+
def export_handler(df, fmt, filename):
|
1076 |
+
if df is None:
|
1077 |
+
return None, "β No data to export", gr.File(visible=False)
|
1078 |
+
|
1079 |
+
try:
|
1080 |
+
file_data, file_name = create_download_file(df, fmt, filename)
|
1081 |
+
|
1082 |
+
# Save file temporarily
|
1083 |
+
with open(file_name, 'wb' if fmt == 'excel' else 'w', encoding=None if fmt == 'excel' else 'utf-8') as f:
|
1084 |
+
if fmt == 'excel':
|
1085 |
+
f.write(file_data)
|
1086 |
+
else:
|
1087 |
+
f.write(file_data)
|
1088 |
+
|
1089 |
+
return file_name, f"β
File created successfully: {file_name}", gr.File(value=file_name, visible=True)
|
1090 |
+
|
1091 |
+
except Exception as e:
|
1092 |
+
return None, f"β Export error: {str(e)}", gr.File(visible=False)
|
1093 |
+
|
1094 |
+
export_btn.click(
|
1095 |
+
export_handler,
|
1096 |
+
inputs=[current_data, export_format, export_filename],
|
1097 |
+
outputs=[download_file, export_status, download_file]
|
1098 |
+
)
|
1099 |
+
|
1100 |
+
def undo_handler():
|
1101 |
+
result_df, status = processor.undo_operation()
|
1102 |
+
info_text = get_data_info(result_df) if result_df is not None else ""
|
1103 |
+
return result_df, status, info_text
|
1104 |
+
|
1105 |
+
def reset_handler():
|
1106 |
+
result_df, status = processor.reset_to_original()
|
1107 |
+
info_text = get_data_info(result_df) if result_df is not None else ""
|
1108 |
+
return result_df, status, info_text
|
1109 |
+
|
1110 |
+
undo_btn.click(
|
1111 |
+
undo_handler,
|
1112 |
+
outputs=[current_data, history_status, data_info_display]
|
1113 |
+
)
|
1114 |
+
|
1115 |
+
reset_btn.click(
|
1116 |
+
reset_handler,
|
1117 |
+
outputs=[current_data, history_status, data_info_display]
|
1118 |
+
)
|
1119 |
+
|
1120 |
+
# Footer
|
1121 |
+
gr.HTML("""
|
1122 |
+
<div style="text-align: center; padding: 20px; margin-top: 30px; border-top: 1px solid #ddd;">
|
1123 |
+
<p style="color: #666; font-size: 14px;">
|
1124 |
+
π <strong>Advanced CSV Manipulation Tool</strong> |
|
1125 |
+
Commercial-ready data processing with enterprise features |
|
1126 |
+
Built with Gradio & Python
|
1127 |
+
</p>
|
1128 |
+
</div>
|
1129 |
+
""")
|
1130 |
+
|
1131 |
+
return demo
|
1132 |
+
|
1133 |
+
if __name__ == "__main__":
|
1134 |
+
# Create and launch the interface
|
1135 |
+
demo = create_interface()
|
1136 |
+
demo.launch(
|
1137 |
+
share=True,
|
1138 |
+
inbrowser=True,
|
1139 |
+
server_name="0.0.0.0",
|
1140 |
+
server_port=7860,
|
1141 |
+
max_file_size="1gb"
|
1142 |
+
)
|