limitedonly41 commited on
Commit
67fa2c6
Β·
verified Β·
1 Parent(s): dd03f94

Upload csv-manipulator.py

Browse files
Files changed (1) hide show
  1. csv-manipulator.py +1142 -0
csv-manipulator.py ADDED
@@ -0,0 +1,1142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Advanced CSV Manipulation Tool with Gradio Interface
4
+ Commercial-ready application for powerful CSV data processing
5
+
6
+ Features:
7
+ - File upload with 1GB limit
8
+ - Data preview with selectable rows
9
+ - Value replacement based on conditions
10
+ - CSV concatenation with column selection
11
+ - Advanced statistical analysis and visualization
12
+ - Data validation and quality checks
13
+ - Export to CSV, Excel, JSON
14
+ - Batch operations and operation recipes
15
+ - Undo/Redo functionality
16
+ - Memory-efficient large file processing
17
+ """
18
+
19
+ import gradio as gr
20
+ import pandas as pd
21
+ import numpy as np
22
+ import json
23
+ import io
24
+ import zipfile
25
+ from datetime import datetime, timedelta
26
+ import re
27
+ import matplotlib.pyplot as plt
28
+ import seaborn as sns
29
+ import plotly.express as px
30
+ import plotly.graph_objects as go
31
+ from plotly.subplots import make_subplots
32
+ import warnings
33
+ import os
34
+ from typing import Dict, List, Tuple, Optional, Any
35
+ import hashlib
36
+ import pickle
37
+ from pathlib import Path
38
+
39
+ warnings.filterwarnings('ignore')
40
+ plt.style.use('seaborn-v0_8')
41
+ sns.set_palette("husl")
42
+
43
+ class CSVProcessor:
44
+ """Advanced CSV processing class with state management and history"""
45
+
46
+ def __init__(self):
47
+ self.original_df = None
48
+ self.current_df = None
49
+ self.history = []
50
+ self.recipes = {}
51
+ self.batch_files = []
52
+
53
+ def load_data(self, file, preview_rows=100, encoding='utf-8'):
54
+ """Load data file with error handling and memory optimization"""
55
+ try:
56
+ if file is None:
57
+ return None, "No file provided"
58
+
59
+ file_path = file.name if hasattr(file, 'name') else str(file)
60
+ file_extension = Path(file_path).suffix.lower()
61
+
62
+ # Chunked reading for large files
63
+ if file_extension == '.csv':
64
+ # Try different encodings
65
+ encodings = [encoding, 'utf-8', 'latin-1', 'cp1252']
66
+ df = None
67
+ for enc in encodings:
68
+ try:
69
+ df = pd.read_csv(file_path, encoding=enc, low_memory=False)
70
+ break
71
+ except UnicodeDecodeError:
72
+ continue
73
+ if df is None:
74
+ return None, "Failed to decode file with supported encodings"
75
+
76
+ elif file_extension in ['.xlsx', '.xls']:
77
+ df = pd.read_excel(file_path)
78
+ elif file_extension == '.json':
79
+ df = pd.read_json(file_path)
80
+ elif file_extension == '.parquet':
81
+ df = pd.read_parquet(file_path)
82
+ else:
83
+ return None, f"Unsupported file format: {file_extension}"
84
+
85
+ self.original_df = df.copy()
86
+ self.current_df = df.copy()
87
+ self.history = []
88
+
89
+ # Create preview
90
+ if preview_rows > 0:
91
+ preview = df.head(preview_rows)
92
+ else:
93
+ preview = df
94
+
95
+ # Memory and performance info
96
+ memory_mb = df.memory_usage(deep=True).sum() / 1024**2
97
+ info = {
98
+ 'rows': len(df),
99
+ 'columns': len(df.columns),
100
+ 'memory_usage': f"{memory_mb:.2f} MB",
101
+ 'dtypes': dict(df.dtypes.astype(str)),
102
+ 'null_counts': dict(df.isnull().sum()),
103
+ 'duplicates': df.duplicated().sum()
104
+ }
105
+
106
+ success_msg = f"βœ… File loaded successfully!\n"
107
+ success_msg += f"πŸ“Š {info['rows']:,} rows Γ— {info['columns']} columns\n"
108
+ success_msg += f"πŸ’Ύ Memory usage: {info['memory_usage']}\n"
109
+ success_msg += f"πŸ”„ Duplicates: {info['duplicates']:,}\n"
110
+ success_msg += f"❌ Missing values: {sum(info['null_counts'].values()):,}"
111
+
112
+ return preview, success_msg, info
113
+
114
+ except Exception as e:
115
+ return None, f"❌ Error loading file: {str(e)}", {}
116
+
117
+ def save_state(self, operation_name: str):
118
+ """Save current state to history with memory management"""
119
+ if len(self.history) > 50: # Limit history to prevent memory issues
120
+ self.history = self.history[-25:] # Keep last 25 operations
121
+
122
+ self.history.append({
123
+ 'operation': operation_name,
124
+ 'timestamp': datetime.now(),
125
+ 'df': self.current_df.copy() if self.current_df is not None else None
126
+ })
127
+
128
+ def undo_operation(self):
129
+ """Undo last operation"""
130
+ if len(self.history) > 1:
131
+ self.history.pop()
132
+ self.current_df = self.history[-1]['df'].copy()
133
+ return self.current_df, f"βœ… Undone: {self.history[-1]['operation']}"
134
+ elif len(self.history) == 1:
135
+ self.current_df = self.original_df.copy()
136
+ self.history = []
137
+ return self.current_df, "βœ… Reset to original data"
138
+ else:
139
+ return self.current_df, "❌ No operations to undo"
140
+
141
+ def reset_to_original(self):
142
+ """Reset to original data"""
143
+ if self.original_df is not None:
144
+ self.current_df = self.original_df.copy()
145
+ self.history = []
146
+ return self.current_df, "βœ… Reset to original data"
147
+ return None, "❌ No original data available"
148
+
149
+ # Global processor instance
150
+ processor = CSVProcessor()
151
+
152
+ def create_download_file(df: pd.DataFrame, format_type: str, filename: str = "processed_data"):
153
+ """Create downloadable file in specified format"""
154
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
155
+ filename_with_timestamp = f"{filename}_{timestamp}"
156
+
157
+ try:
158
+ if format_type == "csv":
159
+ csv_data = df.to_csv(index=False)
160
+ return csv_data, f"{filename_with_timestamp}.csv"
161
+ elif format_type == "excel":
162
+ buffer = io.BytesIO()
163
+ with pd.ExcelWriter(buffer, engine='openpyxl') as writer:
164
+ df.to_excel(writer, index=False, sheet_name='Data')
165
+ buffer.seek(0)
166
+ return buffer.getvalue(), f"{filename_with_timestamp}.xlsx"
167
+ elif format_type == "json":
168
+ json_data = df.to_json(orient='records', indent=2, date_format='iso')
169
+ return json_data, f"{filename_with_timestamp}.json"
170
+ except Exception as e:
171
+ return None, f"Error creating {format_type} file: {str(e)}"
172
+
173
+ def get_data_info(df: pd.DataFrame) -> str:
174
+ """Get comprehensive data information"""
175
+ if df is None or df.empty:
176
+ return "No data loaded"
177
+
178
+ info_dict = {
179
+ 'πŸ“Š Shape': f"{df.shape[0]:,} rows Γ— {df.shape[1]} columns",
180
+ 'πŸ’Ύ Memory': f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB",
181
+ 'πŸ”„ Duplicates': f"{df.duplicated().sum():,}",
182
+ '❌ Missing Values': f"{df.isnull().sum().sum():,}",
183
+ 'πŸ“ˆ Numeric Columns': f"{len(df.select_dtypes(include=[np.number]).columns)}",
184
+ 'πŸ“ Text Columns': f"{len(df.select_dtypes(include=['object']).columns)}",
185
+ 'πŸ“… Date Columns': f"{len(df.select_dtypes(include=['datetime64']).columns)}"
186
+ }
187
+
188
+ return "\n".join([f"{k}: {v}" for k, v in info_dict.items()])
189
+
190
+ def get_column_options(df: pd.DataFrame) -> List[str]:
191
+ """Get list of column names for dropdowns"""
192
+ return list(df.columns) if df is not None else []
193
+
194
+ # ===========================================
195
+ # CORE DATA MANIPULATION FUNCTIONS
196
+ # ===========================================
197
+
198
+ def rename_values_conditional(df: pd.DataFrame, target_col: str, condition_col: str,
199
+ condition_value: str, new_value: str, match_type: str = "exact") -> Tuple[pd.DataFrame, str]:
200
+ """Rename values in target column based on condition in another column"""
201
+ try:
202
+ if df is None or df.empty:
203
+ return df, "❌ No data available"
204
+
205
+ if target_col not in df.columns or condition_col not in df.columns:
206
+ return df, "❌ One or more columns not found"
207
+
208
+ df_result = df.copy()
209
+
210
+ if match_type == "exact":
211
+ mask = df_result[condition_col] == condition_value
212
+ elif match_type == "contains":
213
+ mask = df_result[condition_col].astype(str).str.contains(condition_value, na=False)
214
+ elif match_type == "regex":
215
+ mask = df_result[condition_col].astype(str).str.match(condition_value, na=False)
216
+ elif match_type == "starts_with":
217
+ mask = df_result[condition_col].astype(str).str.startswith(condition_value, na=False)
218
+ elif match_type == "ends_with":
219
+ mask = df_result[condition_col].astype(str).str.endswith(condition_value, na=False)
220
+
221
+ affected_rows = mask.sum()
222
+ df_result.loc[mask, target_col] = new_value
223
+
224
+ processor.current_df = df_result
225
+ processor.save_state(f"Renamed values in '{target_col}' based on '{condition_col}'")
226
+
227
+ return df_result, f"βœ… Updated {affected_rows:,} rows in column '{target_col}'"
228
+
229
+ except Exception as e:
230
+ return df, f"❌ Error: {str(e)}"
231
+
232
+ def concatenate_csvs(files: List, selected_columns: str, join_type: str = "outer") -> Tuple[pd.DataFrame, str]:
233
+ """Concatenate multiple CSV files with column selection"""
234
+ try:
235
+ if not files:
236
+ return None, "❌ No files provided"
237
+
238
+ dfs = []
239
+ columns_to_use = [col.strip() for col in selected_columns.split(",") if col.strip()] if selected_columns else None
240
+
241
+ for file in files:
242
+ if hasattr(file, 'name'):
243
+ file_path = file.name
244
+ if file_path.endswith('.csv'):
245
+ df = pd.read_csv(file_path, encoding='utf-8', low_memory=False)
246
+ elif file_path.endswith(('.xlsx', '.xls')):
247
+ df = pd.read_excel(file_path)
248
+ else:
249
+ continue
250
+
251
+ # Select specific columns if specified
252
+ if columns_to_use:
253
+ available_cols = [col for col in columns_to_use if col in df.columns]
254
+ if available_cols:
255
+ df = df[available_cols]
256
+ else:
257
+ continue
258
+
259
+ # Add source file identifier
260
+ df['_source_file'] = Path(file_path).stem
261
+ dfs.append(df)
262
+
263
+ if not dfs:
264
+ return None, "❌ No valid files found or columns don't exist"
265
+
266
+ # Concatenate with specified join type
267
+ if join_type == "inner":
268
+ result_df = pd.concat(dfs, ignore_index=True, join='inner')
269
+ else:
270
+ result_df = pd.concat(dfs, ignore_index=True, join='outer')
271
+
272
+ processor.current_df = result_df
273
+ processor.save_state(f"Concatenated {len(dfs)} files")
274
+
275
+ return result_df, f"βœ… Successfully concatenated {len(dfs)} files with {len(result_df):,} total rows"
276
+
277
+ except Exception as e:
278
+ return None, f"❌ Error concatenating files: {str(e)}"
279
+
280
+ def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 20, normalize: bool = False) -> Tuple[pd.DataFrame, str]:
281
+ """Get value counts for specified column"""
282
+ try:
283
+ if df is None or df.empty:
284
+ return None, "❌ No data available"
285
+
286
+ if column not in df.columns:
287
+ return None, f"❌ Column '{column}' not found"
288
+
289
+ value_counts = df[column].value_counts(normalize=normalize, dropna=False).head(top_n)
290
+
291
+ # Convert to DataFrame for better display
292
+ result_df = pd.DataFrame({
293
+ 'Value': value_counts.index,
294
+ 'Count' if not normalize else 'Percentage': value_counts.values
295
+ })
296
+
297
+ if normalize:
298
+ result_df['Percentage'] = result_df['Percentage'].map(lambda x: f"{x:.2%}")
299
+
300
+ return result_df, f"βœ… Value counts for '{column}' (Top {min(top_n, len(result_df))})"
301
+
302
+ except Exception as e:
303
+ return None, f"❌ Error: {str(e)}"
304
+
305
+ def filter_data(df: pd.DataFrame, column: str, condition: str, value: str) -> Tuple[pd.DataFrame, str]:
306
+ """Filter data based on conditions"""
307
+ try:
308
+ if df is None or df.empty:
309
+ return df, "❌ No data available"
310
+
311
+ if column not in df.columns:
312
+ return df, f"❌ Column '{column}' not found"
313
+
314
+ df_result = df.copy()
315
+
316
+ if condition == "equals":
317
+ mask = df_result[column] == value
318
+ elif condition == "not_equals":
319
+ mask = df_result[column] != value
320
+ elif condition == "contains":
321
+ mask = df_result[column].astype(str).str.contains(value, na=False)
322
+ elif condition == "not_contains":
323
+ mask = ~df_result[column].astype(str).str.contains(value, na=False)
324
+ elif condition == "starts_with":
325
+ mask = df_result[column].astype(str).str.startswith(value, na=False)
326
+ elif condition == "ends_with":
327
+ mask = df_result[column].astype(str).str.endswith(value, na=False)
328
+ elif condition == "greater_than":
329
+ mask = pd.to_numeric(df_result[column], errors='coerce') > float(value)
330
+ elif condition == "less_than":
331
+ mask = pd.to_numeric(df_result[column], errors='coerce') < float(value)
332
+ elif condition == "is_null":
333
+ mask = df_result[column].isnull()
334
+ elif condition == "is_not_null":
335
+ mask = df_result[column].notnull()
336
+ else:
337
+ return df, f"❌ Unknown condition: {condition}"
338
+
339
+ filtered_df = df_result[mask]
340
+
341
+ processor.current_df = filtered_df
342
+ processor.save_state(f"Filtered data: {column} {condition} {value}")
343
+
344
+ return filtered_df, f"βœ… Filtered to {len(filtered_df):,} rows (removed {len(df) - len(filtered_df):,} rows)"
345
+
346
+ except Exception as e:
347
+ return df, f"❌ Error: {str(e)}"
348
+
349
+ def handle_missing_values(df: pd.DataFrame, column: str, method: str, fill_value: str = "") -> Tuple[pd.DataFrame, str]:
350
+ """Handle missing values in specified column"""
351
+ try:
352
+ if df is None or df.empty:
353
+ return df, "❌ No data available"
354
+
355
+ if column != "ALL" and column not in df.columns:
356
+ return df, f"❌ Column '{column}' not found"
357
+
358
+ df_result = df.copy()
359
+ columns_to_process = [column] if column != "ALL" else df_result.columns.tolist()
360
+
361
+ total_missing_before = df_result.isnull().sum().sum()
362
+
363
+ for col in columns_to_process:
364
+ if method == "drop_rows":
365
+ df_result = df_result.dropna(subset=[col])
366
+ elif method == "fill_value":
367
+ df_result[col] = df_result[col].fillna(fill_value)
368
+ elif method == "fill_mean":
369
+ if df_result[col].dtype in ['int64', 'float64']:
370
+ df_result[col] = df_result[col].fillna(df_result[col].mean())
371
+ elif method == "fill_median":
372
+ if df_result[col].dtype in ['int64', 'float64']:
373
+ df_result[col] = df_result[col].fillna(df_result[col].median())
374
+ elif method == "fill_mode":
375
+ mode_val = df_result[col].mode()
376
+ if len(mode_val) > 0:
377
+ df_result[col] = df_result[col].fillna(mode_val[0])
378
+ elif method == "forward_fill":
379
+ df_result[col] = df_result[col].fillna(method='ffill')
380
+ elif method == "backward_fill":
381
+ df_result[col] = df_result[col].fillna(method='bfill')
382
+
383
+ total_missing_after = df_result.isnull().sum().sum()
384
+
385
+ processor.current_df = df_result
386
+ processor.save_state(f"Handle missing values: {method}")
387
+
388
+ return df_result, f"βœ… Processed missing values. Before: {total_missing_before:,}, After: {total_missing_after:,}"
389
+
390
+ except Exception as e:
391
+ return df, f"❌ Error: {str(e)}"
392
+
393
+ def detect_and_remove_duplicates(df: pd.DataFrame, columns: str = "", keep: str = "first") -> Tuple[pd.DataFrame, str]:
394
+ """Detect and remove duplicate rows"""
395
+ try:
396
+ if df is None or df.empty:
397
+ return df, "❌ No data available"
398
+
399
+ df_result = df.copy()
400
+
401
+ # Parse columns
402
+ if columns.strip():
403
+ cols_list = [col.strip() for col in columns.split(",") if col.strip() in df.columns]
404
+ subset = cols_list if cols_list else None
405
+ else:
406
+ subset = None
407
+
408
+ duplicates_before = df_result.duplicated(subset=subset).sum()
409
+
410
+ if duplicates_before == 0:
411
+ return df_result, "βœ… No duplicate rows found"
412
+
413
+ df_result = df_result.drop_duplicates(subset=subset, keep=keep)
414
+
415
+ processor.current_df = df_result
416
+ processor.save_state(f"Removed {duplicates_before:,} duplicate rows")
417
+
418
+ return df_result, f"βœ… Removed {duplicates_before:,} duplicate rows. Remaining: {len(df_result):,} rows"
419
+
420
+ except Exception as e:
421
+ return df, f"❌ Error: {str(e)}"
422
+
423
+ def perform_column_operations(df: pd.DataFrame, operation: str, col1: str, col2: str = "",
424
+ new_col_name: str = "", constant: str = "") -> Tuple[pd.DataFrame, str]:
425
+ """Perform mathematical and string operations on columns"""
426
+ try:
427
+ if df is None or df.empty:
428
+ return df, "❌ No data available"
429
+
430
+ if col1 not in df.columns:
431
+ return df, f"❌ Column '{col1}' not found"
432
+
433
+ df_result = df.copy()
434
+
435
+ if not new_col_name:
436
+ new_col_name = f"{col1}_{operation}"
437
+
438
+ if operation == "add":
439
+ if col2 and col2 in df.columns:
440
+ df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') + pd.to_numeric(df_result[col2], errors='coerce')
441
+ elif constant:
442
+ df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') + float(constant)
443
+
444
+ elif operation == "subtract":
445
+ if col2 and col2 in df.columns:
446
+ df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') - pd.to_numeric(df_result[col2], errors='coerce')
447
+ elif constant:
448
+ df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') - float(constant)
449
+
450
+ elif operation == "multiply":
451
+ if col2 and col2 in df.columns:
452
+ df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') * pd.to_numeric(df_result[col2], errors='coerce')
453
+ elif constant:
454
+ df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') * float(constant)
455
+
456
+ elif operation == "divide":
457
+ if col2 and col2 in df.columns:
458
+ df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') / pd.to_numeric(df_result[col2], errors='coerce')
459
+ elif constant:
460
+ df_result[new_col_name] = pd.to_numeric(df_result[col1], errors='coerce') / float(constant)
461
+
462
+ elif operation == "concatenate":
463
+ if col2 and col2 in df.columns:
464
+ df_result[new_col_name] = df_result[col1].astype(str) + " " + df_result[col2].astype(str)
465
+ elif constant:
466
+ df_result[new_col_name] = df_result[col1].astype(str) + constant
467
+
468
+ elif operation == "extract_numbers":
469
+ df_result[new_col_name] = df_result[col1].astype(str).str.extract(r'(\d+)')[0]
470
+
471
+ elif operation == "upper":
472
+ df_result[new_col_name] = df_result[col1].astype(str).str.upper()
473
+
474
+ elif operation == "lower":
475
+ df_result[new_col_name] = df_result[col1].astype(str).str.lower()
476
+
477
+ elif operation == "title":
478
+ df_result[new_col_name] = df_result[col1].astype(str).str.title()
479
+
480
+ elif operation == "length":
481
+ df_result[new_col_name] = df_result[col1].astype(str).str.len()
482
+
483
+ else:
484
+ return df, f"❌ Unknown operation: {operation}"
485
+
486
+ processor.current_df = df_result
487
+ processor.save_state(f"Column operation: {operation} on {col1}")
488
+
489
+ return df_result, f"βœ… Created new column '{new_col_name}' using {operation} operation"
490
+
491
+ except Exception as e:
492
+ return df, f"❌ Error: {str(e)}"
493
+
494
+ def convert_data_types(df: pd.DataFrame, column: str, target_type: str) -> Tuple[pd.DataFrame, str]:
495
+ """Convert column data types"""
496
+ try:
497
+ if df is None or df.empty:
498
+ return df, "❌ No data available"
499
+
500
+ if column not in df.columns:
501
+ return df, f"❌ Column '{column}' not found"
502
+
503
+ df_result = df.copy()
504
+
505
+ if target_type == "string":
506
+ df_result[column] = df_result[column].astype(str)
507
+ elif target_type == "integer":
508
+ df_result[column] = pd.to_numeric(df_result[column], errors='coerce').astype('Int64')
509
+ elif target_type == "float":
510
+ df_result[column] = pd.to_numeric(df_result[column], errors='coerce')
511
+ elif target_type == "datetime":
512
+ df_result[column] = pd.to_datetime(df_result[column], errors='coerce')
513
+ elif target_type == "boolean":
514
+ df_result[column] = df_result[column].astype(bool)
515
+ elif target_type == "category":
516
+ df_result[column] = df_result[column].astype('category')
517
+ else:
518
+ return df, f"❌ Unknown data type: {target_type}"
519
+
520
+ processor.current_df = df_result
521
+ processor.save_state(f"Converted '{column}' to {target_type}")
522
+
523
+ return df_result, f"βœ… Converted column '{column}' to {target_type}"
524
+
525
+ except Exception as e:
526
+ return df, f"❌ Error: {str(e)}"
527
+
528
+ # ===========================================
529
+ # ANALYSIS AND VISUALIZATION FUNCTIONS
530
+ # ===========================================
531
+
532
+ def generate_statistical_summary(df: pd.DataFrame) -> Tuple[pd.DataFrame, str]:
533
+ """Generate comprehensive statistical summary"""
534
+ try:
535
+ if df is None or df.empty:
536
+ return None, "❌ No data available"
537
+
538
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
539
+
540
+ if len(numeric_cols) == 0:
541
+ return None, "❌ No numeric columns found"
542
+
543
+ stats_df = df[numeric_cols].describe()
544
+
545
+ # Add additional statistics
546
+ stats_df.loc['variance'] = df[numeric_cols].var()
547
+ stats_df.loc['skewness'] = df[numeric_cols].skew()
548
+ stats_df.loc['kurtosis'] = df[numeric_cols].kurtosis()
549
+ stats_df.loc['missing'] = df[numeric_cols].isnull().sum()
550
+
551
+ return stats_df.round(4), "βœ… Statistical summary generated"
552
+
553
+ except Exception as e:
554
+ return None, f"❌ Error: {str(e)}"
555
+
556
+ def create_correlation_matrix(df: pd.DataFrame) -> Tuple[str, str]:
557
+ """Create correlation matrix visualization"""
558
+ try:
559
+ if df is None or df.empty:
560
+ return None, "❌ No data available"
561
+
562
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
563
+
564
+ if len(numeric_cols) < 2:
565
+ return None, "❌ Need at least 2 numeric columns for correlation"
566
+
567
+ # Calculate correlation matrix
568
+ corr_matrix = df[numeric_cols].corr()
569
+
570
+ # Create heatmap
571
+ plt.figure(figsize=(12, 8))
572
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
573
+ sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
574
+ square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
575
+ plt.title('Correlation Matrix Heatmap', fontsize=16, fontweight='bold')
576
+ plt.tight_layout()
577
+
578
+ # Save plot
579
+ plt.savefig('correlation_matrix.png', dpi=300, bbox_inches='tight')
580
+ plt.close()
581
+
582
+ return 'correlation_matrix.png', "βœ… Correlation matrix created"
583
+
584
+ except Exception as e:
585
+ return None, f"❌ Error: {str(e)}"
586
+
587
+ def create_distribution_plots(df: pd.DataFrame, column: str, plot_type: str = "histogram") -> Tuple[str, str]:
588
+ """Create distribution plots"""
589
+ try:
590
+ if df is None or df.empty:
591
+ return None, "❌ No data available"
592
+
593
+ if column not in df.columns:
594
+ return None, f"❌ Column '{column}' not found"
595
+
596
+ plt.figure(figsize=(12, 6))
597
+
598
+ if plot_type == "histogram":
599
+ plt.subplot(1, 2, 1)
600
+ df[column].hist(bins=30, edgecolor='black', alpha=0.7)
601
+ plt.title(f'Histogram of {column}')
602
+ plt.xlabel(column)
603
+ plt.ylabel('Frequency')
604
+
605
+ plt.subplot(1, 2, 2)
606
+ df.boxplot(column=column)
607
+ plt.title(f'Box Plot of {column}')
608
+
609
+ elif plot_type == "density":
610
+ plt.subplot(1, 2, 1)
611
+ df[column].plot(kind='density')
612
+ plt.title(f'Density Plot of {column}')
613
+ plt.xlabel(column)
614
+
615
+ plt.subplot(1, 2, 2)
616
+ df[column].plot(kind='box')
617
+ plt.title(f'Box Plot of {column}')
618
+
619
+ plt.tight_layout()
620
+ plt.savefig(f'distribution_{column}_{plot_type}.png', dpi=300, bbox_inches='tight')
621
+ plt.close()
622
+
623
+ return f'distribution_{column}_{plot_type}.png', f"βœ… Distribution plot created for {column}"
624
+
625
+ except Exception as e:
626
+ return None, f"❌ Error: {str(e)}"
627
+
628
+ # ===========================================
629
+ # GRADIO INTERFACE SETUP
630
+ # ===========================================
631
+
632
+ def create_interface():
633
+ """Create the main Gradio interface"""
634
+
635
+ with gr.Blocks(title="Advanced CSV Manipulation Tool", theme=gr.themes.Soft()) as demo:
636
+
637
+ gr.HTML("""
638
+ <div style="text-align: center; padding: 20px;">
639
+ <h1 style="color: #2e7d32; margin-bottom: 10px;">πŸ”₯ Advanced CSV Manipulation Tool</h1>
640
+ <p style="font-size: 18px; color: #666;">Commercial-ready data processing with advanced analytics</p>
641
+ <hr style="margin: 20px 0;">
642
+ </div>
643
+ """)
644
+
645
+ # Global state variables
646
+ current_data = gr.State(None)
647
+ data_info = gr.State({})
648
+
649
+ with gr.Tabs():
650
+
651
+ # ===== FILE UPLOAD TAB =====
652
+ with gr.TabItem("πŸ“ File Upload & Preview"):
653
+ with gr.Row():
654
+ with gr.Column(scale=1):
655
+ file_upload = gr.File(
656
+ label="Upload CSV/Excel/JSON file (Max 1GB)",
657
+ file_types=[".csv", ".xlsx", ".xls", ".json"],
658
+ file_count="single"
659
+ )
660
+ preview_rows = gr.Slider(
661
+ minimum=0,
662
+ maximum=1000,
663
+ value=100,
664
+ step=50,
665
+ label="Preview Rows (0 = All)",
666
+ info="Number of rows to display in preview"
667
+ )
668
+ upload_btn = gr.Button("πŸ“Š Load & Analyze Data", variant="primary", size="lg")
669
+
670
+ with gr.Column(scale=2):
671
+ upload_status = gr.Textbox(label="Status", lines=5, interactive=False)
672
+ data_info_display = gr.Textbox(label="Data Information", lines=8, interactive=False)
673
+
674
+ data_preview = gr.DataFrame(label="Data Preview", interactive=False, height=400)
675
+
676
+ def load_file_handler(file, rows):
677
+ if file is None:
678
+ return None, "Please upload a file first", "", None, {}
679
+
680
+ preview, status, info = processor.load_data(file, rows)
681
+ info_text = get_data_info(processor.current_df) if processor.current_df is not None else ""
682
+
683
+ return preview, status, info_text, processor.current_df, info
684
+
685
+ upload_btn.click(
686
+ load_file_handler,
687
+ inputs=[file_upload, preview_rows],
688
+ outputs=[data_preview, upload_status, data_info_display, current_data, data_info]
689
+ )
690
+
691
+ # ===== VALUE REPLACEMENT TAB =====
692
+ with gr.TabItem("πŸ”„ Value Replacement"):
693
+ gr.HTML("<h3>Replace values in one column based on conditions in another column</h3>")
694
+
695
+ with gr.Row():
696
+ with gr.Column():
697
+ target_col = gr.Dropdown(label="Target Column (to modify)", choices=[], interactive=True)
698
+ condition_col = gr.Dropdown(label="Condition Column (to check)", choices=[], interactive=True)
699
+ condition_value = gr.Textbox(label="Condition Value", placeholder="Value to match in condition column")
700
+ new_value = gr.Textbox(label="New Value", placeholder="Replacement value for target column")
701
+ match_type = gr.Radio(
702
+ choices=["exact", "contains", "starts_with", "ends_with", "regex"],
703
+ value="exact",
704
+ label="Match Type"
705
+ )
706
+ replace_btn = gr.Button("πŸ”„ Replace Values", variant="primary")
707
+
708
+ with gr.Column():
709
+ replace_status = gr.Textbox(label="Status", lines=3, interactive=False)
710
+
711
+ # Update column choices when data changes
712
+ def update_columns(df):
713
+ if df is not None:
714
+ cols = list(df.columns)
715
+ return gr.Dropdown(choices=cols), gr.Dropdown(choices=cols)
716
+ return gr.Dropdown(choices=[]), gr.Dropdown(choices=[])
717
+
718
+ current_data.change(
719
+ update_columns,
720
+ inputs=[current_data],
721
+ outputs=[target_col, condition_col]
722
+ )
723
+
724
+ def replace_values_handler(df, tcol, ccol, cval, nval, mtype):
725
+ if df is None:
726
+ return None, "❌ No data loaded", ""
727
+
728
+ result_df, status = rename_values_conditional(df, tcol, ccol, cval, nval, mtype)
729
+ info_text = get_data_info(result_df) if result_df is not None else ""
730
+
731
+ return result_df, status, info_text
732
+
733
+ replace_btn.click(
734
+ replace_values_handler,
735
+ inputs=[current_data, target_col, condition_col, condition_value, new_value, match_type],
736
+ outputs=[current_data, replace_status, data_info_display]
737
+ )
738
+
739
+ # ===== CSV CONCATENATION TAB =====
740
+ with gr.TabItem("πŸ“‹ CSV Concatenation"):
741
+ gr.HTML("<h3>Combine multiple CSV files with column selection</h3>")
742
+
743
+ with gr.Row():
744
+ with gr.Column():
745
+ multi_files = gr.File(
746
+ label="Upload Multiple Files",
747
+ file_types=[".csv", ".xlsx", ".xls"],
748
+ file_count="multiple"
749
+ )
750
+ selected_columns = gr.Textbox(
751
+ label="Columns to Include",
752
+ placeholder="column1, column2, column3 (leave empty for all)",
753
+ info="Comma-separated list of column names"
754
+ )
755
+ join_type = gr.Radio(
756
+ choices=["outer", "inner"],
757
+ value="outer",
758
+ label="Join Type",
759
+ info="Outer: keep all columns, Inner: only common columns"
760
+ )
761
+ concat_btn = gr.Button("πŸ“‹ Concatenate Files", variant="primary")
762
+
763
+ with gr.Column():
764
+ concat_status = gr.Textbox(label="Status", lines=5, interactive=False)
765
+
766
+ def concat_handler(files, cols, jtype):
767
+ if not files:
768
+ return None, "❌ Please upload files first", ""
769
+
770
+ result_df, status = concatenate_csvs(files, cols, jtype)
771
+ info_text = get_data_info(result_df) if result_df is not None else ""
772
+
773
+ return result_df, status, info_text
774
+
775
+ concat_btn.click(
776
+ concat_handler,
777
+ inputs=[multi_files, selected_columns, join_type],
778
+ outputs=[current_data, concat_status, data_info_display]
779
+ )
780
+
781
+ # ===== VALUE COUNTS TAB =====
782
+ with gr.TabItem("πŸ“Š Value Analysis"):
783
+ gr.HTML("<h3>Analyze value frequencies and distributions</h3>")
784
+
785
+ with gr.Row():
786
+ with gr.Column():
787
+ analysis_col = gr.Dropdown(label="Column to Analyze", choices=[], interactive=True)
788
+ top_n = gr.Slider(minimum=5, maximum=100, value=20, step=5, label="Top N Values")
789
+ normalize_counts = gr.Checkbox(label="Show Percentages", value=False)
790
+ analyze_btn = gr.Button("πŸ“Š Analyze Values", variant="primary")
791
+
792
+ with gr.Column():
793
+ analysis_status = gr.Textbox(label="Status", lines=3, interactive=False)
794
+
795
+ analysis_results = gr.DataFrame(label="Value Counts", height=400)
796
+
797
+ # Update analysis column choices
798
+ current_data.change(
799
+ lambda df: gr.Dropdown(choices=list(df.columns) if df is not None else []),
800
+ inputs=[current_data],
801
+ outputs=[analysis_col]
802
+ )
803
+
804
+ def analysis_handler(df, col, n, norm):
805
+ if df is None:
806
+ return None, "❌ No data loaded"
807
+
808
+ return get_value_counts(df, col, n, norm)
809
+
810
+ analyze_btn.click(
811
+ analysis_handler,
812
+ inputs=[current_data, analysis_col, top_n, normalize_counts],
813
+ outputs=[analysis_results, analysis_status]
814
+ )
815
+
816
+ # ===== DATA CLEANING TAB =====
817
+ with gr.TabItem("🧹 Data Cleaning"):
818
+ gr.HTML("<h3>Clean and preprocess your data</h3>")
819
+
820
+ with gr.Tabs():
821
+ # Missing Values
822
+ with gr.TabItem("Missing Values"):
823
+ with gr.Row():
824
+ with gr.Column():
825
+ missing_col = gr.Dropdown(label="Column", choices=["ALL"], value="ALL", interactive=True)
826
+ missing_method = gr.Radio(
827
+ choices=["drop_rows", "fill_value", "fill_mean", "fill_median", "fill_mode", "forward_fill", "backward_fill"],
828
+ value="drop_rows",
829
+ label="Method"
830
+ )
831
+ fill_value_input = gr.Textbox(label="Fill Value", placeholder="For fill_value method")
832
+ missing_btn = gr.Button("🧹 Handle Missing Values", variant="primary")
833
+
834
+ with gr.Column():
835
+ missing_status = gr.Textbox(label="Status", lines=4, interactive=False)
836
+
837
+ # Duplicates
838
+ with gr.TabItem("Duplicates"):
839
+ with gr.Row():
840
+ with gr.Column():
841
+ duplicate_cols = gr.Textbox(
842
+ label="Columns to Check",
843
+ placeholder="column1, column2 (empty = all columns)"
844
+ )
845
+ keep_method = gr.Radio(
846
+ choices=["first", "last", "false"],
847
+ value="first",
848
+ label="Keep Method"
849
+ )
850
+ duplicate_btn = gr.Button("πŸ—‘οΈ Remove Duplicates", variant="primary")
851
+
852
+ with gr.Column():
853
+ duplicate_status = gr.Textbox(label="Status", lines=4, interactive=False)
854
+
855
+ # Data Filtering
856
+ with gr.TabItem("Filtering"):
857
+ with gr.Row():
858
+ with gr.Column():
859
+ filter_col = gr.Dropdown(label="Column", choices=[], interactive=True)
860
+ filter_condition = gr.Dropdown(
861
+ choices=["equals", "not_equals", "contains", "not_contains", "starts_with", "ends_with",
862
+ "greater_than", "less_than", "is_null", "is_not_null"],
863
+ value="equals",
864
+ label="Condition"
865
+ )
866
+ filter_value = gr.Textbox(label="Value")
867
+ filter_btn = gr.Button("πŸ” Filter Data", variant="primary")
868
+
869
+ with gr.Column():
870
+ filter_status = gr.Textbox(label="Status", lines=4, interactive=False)
871
+
872
+ # Update dropdown choices
873
+ current_data.change(
874
+ lambda df: (
875
+ gr.Dropdown(choices=["ALL"] + list(df.columns) if df is not None else ["ALL"]),
876
+ gr.Dropdown(choices=list(df.columns) if df is not None else [])
877
+ ),
878
+ inputs=[current_data],
879
+ outputs=[missing_col, filter_col]
880
+ )
881
+
882
+ # Event handlers
883
+ missing_btn.click(
884
+ lambda df, col, method, val: handle_missing_values(df, col, method, val)[1] if df is not None else "❌ No data",
885
+ inputs=[current_data, missing_col, missing_method, fill_value_input],
886
+ outputs=[missing_status]
887
+ ).then(
888
+ lambda: processor.current_df,
889
+ outputs=[current_data]
890
+ ).then(
891
+ lambda df: get_data_info(df),
892
+ inputs=[current_data],
893
+ outputs=[data_info_display]
894
+ )
895
+
896
+ duplicate_btn.click(
897
+ lambda df, cols, keep: detect_and_remove_duplicates(df, cols, keep)[1] if df is not None else "❌ No data",
898
+ inputs=[current_data, duplicate_cols, keep_method],
899
+ outputs=[duplicate_status]
900
+ ).then(
901
+ lambda: processor.current_df,
902
+ outputs=[current_data]
903
+ ).then(
904
+ lambda df: get_data_info(df),
905
+ inputs=[current_data],
906
+ outputs=[data_info_display]
907
+ )
908
+
909
+ filter_btn.click(
910
+ lambda df, col, cond, val: filter_data(df, col, cond, val)[1] if df is not None else "❌ No data",
911
+ inputs=[current_data, filter_col, filter_condition, filter_value],
912
+ outputs=[filter_status]
913
+ ).then(
914
+ lambda: processor.current_df,
915
+ outputs=[current_data]
916
+ ).then(
917
+ lambda df: get_data_info(df),
918
+ inputs=[current_data],
919
+ outputs=[data_info_display]
920
+ )
921
+
922
+ # ===== COLUMN OPERATIONS TAB =====
923
+ with gr.TabItem("βš™οΈ Column Operations"):
924
+ gr.HTML("<h3>Perform operations on columns</h3>")
925
+
926
+ with gr.Row():
927
+ with gr.Column():
928
+ op_type = gr.Dropdown(
929
+ choices=["add", "subtract", "multiply", "divide", "concatenate",
930
+ "extract_numbers", "upper", "lower", "title", "length"],
931
+ value="add",
932
+ label="Operation"
933
+ )
934
+ op_col1 = gr.Dropdown(label="Primary Column", choices=[], interactive=True)
935
+ op_col2 = gr.Dropdown(label="Second Column (optional)", choices=[], interactive=True)
936
+ op_constant = gr.Textbox(label="Constant Value (optional)")
937
+ op_new_name = gr.Textbox(label="New Column Name")
938
+ op_btn = gr.Button("βš™οΈ Execute Operation", variant="primary")
939
+
940
+ with gr.Column():
941
+ op_status = gr.Textbox(label="Status", lines=5, interactive=False)
942
+
943
+ # Data type conversion
944
+ gr.HTML("<hr><h4>Data Type Conversion</h4>")
945
+ convert_col = gr.Dropdown(label="Column", choices=[], interactive=True)
946
+ convert_type = gr.Dropdown(
947
+ choices=["string", "integer", "float", "datetime", "boolean", "category"],
948
+ value="string",
949
+ label="Target Type"
950
+ )
951
+ convert_btn = gr.Button("πŸ”„ Convert Type", variant="secondary")
952
+ convert_status = gr.Textbox(label="Conversion Status", lines=2, interactive=False)
953
+
954
+ # Update column choices
955
+ current_data.change(
956
+ lambda df: (
957
+ gr.Dropdown(choices=list(df.columns) if df is not None else []),
958
+ gr.Dropdown(choices=list(df.columns) if df is not None else []),
959
+ gr.Dropdown(choices=list(df.columns) if df is not None else [])
960
+ ),
961
+ inputs=[current_data],
962
+ outputs=[op_col1, op_col2, convert_col]
963
+ )
964
+
965
+ # Event handlers
966
+ def operation_handler(df, op, col1, col2, const, new_name):
967
+ if df is None:
968
+ return None, "❌ No data loaded", ""
969
+
970
+ result_df, status = perform_column_operations(df, op, col1, col2, new_name, const)
971
+ info_text = get_data_info(result_df) if result_df is not None else ""
972
+
973
+ return result_df, status, info_text
974
+
975
+ op_btn.click(
976
+ operation_handler,
977
+ inputs=[current_data, op_type, op_col1, op_col2, op_constant, op_new_name],
978
+ outputs=[current_data, op_status, data_info_display]
979
+ )
980
+
981
+ def convert_handler(df, col, target_type):
982
+ if df is None:
983
+ return None, "❌ No data loaded", ""
984
+
985
+ result_df, status = convert_data_types(df, col, target_type)
986
+ info_text = get_data_info(result_df) if result_df is not None else ""
987
+
988
+ return result_df, status, info_text
989
+
990
+ convert_btn.click(
991
+ convert_handler,
992
+ inputs=[current_data, convert_col, convert_type],
993
+ outputs=[current_data, convert_status, data_info_display]
994
+ )
995
+
996
+ # ===== STATISTICS TAB =====
997
+ with gr.TabItem("πŸ“ˆ Statistics & Analysis"):
998
+ gr.HTML("<h3>Statistical analysis and insights</h3>")
999
+
1000
+ with gr.Row():
1001
+ with gr.Column():
1002
+ stats_btn = gr.Button("πŸ“Š Generate Statistical Summary", variant="primary")
1003
+ corr_btn = gr.Button("πŸ”— Create Correlation Matrix", variant="secondary")
1004
+
1005
+ # Distribution plots
1006
+ gr.HTML("<hr><h4>Distribution Analysis</h4>")
1007
+ dist_col = gr.Dropdown(label="Column", choices=[], interactive=True)
1008
+ plot_type = gr.Radio(choices=["histogram", "density"], value="histogram", label="Plot Type")
1009
+ dist_btn = gr.Button("πŸ“ˆ Create Distribution Plot", variant="secondary")
1010
+
1011
+ with gr.Column():
1012
+ stats_status = gr.Textbox(label="Status", lines=3, interactive=False)
1013
+ plot_output = gr.Image(label="Visualization")
1014
+
1015
+ stats_results = gr.DataFrame(label="Statistical Summary", height=400)
1016
+
1017
+ # Update column choices
1018
+ current_data.change(
1019
+ lambda df: gr.Dropdown(choices=list(df.select_dtypes(include=[np.number]).columns) if df is not None else []),
1020
+ inputs=[current_data],
1021
+ outputs=[dist_col]
1022
+ )
1023
+
1024
+ # Event handlers
1025
+ stats_btn.click(
1026
+ lambda df: generate_statistical_summary(df) if df is not None else (None, "❌ No data"),
1027
+ inputs=[current_data],
1028
+ outputs=[stats_results, stats_status]
1029
+ )
1030
+
1031
+ corr_btn.click(
1032
+ lambda df: create_correlation_matrix(df) if df is not None else (None, "❌ No data"),
1033
+ inputs=[current_data],
1034
+ outputs=[plot_output, stats_status]
1035
+ )
1036
+
1037
+ dist_btn.click(
1038
+ lambda df, col, ptype: create_distribution_plots(df, col, ptype) if df is not None else (None, "❌ No data"),
1039
+ inputs=[current_data, dist_col, plot_type],
1040
+ outputs=[plot_output, stats_status]
1041
+ )
1042
+
1043
+ # ===== EXPORT TAB =====
1044
+ with gr.TabItem("πŸ’Ύ Export & Download"):
1045
+ gr.HTML("<h3>Export your processed data</h3>")
1046
+
1047
+ with gr.Row():
1048
+ with gr.Column():
1049
+ export_format = gr.Radio(
1050
+ choices=["csv", "excel", "json"],
1051
+ value="csv",
1052
+ label="Export Format"
1053
+ )
1054
+ export_filename = gr.Textbox(
1055
+ label="Filename (without extension)",
1056
+ value="processed_data",
1057
+ placeholder="Enter filename"
1058
+ )
1059
+ export_btn = gr.Button("πŸ’Ύ Create Download File", variant="primary", size="lg")
1060
+
1061
+ with gr.Column():
1062
+ export_status = gr.Textbox(label="Status", lines=3, interactive=False)
1063
+ download_file = gr.File(label="Download", visible=False)
1064
+
1065
+ # History and Undo/Redo
1066
+ with gr.Row():
1067
+ with gr.Column():
1068
+ gr.HTML("<hr><h4>History & Undo Operations</h4>")
1069
+ undo_btn = gr.Button("β†Ά Undo Last Operation", variant="secondary")
1070
+ reset_btn = gr.Button("πŸ”„ Reset to Original", variant="secondary")
1071
+
1072
+ with gr.Column():
1073
+ history_status = gr.Textbox(label="History Status", lines=3, interactive=False)
1074
+
1075
+ def export_handler(df, fmt, filename):
1076
+ if df is None:
1077
+ return None, "❌ No data to export", gr.File(visible=False)
1078
+
1079
+ try:
1080
+ file_data, file_name = create_download_file(df, fmt, filename)
1081
+
1082
+ # Save file temporarily
1083
+ with open(file_name, 'wb' if fmt == 'excel' else 'w', encoding=None if fmt == 'excel' else 'utf-8') as f:
1084
+ if fmt == 'excel':
1085
+ f.write(file_data)
1086
+ else:
1087
+ f.write(file_data)
1088
+
1089
+ return file_name, f"βœ… File created successfully: {file_name}", gr.File(value=file_name, visible=True)
1090
+
1091
+ except Exception as e:
1092
+ return None, f"❌ Export error: {str(e)}", gr.File(visible=False)
1093
+
1094
+ export_btn.click(
1095
+ export_handler,
1096
+ inputs=[current_data, export_format, export_filename],
1097
+ outputs=[download_file, export_status, download_file]
1098
+ )
1099
+
1100
+ def undo_handler():
1101
+ result_df, status = processor.undo_operation()
1102
+ info_text = get_data_info(result_df) if result_df is not None else ""
1103
+ return result_df, status, info_text
1104
+
1105
+ def reset_handler():
1106
+ result_df, status = processor.reset_to_original()
1107
+ info_text = get_data_info(result_df) if result_df is not None else ""
1108
+ return result_df, status, info_text
1109
+
1110
+ undo_btn.click(
1111
+ undo_handler,
1112
+ outputs=[current_data, history_status, data_info_display]
1113
+ )
1114
+
1115
+ reset_btn.click(
1116
+ reset_handler,
1117
+ outputs=[current_data, history_status, data_info_display]
1118
+ )
1119
+
1120
+ # Footer
1121
+ gr.HTML("""
1122
+ <div style="text-align: center; padding: 20px; margin-top: 30px; border-top: 1px solid #ddd;">
1123
+ <p style="color: #666; font-size: 14px;">
1124
+ πŸš€ <strong>Advanced CSV Manipulation Tool</strong> |
1125
+ Commercial-ready data processing with enterprise features |
1126
+ Built with Gradio & Python
1127
+ </p>
1128
+ </div>
1129
+ """)
1130
+
1131
+ return demo
1132
+
1133
+ if __name__ == "__main__":
1134
+ # Create and launch the interface
1135
+ demo = create_interface()
1136
+ demo.launch(
1137
+ share=True,
1138
+ inbrowser=True,
1139
+ server_name="0.0.0.0",
1140
+ server_port=7860,
1141
+ max_file_size="1gb"
1142
+ )