mike23415 commited on
Commit
e35f53d
·
verified ·
1 Parent(s): de52c57

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +595 -270
app.py CHANGED
@@ -1,307 +1,632 @@
 
 
 
 
 
 
1
  from flask import Flask, request, jsonify, send_file
2
  from flask_cors import CORS
3
- import pandas as pd
4
- import os
5
  import threading
6
  import time
7
- import re
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  app = Flask(__name__)
10
  CORS(app)
11
 
12
- UPLOAD_FOLDER = "/tmp"
13
- SESSION_KEY_PREFIX = "data_tool_session_id"
14
- app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
15
- app.config['MAX_CONTENT_LENGTH'] = 512 * 1024 * 1024 # 512 MB
 
 
16
 
17
- # === Root Route (Required for Hugging Face) ===
18
- @app.route("/", methods=["GET"])
19
- def root():
20
- return jsonify({
21
- "message": "Data Processing API is running",
22
- "status": "healthy",
23
- "endpoints": {
24
- "POST /process": "Upload and process CSV/Excel files",
25
- "GET /download/<filename>": "Download processed file with session_id parameter",
26
- "GET /health": "Health check"
27
- },
28
- "version": "1.0"
29
- })
30
 
31
- # === Health Check Route ===
32
- @app.route("/health", methods=["GET"])
33
- def health_check():
34
- return jsonify({"status": "healthy", "timestamp": time.time()})
35
 
36
- # === Cleanup Thread: delete files older than 60 minutes ===
37
- def clean_old_files(folder=UPLOAD_FOLDER, max_age=60):
38
- def cleanup_loop():
39
- while True:
40
- now = time.time()
41
- try:
42
- if os.path.exists(folder):
43
- for f in os.listdir(folder):
44
- path = os.path.join(folder, f)
45
- if os.path.isfile(path):
46
- if now - os.path.getmtime(path) > max_age * 60:
47
- try:
48
- os.remove(path)
49
- print(f"[Cleanup] Deleted: {path}")
50
- except Exception as e:
51
- print(f"[Cleanup Error] {e}")
52
- except Exception as e:
53
- print(f"[Cleanup Error] {e}")
54
- time.sleep(600) # Every 10 minutes
55
 
56
- threading.Thread(target=cleanup_loop, daemon=True).start()
57
-
58
- # Start cleanup thread
59
- clean_old_files()
60
-
61
- # === Instruction Parser ===
62
- def apply_instruction(df, instruction):
63
- instruction = instruction.lower().strip()
64
-
65
- if not instruction:
66
- return df, "No instruction provided"
67
 
 
 
68
  try:
69
- # Drop column
70
- match = re.search(r"drop column (\w+)", instruction)
71
- if match:
72
- col_name = match.group(1)
73
- if col_name in df.columns:
74
- df = df.drop(columns=[col_name])
75
- return df, f"Dropped column '{col_name}'"
76
- else:
77
- return df, f"Error: Column '{col_name}' not found"
78
-
79
- # Remove duplicates
80
- if "remove duplicates" in instruction:
81
- original_count = len(df)
82
- df = df.drop_duplicates()
83
- removed_count = original_count - len(df)
84
- return df, f"Removed {removed_count} duplicate rows"
85
-
86
- # Drop missing values
87
- if "drop missing" in instruction or "remove null" in instruction:
88
- original_count = len(df)
89
- df = df.dropna()
90
- removed_count = original_count - len(df)
91
- return df, f"Removed {removed_count} rows with missing values"
92
-
93
- # Fill missing values
94
- match = re.search(r"fill missing.*with ([\w\.]+)", instruction)
95
- if match:
96
- val = match.group(1)
97
- try:
98
- val = float(val)
99
- except:
100
- pass
101
- missing_count = df.isnull().sum().sum()
102
- df = df.fillna(val)
103
- return df, f"Filled {missing_count} missing values with '{val}'"
104
-
105
- # Sort by column
106
- match = re.search(r"sort by (\w+)( descending| desc)?", instruction)
107
- if match:
108
- col = match.group(1)
109
- if col not in df.columns:
110
- return df, f"Error: Column '{col}' not found"
111
- ascending = not bool(match.group(2))
112
- df = df.sort_values(by=col, ascending=ascending)
113
- order = "descending" if not ascending else "ascending"
114
- return df, f"Sorted by '{col}' in {order} order"
115
-
116
- # Rename column
117
- match = re.search(r"rename column (\w+) to (\w+)", instruction)
118
- if match:
119
- old_name, new_name = match.group(1), match.group(2)
120
- if old_name not in df.columns:
121
- return df, f"Error: Column '{old_name}' not found"
122
- df = df.rename(columns={old_name: new_name})
123
- return df, f"Renamed column '{old_name}' to '{new_name}'"
124
-
125
- # Filter rows
126
- match = re.search(r"filter where (\w+) > (\d+)", instruction)
127
- if match:
128
- col, val = match.group(1), float(match.group(2))
129
- if col not in df.columns:
130
- return df, f"Error: Column '{col}' not found"
131
- original_count = len(df)
132
- df = df[df[col] > val]
133
- kept_count = len(df)
134
- return df, f"Filtered data: kept {kept_count} rows where {col} > {val}"
135
-
136
- # Group by and sum
137
- match = re.search(r"group by (\w+) and sum (\w+)", instruction)
138
- if match:
139
- group_col, sum_col = match.group(1), match.group(2)
140
- if group_col not in df.columns:
141
- return df, f"Error: Column '{group_col}' not found"
142
- if sum_col not in df.columns:
143
- return df, f"Error: Column '{sum_col}' not found"
144
- df = df.groupby(group_col)[sum_col].sum().reset_index()
145
- return df, f"Grouped by '{group_col}' and summed '{sum_col}'"
146
-
147
- # Add column (sum of two columns)
148
- match = re.search(r"add column (\w+) as (\w+) \+ (\w+)", instruction)
149
- if match:
150
- new_col, col1, col2 = match.group(1), match.group(2), match.group(3)
151
- if col1 not in df.columns:
152
- return df, f"Error: Column '{col1}' not found"
153
- if col2 not in df.columns:
154
- return df, f"Error: Column '{col2}' not found"
155
- df[new_col] = df[col1] + df[col2]
156
- return df, f"Added column '{new_col}' as sum of '{col1}' and '{col2}'"
157
-
158
- # Normalize column
159
- match = re.search(r"normalize column (\w+)", instruction)
160
- if match:
161
- col = match.group(1)
162
- if col not in df.columns:
163
- return df, f"Error: Column '{col}' not found"
164
- if not pd.api.types.is_numeric_dtype(df[col]):
165
- return df, f"Error: Column '{col}' is not numeric"
166
- df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
167
- return df, f"Normalized column '{col}' using min-max scaling"
168
-
169
- # Standardize column
170
- match = re.search(r"standardize column (\w+)", instruction)
171
- if match:
172
- col = match.group(1)
173
- if col not in df.columns:
174
- return df, f"Error: Column '{col}' not found"
175
- if not pd.api.types.is_numeric_dtype(df[col]):
176
- return df, f"Error: Column '{col}' is not numeric"
177
- df[col] = (df[col] - df[col].mean()) / df[col].std()
178
- return df, f"Standardized column '{col}' using z-score"
179
-
180
- # Split column by comma
181
- match = re.search(r"split column (\w+) by comma", instruction)
182
- if match:
183
- col = match.group(1)
184
- if col not in df.columns:
185
- return df, f"Error: Column '{col}' not found"
186
- df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)
187
- return df, f"Split column '{col}' by comma into '{col}_1' and '{col}_2'"
188
-
189
- # Remove special characters
190
- match = re.search(r"remove special characters from (\w+)", instruction)
191
- if match:
192
- col = match.group(1)
193
- if col not in df.columns:
194
- return df, f"Error: Column '{col}' not found"
195
- df[col] = df[col].astype(str).str.replace(r"[^a-zA-Z0-9]", "", regex=True)
196
- return df, f"Removed special characters from column '{col}'"
197
-
198
- # If no instruction matched
199
- return df, f"Instruction '{instruction}' not recognized"
200
-
201
  except Exception as e:
202
- return df, f"Error: {str(e)}"
203
 
204
- # === File Processor Endpoint ===
205
- @app.route("/process", methods=["POST"])
206
- def process_file():
207
- try:
208
- # Validate request
209
- if "file" not in request.files:
210
- return jsonify({"error": "No file provided"}), 400
211
- if "instruction" not in request.form:
212
- return jsonify({"error": "No instruction provided"}), 400
213
- if "session_id" not in request.form:
214
- return jsonify({"error": "No session_id provided"}), 400
215
 
216
- file = request.files["file"]
217
- instruction = request.form["instruction"]
218
- session_id = request.form["session_id"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
- if file.filename == '':
221
- return jsonify({"error": "No file selected"}), 400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
- # Read file
224
- try:
225
- if file.filename.lower().endswith('.csv'):
226
- df = pd.read_csv(file)
227
- elif file.filename.lower().endswith(('.xlsx', '.xls')):
228
- df = pd.read_excel(file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  else:
230
- return jsonify({"error": "Unsupported file format. Use CSV or Excel files."}), 400
231
- except Exception as e:
232
- return jsonify({"error": f"File reading error: {str(e)}"}), 400
233
-
234
- # Apply instruction
235
- df_processed, status = apply_instruction(df, instruction)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
- # Save processed file
238
- original_name = file.filename.rsplit('.', 1)[0] # Remove extension
239
- filename = f"processed_{session_id}_{original_name}.csv"
240
- filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
241
-
242
- try:
243
- df_processed.to_csv(filepath, index=False)
244
- except Exception as e:
245
- return jsonify({"error": f"File saving error: {str(e)}"}), 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
- # Generate preview (first 5 rows)
248
- preview = df_processed.head(5).to_dict(orient="records")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
  return jsonify({
251
- "success": True,
252
- "message": status,
253
- "preview": preview,
254
- "download_url": f"/download/{filename}",
255
- "original_rows": len(df),
256
- "processed_rows": len(df_processed),
257
- "columns": list(df_processed.columns),
258
- "filename": filename
259
  })
260
-
261
  except Exception as e:
262
- return jsonify({"error": f"Processing error: {str(e)}"}), 500
 
263
 
264
- # === File Download with Session ID Verification ===
265
- @app.route("/download/<filename>", methods=["GET"])
266
- def download_file(filename):
267
  try:
268
- session_id = request.args.get("session_id")
 
 
269
 
270
- # Validate session
271
- if not session_id:
272
- return jsonify({"error": "session_id parameter required"}), 400
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
- if f"_{session_id}_" not in filename:
275
- return jsonify({"error": "Invalid session or unauthorized access"}), 403
 
 
 
276
 
277
- # Check file exists
278
- filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
279
- if not os.path.exists(filepath):
280
- return jsonify({"error": "File not found or expired"}), 404
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
- return send_file(filepath, as_attachment=True, download_name=filename)
283
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  except Exception as e:
285
- return jsonify({"error": f"Download error: {str(e)}"}), 500
 
286
 
287
- # === Error Handlers ===
288
- @app.errorhandler(404)
289
- def not_found(error):
290
- return jsonify({"error": "Endpoint not found"}), 404
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
 
292
- @app.errorhandler(413)
293
- def too_large(error):
294
- return jsonify({"error": "File too large (max 512MB)"}), 413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
 
296
- @app.errorhandler(500)
297
- def internal_error(error):
298
- return jsonify({"error": "Internal server error"}), 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
 
300
- # === Run on Port 7860 for Hugging Face ===
301
- if __name__ == "__main__":
302
- print("🚀 Starting Data Processing API on port 7860...")
303
- print("📊 API Endpoints:")
304
- print(" POST /process - Process files")
305
- print(" GET /download/<filename> - Download processed files")
306
- print(" GET /health - Health check")
307
- app.run(host="0.0.0.0", port=7860, debug=False)
 
1
+ import os
2
+ import uuid
3
+ import json
4
+ import pandas as pd
5
+ import numpy as np
6
+ from datetime import datetime, timedelta
7
  from flask import Flask, request, jsonify, send_file
8
  from flask_cors import CORS
9
+ from werkzeug.utils import secure_filename
 
10
  import threading
11
  import time
12
+ import logging
13
+ from scipy import stats
14
+ import matplotlib
15
+ matplotlib.use('Agg') # Use non-interactive backend
16
+ import matplotlib.pyplot as plt
17
+ import seaborn as sns
18
+ import io
19
+ import base64
20
+ from apscheduler.schedulers.background import BackgroundScheduler
21
+ import atexit
22
+
23
+ # Configure logging
24
+ logging.basicConfig(level=logging.INFO)
25
+ logger = logging.getLogger(__name__)
26
 
27
  app = Flask(__name__)
28
  CORS(app)
29
 
30
+ # Configuration
31
+ UPLOAD_FOLDER = '/tmp/uploads'
32
+ PROCESSED_FOLDER = '/tmp/processed'
33
+ MAX_FILE_SIZE = 512 * 1024 * 1024 # 512MB
34
+ ALLOWED_EXTENSIONS = {'csv', 'xlsx', 'xls', 'json', 'parquet', 'tsv'}
35
+ FILE_EXPIRY_HOURS = 1
36
 
37
+ # Ensure directories exist
38
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
39
+ os.makedirs(PROCESSED_FOLDER, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
40
 
41
+ # File storage to track sessions and files
42
+ file_storage = {}
 
 
43
 
44
+ def allowed_file(filename):
45
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ def get_file_age(filepath):
48
+ """Get file age in hours"""
49
+ if os.path.exists(filepath):
50
+ file_time = os.path.getmtime(filepath)
51
+ return (time.time() - file_time) / 3600
52
+ return float('inf')
 
 
 
 
 
53
 
54
+ def cleanup_old_files():
55
+ """Remove files older than FILE_EXPIRY_HOURS"""
56
  try:
57
+ for folder in [UPLOAD_FOLDER, PROCESSED_FOLDER]:
58
+ for root, dirs, files in os.walk(folder):
59
+ for file in files:
60
+ filepath = os.path.join(root, file)
61
+ if get_file_age(filepath) > FILE_EXPIRY_HOURS:
62
+ os.remove(filepath)
63
+ logger.info(f"Cleaned up old file: {filepath}")
64
+
65
+ # Clean up file_storage entries
66
+ current_time = datetime.now()
67
+ sessions_to_remove = []
68
+ for session_id, files in file_storage.items():
69
+ files_to_remove = []
70
+ for file_id, file_info in files.items():
71
+ file_time = datetime.fromisoformat(file_info['timestamp'])
72
+ if (current_time - file_time).total_seconds() > FILE_EXPIRY_HOURS * 3600:
73
+ files_to_remove.append(file_id)
74
+
75
+ for file_id in files_to_remove:
76
+ del files[file_id]
77
+
78
+ if not files:
79
+ sessions_to_remove.append(session_id)
80
+
81
+ for session_id in sessions_to_remove:
82
+ del file_storage[session_id]
83
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  except Exception as e:
85
+ logger.error(f"Error during cleanup: {str(e)}")
86
 
87
+ # Setup scheduler for automatic cleanup
88
+ scheduler = BackgroundScheduler()
89
+ scheduler.add_job(func=cleanup_old_files, trigger="interval", minutes=15)
90
+ scheduler.start()
91
+ atexit.register(lambda: scheduler.shutdown())
 
 
 
 
 
 
92
 
93
+ def load_data_file(filepath, filename):
94
+ """Load data from various file formats"""
95
+ try:
96
+ file_ext = filename.rsplit('.', 1)[1].lower()
97
+
98
+ if file_ext == 'csv':
99
+ return pd.read_csv(filepath)
100
+ elif file_ext in ['xlsx', 'xls']:
101
+ return pd.read_excel(filepath)
102
+ elif file_ext == 'json':
103
+ return pd.read_json(filepath)
104
+ elif file_ext == 'parquet':
105
+ return pd.read_parquet(filepath)
106
+ elif file_ext == 'tsv':
107
+ return pd.read_csv(filepath, sep='\t')
108
+ else:
109
+ raise ValueError(f"Unsupported file format: {file_ext}")
110
+ except Exception as e:
111
+ raise Exception(f"Error loading file: {str(e)}")
112
 
113
+ def perform_basic_statistics(df, columns=None):
114
+ """Perform basic statistical analysis"""
115
+ if columns:
116
+ df = df[columns]
117
+
118
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
119
+ categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
120
+
121
+ result = {
122
+ 'numeric_summary': {},
123
+ 'categorical_summary': {},
124
+ 'general_info': {
125
+ 'total_rows': len(df),
126
+ 'total_columns': len(df.columns),
127
+ 'numeric_columns': len(numeric_cols),
128
+ 'categorical_columns': len(categorical_cols),
129
+ 'missing_values': df.isnull().sum().to_dict()
130
+ }
131
+ }
132
+
133
+ # Numeric statistics
134
+ if numeric_cols:
135
+ numeric_stats = df[numeric_cols].describe()
136
+ result['numeric_summary'] = numeric_stats.to_dict()
137
+
138
+ # Categorical statistics
139
+ if categorical_cols:
140
+ for col in categorical_cols:
141
+ result['categorical_summary'][col] = {
142
+ 'unique_values': df[col].nunique(),
143
+ 'top_values': df[col].value_counts().head(10).to_dict(),
144
+ 'missing_count': df[col].isnull().sum()
145
+ }
146
+
147
+ return result
148
+
149
+ def perform_groupby_analysis(df, group_column, target_column, operation='mean', filters=None):
150
+ """Perform group by analysis"""
151
+ # Apply filters if provided
152
+ if filters:
153
+ for f in filters:
154
+ col, op, val = f['column'], f['operator'], f['value']
155
+ if op == '>':
156
+ df = df[df[col] > val]
157
+ elif op == '<':
158
+ df = df[df[col] < val]
159
+ elif op == '==':
160
+ df = df[df[col] == val]
161
+ elif op == '!=':
162
+ df = df[df[col] != val]
163
+ elif op == '>=':
164
+ df = df[df[col] >= val]
165
+ elif op == '<=':
166
+ df = df[df[col] <= val]
167
+
168
+ # Perform groupby operation
169
+ grouped = df.groupby(group_column)[target_column]
170
+
171
+ if operation == 'mean':
172
+ result = grouped.mean()
173
+ elif operation == 'sum':
174
+ result = grouped.sum()
175
+ elif operation == 'count':
176
+ result = grouped.count()
177
+ elif operation == 'max':
178
+ result = grouped.max()
179
+ elif operation == 'min':
180
+ result = grouped.min()
181
+ elif operation == 'std':
182
+ result = grouped.std()
183
+ else:
184
+ raise ValueError(f"Unsupported operation: {operation}")
185
+
186
+ return {
187
+ 'result': result.to_dict(),
188
+ 'operation': operation,
189
+ 'group_column': group_column,
190
+ 'target_column': target_column,
191
+ 'total_groups': len(result)
192
+ }
193
+
194
+ def perform_correlation_analysis(df, columns=None, method='pearson'):
195
+ """Perform correlation analysis"""
196
+ if columns:
197
+ df = df[columns]
198
+
199
+ # Only numeric columns
200
+ numeric_df = df.select_dtypes(include=[np.number])
201
+
202
+ if numeric_df.empty:
203
+ raise ValueError("No numeric columns found for correlation analysis")
204
+
205
+ correlation_matrix = numeric_df.corr(method=method)
206
+
207
+ return {
208
+ 'correlation_matrix': correlation_matrix.to_dict(),
209
+ 'method': method,
210
+ 'columns': numeric_df.columns.tolist()
211
+ }
212
+
213
+ def detect_outliers(df, columns=None, method='iqr'):
214
+ """Detect outliers in numeric columns"""
215
+ if columns:
216
+ df = df[columns]
217
+
218
+ numeric_df = df.select_dtypes(include=[np.number])
219
+ outliers = {}
220
+
221
+ for col in numeric_df.columns:
222
+ if method == 'iqr':
223
+ Q1 = numeric_df[col].quantile(0.25)
224
+ Q3 = numeric_df[col].quantile(0.75)
225
+ IQR = Q3 - Q1
226
+ lower_bound = Q1 - 1.5 * IQR
227
+ upper_bound = Q3 + 1.5 * IQR
228
+
229
+ outlier_indices = numeric_df[(numeric_df[col] < lower_bound) |
230
+ (numeric_df[col] > upper_bound)].index.tolist()
231
+
232
+ elif method == 'zscore':
233
+ z_scores = np.abs(stats.zscore(numeric_df[col].dropna()))
234
+ outlier_indices = numeric_df[z_scores > 3].index.tolist()
235
+
236
+ outliers[col] = {
237
+ 'count': len(outlier_indices),
238
+ 'indices': outlier_indices[:100], # Limit to first 100
239
+ 'percentage': (len(outlier_indices) / len(numeric_df)) * 100
240
+ }
241
+
242
+ return outliers
243
 
244
+ def generate_visualization(df, chart_type, x_column, y_column=None, group_column=None):
245
+ """Generate visualization and return base64 encoded image"""
246
+ plt.figure(figsize=(10, 6))
247
+
248
+ try:
249
+ if chart_type == 'histogram':
250
+ plt.hist(df[x_column], bins=30, alpha=0.7)
251
+ plt.xlabel(x_column)
252
+ plt.ylabel('Frequency')
253
+ plt.title(f'Histogram of {x_column}')
254
+
255
+ elif chart_type == 'scatter':
256
+ if not y_column:
257
+ raise ValueError("Y column required for scatter plot")
258
+ plt.scatter(df[x_column], df[y_column], alpha=0.6)
259
+ plt.xlabel(x_column)
260
+ plt.ylabel(y_column)
261
+ plt.title(f'{x_column} vs {y_column}')
262
+
263
+ elif chart_type == 'bar':
264
+ if group_column:
265
+ grouped = df.groupby(group_column)[x_column].mean() if pd.api.types.is_numeric_dtype(df[x_column]) else df[group_column].value_counts()
266
  else:
267
+ grouped = df[x_column].value_counts().head(20)
268
+
269
+ grouped.plot(kind='bar')
270
+ plt.xlabel(group_column or x_column)
271
+ plt.ylabel('Count' if not pd.api.types.is_numeric_dtype(df[x_column]) else f'Mean {x_column}')
272
+ plt.title(f'Bar Chart')
273
+ plt.xticks(rotation=45)
274
+
275
+ elif chart_type == 'line':
276
+ if y_column:
277
+ plt.plot(df[x_column], df[y_column])
278
+ plt.xlabel(x_column)
279
+ plt.ylabel(y_column)
280
+ else:
281
+ df[x_column].plot()
282
+ plt.ylabel(x_column)
283
+ plt.title('Line Chart')
284
+
285
+ elif chart_type == 'box':
286
+ if group_column:
287
+ df.boxplot(column=x_column, by=group_column)
288
+ else:
289
+ df.boxplot(column=x_column)
290
+ plt.title('Box Plot')
291
+
292
+ plt.tight_layout()
293
+
294
+ # Convert plot to base64 string
295
+ img_buffer = io.BytesIO()
296
+ plt.savefig(img_buffer, format='png', dpi=150, bbox_inches='tight')
297
+ img_buffer.seek(0)
298
+ img_base64 = base64.b64encode(img_buffer.getvalue()).decode()
299
+ plt.close()
300
+
301
+ return img_base64
302
+
303
+ except Exception as e:
304
+ plt.close()
305
+ raise Exception(f"Error generating visualization: {str(e)}")
306
 
307
+ def parse_natural_language_query(query, df_columns):
308
+ """Simple natural language query parser"""
309
+ query_lower = query.lower()
310
+
311
+ # Define operation keywords
312
+ operations = {
313
+ 'average': 'mean', 'mean': 'mean', 'avg': 'mean',
314
+ 'sum': 'sum', 'total': 'sum',
315
+ 'count': 'count', 'number': 'count',
316
+ 'max': 'max', 'maximum': 'max', 'highest': 'max',
317
+ 'min': 'min', 'minimum': 'min', 'lowest': 'min'
318
+ }
319
+
320
+ # Find operation
321
+ operation = 'mean' # default
322
+ for keyword, op in operations.items():
323
+ if keyword in query_lower:
324
+ operation = op
325
+ break
326
+
327
+ # Find columns mentioned in query
328
+ mentioned_columns = [col for col in df_columns if col.lower() in query_lower]
329
+
330
+ # Simple parsing patterns
331
+ if 'by' in query_lower and len(mentioned_columns) >= 2:
332
+ # Group by analysis
333
+ target_col = mentioned_columns[0]
334
+ group_col = mentioned_columns[-1]
335
+
336
+ return {
337
+ 'analysisType': 'groupby',
338
+ 'parameters': {
339
+ 'groupByColumn': group_col,
340
+ 'targetColumn': target_col,
341
+ 'operation': operation
342
+ }
343
+ }
344
+ elif 'correlation' in query_lower:
345
+ return {
346
+ 'analysisType': 'correlation',
347
+ 'parameters': {
348
+ 'columns': mentioned_columns if mentioned_columns else None
349
+ }
350
+ }
351
+ elif any(word in query_lower for word in ['chart', 'plot', 'graph', 'visualize']):
352
+ chart_type = 'bar' # default
353
+ if 'scatter' in query_lower:
354
+ chart_type = 'scatter'
355
+ elif 'line' in query_lower:
356
+ chart_type = 'line'
357
+ elif 'histogram' in query_lower:
358
+ chart_type = 'histogram'
359
+
360
+ return {
361
+ 'analysisType': 'visualization',
362
+ 'parameters': {
363
+ 'chartType': chart_type,
364
+ 'xColumn': mentioned_columns[0] if mentioned_columns else None,
365
+ 'yColumn': mentioned_columns[1] if len(mentioned_columns) > 1 else None
366
+ }
367
+ }
368
+ else:
369
+ # Default to basic statistics
370
+ return {
371
+ 'analysisType': 'statistics',
372
+ 'parameters': {
373
+ 'columns': mentioned_columns if mentioned_columns else None
374
+ }
375
+ }
376
+
377
+ @app.route('/api/health', methods=['GET'])
378
+ def health_check():
379
+ return jsonify({'status': 'healthy', 'timestamp': datetime.now().isoformat()})
380
 
381
+ @app.route('/api/upload', methods=['POST'])
382
+ def upload_file():
383
+ try:
384
+ if 'file' not in request.files:
385
+ return jsonify({'error': 'No file provided'}), 400
386
+
387
+ file = request.files['file']
388
+ session_id = request.form.get('sessionId')
389
+
390
+ if not session_id:
391
+ return jsonify({'error': 'Session ID required'}), 400
392
+
393
+ if file.filename == '':
394
+ return jsonify({'error': 'No file selected'}), 400
395
+
396
+ if not allowed_file(file.filename):
397
+ return jsonify({'error': 'File type not supported'}), 400
398
+
399
+ # Check file size
400
+ file.seek(0, 2) # Seek to end
401
+ file_size = file.tell()
402
+ file.seek(0) # Reset to beginning
403
+
404
+ if file_size > MAX_FILE_SIZE:
405
+ return jsonify({'error': f'File too large. Maximum size is {MAX_FILE_SIZE // (1024*1024)}MB'}), 400
406
+
407
+ # Generate unique file ID and secure filename
408
+ file_id = str(uuid.uuid4())
409
+ filename = secure_filename(file.filename)
410
+
411
+ # Create session directory
412
+ session_dir = os.path.join(UPLOAD_FOLDER, session_id)
413
+ os.makedirs(session_dir, exist_ok=True)
414
+
415
+ # Save file
416
+ filepath = os.path.join(session_dir, f"{file_id}_{filename}")
417
+ file.save(filepath)
418
+
419
+ # Store file info
420
+ if session_id not in file_storage:
421
+ file_storage[session_id] = {}
422
+
423
+ file_storage[session_id][file_id] = {
424
+ 'filename': filename,
425
+ 'filepath': filepath,
426
+ 'size': file_size,
427
+ 'timestamp': datetime.now().isoformat()
428
+ }
429
 
430
  return jsonify({
431
+ 'fileId': file_id,
432
+ 'filename': filename,
433
+ 'size': file_size,
434
+ 'message': 'File uploaded successfully'
 
 
 
 
435
  })
436
+
437
  except Exception as e:
438
+ logger.error(f"Upload error: {str(e)}")
439
+ return jsonify({'error': str(e)}), 500
440
 
441
+ @app.route('/api/preview/<file_id>', methods=['GET'])
442
+ def preview_file(file_id):
 
443
  try:
444
+ session_id = request.args.get('sessionId')
445
+ if not session_id or session_id not in file_storage:
446
+ return jsonify({'error': 'Invalid session'}), 400
447
 
448
+ if file_id not in file_storage[session_id]:
449
+ return jsonify({'error': 'File not found'}), 404
450
+
451
+ file_info = file_storage[session_id][file_id]
452
+
453
+ # Load data and get preview
454
+ df = load_data_file(file_info['filepath'], file_info['filename'])
455
+
456
+ preview_data = {
457
+ 'columns': df.columns.tolist(),
458
+ 'dtypes': df.dtypes.astype(str).to_dict(),
459
+ 'shape': df.shape,
460
+ 'head': df.head(5).to_dict('records'),
461
+ 'missing_values': df.isnull().sum().to_dict()
462
+ }
463
 
464
+ return jsonify(preview_data)
465
+
466
+ except Exception as e:
467
+ logger.error(f"Preview error: {str(e)}")
468
+ return jsonify({'error': str(e)}), 500
469
 
470
+ @app.route('/api/analyze', methods=['POST'])
471
+ def analyze_data():
472
+ try:
473
+ data = request.get_json()
474
+ session_id = data.get('sessionId')
475
+ file_id = data.get('fileId')
476
+ analysis_type = data.get('analysisType')
477
+ parameters = data.get('parameters', {})
478
+ natural_query = data.get('naturalQuery')
479
+
480
+ if not all([session_id, file_id]):
481
+ return jsonify({'error': 'Session ID and File ID required'}), 400
482
+
483
+ if session_id not in file_storage or file_id not in file_storage[session_id]:
484
+ return jsonify({'error': 'File not found'}), 404
485
+
486
+ file_info = file_storage[session_id][file_id]
487
+ df = load_data_file(file_info['filepath'], file_info['filename'])
488
+
489
+ # Handle natural language query
490
+ if natural_query and not analysis_type:
491
+ parsed_query = parse_natural_language_query(natural_query, df.columns.tolist())
492
+ analysis_type = parsed_query['analysisType']
493
+ parameters = parsed_query['parameters']
494
+
495
+ result = {}
496
+
497
+ if analysis_type == 'statistics':
498
+ result = perform_basic_statistics(df, parameters.get('columns'))
499
 
500
+ elif analysis_type == 'groupby':
501
+ result = perform_groupby_analysis(
502
+ df,
503
+ parameters.get('groupByColumn'),
504
+ parameters.get('targetColumn'),
505
+ parameters.get('operation', 'mean'),
506
+ parameters.get('filters')
507
+ )
508
+
509
+ elif analysis_type == 'correlation':
510
+ result = perform_correlation_analysis(
511
+ df,
512
+ parameters.get('columns'),
513
+ parameters.get('method', 'pearson')
514
+ )
515
+
516
+ elif analysis_type == 'outliers':
517
+ result = detect_outliers(
518
+ df,
519
+ parameters.get('columns'),
520
+ parameters.get('method', 'iqr')
521
+ )
522
+
523
+ elif analysis_type == 'visualization':
524
+ chart_base64 = generate_visualization(
525
+ df,
526
+ parameters.get('chartType', 'bar'),
527
+ parameters.get('xColumn'),
528
+ parameters.get('yColumn'),
529
+ parameters.get('groupColumn')
530
+ )
531
+ result = {
532
+ 'chart': chart_base64,
533
+ 'chartType': parameters.get('chartType', 'bar')
534
+ }
535
+
536
+ else:
537
+ return jsonify({'error': 'Invalid analysis type'}), 400
538
+
539
+ # Save result to processed folder
540
+ result_id = str(uuid.uuid4())
541
+ result_dir = os.path.join(PROCESSED_FOLDER, session_id)
542
+ os.makedirs(result_dir, exist_ok=True)
543
+
544
+ result_filepath = os.path.join(result_dir, f"{result_id}_result.json")
545
+ with open(result_filepath, 'w') as f:
546
+ json.dump(result, f, indent=2, default=str)
547
+
548
+ return jsonify({
549
+ 'resultId': result_id,
550
+ 'result': result,
551
+ 'analysisType': analysis_type,
552
+ 'timestamp': datetime.now().isoformat()
553
+ })
554
+
555
  except Exception as e:
556
+ logger.error(f"Analysis error: {str(e)}")
557
+ return jsonify({'error': str(e)}), 500
558
 
559
+ @app.route('/api/files/<session_id>', methods=['GET'])
560
+ def list_files(session_id):
561
+ try:
562
+ if session_id not in file_storage:
563
+ return jsonify({'files': []})
564
+
565
+ files = []
566
+ for file_id, file_info in file_storage[session_id].items():
567
+ # Check if file still exists
568
+ if os.path.exists(file_info['filepath']):
569
+ files.append({
570
+ 'fileId': file_id,
571
+ 'filename': file_info['filename'],
572
+ 'size': file_info['size'],
573
+ 'timestamp': file_info['timestamp']
574
+ })
575
+
576
+ return jsonify({'files': files})
577
+
578
+ except Exception as e:
579
+ logger.error(f"List files error: {str(e)}")
580
+ return jsonify({'error': str(e)}), 500
581
 
582
+ @app.route('/api/file/<file_id>', methods=['DELETE'])
583
+ def delete_file(file_id):
584
+ try:
585
+ session_id = request.args.get('sessionId')
586
+ if not session_id or session_id not in file_storage:
587
+ return jsonify({'error': 'Invalid session'}), 400
588
+
589
+ if file_id not in file_storage[session_id]:
590
+ return jsonify({'error': 'File not found'}), 404
591
+
592
+ file_info = file_storage[session_id][file_id]
593
+
594
+ # Remove file from filesystem
595
+ if os.path.exists(file_info['filepath']):
596
+ os.remove(file_info['filepath'])
597
+
598
+ # Remove from storage
599
+ del file_storage[session_id][file_id]
600
+
601
+ return jsonify({'message': 'File deleted successfully'})
602
+
603
+ except Exception as e:
604
+ logger.error(f"Delete error: {str(e)}")
605
+ return jsonify({'error': str(e)}), 500
606
 
607
+ @app.route('/api/download/<result_id>', methods=['GET'])
608
+ def download_result(result_id):
609
+ try:
610
+ session_id = request.args.get('sessionId')
611
+ format_type = request.args.get('format', 'json')
612
+
613
+ if not session_id:
614
+ return jsonify({'error': 'Session ID required'}), 400
615
+
616
+ result_filepath = os.path.join(PROCESSED_FOLDER, session_id, f"{result_id}_result.json")
617
+
618
+ if not os.path.exists(result_filepath):
619
+ return jsonify({'error': 'Result not found'}), 404
620
+
621
+ if format_type == 'json':
622
+ return send_file(result_filepath, as_attachment=True,
623
+ download_name=f"analysis_result_{result_id}.json")
624
+ else:
625
+ return jsonify({'error': 'Format not supported'}), 400
626
+
627
+ except Exception as e:
628
+ logger.error(f"Download error: {str(e)}")
629
+ return jsonify({'error': str(e)}), 500
630
 
631
+ if __name__ == '__main__':
632
+ app.run(host='0.0.0.0', port=7860, debug=False)