Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,307 +1,632 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from flask import Flask, request, jsonify, send_file
|
2 |
from flask_cors import CORS
|
3 |
-
|
4 |
-
import os
|
5 |
import threading
|
6 |
import time
|
7 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
app = Flask(__name__)
|
10 |
CORS(app)
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
|
|
16 |
|
17 |
-
#
|
18 |
-
|
19 |
-
|
20 |
-
return jsonify({
|
21 |
-
"message": "Data Processing API is running",
|
22 |
-
"status": "healthy",
|
23 |
-
"endpoints": {
|
24 |
-
"POST /process": "Upload and process CSV/Excel files",
|
25 |
-
"GET /download/<filename>": "Download processed file with session_id parameter",
|
26 |
-
"GET /health": "Health check"
|
27 |
-
},
|
28 |
-
"version": "1.0"
|
29 |
-
})
|
30 |
|
31 |
-
#
|
32 |
-
|
33 |
-
def health_check():
|
34 |
-
return jsonify({"status": "healthy", "timestamp": time.time()})
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
def cleanup_loop():
|
39 |
-
while True:
|
40 |
-
now = time.time()
|
41 |
-
try:
|
42 |
-
if os.path.exists(folder):
|
43 |
-
for f in os.listdir(folder):
|
44 |
-
path = os.path.join(folder, f)
|
45 |
-
if os.path.isfile(path):
|
46 |
-
if now - os.path.getmtime(path) > max_age * 60:
|
47 |
-
try:
|
48 |
-
os.remove(path)
|
49 |
-
print(f"[Cleanup] Deleted: {path}")
|
50 |
-
except Exception as e:
|
51 |
-
print(f"[Cleanup Error] {e}")
|
52 |
-
except Exception as e:
|
53 |
-
print(f"[Cleanup Error] {e}")
|
54 |
-
time.sleep(600) # Every 10 minutes
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
def apply_instruction(df, instruction):
|
63 |
-
instruction = instruction.lower().strip()
|
64 |
-
|
65 |
-
if not instruction:
|
66 |
-
return df, "No instruction provided"
|
67 |
|
|
|
|
|
68 |
try:
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
val = match.group(1)
|
97 |
-
try:
|
98 |
-
val = float(val)
|
99 |
-
except:
|
100 |
-
pass
|
101 |
-
missing_count = df.isnull().sum().sum()
|
102 |
-
df = df.fillna(val)
|
103 |
-
return df, f"Filled {missing_count} missing values with '{val}'"
|
104 |
-
|
105 |
-
# Sort by column
|
106 |
-
match = re.search(r"sort by (\w+)( descending| desc)?", instruction)
|
107 |
-
if match:
|
108 |
-
col = match.group(1)
|
109 |
-
if col not in df.columns:
|
110 |
-
return df, f"Error: Column '{col}' not found"
|
111 |
-
ascending = not bool(match.group(2))
|
112 |
-
df = df.sort_values(by=col, ascending=ascending)
|
113 |
-
order = "descending" if not ascending else "ascending"
|
114 |
-
return df, f"Sorted by '{col}' in {order} order"
|
115 |
-
|
116 |
-
# Rename column
|
117 |
-
match = re.search(r"rename column (\w+) to (\w+)", instruction)
|
118 |
-
if match:
|
119 |
-
old_name, new_name = match.group(1), match.group(2)
|
120 |
-
if old_name not in df.columns:
|
121 |
-
return df, f"Error: Column '{old_name}' not found"
|
122 |
-
df = df.rename(columns={old_name: new_name})
|
123 |
-
return df, f"Renamed column '{old_name}' to '{new_name}'"
|
124 |
-
|
125 |
-
# Filter rows
|
126 |
-
match = re.search(r"filter where (\w+) > (\d+)", instruction)
|
127 |
-
if match:
|
128 |
-
col, val = match.group(1), float(match.group(2))
|
129 |
-
if col not in df.columns:
|
130 |
-
return df, f"Error: Column '{col}' not found"
|
131 |
-
original_count = len(df)
|
132 |
-
df = df[df[col] > val]
|
133 |
-
kept_count = len(df)
|
134 |
-
return df, f"Filtered data: kept {kept_count} rows where {col} > {val}"
|
135 |
-
|
136 |
-
# Group by and sum
|
137 |
-
match = re.search(r"group by (\w+) and sum (\w+)", instruction)
|
138 |
-
if match:
|
139 |
-
group_col, sum_col = match.group(1), match.group(2)
|
140 |
-
if group_col not in df.columns:
|
141 |
-
return df, f"Error: Column '{group_col}' not found"
|
142 |
-
if sum_col not in df.columns:
|
143 |
-
return df, f"Error: Column '{sum_col}' not found"
|
144 |
-
df = df.groupby(group_col)[sum_col].sum().reset_index()
|
145 |
-
return df, f"Grouped by '{group_col}' and summed '{sum_col}'"
|
146 |
-
|
147 |
-
# Add column (sum of two columns)
|
148 |
-
match = re.search(r"add column (\w+) as (\w+) \+ (\w+)", instruction)
|
149 |
-
if match:
|
150 |
-
new_col, col1, col2 = match.group(1), match.group(2), match.group(3)
|
151 |
-
if col1 not in df.columns:
|
152 |
-
return df, f"Error: Column '{col1}' not found"
|
153 |
-
if col2 not in df.columns:
|
154 |
-
return df, f"Error: Column '{col2}' not found"
|
155 |
-
df[new_col] = df[col1] + df[col2]
|
156 |
-
return df, f"Added column '{new_col}' as sum of '{col1}' and '{col2}'"
|
157 |
-
|
158 |
-
# Normalize column
|
159 |
-
match = re.search(r"normalize column (\w+)", instruction)
|
160 |
-
if match:
|
161 |
-
col = match.group(1)
|
162 |
-
if col not in df.columns:
|
163 |
-
return df, f"Error: Column '{col}' not found"
|
164 |
-
if not pd.api.types.is_numeric_dtype(df[col]):
|
165 |
-
return df, f"Error: Column '{col}' is not numeric"
|
166 |
-
df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
|
167 |
-
return df, f"Normalized column '{col}' using min-max scaling"
|
168 |
-
|
169 |
-
# Standardize column
|
170 |
-
match = re.search(r"standardize column (\w+)", instruction)
|
171 |
-
if match:
|
172 |
-
col = match.group(1)
|
173 |
-
if col not in df.columns:
|
174 |
-
return df, f"Error: Column '{col}' not found"
|
175 |
-
if not pd.api.types.is_numeric_dtype(df[col]):
|
176 |
-
return df, f"Error: Column '{col}' is not numeric"
|
177 |
-
df[col] = (df[col] - df[col].mean()) / df[col].std()
|
178 |
-
return df, f"Standardized column '{col}' using z-score"
|
179 |
-
|
180 |
-
# Split column by comma
|
181 |
-
match = re.search(r"split column (\w+) by comma", instruction)
|
182 |
-
if match:
|
183 |
-
col = match.group(1)
|
184 |
-
if col not in df.columns:
|
185 |
-
return df, f"Error: Column '{col}' not found"
|
186 |
-
df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)
|
187 |
-
return df, f"Split column '{col}' by comma into '{col}_1' and '{col}_2'"
|
188 |
-
|
189 |
-
# Remove special characters
|
190 |
-
match = re.search(r"remove special characters from (\w+)", instruction)
|
191 |
-
if match:
|
192 |
-
col = match.group(1)
|
193 |
-
if col not in df.columns:
|
194 |
-
return df, f"Error: Column '{col}' not found"
|
195 |
-
df[col] = df[col].astype(str).str.replace(r"[^a-zA-Z0-9]", "", regex=True)
|
196 |
-
return df, f"Removed special characters from column '{col}'"
|
197 |
-
|
198 |
-
# If no instruction matched
|
199 |
-
return df, f"Instruction '{instruction}' not recognized"
|
200 |
-
|
201 |
except Exception as e:
|
202 |
-
|
203 |
|
204 |
-
#
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
if "file" not in request.files:
|
210 |
-
return jsonify({"error": "No file provided"}), 400
|
211 |
-
if "instruction" not in request.form:
|
212 |
-
return jsonify({"error": "No instruction provided"}), 400
|
213 |
-
if "session_id" not in request.form:
|
214 |
-
return jsonify({"error": "No session_id provided"}), 400
|
215 |
|
216 |
-
|
217 |
-
|
218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
|
220 |
-
|
221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
else:
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
|
247 |
-
|
248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
|
250 |
return jsonify({
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
"original_rows": len(df),
|
256 |
-
"processed_rows": len(df_processed),
|
257 |
-
"columns": list(df_processed.columns),
|
258 |
-
"filename": filename
|
259 |
})
|
260 |
-
|
261 |
except Exception as e:
|
262 |
-
|
|
|
263 |
|
264 |
-
|
265 |
-
|
266 |
-
def download_file(filename):
|
267 |
try:
|
268 |
-
session_id = request.args.get(
|
|
|
|
|
269 |
|
270 |
-
|
271 |
-
|
272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
|
274 |
-
|
275 |
-
|
|
|
|
|
|
|
276 |
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
|
282 |
-
|
283 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
except Exception as e:
|
285 |
-
|
|
|
286 |
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
|
292 |
-
@app.
|
293 |
-
def
|
294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
|
296 |
-
@app.
|
297 |
-
def
|
298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
|
300 |
-
|
301 |
-
|
302 |
-
print("🚀 Starting Data Processing API on port 7860...")
|
303 |
-
print("📊 API Endpoints:")
|
304 |
-
print(" POST /process - Process files")
|
305 |
-
print(" GET /download/<filename> - Download processed files")
|
306 |
-
print(" GET /health - Health check")
|
307 |
-
app.run(host="0.0.0.0", port=7860, debug=False)
|
|
|
1 |
+
import os
|
2 |
+
import uuid
|
3 |
+
import json
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
from datetime import datetime, timedelta
|
7 |
from flask import Flask, request, jsonify, send_file
|
8 |
from flask_cors import CORS
|
9 |
+
from werkzeug.utils import secure_filename
|
|
|
10 |
import threading
|
11 |
import time
|
12 |
+
import logging
|
13 |
+
from scipy import stats
|
14 |
+
import matplotlib
|
15 |
+
matplotlib.use('Agg') # Use non-interactive backend
|
16 |
+
import matplotlib.pyplot as plt
|
17 |
+
import seaborn as sns
|
18 |
+
import io
|
19 |
+
import base64
|
20 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
21 |
+
import atexit
|
22 |
+
|
23 |
+
# Configure logging
|
24 |
+
logging.basicConfig(level=logging.INFO)
|
25 |
+
logger = logging.getLogger(__name__)
|
26 |
|
27 |
app = Flask(__name__)
|
28 |
CORS(app)
|
29 |
|
30 |
+
# Configuration
|
31 |
+
UPLOAD_FOLDER = '/tmp/uploads'
|
32 |
+
PROCESSED_FOLDER = '/tmp/processed'
|
33 |
+
MAX_FILE_SIZE = 512 * 1024 * 1024 # 512MB
|
34 |
+
ALLOWED_EXTENSIONS = {'csv', 'xlsx', 'xls', 'json', 'parquet', 'tsv'}
|
35 |
+
FILE_EXPIRY_HOURS = 1
|
36 |
|
37 |
+
# Ensure directories exist
|
38 |
+
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
39 |
+
os.makedirs(PROCESSED_FOLDER, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
+
# File storage to track sessions and files
|
42 |
+
file_storage = {}
|
|
|
|
|
43 |
|
44 |
+
def allowed_file(filename):
|
45 |
+
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
+
def get_file_age(filepath):
|
48 |
+
"""Get file age in hours"""
|
49 |
+
if os.path.exists(filepath):
|
50 |
+
file_time = os.path.getmtime(filepath)
|
51 |
+
return (time.time() - file_time) / 3600
|
52 |
+
return float('inf')
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
+
def cleanup_old_files():
|
55 |
+
"""Remove files older than FILE_EXPIRY_HOURS"""
|
56 |
try:
|
57 |
+
for folder in [UPLOAD_FOLDER, PROCESSED_FOLDER]:
|
58 |
+
for root, dirs, files in os.walk(folder):
|
59 |
+
for file in files:
|
60 |
+
filepath = os.path.join(root, file)
|
61 |
+
if get_file_age(filepath) > FILE_EXPIRY_HOURS:
|
62 |
+
os.remove(filepath)
|
63 |
+
logger.info(f"Cleaned up old file: {filepath}")
|
64 |
+
|
65 |
+
# Clean up file_storage entries
|
66 |
+
current_time = datetime.now()
|
67 |
+
sessions_to_remove = []
|
68 |
+
for session_id, files in file_storage.items():
|
69 |
+
files_to_remove = []
|
70 |
+
for file_id, file_info in files.items():
|
71 |
+
file_time = datetime.fromisoformat(file_info['timestamp'])
|
72 |
+
if (current_time - file_time).total_seconds() > FILE_EXPIRY_HOURS * 3600:
|
73 |
+
files_to_remove.append(file_id)
|
74 |
+
|
75 |
+
for file_id in files_to_remove:
|
76 |
+
del files[file_id]
|
77 |
+
|
78 |
+
if not files:
|
79 |
+
sessions_to_remove.append(session_id)
|
80 |
+
|
81 |
+
for session_id in sessions_to_remove:
|
82 |
+
del file_storage[session_id]
|
83 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
except Exception as e:
|
85 |
+
logger.error(f"Error during cleanup: {str(e)}")
|
86 |
|
87 |
+
# Setup scheduler for automatic cleanup
|
88 |
+
scheduler = BackgroundScheduler()
|
89 |
+
scheduler.add_job(func=cleanup_old_files, trigger="interval", minutes=15)
|
90 |
+
scheduler.start()
|
91 |
+
atexit.register(lambda: scheduler.shutdown())
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
+
def load_data_file(filepath, filename):
|
94 |
+
"""Load data from various file formats"""
|
95 |
+
try:
|
96 |
+
file_ext = filename.rsplit('.', 1)[1].lower()
|
97 |
+
|
98 |
+
if file_ext == 'csv':
|
99 |
+
return pd.read_csv(filepath)
|
100 |
+
elif file_ext in ['xlsx', 'xls']:
|
101 |
+
return pd.read_excel(filepath)
|
102 |
+
elif file_ext == 'json':
|
103 |
+
return pd.read_json(filepath)
|
104 |
+
elif file_ext == 'parquet':
|
105 |
+
return pd.read_parquet(filepath)
|
106 |
+
elif file_ext == 'tsv':
|
107 |
+
return pd.read_csv(filepath, sep='\t')
|
108 |
+
else:
|
109 |
+
raise ValueError(f"Unsupported file format: {file_ext}")
|
110 |
+
except Exception as e:
|
111 |
+
raise Exception(f"Error loading file: {str(e)}")
|
112 |
|
113 |
+
def perform_basic_statistics(df, columns=None):
|
114 |
+
"""Perform basic statistical analysis"""
|
115 |
+
if columns:
|
116 |
+
df = df[columns]
|
117 |
+
|
118 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
119 |
+
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
|
120 |
+
|
121 |
+
result = {
|
122 |
+
'numeric_summary': {},
|
123 |
+
'categorical_summary': {},
|
124 |
+
'general_info': {
|
125 |
+
'total_rows': len(df),
|
126 |
+
'total_columns': len(df.columns),
|
127 |
+
'numeric_columns': len(numeric_cols),
|
128 |
+
'categorical_columns': len(categorical_cols),
|
129 |
+
'missing_values': df.isnull().sum().to_dict()
|
130 |
+
}
|
131 |
+
}
|
132 |
+
|
133 |
+
# Numeric statistics
|
134 |
+
if numeric_cols:
|
135 |
+
numeric_stats = df[numeric_cols].describe()
|
136 |
+
result['numeric_summary'] = numeric_stats.to_dict()
|
137 |
+
|
138 |
+
# Categorical statistics
|
139 |
+
if categorical_cols:
|
140 |
+
for col in categorical_cols:
|
141 |
+
result['categorical_summary'][col] = {
|
142 |
+
'unique_values': df[col].nunique(),
|
143 |
+
'top_values': df[col].value_counts().head(10).to_dict(),
|
144 |
+
'missing_count': df[col].isnull().sum()
|
145 |
+
}
|
146 |
+
|
147 |
+
return result
|
148 |
+
|
149 |
+
def perform_groupby_analysis(df, group_column, target_column, operation='mean', filters=None):
|
150 |
+
"""Perform group by analysis"""
|
151 |
+
# Apply filters if provided
|
152 |
+
if filters:
|
153 |
+
for f in filters:
|
154 |
+
col, op, val = f['column'], f['operator'], f['value']
|
155 |
+
if op == '>':
|
156 |
+
df = df[df[col] > val]
|
157 |
+
elif op == '<':
|
158 |
+
df = df[df[col] < val]
|
159 |
+
elif op == '==':
|
160 |
+
df = df[df[col] == val]
|
161 |
+
elif op == '!=':
|
162 |
+
df = df[df[col] != val]
|
163 |
+
elif op == '>=':
|
164 |
+
df = df[df[col] >= val]
|
165 |
+
elif op == '<=':
|
166 |
+
df = df[df[col] <= val]
|
167 |
+
|
168 |
+
# Perform groupby operation
|
169 |
+
grouped = df.groupby(group_column)[target_column]
|
170 |
+
|
171 |
+
if operation == 'mean':
|
172 |
+
result = grouped.mean()
|
173 |
+
elif operation == 'sum':
|
174 |
+
result = grouped.sum()
|
175 |
+
elif operation == 'count':
|
176 |
+
result = grouped.count()
|
177 |
+
elif operation == 'max':
|
178 |
+
result = grouped.max()
|
179 |
+
elif operation == 'min':
|
180 |
+
result = grouped.min()
|
181 |
+
elif operation == 'std':
|
182 |
+
result = grouped.std()
|
183 |
+
else:
|
184 |
+
raise ValueError(f"Unsupported operation: {operation}")
|
185 |
+
|
186 |
+
return {
|
187 |
+
'result': result.to_dict(),
|
188 |
+
'operation': operation,
|
189 |
+
'group_column': group_column,
|
190 |
+
'target_column': target_column,
|
191 |
+
'total_groups': len(result)
|
192 |
+
}
|
193 |
+
|
194 |
+
def perform_correlation_analysis(df, columns=None, method='pearson'):
|
195 |
+
"""Perform correlation analysis"""
|
196 |
+
if columns:
|
197 |
+
df = df[columns]
|
198 |
+
|
199 |
+
# Only numeric columns
|
200 |
+
numeric_df = df.select_dtypes(include=[np.number])
|
201 |
+
|
202 |
+
if numeric_df.empty:
|
203 |
+
raise ValueError("No numeric columns found for correlation analysis")
|
204 |
+
|
205 |
+
correlation_matrix = numeric_df.corr(method=method)
|
206 |
+
|
207 |
+
return {
|
208 |
+
'correlation_matrix': correlation_matrix.to_dict(),
|
209 |
+
'method': method,
|
210 |
+
'columns': numeric_df.columns.tolist()
|
211 |
+
}
|
212 |
+
|
213 |
+
def detect_outliers(df, columns=None, method='iqr'):
|
214 |
+
"""Detect outliers in numeric columns"""
|
215 |
+
if columns:
|
216 |
+
df = df[columns]
|
217 |
+
|
218 |
+
numeric_df = df.select_dtypes(include=[np.number])
|
219 |
+
outliers = {}
|
220 |
+
|
221 |
+
for col in numeric_df.columns:
|
222 |
+
if method == 'iqr':
|
223 |
+
Q1 = numeric_df[col].quantile(0.25)
|
224 |
+
Q3 = numeric_df[col].quantile(0.75)
|
225 |
+
IQR = Q3 - Q1
|
226 |
+
lower_bound = Q1 - 1.5 * IQR
|
227 |
+
upper_bound = Q3 + 1.5 * IQR
|
228 |
+
|
229 |
+
outlier_indices = numeric_df[(numeric_df[col] < lower_bound) |
|
230 |
+
(numeric_df[col] > upper_bound)].index.tolist()
|
231 |
+
|
232 |
+
elif method == 'zscore':
|
233 |
+
z_scores = np.abs(stats.zscore(numeric_df[col].dropna()))
|
234 |
+
outlier_indices = numeric_df[z_scores > 3].index.tolist()
|
235 |
+
|
236 |
+
outliers[col] = {
|
237 |
+
'count': len(outlier_indices),
|
238 |
+
'indices': outlier_indices[:100], # Limit to first 100
|
239 |
+
'percentage': (len(outlier_indices) / len(numeric_df)) * 100
|
240 |
+
}
|
241 |
+
|
242 |
+
return outliers
|
243 |
|
244 |
+
def generate_visualization(df, chart_type, x_column, y_column=None, group_column=None):
|
245 |
+
"""Generate visualization and return base64 encoded image"""
|
246 |
+
plt.figure(figsize=(10, 6))
|
247 |
+
|
248 |
+
try:
|
249 |
+
if chart_type == 'histogram':
|
250 |
+
plt.hist(df[x_column], bins=30, alpha=0.7)
|
251 |
+
plt.xlabel(x_column)
|
252 |
+
plt.ylabel('Frequency')
|
253 |
+
plt.title(f'Histogram of {x_column}')
|
254 |
+
|
255 |
+
elif chart_type == 'scatter':
|
256 |
+
if not y_column:
|
257 |
+
raise ValueError("Y column required for scatter plot")
|
258 |
+
plt.scatter(df[x_column], df[y_column], alpha=0.6)
|
259 |
+
plt.xlabel(x_column)
|
260 |
+
plt.ylabel(y_column)
|
261 |
+
plt.title(f'{x_column} vs {y_column}')
|
262 |
+
|
263 |
+
elif chart_type == 'bar':
|
264 |
+
if group_column:
|
265 |
+
grouped = df.groupby(group_column)[x_column].mean() if pd.api.types.is_numeric_dtype(df[x_column]) else df[group_column].value_counts()
|
266 |
else:
|
267 |
+
grouped = df[x_column].value_counts().head(20)
|
268 |
+
|
269 |
+
grouped.plot(kind='bar')
|
270 |
+
plt.xlabel(group_column or x_column)
|
271 |
+
plt.ylabel('Count' if not pd.api.types.is_numeric_dtype(df[x_column]) else f'Mean {x_column}')
|
272 |
+
plt.title(f'Bar Chart')
|
273 |
+
plt.xticks(rotation=45)
|
274 |
+
|
275 |
+
elif chart_type == 'line':
|
276 |
+
if y_column:
|
277 |
+
plt.plot(df[x_column], df[y_column])
|
278 |
+
plt.xlabel(x_column)
|
279 |
+
plt.ylabel(y_column)
|
280 |
+
else:
|
281 |
+
df[x_column].plot()
|
282 |
+
plt.ylabel(x_column)
|
283 |
+
plt.title('Line Chart')
|
284 |
+
|
285 |
+
elif chart_type == 'box':
|
286 |
+
if group_column:
|
287 |
+
df.boxplot(column=x_column, by=group_column)
|
288 |
+
else:
|
289 |
+
df.boxplot(column=x_column)
|
290 |
+
plt.title('Box Plot')
|
291 |
+
|
292 |
+
plt.tight_layout()
|
293 |
+
|
294 |
+
# Convert plot to base64 string
|
295 |
+
img_buffer = io.BytesIO()
|
296 |
+
plt.savefig(img_buffer, format='png', dpi=150, bbox_inches='tight')
|
297 |
+
img_buffer.seek(0)
|
298 |
+
img_base64 = base64.b64encode(img_buffer.getvalue()).decode()
|
299 |
+
plt.close()
|
300 |
+
|
301 |
+
return img_base64
|
302 |
+
|
303 |
+
except Exception as e:
|
304 |
+
plt.close()
|
305 |
+
raise Exception(f"Error generating visualization: {str(e)}")
|
306 |
|
307 |
+
def parse_natural_language_query(query, df_columns):
|
308 |
+
"""Simple natural language query parser"""
|
309 |
+
query_lower = query.lower()
|
310 |
+
|
311 |
+
# Define operation keywords
|
312 |
+
operations = {
|
313 |
+
'average': 'mean', 'mean': 'mean', 'avg': 'mean',
|
314 |
+
'sum': 'sum', 'total': 'sum',
|
315 |
+
'count': 'count', 'number': 'count',
|
316 |
+
'max': 'max', 'maximum': 'max', 'highest': 'max',
|
317 |
+
'min': 'min', 'minimum': 'min', 'lowest': 'min'
|
318 |
+
}
|
319 |
+
|
320 |
+
# Find operation
|
321 |
+
operation = 'mean' # default
|
322 |
+
for keyword, op in operations.items():
|
323 |
+
if keyword in query_lower:
|
324 |
+
operation = op
|
325 |
+
break
|
326 |
+
|
327 |
+
# Find columns mentioned in query
|
328 |
+
mentioned_columns = [col for col in df_columns if col.lower() in query_lower]
|
329 |
+
|
330 |
+
# Simple parsing patterns
|
331 |
+
if 'by' in query_lower and len(mentioned_columns) >= 2:
|
332 |
+
# Group by analysis
|
333 |
+
target_col = mentioned_columns[0]
|
334 |
+
group_col = mentioned_columns[-1]
|
335 |
+
|
336 |
+
return {
|
337 |
+
'analysisType': 'groupby',
|
338 |
+
'parameters': {
|
339 |
+
'groupByColumn': group_col,
|
340 |
+
'targetColumn': target_col,
|
341 |
+
'operation': operation
|
342 |
+
}
|
343 |
+
}
|
344 |
+
elif 'correlation' in query_lower:
|
345 |
+
return {
|
346 |
+
'analysisType': 'correlation',
|
347 |
+
'parameters': {
|
348 |
+
'columns': mentioned_columns if mentioned_columns else None
|
349 |
+
}
|
350 |
+
}
|
351 |
+
elif any(word in query_lower for word in ['chart', 'plot', 'graph', 'visualize']):
|
352 |
+
chart_type = 'bar' # default
|
353 |
+
if 'scatter' in query_lower:
|
354 |
+
chart_type = 'scatter'
|
355 |
+
elif 'line' in query_lower:
|
356 |
+
chart_type = 'line'
|
357 |
+
elif 'histogram' in query_lower:
|
358 |
+
chart_type = 'histogram'
|
359 |
+
|
360 |
+
return {
|
361 |
+
'analysisType': 'visualization',
|
362 |
+
'parameters': {
|
363 |
+
'chartType': chart_type,
|
364 |
+
'xColumn': mentioned_columns[0] if mentioned_columns else None,
|
365 |
+
'yColumn': mentioned_columns[1] if len(mentioned_columns) > 1 else None
|
366 |
+
}
|
367 |
+
}
|
368 |
+
else:
|
369 |
+
# Default to basic statistics
|
370 |
+
return {
|
371 |
+
'analysisType': 'statistics',
|
372 |
+
'parameters': {
|
373 |
+
'columns': mentioned_columns if mentioned_columns else None
|
374 |
+
}
|
375 |
+
}
|
376 |
+
|
377 |
+
@app.route('/api/health', methods=['GET'])
|
378 |
+
def health_check():
|
379 |
+
return jsonify({'status': 'healthy', 'timestamp': datetime.now().isoformat()})
|
380 |
|
381 |
+
@app.route('/api/upload', methods=['POST'])
|
382 |
+
def upload_file():
|
383 |
+
try:
|
384 |
+
if 'file' not in request.files:
|
385 |
+
return jsonify({'error': 'No file provided'}), 400
|
386 |
+
|
387 |
+
file = request.files['file']
|
388 |
+
session_id = request.form.get('sessionId')
|
389 |
+
|
390 |
+
if not session_id:
|
391 |
+
return jsonify({'error': 'Session ID required'}), 400
|
392 |
+
|
393 |
+
if file.filename == '':
|
394 |
+
return jsonify({'error': 'No file selected'}), 400
|
395 |
+
|
396 |
+
if not allowed_file(file.filename):
|
397 |
+
return jsonify({'error': 'File type not supported'}), 400
|
398 |
+
|
399 |
+
# Check file size
|
400 |
+
file.seek(0, 2) # Seek to end
|
401 |
+
file_size = file.tell()
|
402 |
+
file.seek(0) # Reset to beginning
|
403 |
+
|
404 |
+
if file_size > MAX_FILE_SIZE:
|
405 |
+
return jsonify({'error': f'File too large. Maximum size is {MAX_FILE_SIZE // (1024*1024)}MB'}), 400
|
406 |
+
|
407 |
+
# Generate unique file ID and secure filename
|
408 |
+
file_id = str(uuid.uuid4())
|
409 |
+
filename = secure_filename(file.filename)
|
410 |
+
|
411 |
+
# Create session directory
|
412 |
+
session_dir = os.path.join(UPLOAD_FOLDER, session_id)
|
413 |
+
os.makedirs(session_dir, exist_ok=True)
|
414 |
+
|
415 |
+
# Save file
|
416 |
+
filepath = os.path.join(session_dir, f"{file_id}_{filename}")
|
417 |
+
file.save(filepath)
|
418 |
+
|
419 |
+
# Store file info
|
420 |
+
if session_id not in file_storage:
|
421 |
+
file_storage[session_id] = {}
|
422 |
+
|
423 |
+
file_storage[session_id][file_id] = {
|
424 |
+
'filename': filename,
|
425 |
+
'filepath': filepath,
|
426 |
+
'size': file_size,
|
427 |
+
'timestamp': datetime.now().isoformat()
|
428 |
+
}
|
429 |
|
430 |
return jsonify({
|
431 |
+
'fileId': file_id,
|
432 |
+
'filename': filename,
|
433 |
+
'size': file_size,
|
434 |
+
'message': 'File uploaded successfully'
|
|
|
|
|
|
|
|
|
435 |
})
|
436 |
+
|
437 |
except Exception as e:
|
438 |
+
logger.error(f"Upload error: {str(e)}")
|
439 |
+
return jsonify({'error': str(e)}), 500
|
440 |
|
441 |
+
@app.route('/api/preview/<file_id>', methods=['GET'])
|
442 |
+
def preview_file(file_id):
|
|
|
443 |
try:
|
444 |
+
session_id = request.args.get('sessionId')
|
445 |
+
if not session_id or session_id not in file_storage:
|
446 |
+
return jsonify({'error': 'Invalid session'}), 400
|
447 |
|
448 |
+
if file_id not in file_storage[session_id]:
|
449 |
+
return jsonify({'error': 'File not found'}), 404
|
450 |
+
|
451 |
+
file_info = file_storage[session_id][file_id]
|
452 |
+
|
453 |
+
# Load data and get preview
|
454 |
+
df = load_data_file(file_info['filepath'], file_info['filename'])
|
455 |
+
|
456 |
+
preview_data = {
|
457 |
+
'columns': df.columns.tolist(),
|
458 |
+
'dtypes': df.dtypes.astype(str).to_dict(),
|
459 |
+
'shape': df.shape,
|
460 |
+
'head': df.head(5).to_dict('records'),
|
461 |
+
'missing_values': df.isnull().sum().to_dict()
|
462 |
+
}
|
463 |
|
464 |
+
return jsonify(preview_data)
|
465 |
+
|
466 |
+
except Exception as e:
|
467 |
+
logger.error(f"Preview error: {str(e)}")
|
468 |
+
return jsonify({'error': str(e)}), 500
|
469 |
|
470 |
+
@app.route('/api/analyze', methods=['POST'])
|
471 |
+
def analyze_data():
|
472 |
+
try:
|
473 |
+
data = request.get_json()
|
474 |
+
session_id = data.get('sessionId')
|
475 |
+
file_id = data.get('fileId')
|
476 |
+
analysis_type = data.get('analysisType')
|
477 |
+
parameters = data.get('parameters', {})
|
478 |
+
natural_query = data.get('naturalQuery')
|
479 |
+
|
480 |
+
if not all([session_id, file_id]):
|
481 |
+
return jsonify({'error': 'Session ID and File ID required'}), 400
|
482 |
+
|
483 |
+
if session_id not in file_storage or file_id not in file_storage[session_id]:
|
484 |
+
return jsonify({'error': 'File not found'}), 404
|
485 |
+
|
486 |
+
file_info = file_storage[session_id][file_id]
|
487 |
+
df = load_data_file(file_info['filepath'], file_info['filename'])
|
488 |
+
|
489 |
+
# Handle natural language query
|
490 |
+
if natural_query and not analysis_type:
|
491 |
+
parsed_query = parse_natural_language_query(natural_query, df.columns.tolist())
|
492 |
+
analysis_type = parsed_query['analysisType']
|
493 |
+
parameters = parsed_query['parameters']
|
494 |
+
|
495 |
+
result = {}
|
496 |
+
|
497 |
+
if analysis_type == 'statistics':
|
498 |
+
result = perform_basic_statistics(df, parameters.get('columns'))
|
499 |
|
500 |
+
elif analysis_type == 'groupby':
|
501 |
+
result = perform_groupby_analysis(
|
502 |
+
df,
|
503 |
+
parameters.get('groupByColumn'),
|
504 |
+
parameters.get('targetColumn'),
|
505 |
+
parameters.get('operation', 'mean'),
|
506 |
+
parameters.get('filters')
|
507 |
+
)
|
508 |
+
|
509 |
+
elif analysis_type == 'correlation':
|
510 |
+
result = perform_correlation_analysis(
|
511 |
+
df,
|
512 |
+
parameters.get('columns'),
|
513 |
+
parameters.get('method', 'pearson')
|
514 |
+
)
|
515 |
+
|
516 |
+
elif analysis_type == 'outliers':
|
517 |
+
result = detect_outliers(
|
518 |
+
df,
|
519 |
+
parameters.get('columns'),
|
520 |
+
parameters.get('method', 'iqr')
|
521 |
+
)
|
522 |
+
|
523 |
+
elif analysis_type == 'visualization':
|
524 |
+
chart_base64 = generate_visualization(
|
525 |
+
df,
|
526 |
+
parameters.get('chartType', 'bar'),
|
527 |
+
parameters.get('xColumn'),
|
528 |
+
parameters.get('yColumn'),
|
529 |
+
parameters.get('groupColumn')
|
530 |
+
)
|
531 |
+
result = {
|
532 |
+
'chart': chart_base64,
|
533 |
+
'chartType': parameters.get('chartType', 'bar')
|
534 |
+
}
|
535 |
+
|
536 |
+
else:
|
537 |
+
return jsonify({'error': 'Invalid analysis type'}), 400
|
538 |
+
|
539 |
+
# Save result to processed folder
|
540 |
+
result_id = str(uuid.uuid4())
|
541 |
+
result_dir = os.path.join(PROCESSED_FOLDER, session_id)
|
542 |
+
os.makedirs(result_dir, exist_ok=True)
|
543 |
+
|
544 |
+
result_filepath = os.path.join(result_dir, f"{result_id}_result.json")
|
545 |
+
with open(result_filepath, 'w') as f:
|
546 |
+
json.dump(result, f, indent=2, default=str)
|
547 |
+
|
548 |
+
return jsonify({
|
549 |
+
'resultId': result_id,
|
550 |
+
'result': result,
|
551 |
+
'analysisType': analysis_type,
|
552 |
+
'timestamp': datetime.now().isoformat()
|
553 |
+
})
|
554 |
+
|
555 |
except Exception as e:
|
556 |
+
logger.error(f"Analysis error: {str(e)}")
|
557 |
+
return jsonify({'error': str(e)}), 500
|
558 |
|
559 |
+
@app.route('/api/files/<session_id>', methods=['GET'])
|
560 |
+
def list_files(session_id):
|
561 |
+
try:
|
562 |
+
if session_id not in file_storage:
|
563 |
+
return jsonify({'files': []})
|
564 |
+
|
565 |
+
files = []
|
566 |
+
for file_id, file_info in file_storage[session_id].items():
|
567 |
+
# Check if file still exists
|
568 |
+
if os.path.exists(file_info['filepath']):
|
569 |
+
files.append({
|
570 |
+
'fileId': file_id,
|
571 |
+
'filename': file_info['filename'],
|
572 |
+
'size': file_info['size'],
|
573 |
+
'timestamp': file_info['timestamp']
|
574 |
+
})
|
575 |
+
|
576 |
+
return jsonify({'files': files})
|
577 |
+
|
578 |
+
except Exception as e:
|
579 |
+
logger.error(f"List files error: {str(e)}")
|
580 |
+
return jsonify({'error': str(e)}), 500
|
581 |
|
582 |
+
@app.route('/api/file/<file_id>', methods=['DELETE'])
|
583 |
+
def delete_file(file_id):
|
584 |
+
try:
|
585 |
+
session_id = request.args.get('sessionId')
|
586 |
+
if not session_id or session_id not in file_storage:
|
587 |
+
return jsonify({'error': 'Invalid session'}), 400
|
588 |
+
|
589 |
+
if file_id not in file_storage[session_id]:
|
590 |
+
return jsonify({'error': 'File not found'}), 404
|
591 |
+
|
592 |
+
file_info = file_storage[session_id][file_id]
|
593 |
+
|
594 |
+
# Remove file from filesystem
|
595 |
+
if os.path.exists(file_info['filepath']):
|
596 |
+
os.remove(file_info['filepath'])
|
597 |
+
|
598 |
+
# Remove from storage
|
599 |
+
del file_storage[session_id][file_id]
|
600 |
+
|
601 |
+
return jsonify({'message': 'File deleted successfully'})
|
602 |
+
|
603 |
+
except Exception as e:
|
604 |
+
logger.error(f"Delete error: {str(e)}")
|
605 |
+
return jsonify({'error': str(e)}), 500
|
606 |
|
607 |
+
@app.route('/api/download/<result_id>', methods=['GET'])
|
608 |
+
def download_result(result_id):
|
609 |
+
try:
|
610 |
+
session_id = request.args.get('sessionId')
|
611 |
+
format_type = request.args.get('format', 'json')
|
612 |
+
|
613 |
+
if not session_id:
|
614 |
+
return jsonify({'error': 'Session ID required'}), 400
|
615 |
+
|
616 |
+
result_filepath = os.path.join(PROCESSED_FOLDER, session_id, f"{result_id}_result.json")
|
617 |
+
|
618 |
+
if not os.path.exists(result_filepath):
|
619 |
+
return jsonify({'error': 'Result not found'}), 404
|
620 |
+
|
621 |
+
if format_type == 'json':
|
622 |
+
return send_file(result_filepath, as_attachment=True,
|
623 |
+
download_name=f"analysis_result_{result_id}.json")
|
624 |
+
else:
|
625 |
+
return jsonify({'error': 'Format not supported'}), 400
|
626 |
+
|
627 |
+
except Exception as e:
|
628 |
+
logger.error(f"Download error: {str(e)}")
|
629 |
+
return jsonify({'error': str(e)}), 500
|
630 |
|
631 |
+
if __name__ == '__main__':
|
632 |
+
app.run(host='0.0.0.0', port=7860, debug=False)
|
|
|
|
|
|
|
|
|
|
|
|