Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -82,68 +82,310 @@ def apply_instruction(df, instruction):
|
|
82 |
match = re.search(r"normalize column (\w+)", instruction)
|
83 |
if match:
|
84 |
col = match.group(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
|
|
|
86 |
|
|
|
87 |
match = re.search(r"standardize column (\w+)", instruction)
|
88 |
if match:
|
89 |
col = match.group(1)
|
|
|
|
|
|
|
|
|
90 |
df[col] = (df[col] - df[col].mean()) / df[col].std()
|
|
|
91 |
|
|
|
92 |
match = re.search(r"split column (\w+) by comma", instruction)
|
93 |
if match:
|
94 |
col = match.group(1)
|
|
|
|
|
95 |
df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)
|
|
|
96 |
|
|
|
97 |
match = re.search(r"remove special characters from (\w+)", instruction)
|
98 |
if match:
|
99 |
col = match.group(1)
|
|
|
|
|
100 |
df[col] = df[col].astype(str).str.replace(r"[^a-zA-Z0-9]", "", regex=True)
|
|
|
101 |
|
102 |
-
|
103 |
-
return df, f"
|
104 |
|
105 |
-
|
|
|
106 |
|
107 |
# === File Processor Endpoint ===
|
108 |
@app.route("/process", methods=["POST"])
|
109 |
def process_file():
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
|
117 |
-
|
118 |
-
|
119 |
-
except Exception as e:
|
120 |
-
return jsonify({"error": f"File read error: {str(e)}"}), 400
|
121 |
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
-
|
125 |
-
|
126 |
-
df.to_csv(filepath, index=False)
|
127 |
|
128 |
-
|
129 |
-
|
130 |
-
"
|
131 |
-
|
132 |
-
|
133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
# === File Download with Session ID Verification ===
|
136 |
@app.route("/download/<filename>", methods=["GET"])
|
137 |
def download_file(filename):
|
138 |
-
|
139 |
-
|
140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
return jsonify({"error": "File not found"}), 404
|
146 |
|
147 |
# === Run on Port 7860 for Hugging Face ===
|
148 |
if __name__ == "__main__":
|
149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
82 |
match = re.search(r"normalize column (\w+)", instruction)
|
83 |
if match:
|
84 |
col = match.group(1)
|
85 |
+
from flask import Flask, request, jsonify, send_file
|
86 |
+
from flask_cors import CORS
|
87 |
+
import pandas as pd
|
88 |
+
import os
|
89 |
+
import threading
|
90 |
+
import time
|
91 |
+
import re
|
92 |
+
|
93 |
+
app = Flask(__name__)
|
94 |
+
CORS(app)
|
95 |
+
|
96 |
+
UPLOAD_FOLDER = "/tmp"
|
97 |
+
SESSION_KEY_PREFIX = "data_tool_session_id"
|
98 |
+
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
99 |
+
app.config['MAX_CONTENT_LENGTH'] = 512 * 1024 * 1024 # 512 MB
|
100 |
+
|
101 |
+
# === Root Route (Required for Hugging Face) ===
|
102 |
+
@app.route("/", methods=["GET"])
|
103 |
+
def root():
|
104 |
+
return jsonify({
|
105 |
+
"message": "Data Processing API is running",
|
106 |
+
"status": "healthy",
|
107 |
+
"endpoints": {
|
108 |
+
"POST /process": "Upload and process CSV/Excel files",
|
109 |
+
"GET /download/<filename>": "Download processed file with session_id parameter",
|
110 |
+
"GET /health": "Health check"
|
111 |
+
},
|
112 |
+
"version": "1.0"
|
113 |
+
})
|
114 |
+
|
115 |
+
# === Health Check Route ===
|
116 |
+
@app.route("/health", methods=["GET"])
|
117 |
+
def health_check():
|
118 |
+
return jsonify({"status": "healthy", "timestamp": time.time()})
|
119 |
+
|
120 |
+
# === Cleanup Thread: delete files older than 60 minutes ===
|
121 |
+
def clean_old_files(folder=UPLOAD_FOLDER, max_age=60):
|
122 |
+
def cleanup_loop():
|
123 |
+
while True:
|
124 |
+
now = time.time()
|
125 |
+
try:
|
126 |
+
if os.path.exists(folder):
|
127 |
+
for f in os.listdir(folder):
|
128 |
+
path = os.path.join(folder, f)
|
129 |
+
if os.path.isfile(path):
|
130 |
+
if now - os.path.getmtime(path) > max_age * 60:
|
131 |
+
try:
|
132 |
+
os.remove(path)
|
133 |
+
print(f"[Cleanup] Deleted: {path}")
|
134 |
+
except Exception as e:
|
135 |
+
print(f"[Cleanup Error] {e}")
|
136 |
+
except Exception as e:
|
137 |
+
print(f"[Cleanup Error] {e}")
|
138 |
+
time.sleep(600) # Every 10 minutes
|
139 |
+
|
140 |
+
threading.Thread(target=cleanup_loop, daemon=True).start()
|
141 |
+
|
142 |
+
# Start cleanup thread
|
143 |
+
clean_old_files()
|
144 |
+
|
145 |
+
# === Instruction Parser ===
|
146 |
+
def apply_instruction(df, instruction):
|
147 |
+
instruction = instruction.lower().strip()
|
148 |
+
|
149 |
+
if not instruction:
|
150 |
+
return df, "No instruction provided"
|
151 |
+
|
152 |
+
try:
|
153 |
+
# Drop column
|
154 |
+
match = re.search(r"drop column (\w+)", instruction)
|
155 |
+
if match:
|
156 |
+
col_name = match.group(1)
|
157 |
+
if col_name in df.columns:
|
158 |
+
df = df.drop(columns=[col_name])
|
159 |
+
return df, f"Dropped column '{col_name}'"
|
160 |
+
else:
|
161 |
+
return df, f"Error: Column '{col_name}' not found"
|
162 |
+
|
163 |
+
# Remove duplicates
|
164 |
+
if "remove duplicates" in instruction:
|
165 |
+
original_count = len(df)
|
166 |
+
df = df.drop_duplicates()
|
167 |
+
removed_count = original_count - len(df)
|
168 |
+
return df, f"Removed {removed_count} duplicate rows"
|
169 |
+
|
170 |
+
# Drop missing values
|
171 |
+
if "drop missing" in instruction or "remove null" in instruction:
|
172 |
+
original_count = len(df)
|
173 |
+
df = df.dropna()
|
174 |
+
removed_count = original_count - len(df)
|
175 |
+
return df, f"Removed {removed_count} rows with missing values"
|
176 |
+
|
177 |
+
# Fill missing values
|
178 |
+
match = re.search(r"fill missing.*with ([\w\.]+)", instruction)
|
179 |
+
if match:
|
180 |
+
val = match.group(1)
|
181 |
+
try:
|
182 |
+
val = float(val)
|
183 |
+
except:
|
184 |
+
pass
|
185 |
+
missing_count = df.isnull().sum().sum()
|
186 |
+
df = df.fillna(val)
|
187 |
+
return df, f"Filled {missing_count} missing values with '{val}'"
|
188 |
+
|
189 |
+
# Sort by column
|
190 |
+
match = re.search(r"sort by (\w+)( descending| desc)?", instruction)
|
191 |
+
if match:
|
192 |
+
col = match.group(1)
|
193 |
+
if col not in df.columns:
|
194 |
+
return df, f"Error: Column '{col}' not found"
|
195 |
+
ascending = not bool(match.group(2))
|
196 |
+
df = df.sort_values(by=col, ascending=ascending)
|
197 |
+
order = "descending" if not ascending else "ascending"
|
198 |
+
return df, f"Sorted by '{col}' in {order} order"
|
199 |
+
|
200 |
+
# Rename column
|
201 |
+
match = re.search(r"rename column (\w+) to (\w+)", instruction)
|
202 |
+
if match:
|
203 |
+
old_name, new_name = match.group(1), match.group(2)
|
204 |
+
if old_name not in df.columns:
|
205 |
+
return df, f"Error: Column '{old_name}' not found"
|
206 |
+
df = df.rename(columns={old_name: new_name})
|
207 |
+
return df, f"Renamed column '{old_name}' to '{new_name}'"
|
208 |
+
|
209 |
+
# Filter rows
|
210 |
+
match = re.search(r"filter where (\w+) > (\d+)", instruction)
|
211 |
+
if match:
|
212 |
+
col, val = match.group(1), float(match.group(2))
|
213 |
+
if col not in df.columns:
|
214 |
+
return df, f"Error: Column '{col}' not found"
|
215 |
+
original_count = len(df)
|
216 |
+
df = df[df[col] > val]
|
217 |
+
kept_count = len(df)
|
218 |
+
return df, f"Filtered data: kept {kept_count} rows where {col} > {val}"
|
219 |
+
|
220 |
+
# Group by and sum
|
221 |
+
match = re.search(r"group by (\w+) and sum (\w+)", instruction)
|
222 |
+
if match:
|
223 |
+
group_col, sum_col = match.group(1), match.group(2)
|
224 |
+
if group_col not in df.columns:
|
225 |
+
return df, f"Error: Column '{group_col}' not found"
|
226 |
+
if sum_col not in df.columns:
|
227 |
+
return df, f"Error: Column '{sum_col}' not found"
|
228 |
+
df = df.groupby(group_col)[sum_col].sum().reset_index()
|
229 |
+
return df, f"Grouped by '{group_col}' and summed '{sum_col}'"
|
230 |
+
|
231 |
+
# Add column (sum of two columns)
|
232 |
+
match = re.search(r"add column (\w+) as (\w+) \+ (\w+)", instruction)
|
233 |
+
if match:
|
234 |
+
new_col, col1, col2 = match.group(1), match.group(2), match.group(3)
|
235 |
+
if col1 not in df.columns:
|
236 |
+
return df, f"Error: Column '{col1}' not found"
|
237 |
+
if col2 not in df.columns:
|
238 |
+
return df, f"Error: Column '{col2}' not found"
|
239 |
+
df[new_col] = df[col1] + df[col2]
|
240 |
+
return df, f"Added column '{new_col}' as sum of '{col1}' and '{col2}'"
|
241 |
+
|
242 |
+
# Normalize column
|
243 |
+
match = re.search(r"normalize column (\w+)", instruction)
|
244 |
+
if match:
|
245 |
+
col = match.group(1)
|
246 |
+
if col not in df.columns:
|
247 |
+
return df, f"Error: Column '{col}' not found"
|
248 |
+
if not pd.api.types.is_numeric_dtype(df[col]):
|
249 |
+
return df, f"Error: Column '{col}' is not numeric"
|
250 |
df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
|
251 |
+
return df, f"Normalized column '{col}' using min-max scaling"
|
252 |
|
253 |
+
# Standardize column
|
254 |
match = re.search(r"standardize column (\w+)", instruction)
|
255 |
if match:
|
256 |
col = match.group(1)
|
257 |
+
if col not in df.columns:
|
258 |
+
return df, f"Error: Column '{col}' not found"
|
259 |
+
if not pd.api.types.is_numeric_dtype(df[col]):
|
260 |
+
return df, f"Error: Column '{col}' is not numeric"
|
261 |
df[col] = (df[col] - df[col].mean()) / df[col].std()
|
262 |
+
return df, f"Standardized column '{col}' using z-score"
|
263 |
|
264 |
+
# Split column by comma
|
265 |
match = re.search(r"split column (\w+) by comma", instruction)
|
266 |
if match:
|
267 |
col = match.group(1)
|
268 |
+
if col not in df.columns:
|
269 |
+
return df, f"Error: Column '{col}' not found"
|
270 |
df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)
|
271 |
+
return df, f"Split column '{col}' by comma into '{col}_1' and '{col}_2'"
|
272 |
|
273 |
+
# Remove special characters
|
274 |
match = re.search(r"remove special characters from (\w+)", instruction)
|
275 |
if match:
|
276 |
col = match.group(1)
|
277 |
+
if col not in df.columns:
|
278 |
+
return df, f"Error: Column '{col}' not found"
|
279 |
df[col] = df[col].astype(str).str.replace(r"[^a-zA-Z0-9]", "", regex=True)
|
280 |
+
return df, f"Removed special characters from column '{col}'"
|
281 |
|
282 |
+
# If no instruction matched
|
283 |
+
return df, f"Instruction '{instruction}' not recognized"
|
284 |
|
285 |
+
except Exception as e:
|
286 |
+
return df, f"Error: {str(e)}"
|
287 |
|
288 |
# === File Processor Endpoint ===
|
289 |
@app.route("/process", methods=["POST"])
|
290 |
def process_file():
|
291 |
+
try:
|
292 |
+
# Validate request
|
293 |
+
if "file" not in request.files:
|
294 |
+
return jsonify({"error": "No file provided"}), 400
|
295 |
+
if "instruction" not in request.form:
|
296 |
+
return jsonify({"error": "No instruction provided"}), 400
|
297 |
+
if "session_id" not in request.form:
|
298 |
+
return jsonify({"error": "No session_id provided"}), 400
|
299 |
|
300 |
+
file = request.files["file"]
|
301 |
+
instruction = request.form["instruction"]
|
302 |
+
session_id = request.form["session_id"]
|
303 |
|
304 |
+
if file.filename == '':
|
305 |
+
return jsonify({"error": "No file selected"}), 400
|
|
|
|
|
306 |
|
307 |
+
# Read file
|
308 |
+
try:
|
309 |
+
if file.filename.lower().endswith('.csv'):
|
310 |
+
df = pd.read_csv(file)
|
311 |
+
elif file.filename.lower().endswith(('.xlsx', '.xls')):
|
312 |
+
df = pd.read_excel(file)
|
313 |
+
else:
|
314 |
+
return jsonify({"error": "Unsupported file format. Use CSV or Excel files."}), 400
|
315 |
+
except Exception as e:
|
316 |
+
return jsonify({"error": f"File reading error: {str(e)}"}), 400
|
317 |
|
318 |
+
# Apply instruction
|
319 |
+
df_processed, status = apply_instruction(df, instruction)
|
|
|
320 |
|
321 |
+
# Save processed file
|
322 |
+
original_name = file.filename.rsplit('.', 1)[0] # Remove extension
|
323 |
+
filename = f"processed_{session_id}_{original_name}.csv"
|
324 |
+
filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
|
325 |
+
|
326 |
+
try:
|
327 |
+
df_processed.to_csv(filepath, index=False)
|
328 |
+
except Exception as e:
|
329 |
+
return jsonify({"error": f"File saving error: {str(e)}"}), 500
|
330 |
+
|
331 |
+
# Generate preview (first 5 rows)
|
332 |
+
preview = df_processed.head(5).to_dict(orient="records")
|
333 |
+
|
334 |
+
return jsonify({
|
335 |
+
"success": True,
|
336 |
+
"message": status,
|
337 |
+
"preview": preview,
|
338 |
+
"download_url": f"/download/{filename}",
|
339 |
+
"original_rows": len(df),
|
340 |
+
"processed_rows": len(df_processed),
|
341 |
+
"columns": list(df_processed.columns),
|
342 |
+
"filename": filename
|
343 |
+
})
|
344 |
+
|
345 |
+
except Exception as e:
|
346 |
+
return jsonify({"error": f"Processing error: {str(e)}"}), 500
|
347 |
|
348 |
# === File Download with Session ID Verification ===
|
349 |
@app.route("/download/<filename>", methods=["GET"])
|
350 |
def download_file(filename):
|
351 |
+
try:
|
352 |
+
session_id = request.args.get("session_id")
|
353 |
+
|
354 |
+
# Validate session
|
355 |
+
if not session_id:
|
356 |
+
return jsonify({"error": "session_id parameter required"}), 400
|
357 |
+
|
358 |
+
if f"_{session_id}_" not in filename:
|
359 |
+
return jsonify({"error": "Invalid session or unauthorized access"}), 403
|
360 |
+
|
361 |
+
# Check file exists
|
362 |
+
filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
|
363 |
+
if not os.path.exists(filepath):
|
364 |
+
return jsonify({"error": "File not found or expired"}), 404
|
365 |
+
|
366 |
+
return send_file(filepath, as_attachment=True, download_name=filename)
|
367 |
+
|
368 |
+
except Exception as e:
|
369 |
+
return jsonify({"error": f"Download error: {str(e)}"}), 500
|
370 |
+
|
371 |
+
# === Error Handlers ===
|
372 |
+
@app.errorhandler(404)
|
373 |
+
def not_found(error):
|
374 |
+
return jsonify({"error": "Endpoint not found"}), 404
|
375 |
+
|
376 |
+
@app.errorhandler(413)
|
377 |
+
def too_large(error):
|
378 |
+
return jsonify({"error": "File too large (max 512MB)"}), 413
|
379 |
|
380 |
+
@app.errorhandler(500)
|
381 |
+
def internal_error(error):
|
382 |
+
return jsonify({"error": "Internal server error"}), 500
|
|
|
383 |
|
384 |
# === Run on Port 7860 for Hugging Face ===
|
385 |
if __name__ == "__main__":
|
386 |
+
print("π Starting Data Processing API on port 7860...")
|
387 |
+
print("π API Endpoints:")
|
388 |
+
print(" POST /process - Process files")
|
389 |
+
print(" GET /download/<filename> - Download processed files")
|
390 |
+
print(" GET /health - Health check")
|
391 |
+
app.run(host="0.0.0.0", port=7860, debug=False)
|