mike23415 commited on
Commit
682de52
Β·
verified Β·
1 Parent(s): 2b74b13

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +272 -30
app.py CHANGED
@@ -82,68 +82,310 @@ def apply_instruction(df, instruction):
82
  match = re.search(r"normalize column (\w+)", instruction)
83
  if match:
84
  col = match.group(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
 
86
 
 
87
  match = re.search(r"standardize column (\w+)", instruction)
88
  if match:
89
  col = match.group(1)
 
 
 
 
90
  df[col] = (df[col] - df[col].mean()) / df[col].std()
 
91
 
 
92
  match = re.search(r"split column (\w+) by comma", instruction)
93
  if match:
94
  col = match.group(1)
 
 
95
  df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)
 
96
 
 
97
  match = re.search(r"remove special characters from (\w+)", instruction)
98
  if match:
99
  col = match.group(1)
 
 
100
  df[col] = df[col].astype(str).str.replace(r"[^a-zA-Z0-9]", "", regex=True)
 
101
 
102
- except Exception as e:
103
- return df, f"Error: {e}"
104
 
105
- return df, "success"
 
106
 
107
  # === File Processor Endpoint ===
108
  @app.route("/process", methods=["POST"])
109
  def process_file():
110
- if "file" not in request.files or "instruction" not in request.form or "session_id" not in request.form:
111
- return jsonify({"error": "Missing file, instruction, or session_id"}), 400
 
 
 
 
 
 
112
 
113
- file = request.files["file"]
114
- instruction = request.form["instruction"]
115
- session_id = request.form["session_id"]
116
 
117
- try:
118
- df = pd.read_csv(file) if file.filename.endswith(".csv") else pd.read_excel(file)
119
- except Exception as e:
120
- return jsonify({"error": f"File read error: {str(e)}"}), 400
121
 
122
- df, status = apply_instruction(df, instruction)
 
 
 
 
 
 
 
 
 
123
 
124
- filename = f"cleaned_{session_id}_{file.filename}"
125
- filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
126
- df.to_csv(filepath, index=False)
127
 
128
- preview = df.head(10).to_dict(orient="records")
129
- return jsonify({
130
- "preview": preview,
131
- "download_url": f"/download/{filename}",
132
- "status": status
133
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  # === File Download with Session ID Verification ===
136
  @app.route("/download/<filename>", methods=["GET"])
137
  def download_file(filename):
138
- session_id = request.args.get("session_id")
139
- if not session_id or f"_{session_id}_" not in filename:
140
- return jsonify({"error": "Unauthorized download attempt"}), 403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
- path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
143
- if os.path.exists(path):
144
- return send_file(path, as_attachment=True)
145
- return jsonify({"error": "File not found"}), 404
146
 
147
  # === Run on Port 7860 for Hugging Face ===
148
  if __name__ == "__main__":
149
- app.run(host="0.0.0.0", port=7860)
 
 
 
 
 
 
82
  match = re.search(r"normalize column (\w+)", instruction)
83
  if match:
84
  col = match.group(1)
85
+ from flask import Flask, request, jsonify, send_file
86
+ from flask_cors import CORS
87
+ import pandas as pd
88
+ import os
89
+ import threading
90
+ import time
91
+ import re
92
+
93
+ app = Flask(__name__)
94
+ CORS(app)
95
+
96
+ UPLOAD_FOLDER = "/tmp"
97
+ SESSION_KEY_PREFIX = "data_tool_session_id"
98
+ app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
99
+ app.config['MAX_CONTENT_LENGTH'] = 512 * 1024 * 1024 # 512 MB
100
+
101
+ # === Root Route (Required for Hugging Face) ===
102
+ @app.route("/", methods=["GET"])
103
+ def root():
104
+ return jsonify({
105
+ "message": "Data Processing API is running",
106
+ "status": "healthy",
107
+ "endpoints": {
108
+ "POST /process": "Upload and process CSV/Excel files",
109
+ "GET /download/<filename>": "Download processed file with session_id parameter",
110
+ "GET /health": "Health check"
111
+ },
112
+ "version": "1.0"
113
+ })
114
+
115
+ # === Health Check Route ===
116
+ @app.route("/health", methods=["GET"])
117
+ def health_check():
118
+ return jsonify({"status": "healthy", "timestamp": time.time()})
119
+
120
+ # === Cleanup Thread: delete files older than 60 minutes ===
121
+ def clean_old_files(folder=UPLOAD_FOLDER, max_age=60):
122
+ def cleanup_loop():
123
+ while True:
124
+ now = time.time()
125
+ try:
126
+ if os.path.exists(folder):
127
+ for f in os.listdir(folder):
128
+ path = os.path.join(folder, f)
129
+ if os.path.isfile(path):
130
+ if now - os.path.getmtime(path) > max_age * 60:
131
+ try:
132
+ os.remove(path)
133
+ print(f"[Cleanup] Deleted: {path}")
134
+ except Exception as e:
135
+ print(f"[Cleanup Error] {e}")
136
+ except Exception as e:
137
+ print(f"[Cleanup Error] {e}")
138
+ time.sleep(600) # Every 10 minutes
139
+
140
+ threading.Thread(target=cleanup_loop, daemon=True).start()
141
+
142
+ # Start cleanup thread
143
+ clean_old_files()
144
+
145
+ # === Instruction Parser ===
146
+ def apply_instruction(df, instruction):
147
+ instruction = instruction.lower().strip()
148
+
149
+ if not instruction:
150
+ return df, "No instruction provided"
151
+
152
+ try:
153
+ # Drop column
154
+ match = re.search(r"drop column (\w+)", instruction)
155
+ if match:
156
+ col_name = match.group(1)
157
+ if col_name in df.columns:
158
+ df = df.drop(columns=[col_name])
159
+ return df, f"Dropped column '{col_name}'"
160
+ else:
161
+ return df, f"Error: Column '{col_name}' not found"
162
+
163
+ # Remove duplicates
164
+ if "remove duplicates" in instruction:
165
+ original_count = len(df)
166
+ df = df.drop_duplicates()
167
+ removed_count = original_count - len(df)
168
+ return df, f"Removed {removed_count} duplicate rows"
169
+
170
+ # Drop missing values
171
+ if "drop missing" in instruction or "remove null" in instruction:
172
+ original_count = len(df)
173
+ df = df.dropna()
174
+ removed_count = original_count - len(df)
175
+ return df, f"Removed {removed_count} rows with missing values"
176
+
177
+ # Fill missing values
178
+ match = re.search(r"fill missing.*with ([\w\.]+)", instruction)
179
+ if match:
180
+ val = match.group(1)
181
+ try:
182
+ val = float(val)
183
+ except:
184
+ pass
185
+ missing_count = df.isnull().sum().sum()
186
+ df = df.fillna(val)
187
+ return df, f"Filled {missing_count} missing values with '{val}'"
188
+
189
+ # Sort by column
190
+ match = re.search(r"sort by (\w+)( descending| desc)?", instruction)
191
+ if match:
192
+ col = match.group(1)
193
+ if col not in df.columns:
194
+ return df, f"Error: Column '{col}' not found"
195
+ ascending = not bool(match.group(2))
196
+ df = df.sort_values(by=col, ascending=ascending)
197
+ order = "descending" if not ascending else "ascending"
198
+ return df, f"Sorted by '{col}' in {order} order"
199
+
200
+ # Rename column
201
+ match = re.search(r"rename column (\w+) to (\w+)", instruction)
202
+ if match:
203
+ old_name, new_name = match.group(1), match.group(2)
204
+ if old_name not in df.columns:
205
+ return df, f"Error: Column '{old_name}' not found"
206
+ df = df.rename(columns={old_name: new_name})
207
+ return df, f"Renamed column '{old_name}' to '{new_name}'"
208
+
209
+ # Filter rows
210
+ match = re.search(r"filter where (\w+) > (\d+)", instruction)
211
+ if match:
212
+ col, val = match.group(1), float(match.group(2))
213
+ if col not in df.columns:
214
+ return df, f"Error: Column '{col}' not found"
215
+ original_count = len(df)
216
+ df = df[df[col] > val]
217
+ kept_count = len(df)
218
+ return df, f"Filtered data: kept {kept_count} rows where {col} > {val}"
219
+
220
+ # Group by and sum
221
+ match = re.search(r"group by (\w+) and sum (\w+)", instruction)
222
+ if match:
223
+ group_col, sum_col = match.group(1), match.group(2)
224
+ if group_col not in df.columns:
225
+ return df, f"Error: Column '{group_col}' not found"
226
+ if sum_col not in df.columns:
227
+ return df, f"Error: Column '{sum_col}' not found"
228
+ df = df.groupby(group_col)[sum_col].sum().reset_index()
229
+ return df, f"Grouped by '{group_col}' and summed '{sum_col}'"
230
+
231
+ # Add column (sum of two columns)
232
+ match = re.search(r"add column (\w+) as (\w+) \+ (\w+)", instruction)
233
+ if match:
234
+ new_col, col1, col2 = match.group(1), match.group(2), match.group(3)
235
+ if col1 not in df.columns:
236
+ return df, f"Error: Column '{col1}' not found"
237
+ if col2 not in df.columns:
238
+ return df, f"Error: Column '{col2}' not found"
239
+ df[new_col] = df[col1] + df[col2]
240
+ return df, f"Added column '{new_col}' as sum of '{col1}' and '{col2}'"
241
+
242
+ # Normalize column
243
+ match = re.search(r"normalize column (\w+)", instruction)
244
+ if match:
245
+ col = match.group(1)
246
+ if col not in df.columns:
247
+ return df, f"Error: Column '{col}' not found"
248
+ if not pd.api.types.is_numeric_dtype(df[col]):
249
+ return df, f"Error: Column '{col}' is not numeric"
250
  df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
251
+ return df, f"Normalized column '{col}' using min-max scaling"
252
 
253
+ # Standardize column
254
  match = re.search(r"standardize column (\w+)", instruction)
255
  if match:
256
  col = match.group(1)
257
+ if col not in df.columns:
258
+ return df, f"Error: Column '{col}' not found"
259
+ if not pd.api.types.is_numeric_dtype(df[col]):
260
+ return df, f"Error: Column '{col}' is not numeric"
261
  df[col] = (df[col] - df[col].mean()) / df[col].std()
262
+ return df, f"Standardized column '{col}' using z-score"
263
 
264
+ # Split column by comma
265
  match = re.search(r"split column (\w+) by comma", instruction)
266
  if match:
267
  col = match.group(1)
268
+ if col not in df.columns:
269
+ return df, f"Error: Column '{col}' not found"
270
  df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)
271
+ return df, f"Split column '{col}' by comma into '{col}_1' and '{col}_2'"
272
 
273
+ # Remove special characters
274
  match = re.search(r"remove special characters from (\w+)", instruction)
275
  if match:
276
  col = match.group(1)
277
+ if col not in df.columns:
278
+ return df, f"Error: Column '{col}' not found"
279
  df[col] = df[col].astype(str).str.replace(r"[^a-zA-Z0-9]", "", regex=True)
280
+ return df, f"Removed special characters from column '{col}'"
281
 
282
+ # If no instruction matched
283
+ return df, f"Instruction '{instruction}' not recognized"
284
 
285
+ except Exception as e:
286
+ return df, f"Error: {str(e)}"
287
 
288
  # === File Processor Endpoint ===
289
  @app.route("/process", methods=["POST"])
290
  def process_file():
291
+ try:
292
+ # Validate request
293
+ if "file" not in request.files:
294
+ return jsonify({"error": "No file provided"}), 400
295
+ if "instruction" not in request.form:
296
+ return jsonify({"error": "No instruction provided"}), 400
297
+ if "session_id" not in request.form:
298
+ return jsonify({"error": "No session_id provided"}), 400
299
 
300
+ file = request.files["file"]
301
+ instruction = request.form["instruction"]
302
+ session_id = request.form["session_id"]
303
 
304
+ if file.filename == '':
305
+ return jsonify({"error": "No file selected"}), 400
 
 
306
 
307
+ # Read file
308
+ try:
309
+ if file.filename.lower().endswith('.csv'):
310
+ df = pd.read_csv(file)
311
+ elif file.filename.lower().endswith(('.xlsx', '.xls')):
312
+ df = pd.read_excel(file)
313
+ else:
314
+ return jsonify({"error": "Unsupported file format. Use CSV or Excel files."}), 400
315
+ except Exception as e:
316
+ return jsonify({"error": f"File reading error: {str(e)}"}), 400
317
 
318
+ # Apply instruction
319
+ df_processed, status = apply_instruction(df, instruction)
 
320
 
321
+ # Save processed file
322
+ original_name = file.filename.rsplit('.', 1)[0] # Remove extension
323
+ filename = f"processed_{session_id}_{original_name}.csv"
324
+ filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
325
+
326
+ try:
327
+ df_processed.to_csv(filepath, index=False)
328
+ except Exception as e:
329
+ return jsonify({"error": f"File saving error: {str(e)}"}), 500
330
+
331
+ # Generate preview (first 5 rows)
332
+ preview = df_processed.head(5).to_dict(orient="records")
333
+
334
+ return jsonify({
335
+ "success": True,
336
+ "message": status,
337
+ "preview": preview,
338
+ "download_url": f"/download/{filename}",
339
+ "original_rows": len(df),
340
+ "processed_rows": len(df_processed),
341
+ "columns": list(df_processed.columns),
342
+ "filename": filename
343
+ })
344
+
345
+ except Exception as e:
346
+ return jsonify({"error": f"Processing error: {str(e)}"}), 500
347
 
348
  # === File Download with Session ID Verification ===
349
  @app.route("/download/<filename>", methods=["GET"])
350
  def download_file(filename):
351
+ try:
352
+ session_id = request.args.get("session_id")
353
+
354
+ # Validate session
355
+ if not session_id:
356
+ return jsonify({"error": "session_id parameter required"}), 400
357
+
358
+ if f"_{session_id}_" not in filename:
359
+ return jsonify({"error": "Invalid session or unauthorized access"}), 403
360
+
361
+ # Check file exists
362
+ filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
363
+ if not os.path.exists(filepath):
364
+ return jsonify({"error": "File not found or expired"}), 404
365
+
366
+ return send_file(filepath, as_attachment=True, download_name=filename)
367
+
368
+ except Exception as e:
369
+ return jsonify({"error": f"Download error: {str(e)}"}), 500
370
+
371
+ # === Error Handlers ===
372
+ @app.errorhandler(404)
373
+ def not_found(error):
374
+ return jsonify({"error": "Endpoint not found"}), 404
375
+
376
+ @app.errorhandler(413)
377
+ def too_large(error):
378
+ return jsonify({"error": "File too large (max 512MB)"}), 413
379
 
380
+ @app.errorhandler(500)
381
+ def internal_error(error):
382
+ return jsonify({"error": "Internal server error"}), 500
 
383
 
384
  # === Run on Port 7860 for Hugging Face ===
385
  if __name__ == "__main__":
386
+ print("πŸš€ Starting Data Processing API on port 7860...")
387
+ print("πŸ“Š API Endpoints:")
388
+ print(" POST /process - Process files")
389
+ print(" GET /download/<filename> - Download processed files")
390
+ print(" GET /health - Health check")
391
+ app.run(host="0.0.0.0", port=7860, debug=False)