from flask import Flask, request, jsonify, send_file from flask_cors import CORS import pandas as pd import os import threading import time import re app = Flask(__name__) CORS(app) UPLOAD_FOLDER = "/tmp" SESSION_KEY_PREFIX = "data_tool_session_id" app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER app.config['MAX_CONTENT_LENGTH'] = 512 * 1024 * 1024 # 512 MB # Cleanup function runs every 10 mins and deletes files older than 60 mins def clean_old_files(folder=UPLOAD_FOLDER, max_age=60): while True: now = time.time() for f in os.listdir(folder): path = os.path.join(folder, f) if os.path.isfile(path): if now - os.path.getmtime(path) > max_age * 60: try: os.remove(path) print(f"[Cleanup] Deleted: {path}") except Exception as e: print(f"[Cleanup Error] {e}") time.sleep(600) # Run every 10 minutes # Start cleanup thread at launch threading.Thread(target=clean_old_files, daemon=True).start() def apply_instruction(df, instruction): instruction = instruction.lower() try: # Drop column match = re.search(r"drop column (\w+)", instruction) if match: df = df.drop(columns=[match.group(1)]) # Remove duplicates if "remove duplicates" in instruction: df = df.drop_duplicates() # Drop missing values if "drop missing" in instruction or "remove null" in instruction: df = df.dropna() # Fill missing values match = re.search(r"fill missing.*with ([\w\.]+)", instruction) if match: val = match.group(1) try: val = float(val) except: pass df = df.fillna(val) # Sort match = re.search(r"sort by (\w+)( descending| desc)?", instruction) if match: col = match.group(1) ascending = not bool(match.group(2)) df = df.sort_values(by=col, ascending=ascending) # Rename match = re.search(r"rename column (\w+) to (\w+)", instruction) if match: old, new = match.group(1), match.group(2) df = df.rename(columns={old: new}) # Filter where col > val match = re.search(r"filter where (\w+) > (\d+)", instruction) if match: col, val = match.group(1), float(match.group(2)) df = df[df[col] > val] # Group by and sum match = re.search(r"group by (\w+) and sum (\w+)", instruction) if match: group_col, sum_col = match.group(1), match.group(2) df = df.groupby(group_col)[sum_col].sum().reset_index() # Add column as sum match = re.search(r"add column (\w+) as (\w+) \+ (\w+)", instruction) if match: new_col, col1, col2 = match.group(1), match.group(2), match.group(3) df[new_col] = df[col1] + df[col2] # Normalize column match = re.search(r"normalize column (\w+)", instruction) if match: col = match.group(1) df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min()) # Standardize column match = re.search(r"standardize column (\w+)", instruction) if match: col = match.group(1) df[col] = (df[col] - df[col].mean()) / df[col].std() # Split column by comma match = re.search(r"split column (\w+) by comma", instruction) if match: col = match.group(1) df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True) # Remove special characters match = re.search(r"remove special characters from (\w+)", instruction) if match: col = match.group(1) df[col] = df[col].astype(str).str.replace(r"[^a-zA-Z0-9]", "", regex=True) except Exception as e: return df, f"Error: {e}" return df, "success" @app.route("/process", methods=["POST"]) def process_file(): if "file" not in request.files or "instruction" not in request.form or "session_id" not in request.form: return jsonify({"error": "Missing file, instruction, or session_id"}), 400 file = request.files["file"] instruction = request.form["instruction"] session_id = request.form["session_id"] try: df = pd.read_csv(file) if file.filename.endswith(".csv") else pd.read_excel(file) except Exception as e: return jsonify({"error": f"Failed to read file: {str(e)}"}), 400 df, status = apply_instruction(df, instruction) filename = f"cleaned_{session_id}_{file.filename}" output_path = os.path.join(app.config['UPLOAD_FOLDER'], filename) df.to_csv(output_path, index=False) preview = df.head(10).to_dict(orient="records") return jsonify({ "preview": preview, "download_url": f"/download/{filename}", "status": status }) @app.route("/download/", methods=["GET"]) def download_file(filename): session_id = request.args.get("session_id") if not session_id or f"_{session_id}_" not in filename: return jsonify({"error": "Unauthorized download attempt"}), 403 path = os.path.join(app.config['UPLOAD_FOLDER'], filename) if os.path.exists(path): return send_file(path, as_attachment=True) return jsonify({"error": "File not found"}), 404 if __name__ == "__main__": app.run(host="0.0.0.0", port=7860)