Spaces:
Sleeping
Sleeping
File size: 5,549 Bytes
77bf716 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
from flask import Flask, request, jsonify, send_file
from flask_cors import CORS
import pandas as pd
import os
import threading
import time
import re
app = Flask(__name__)
CORS(app)
UPLOAD_FOLDER = "/tmp"
SESSION_KEY_PREFIX = "data_tool_session_id"
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = 512 * 1024 * 1024 # 512 MB
# Cleanup function runs every 10 mins and deletes files older than 60 mins
def clean_old_files(folder=UPLOAD_FOLDER, max_age=60):
while True:
now = time.time()
for f in os.listdir(folder):
path = os.path.join(folder, f)
if os.path.isfile(path):
if now - os.path.getmtime(path) > max_age * 60:
try:
os.remove(path)
print(f"[Cleanup] Deleted: {path}")
except Exception as e:
print(f"[Cleanup Error] {e}")
time.sleep(600) # Run every 10 minutes
# Start cleanup thread at launch
threading.Thread(target=clean_old_files, daemon=True).start()
def apply_instruction(df, instruction):
instruction = instruction.lower()
try:
# Drop column
match = re.search(r"drop column (\w+)", instruction)
if match:
df = df.drop(columns=[match.group(1)])
# Remove duplicates
if "remove duplicates" in instruction:
df = df.drop_duplicates()
# Drop missing values
if "drop missing" in instruction or "remove null" in instruction:
df = df.dropna()
# Fill missing values
match = re.search(r"fill missing.*with ([\w\.]+)", instruction)
if match:
val = match.group(1)
try:
val = float(val)
except:
pass
df = df.fillna(val)
# Sort
match = re.search(r"sort by (\w+)( descending| desc)?", instruction)
if match:
col = match.group(1)
ascending = not bool(match.group(2))
df = df.sort_values(by=col, ascending=ascending)
# Rename
match = re.search(r"rename column (\w+) to (\w+)", instruction)
if match:
old, new = match.group(1), match.group(2)
df = df.rename(columns={old: new})
# Filter where col > val
match = re.search(r"filter where (\w+) > (\d+)", instruction)
if match:
col, val = match.group(1), float(match.group(2))
df = df[df[col] > val]
# Group by and sum
match = re.search(r"group by (\w+) and sum (\w+)", instruction)
if match:
group_col, sum_col = match.group(1), match.group(2)
df = df.groupby(group_col)[sum_col].sum().reset_index()
# Add column as sum
match = re.search(r"add column (\w+) as (\w+) \+ (\w+)", instruction)
if match:
new_col, col1, col2 = match.group(1), match.group(2), match.group(3)
df[new_col] = df[col1] + df[col2]
# Normalize column
match = re.search(r"normalize column (\w+)", instruction)
if match:
col = match.group(1)
df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
# Standardize column
match = re.search(r"standardize column (\w+)", instruction)
if match:
col = match.group(1)
df[col] = (df[col] - df[col].mean()) / df[col].std()
# Split column by comma
match = re.search(r"split column (\w+) by comma", instruction)
if match:
col = match.group(1)
df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)
# Remove special characters
match = re.search(r"remove special characters from (\w+)", instruction)
if match:
col = match.group(1)
df[col] = df[col].astype(str).str.replace(r"[^a-zA-Z0-9]", "", regex=True)
except Exception as e:
return df, f"Error: {e}"
return df, "success"
@app.route("/process", methods=["POST"])
def process_file():
if "file" not in request.files or "instruction" not in request.form or "session_id" not in request.form:
return jsonify({"error": "Missing file, instruction, or session_id"}), 400
file = request.files["file"]
instruction = request.form["instruction"]
session_id = request.form["session_id"]
try:
df = pd.read_csv(file) if file.filename.endswith(".csv") else pd.read_excel(file)
except Exception as e:
return jsonify({"error": f"Failed to read file: {str(e)}"}), 400
df, status = apply_instruction(df, instruction)
filename = f"cleaned_{session_id}_{file.filename}"
output_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
df.to_csv(output_path, index=False)
preview = df.head(10).to_dict(orient="records")
return jsonify({
"preview": preview,
"download_url": f"/download/{filename}",
"status": status
})
@app.route("/download/<filename>", methods=["GET"])
def download_file(filename):
session_id = request.args.get("session_id")
if not session_id or f"_{session_id}_" not in filename:
return jsonify({"error": "Unauthorized download attempt"}), 403
path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
if os.path.exists(path):
return send_file(path, as_attachment=True)
return jsonify({"error": "File not found"}), 404
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860) |