Spaces:
Sleeping
Sleeping
File size: 5,207 Bytes
77bf716 2b74b13 77bf716 2b74b13 77bf716 2b74b13 77bf716 2b74b13 77bf716 2b74b13 77bf716 2b74b13 77bf716 2b74b13 77bf716 2b74b13 77bf716 2b74b13 77bf716 2b74b13 77bf716 2b74b13 77bf716 2b74b13 77bf716 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
from flask import Flask, request, jsonify, send_file
from flask_cors import CORS
import pandas as pd
import os
import threading
import time
import re
app = Flask(__name__)
CORS(app)
UPLOAD_FOLDER = "/tmp"
SESSION_KEY_PREFIX = "data_tool_session_id"
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = 512 * 1024 * 1024 # 512 MB
# === Cleanup Thread: delete files older than 60 minutes ===
def clean_old_files(folder=UPLOAD_FOLDER, max_age=60):
def cleanup_loop():
while True:
now = time.time()
for f in os.listdir(folder):
path = os.path.join(folder, f)
if os.path.isfile(path):
if now - os.path.getmtime(path) > max_age * 60:
try:
os.remove(path)
print(f"[Cleanup] Deleted: {path}")
except Exception as e:
print(f"[Cleanup Error] {e}")
time.sleep(600) # Every 10 minutes
threading.Thread(target=cleanup_loop, daemon=True).start()
# Start cleanup thread
clean_old_files()
# === Instruction Parser ===
def apply_instruction(df, instruction):
instruction = instruction.lower()
try:
match = re.search(r"drop column (\w+)", instruction)
if match:
df = df.drop(columns=[match.group(1)])
if "remove duplicates" in instruction:
df = df.drop_duplicates()
if "drop missing" in instruction or "remove null" in instruction:
df = df.dropna()
match = re.search(r"fill missing.*with ([\w\.]+)", instruction)
if match:
val = match.group(1)
try: val = float(val)
except: pass
df = df.fillna(val)
match = re.search(r"sort by (\w+)( descending| desc)?", instruction)
if match:
col = match.group(1)
ascending = not bool(match.group(2))
df = df.sort_values(by=col, ascending=ascending)
match = re.search(r"rename column (\w+) to (\w+)", instruction)
if match:
df = df.rename(columns={match.group(1): match.group(2)})
match = re.search(r"filter where (\w+) > (\d+)", instruction)
if match:
df = df[df[match.group(1)] > float(match.group(2))]
match = re.search(r"group by (\w+) and sum (\w+)", instruction)
if match:
df = df.groupby(match.group(1))[match.group(2)].sum().reset_index()
match = re.search(r"add column (\w+) as (\w+) \+ (\w+)", instruction)
if match:
df[match.group(1)] = df[match.group(2)] + df[match.group(3)]
match = re.search(r"normalize column (\w+)", instruction)
if match:
col = match.group(1)
df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
match = re.search(r"standardize column (\w+)", instruction)
if match:
col = match.group(1)
df[col] = (df[col] - df[col].mean()) / df[col].std()
match = re.search(r"split column (\w+) by comma", instruction)
if match:
col = match.group(1)
df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)
match = re.search(r"remove special characters from (\w+)", instruction)
if match:
col = match.group(1)
df[col] = df[col].astype(str).str.replace(r"[^a-zA-Z0-9]", "", regex=True)
except Exception as e:
return df, f"Error: {e}"
return df, "success"
# === File Processor Endpoint ===
@app.route("/process", methods=["POST"])
def process_file():
if "file" not in request.files or "instruction" not in request.form or "session_id" not in request.form:
return jsonify({"error": "Missing file, instruction, or session_id"}), 400
file = request.files["file"]
instruction = request.form["instruction"]
session_id = request.form["session_id"]
try:
df = pd.read_csv(file) if file.filename.endswith(".csv") else pd.read_excel(file)
except Exception as e:
return jsonify({"error": f"File read error: {str(e)}"}), 400
df, status = apply_instruction(df, instruction)
filename = f"cleaned_{session_id}_{file.filename}"
filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
df.to_csv(filepath, index=False)
preview = df.head(10).to_dict(orient="records")
return jsonify({
"preview": preview,
"download_url": f"/download/{filename}",
"status": status
})
# === File Download with Session ID Verification ===
@app.route("/download/<filename>", methods=["GET"])
def download_file(filename):
session_id = request.args.get("session_id")
if not session_id or f"_{session_id}_" not in filename:
return jsonify({"error": "Unauthorized download attempt"}), 403
path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
if os.path.exists(path):
return send_file(path, as_attachment=True)
return jsonify({"error": "File not found"}), 404
# === Run on Port 7860 for Hugging Face ===
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860) |