Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, jsonify, send_file
|
2 |
+
from flask_cors import CORS
|
3 |
+
import pandas as pd
|
4 |
+
import os
|
5 |
+
import threading
|
6 |
+
import time
|
7 |
+
import re
|
8 |
+
|
9 |
+
app = Flask(__name__)
|
10 |
+
CORS(app)
|
11 |
+
|
12 |
+
UPLOAD_FOLDER = "/tmp"
|
13 |
+
SESSION_KEY_PREFIX = "data_tool_session_id"
|
14 |
+
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
15 |
+
app.config['MAX_CONTENT_LENGTH'] = 512 * 1024 * 1024 # 512 MB
|
16 |
+
|
17 |
+
# Cleanup function runs every 10 mins and deletes files older than 60 mins
|
18 |
+
def clean_old_files(folder=UPLOAD_FOLDER, max_age=60):
|
19 |
+
while True:
|
20 |
+
now = time.time()
|
21 |
+
for f in os.listdir(folder):
|
22 |
+
path = os.path.join(folder, f)
|
23 |
+
if os.path.isfile(path):
|
24 |
+
if now - os.path.getmtime(path) > max_age * 60:
|
25 |
+
try:
|
26 |
+
os.remove(path)
|
27 |
+
print(f"[Cleanup] Deleted: {path}")
|
28 |
+
except Exception as e:
|
29 |
+
print(f"[Cleanup Error] {e}")
|
30 |
+
time.sleep(600) # Run every 10 minutes
|
31 |
+
|
32 |
+
# Start cleanup thread at launch
|
33 |
+
threading.Thread(target=clean_old_files, daemon=True).start()
|
34 |
+
|
35 |
+
def apply_instruction(df, instruction):
|
36 |
+
instruction = instruction.lower()
|
37 |
+
|
38 |
+
try:
|
39 |
+
# Drop column
|
40 |
+
match = re.search(r"drop column (\w+)", instruction)
|
41 |
+
if match:
|
42 |
+
df = df.drop(columns=[match.group(1)])
|
43 |
+
|
44 |
+
# Remove duplicates
|
45 |
+
if "remove duplicates" in instruction:
|
46 |
+
df = df.drop_duplicates()
|
47 |
+
|
48 |
+
# Drop missing values
|
49 |
+
if "drop missing" in instruction or "remove null" in instruction:
|
50 |
+
df = df.dropna()
|
51 |
+
|
52 |
+
# Fill missing values
|
53 |
+
match = re.search(r"fill missing.*with ([\w\.]+)", instruction)
|
54 |
+
if match:
|
55 |
+
val = match.group(1)
|
56 |
+
try:
|
57 |
+
val = float(val)
|
58 |
+
except:
|
59 |
+
pass
|
60 |
+
df = df.fillna(val)
|
61 |
+
|
62 |
+
# Sort
|
63 |
+
match = re.search(r"sort by (\w+)( descending| desc)?", instruction)
|
64 |
+
if match:
|
65 |
+
col = match.group(1)
|
66 |
+
ascending = not bool(match.group(2))
|
67 |
+
df = df.sort_values(by=col, ascending=ascending)
|
68 |
+
|
69 |
+
# Rename
|
70 |
+
match = re.search(r"rename column (\w+) to (\w+)", instruction)
|
71 |
+
if match:
|
72 |
+
old, new = match.group(1), match.group(2)
|
73 |
+
df = df.rename(columns={old: new})
|
74 |
+
|
75 |
+
# Filter where col > val
|
76 |
+
match = re.search(r"filter where (\w+) > (\d+)", instruction)
|
77 |
+
if match:
|
78 |
+
col, val = match.group(1), float(match.group(2))
|
79 |
+
df = df[df[col] > val]
|
80 |
+
|
81 |
+
# Group by and sum
|
82 |
+
match = re.search(r"group by (\w+) and sum (\w+)", instruction)
|
83 |
+
if match:
|
84 |
+
group_col, sum_col = match.group(1), match.group(2)
|
85 |
+
df = df.groupby(group_col)[sum_col].sum().reset_index()
|
86 |
+
|
87 |
+
# Add column as sum
|
88 |
+
match = re.search(r"add column (\w+) as (\w+) \+ (\w+)", instruction)
|
89 |
+
if match:
|
90 |
+
new_col, col1, col2 = match.group(1), match.group(2), match.group(3)
|
91 |
+
df[new_col] = df[col1] + df[col2]
|
92 |
+
|
93 |
+
# Normalize column
|
94 |
+
match = re.search(r"normalize column (\w+)", instruction)
|
95 |
+
if match:
|
96 |
+
col = match.group(1)
|
97 |
+
df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
|
98 |
+
|
99 |
+
# Standardize column
|
100 |
+
match = re.search(r"standardize column (\w+)", instruction)
|
101 |
+
if match:
|
102 |
+
col = match.group(1)
|
103 |
+
df[col] = (df[col] - df[col].mean()) / df[col].std()
|
104 |
+
|
105 |
+
# Split column by comma
|
106 |
+
match = re.search(r"split column (\w+) by comma", instruction)
|
107 |
+
if match:
|
108 |
+
col = match.group(1)
|
109 |
+
df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)
|
110 |
+
|
111 |
+
# Remove special characters
|
112 |
+
match = re.search(r"remove special characters from (\w+)", instruction)
|
113 |
+
if match:
|
114 |
+
col = match.group(1)
|
115 |
+
df[col] = df[col].astype(str).str.replace(r"[^a-zA-Z0-9]", "", regex=True)
|
116 |
+
|
117 |
+
except Exception as e:
|
118 |
+
return df, f"Error: {e}"
|
119 |
+
|
120 |
+
return df, "success"
|
121 |
+
|
122 |
+
@app.route("/process", methods=["POST"])
|
123 |
+
def process_file():
|
124 |
+
if "file" not in request.files or "instruction" not in request.form or "session_id" not in request.form:
|
125 |
+
return jsonify({"error": "Missing file, instruction, or session_id"}), 400
|
126 |
+
|
127 |
+
file = request.files["file"]
|
128 |
+
instruction = request.form["instruction"]
|
129 |
+
session_id = request.form["session_id"]
|
130 |
+
|
131 |
+
try:
|
132 |
+
df = pd.read_csv(file) if file.filename.endswith(".csv") else pd.read_excel(file)
|
133 |
+
except Exception as e:
|
134 |
+
return jsonify({"error": f"Failed to read file: {str(e)}"}), 400
|
135 |
+
|
136 |
+
df, status = apply_instruction(df, instruction)
|
137 |
+
|
138 |
+
filename = f"cleaned_{session_id}_{file.filename}"
|
139 |
+
output_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
|
140 |
+
df.to_csv(output_path, index=False)
|
141 |
+
|
142 |
+
preview = df.head(10).to_dict(orient="records")
|
143 |
+
return jsonify({
|
144 |
+
"preview": preview,
|
145 |
+
"download_url": f"/download/{filename}",
|
146 |
+
"status": status
|
147 |
+
})
|
148 |
+
|
149 |
+
@app.route("/download/<filename>", methods=["GET"])
|
150 |
+
def download_file(filename):
|
151 |
+
session_id = request.args.get("session_id")
|
152 |
+
if not session_id or f"_{session_id}_" not in filename:
|
153 |
+
return jsonify({"error": "Unauthorized download attempt"}), 403
|
154 |
+
|
155 |
+
path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
|
156 |
+
if os.path.exists(path):
|
157 |
+
return send_file(path, as_attachment=True)
|
158 |
+
return jsonify({"error": "File not found"}), 404
|
159 |
+
|
160 |
+
if __name__ == "__main__":
|
161 |
+
app.run(host="0.0.0.0", port=7860)
|