Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -14,101 +14,86 @@ SESSION_KEY_PREFIX = "data_tool_session_id"
|
|
14 |
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
15 |
app.config['MAX_CONTENT_LENGTH'] = 512 * 1024 * 1024 # 512 MB
|
16 |
|
17 |
-
# Cleanup
|
18 |
def clean_old_files(folder=UPLOAD_FOLDER, max_age=60):
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
if
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
threading.Thread(target=
|
34 |
-
|
|
|
|
|
|
|
|
|
35 |
def apply_instruction(df, instruction):
|
36 |
instruction = instruction.lower()
|
37 |
|
38 |
try:
|
39 |
-
# Drop column
|
40 |
match = re.search(r"drop column (\w+)", instruction)
|
41 |
if match:
|
42 |
df = df.drop(columns=[match.group(1)])
|
43 |
|
44 |
-
# Remove duplicates
|
45 |
if "remove duplicates" in instruction:
|
46 |
df = df.drop_duplicates()
|
47 |
|
48 |
-
# Drop missing values
|
49 |
if "drop missing" in instruction or "remove null" in instruction:
|
50 |
df = df.dropna()
|
51 |
|
52 |
-
# Fill missing values
|
53 |
match = re.search(r"fill missing.*with ([\w\.]+)", instruction)
|
54 |
if match:
|
55 |
val = match.group(1)
|
56 |
-
try:
|
57 |
-
|
58 |
-
except:
|
59 |
-
pass
|
60 |
df = df.fillna(val)
|
61 |
|
62 |
-
# Sort
|
63 |
match = re.search(r"sort by (\w+)( descending| desc)?", instruction)
|
64 |
if match:
|
65 |
col = match.group(1)
|
66 |
ascending = not bool(match.group(2))
|
67 |
df = df.sort_values(by=col, ascending=ascending)
|
68 |
|
69 |
-
# Rename
|
70 |
match = re.search(r"rename column (\w+) to (\w+)", instruction)
|
71 |
if match:
|
72 |
-
|
73 |
-
df = df.rename(columns={old: new})
|
74 |
|
75 |
-
# Filter where col > val
|
76 |
match = re.search(r"filter where (\w+) > (\d+)", instruction)
|
77 |
if match:
|
78 |
-
|
79 |
-
df = df[df[col] > val]
|
80 |
|
81 |
-
# Group by and sum
|
82 |
match = re.search(r"group by (\w+) and sum (\w+)", instruction)
|
83 |
if match:
|
84 |
-
|
85 |
-
df = df.groupby(group_col)[sum_col].sum().reset_index()
|
86 |
|
87 |
-
# Add column as sum
|
88 |
match = re.search(r"add column (\w+) as (\w+) \+ (\w+)", instruction)
|
89 |
if match:
|
90 |
-
|
91 |
-
df[new_col] = df[col1] + df[col2]
|
92 |
|
93 |
-
# Normalize column
|
94 |
match = re.search(r"normalize column (\w+)", instruction)
|
95 |
if match:
|
96 |
col = match.group(1)
|
97 |
df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
|
98 |
|
99 |
-
# Standardize column
|
100 |
match = re.search(r"standardize column (\w+)", instruction)
|
101 |
if match:
|
102 |
col = match.group(1)
|
103 |
df[col] = (df[col] - df[col].mean()) / df[col].std()
|
104 |
|
105 |
-
# Split column by comma
|
106 |
match = re.search(r"split column (\w+) by comma", instruction)
|
107 |
if match:
|
108 |
col = match.group(1)
|
109 |
df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)
|
110 |
|
111 |
-
# Remove special characters
|
112 |
match = re.search(r"remove special characters from (\w+)", instruction)
|
113 |
if match:
|
114 |
col = match.group(1)
|
@@ -119,6 +104,7 @@ def apply_instruction(df, instruction):
|
|
119 |
|
120 |
return df, "success"
|
121 |
|
|
|
122 |
@app.route("/process", methods=["POST"])
|
123 |
def process_file():
|
124 |
if "file" not in request.files or "instruction" not in request.form or "session_id" not in request.form:
|
@@ -131,13 +117,13 @@ def process_file():
|
|
131 |
try:
|
132 |
df = pd.read_csv(file) if file.filename.endswith(".csv") else pd.read_excel(file)
|
133 |
except Exception as e:
|
134 |
-
return jsonify({"error": f"
|
135 |
|
136 |
df, status = apply_instruction(df, instruction)
|
137 |
|
138 |
filename = f"cleaned_{session_id}_{file.filename}"
|
139 |
-
|
140 |
-
df.to_csv(
|
141 |
|
142 |
preview = df.head(10).to_dict(orient="records")
|
143 |
return jsonify({
|
@@ -146,6 +132,7 @@ def process_file():
|
|
146 |
"status": status
|
147 |
})
|
148 |
|
|
|
149 |
@app.route("/download/<filename>", methods=["GET"])
|
150 |
def download_file(filename):
|
151 |
session_id = request.args.get("session_id")
|
@@ -157,5 +144,6 @@ def download_file(filename):
|
|
157 |
return send_file(path, as_attachment=True)
|
158 |
return jsonify({"error": "File not found"}), 404
|
159 |
|
|
|
160 |
if __name__ == "__main__":
|
161 |
app.run(host="0.0.0.0", port=7860)
|
|
|
14 |
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
15 |
app.config['MAX_CONTENT_LENGTH'] = 512 * 1024 * 1024 # 512 MB
|
16 |
|
17 |
+
# === Cleanup Thread: delete files older than 60 minutes ===
|
18 |
def clean_old_files(folder=UPLOAD_FOLDER, max_age=60):
|
19 |
+
def cleanup_loop():
|
20 |
+
while True:
|
21 |
+
now = time.time()
|
22 |
+
for f in os.listdir(folder):
|
23 |
+
path = os.path.join(folder, f)
|
24 |
+
if os.path.isfile(path):
|
25 |
+
if now - os.path.getmtime(path) > max_age * 60:
|
26 |
+
try:
|
27 |
+
os.remove(path)
|
28 |
+
print(f"[Cleanup] Deleted: {path}")
|
29 |
+
except Exception as e:
|
30 |
+
print(f"[Cleanup Error] {e}")
|
31 |
+
time.sleep(600) # Every 10 minutes
|
32 |
+
|
33 |
+
threading.Thread(target=cleanup_loop, daemon=True).start()
|
34 |
+
|
35 |
+
# Start cleanup thread
|
36 |
+
clean_old_files()
|
37 |
+
|
38 |
+
# === Instruction Parser ===
|
39 |
def apply_instruction(df, instruction):
|
40 |
instruction = instruction.lower()
|
41 |
|
42 |
try:
|
|
|
43 |
match = re.search(r"drop column (\w+)", instruction)
|
44 |
if match:
|
45 |
df = df.drop(columns=[match.group(1)])
|
46 |
|
|
|
47 |
if "remove duplicates" in instruction:
|
48 |
df = df.drop_duplicates()
|
49 |
|
|
|
50 |
if "drop missing" in instruction or "remove null" in instruction:
|
51 |
df = df.dropna()
|
52 |
|
|
|
53 |
match = re.search(r"fill missing.*with ([\w\.]+)", instruction)
|
54 |
if match:
|
55 |
val = match.group(1)
|
56 |
+
try: val = float(val)
|
57 |
+
except: pass
|
|
|
|
|
58 |
df = df.fillna(val)
|
59 |
|
|
|
60 |
match = re.search(r"sort by (\w+)( descending| desc)?", instruction)
|
61 |
if match:
|
62 |
col = match.group(1)
|
63 |
ascending = not bool(match.group(2))
|
64 |
df = df.sort_values(by=col, ascending=ascending)
|
65 |
|
|
|
66 |
match = re.search(r"rename column (\w+) to (\w+)", instruction)
|
67 |
if match:
|
68 |
+
df = df.rename(columns={match.group(1): match.group(2)})
|
|
|
69 |
|
|
|
70 |
match = re.search(r"filter where (\w+) > (\d+)", instruction)
|
71 |
if match:
|
72 |
+
df = df[df[match.group(1)] > float(match.group(2))]
|
|
|
73 |
|
|
|
74 |
match = re.search(r"group by (\w+) and sum (\w+)", instruction)
|
75 |
if match:
|
76 |
+
df = df.groupby(match.group(1))[match.group(2)].sum().reset_index()
|
|
|
77 |
|
|
|
78 |
match = re.search(r"add column (\w+) as (\w+) \+ (\w+)", instruction)
|
79 |
if match:
|
80 |
+
df[match.group(1)] = df[match.group(2)] + df[match.group(3)]
|
|
|
81 |
|
|
|
82 |
match = re.search(r"normalize column (\w+)", instruction)
|
83 |
if match:
|
84 |
col = match.group(1)
|
85 |
df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
|
86 |
|
|
|
87 |
match = re.search(r"standardize column (\w+)", instruction)
|
88 |
if match:
|
89 |
col = match.group(1)
|
90 |
df[col] = (df[col] - df[col].mean()) / df[col].std()
|
91 |
|
|
|
92 |
match = re.search(r"split column (\w+) by comma", instruction)
|
93 |
if match:
|
94 |
col = match.group(1)
|
95 |
df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)
|
96 |
|
|
|
97 |
match = re.search(r"remove special characters from (\w+)", instruction)
|
98 |
if match:
|
99 |
col = match.group(1)
|
|
|
104 |
|
105 |
return df, "success"
|
106 |
|
107 |
+
# === File Processor Endpoint ===
|
108 |
@app.route("/process", methods=["POST"])
|
109 |
def process_file():
|
110 |
if "file" not in request.files or "instruction" not in request.form or "session_id" not in request.form:
|
|
|
117 |
try:
|
118 |
df = pd.read_csv(file) if file.filename.endswith(".csv") else pd.read_excel(file)
|
119 |
except Exception as e:
|
120 |
+
return jsonify({"error": f"File read error: {str(e)}"}), 400
|
121 |
|
122 |
df, status = apply_instruction(df, instruction)
|
123 |
|
124 |
filename = f"cleaned_{session_id}_{file.filename}"
|
125 |
+
filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
|
126 |
+
df.to_csv(filepath, index=False)
|
127 |
|
128 |
preview = df.head(10).to_dict(orient="records")
|
129 |
return jsonify({
|
|
|
132 |
"status": status
|
133 |
})
|
134 |
|
135 |
+
# === File Download with Session ID Verification ===
|
136 |
@app.route("/download/<filename>", methods=["GET"])
|
137 |
def download_file(filename):
|
138 |
session_id = request.args.get("session_id")
|
|
|
144 |
return send_file(path, as_attachment=True)
|
145 |
return jsonify({"error": "File not found"}), 404
|
146 |
|
147 |
+
# === Run on Port 7860 for Hugging Face ===
|
148 |
if __name__ == "__main__":
|
149 |
app.run(host="0.0.0.0", port=7860)
|