mike23415 commited on
Commit
77bf716
·
verified ·
1 Parent(s): 401329d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -0
app.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify, send_file
2
+ from flask_cors import CORS
3
+ import pandas as pd
4
+ import os
5
+ import threading
6
+ import time
7
+ import re
8
+
9
+ app = Flask(__name__)
10
+ CORS(app)
11
+
12
+ UPLOAD_FOLDER = "/tmp"
13
+ SESSION_KEY_PREFIX = "data_tool_session_id"
14
+ app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
15
+ app.config['MAX_CONTENT_LENGTH'] = 512 * 1024 * 1024 # 512 MB
16
+
17
+ # Cleanup function runs every 10 mins and deletes files older than 60 mins
18
+ def clean_old_files(folder=UPLOAD_FOLDER, max_age=60):
19
+ while True:
20
+ now = time.time()
21
+ for f in os.listdir(folder):
22
+ path = os.path.join(folder, f)
23
+ if os.path.isfile(path):
24
+ if now - os.path.getmtime(path) > max_age * 60:
25
+ try:
26
+ os.remove(path)
27
+ print(f"[Cleanup] Deleted: {path}")
28
+ except Exception as e:
29
+ print(f"[Cleanup Error] {e}")
30
+ time.sleep(600) # Run every 10 minutes
31
+
32
+ # Start cleanup thread at launch
33
+ threading.Thread(target=clean_old_files, daemon=True).start()
34
+
35
+ def apply_instruction(df, instruction):
36
+ instruction = instruction.lower()
37
+
38
+ try:
39
+ # Drop column
40
+ match = re.search(r"drop column (\w+)", instruction)
41
+ if match:
42
+ df = df.drop(columns=[match.group(1)])
43
+
44
+ # Remove duplicates
45
+ if "remove duplicates" in instruction:
46
+ df = df.drop_duplicates()
47
+
48
+ # Drop missing values
49
+ if "drop missing" in instruction or "remove null" in instruction:
50
+ df = df.dropna()
51
+
52
+ # Fill missing values
53
+ match = re.search(r"fill missing.*with ([\w\.]+)", instruction)
54
+ if match:
55
+ val = match.group(1)
56
+ try:
57
+ val = float(val)
58
+ except:
59
+ pass
60
+ df = df.fillna(val)
61
+
62
+ # Sort
63
+ match = re.search(r"sort by (\w+)( descending| desc)?", instruction)
64
+ if match:
65
+ col = match.group(1)
66
+ ascending = not bool(match.group(2))
67
+ df = df.sort_values(by=col, ascending=ascending)
68
+
69
+ # Rename
70
+ match = re.search(r"rename column (\w+) to (\w+)", instruction)
71
+ if match:
72
+ old, new = match.group(1), match.group(2)
73
+ df = df.rename(columns={old: new})
74
+
75
+ # Filter where col > val
76
+ match = re.search(r"filter where (\w+) > (\d+)", instruction)
77
+ if match:
78
+ col, val = match.group(1), float(match.group(2))
79
+ df = df[df[col] > val]
80
+
81
+ # Group by and sum
82
+ match = re.search(r"group by (\w+) and sum (\w+)", instruction)
83
+ if match:
84
+ group_col, sum_col = match.group(1), match.group(2)
85
+ df = df.groupby(group_col)[sum_col].sum().reset_index()
86
+
87
+ # Add column as sum
88
+ match = re.search(r"add column (\w+) as (\w+) \+ (\w+)", instruction)
89
+ if match:
90
+ new_col, col1, col2 = match.group(1), match.group(2), match.group(3)
91
+ df[new_col] = df[col1] + df[col2]
92
+
93
+ # Normalize column
94
+ match = re.search(r"normalize column (\w+)", instruction)
95
+ if match:
96
+ col = match.group(1)
97
+ df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
98
+
99
+ # Standardize column
100
+ match = re.search(r"standardize column (\w+)", instruction)
101
+ if match:
102
+ col = match.group(1)
103
+ df[col] = (df[col] - df[col].mean()) / df[col].std()
104
+
105
+ # Split column by comma
106
+ match = re.search(r"split column (\w+) by comma", instruction)
107
+ if match:
108
+ col = match.group(1)
109
+ df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)
110
+
111
+ # Remove special characters
112
+ match = re.search(r"remove special characters from (\w+)", instruction)
113
+ if match:
114
+ col = match.group(1)
115
+ df[col] = df[col].astype(str).str.replace(r"[^a-zA-Z0-9]", "", regex=True)
116
+
117
+ except Exception as e:
118
+ return df, f"Error: {e}"
119
+
120
+ return df, "success"
121
+
122
+ @app.route("/process", methods=["POST"])
123
+ def process_file():
124
+ if "file" not in request.files or "instruction" not in request.form or "session_id" not in request.form:
125
+ return jsonify({"error": "Missing file, instruction, or session_id"}), 400
126
+
127
+ file = request.files["file"]
128
+ instruction = request.form["instruction"]
129
+ session_id = request.form["session_id"]
130
+
131
+ try:
132
+ df = pd.read_csv(file) if file.filename.endswith(".csv") else pd.read_excel(file)
133
+ except Exception as e:
134
+ return jsonify({"error": f"Failed to read file: {str(e)}"}), 400
135
+
136
+ df, status = apply_instruction(df, instruction)
137
+
138
+ filename = f"cleaned_{session_id}_{file.filename}"
139
+ output_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
140
+ df.to_csv(output_path, index=False)
141
+
142
+ preview = df.head(10).to_dict(orient="records")
143
+ return jsonify({
144
+ "preview": preview,
145
+ "download_url": f"/download/{filename}",
146
+ "status": status
147
+ })
148
+
149
+ @app.route("/download/<filename>", methods=["GET"])
150
+ def download_file(filename):
151
+ session_id = request.args.get("session_id")
152
+ if not session_id or f"_{session_id}_" not in filename:
153
+ return jsonify({"error": "Unauthorized download attempt"}), 403
154
+
155
+ path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
156
+ if os.path.exists(path):
157
+ return send_file(path, as_attachment=True)
158
+ return jsonify({"error": "File not found"}), 404
159
+
160
+ if __name__ == "__main__":
161
+ app.run(host="0.0.0.0", port=7860)