mike23415 commited on
Commit
2b74b13
·
verified ·
1 Parent(s): 3eb14e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -45
app.py CHANGED
@@ -14,101 +14,86 @@ SESSION_KEY_PREFIX = "data_tool_session_id"
14
  app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
15
  app.config['MAX_CONTENT_LENGTH'] = 512 * 1024 * 1024 # 512 MB
16
 
17
- # Cleanup function runs every 10 mins and deletes files older than 60 mins
18
  def clean_old_files(folder=UPLOAD_FOLDER, max_age=60):
19
- while True:
20
- now = time.time()
21
- for f in os.listdir(folder):
22
- path = os.path.join(folder, f)
23
- if os.path.isfile(path):
24
- if now - os.path.getmtime(path) > max_age * 60:
25
- try:
26
- os.remove(path)
27
- print(f"[Cleanup] Deleted: {path}")
28
- except Exception as e:
29
- print(f"[Cleanup Error] {e}")
30
- time.sleep(600) # Run every 10 minutes
31
-
32
- # Start cleanup thread at launch
33
- threading.Thread(target=clean_old_files, daemon=True).start()
34
-
 
 
 
 
35
  def apply_instruction(df, instruction):
36
  instruction = instruction.lower()
37
 
38
  try:
39
- # Drop column
40
  match = re.search(r"drop column (\w+)", instruction)
41
  if match:
42
  df = df.drop(columns=[match.group(1)])
43
 
44
- # Remove duplicates
45
  if "remove duplicates" in instruction:
46
  df = df.drop_duplicates()
47
 
48
- # Drop missing values
49
  if "drop missing" in instruction or "remove null" in instruction:
50
  df = df.dropna()
51
 
52
- # Fill missing values
53
  match = re.search(r"fill missing.*with ([\w\.]+)", instruction)
54
  if match:
55
  val = match.group(1)
56
- try:
57
- val = float(val)
58
- except:
59
- pass
60
  df = df.fillna(val)
61
 
62
- # Sort
63
  match = re.search(r"sort by (\w+)( descending| desc)?", instruction)
64
  if match:
65
  col = match.group(1)
66
  ascending = not bool(match.group(2))
67
  df = df.sort_values(by=col, ascending=ascending)
68
 
69
- # Rename
70
  match = re.search(r"rename column (\w+) to (\w+)", instruction)
71
  if match:
72
- old, new = match.group(1), match.group(2)
73
- df = df.rename(columns={old: new})
74
 
75
- # Filter where col > val
76
  match = re.search(r"filter where (\w+) > (\d+)", instruction)
77
  if match:
78
- col, val = match.group(1), float(match.group(2))
79
- df = df[df[col] > val]
80
 
81
- # Group by and sum
82
  match = re.search(r"group by (\w+) and sum (\w+)", instruction)
83
  if match:
84
- group_col, sum_col = match.group(1), match.group(2)
85
- df = df.groupby(group_col)[sum_col].sum().reset_index()
86
 
87
- # Add column as sum
88
  match = re.search(r"add column (\w+) as (\w+) \+ (\w+)", instruction)
89
  if match:
90
- new_col, col1, col2 = match.group(1), match.group(2), match.group(3)
91
- df[new_col] = df[col1] + df[col2]
92
 
93
- # Normalize column
94
  match = re.search(r"normalize column (\w+)", instruction)
95
  if match:
96
  col = match.group(1)
97
  df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
98
 
99
- # Standardize column
100
  match = re.search(r"standardize column (\w+)", instruction)
101
  if match:
102
  col = match.group(1)
103
  df[col] = (df[col] - df[col].mean()) / df[col].std()
104
 
105
- # Split column by comma
106
  match = re.search(r"split column (\w+) by comma", instruction)
107
  if match:
108
  col = match.group(1)
109
  df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)
110
 
111
- # Remove special characters
112
  match = re.search(r"remove special characters from (\w+)", instruction)
113
  if match:
114
  col = match.group(1)
@@ -119,6 +104,7 @@ def apply_instruction(df, instruction):
119
 
120
  return df, "success"
121
 
 
122
  @app.route("/process", methods=["POST"])
123
  def process_file():
124
  if "file" not in request.files or "instruction" not in request.form or "session_id" not in request.form:
@@ -131,13 +117,13 @@ def process_file():
131
  try:
132
  df = pd.read_csv(file) if file.filename.endswith(".csv") else pd.read_excel(file)
133
  except Exception as e:
134
- return jsonify({"error": f"Failed to read file: {str(e)}"}), 400
135
 
136
  df, status = apply_instruction(df, instruction)
137
 
138
  filename = f"cleaned_{session_id}_{file.filename}"
139
- output_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
140
- df.to_csv(output_path, index=False)
141
 
142
  preview = df.head(10).to_dict(orient="records")
143
  return jsonify({
@@ -146,6 +132,7 @@ def process_file():
146
  "status": status
147
  })
148
 
 
149
  @app.route("/download/<filename>", methods=["GET"])
150
  def download_file(filename):
151
  session_id = request.args.get("session_id")
@@ -157,5 +144,6 @@ def download_file(filename):
157
  return send_file(path, as_attachment=True)
158
  return jsonify({"error": "File not found"}), 404
159
 
 
160
  if __name__ == "__main__":
161
  app.run(host="0.0.0.0", port=7860)
 
14
  app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
15
  app.config['MAX_CONTENT_LENGTH'] = 512 * 1024 * 1024 # 512 MB
16
 
17
+ # === Cleanup Thread: delete files older than 60 minutes ===
18
  def clean_old_files(folder=UPLOAD_FOLDER, max_age=60):
19
+ def cleanup_loop():
20
+ while True:
21
+ now = time.time()
22
+ for f in os.listdir(folder):
23
+ path = os.path.join(folder, f)
24
+ if os.path.isfile(path):
25
+ if now - os.path.getmtime(path) > max_age * 60:
26
+ try:
27
+ os.remove(path)
28
+ print(f"[Cleanup] Deleted: {path}")
29
+ except Exception as e:
30
+ print(f"[Cleanup Error] {e}")
31
+ time.sleep(600) # Every 10 minutes
32
+
33
+ threading.Thread(target=cleanup_loop, daemon=True).start()
34
+
35
+ # Start cleanup thread
36
+ clean_old_files()
37
+
38
+ # === Instruction Parser ===
39
  def apply_instruction(df, instruction):
40
  instruction = instruction.lower()
41
 
42
  try:
 
43
  match = re.search(r"drop column (\w+)", instruction)
44
  if match:
45
  df = df.drop(columns=[match.group(1)])
46
 
 
47
  if "remove duplicates" in instruction:
48
  df = df.drop_duplicates()
49
 
 
50
  if "drop missing" in instruction or "remove null" in instruction:
51
  df = df.dropna()
52
 
 
53
  match = re.search(r"fill missing.*with ([\w\.]+)", instruction)
54
  if match:
55
  val = match.group(1)
56
+ try: val = float(val)
57
+ except: pass
 
 
58
  df = df.fillna(val)
59
 
 
60
  match = re.search(r"sort by (\w+)( descending| desc)?", instruction)
61
  if match:
62
  col = match.group(1)
63
  ascending = not bool(match.group(2))
64
  df = df.sort_values(by=col, ascending=ascending)
65
 
 
66
  match = re.search(r"rename column (\w+) to (\w+)", instruction)
67
  if match:
68
+ df = df.rename(columns={match.group(1): match.group(2)})
 
69
 
 
70
  match = re.search(r"filter where (\w+) > (\d+)", instruction)
71
  if match:
72
+ df = df[df[match.group(1)] > float(match.group(2))]
 
73
 
 
74
  match = re.search(r"group by (\w+) and sum (\w+)", instruction)
75
  if match:
76
+ df = df.groupby(match.group(1))[match.group(2)].sum().reset_index()
 
77
 
 
78
  match = re.search(r"add column (\w+) as (\w+) \+ (\w+)", instruction)
79
  if match:
80
+ df[match.group(1)] = df[match.group(2)] + df[match.group(3)]
 
81
 
 
82
  match = re.search(r"normalize column (\w+)", instruction)
83
  if match:
84
  col = match.group(1)
85
  df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
86
 
 
87
  match = re.search(r"standardize column (\w+)", instruction)
88
  if match:
89
  col = match.group(1)
90
  df[col] = (df[col] - df[col].mean()) / df[col].std()
91
 
 
92
  match = re.search(r"split column (\w+) by comma", instruction)
93
  if match:
94
  col = match.group(1)
95
  df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)
96
 
 
97
  match = re.search(r"remove special characters from (\w+)", instruction)
98
  if match:
99
  col = match.group(1)
 
104
 
105
  return df, "success"
106
 
107
+ # === File Processor Endpoint ===
108
  @app.route("/process", methods=["POST"])
109
  def process_file():
110
  if "file" not in request.files or "instruction" not in request.form or "session_id" not in request.form:
 
117
  try:
118
  df = pd.read_csv(file) if file.filename.endswith(".csv") else pd.read_excel(file)
119
  except Exception as e:
120
+ return jsonify({"error": f"File read error: {str(e)}"}), 400
121
 
122
  df, status = apply_instruction(df, instruction)
123
 
124
  filename = f"cleaned_{session_id}_{file.filename}"
125
+ filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
126
+ df.to_csv(filepath, index=False)
127
 
128
  preview = df.head(10).to_dict(orient="records")
129
  return jsonify({
 
132
  "status": status
133
  })
134
 
135
+ # === File Download with Session ID Verification ===
136
  @app.route("/download/<filename>", methods=["GET"])
137
  def download_file(filename):
138
  session_id = request.args.get("session_id")
 
144
  return send_file(path, as_attachment=True)
145
  return jsonify({"error": "File not found"}), 404
146
 
147
+ # === Run on Port 7860 for Hugging Face ===
148
  if __name__ == "__main__":
149
  app.run(host="0.0.0.0", port=7860)