Spaces:

mike23415
/

Data-analytics

Sleeping

App Files Files Community

mike23415 commited on Jun 21

Commit

77bf716

verified ·

1 Parent(s): 401329d

Create app.py

Browse files

Files changed (1) hide show

app.py +161 -0

app.py ADDED Viewed

	@@ -0,0 +1,161 @@

+from flask import Flask, request, jsonify, send_file
+from flask_cors import CORS
+import pandas as pd
+import os
+import threading
+import time
+import re
+app = Flask(__name__)
+CORS(app)
+UPLOAD_FOLDER = "/tmp"
+SESSION_KEY_PREFIX = "data_tool_session_id"
+app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
+app.config['MAX_CONTENT_LENGTH'] = 512 * 1024 * 1024  # 512 MB
+# Cleanup function runs every 10 mins and deletes files older than 60 mins
+def clean_old_files(folder=UPLOAD_FOLDER, max_age=60):
+    while True:
+        now = time.time()
+        for f in os.listdir(folder):
+            path = os.path.join(folder, f)
+            if os.path.isfile(path):
+                if now - os.path.getmtime(path) > max_age * 60:
+                    try:
+                        os.remove(path)
+                        print(f"[Cleanup] Deleted: {path}")
+                    except Exception as e:
+                        print(f"[Cleanup Error] {e}")
+        time.sleep(600)  # Run every 10 minutes
+# Start cleanup thread at launch
+threading.Thread(target=clean_old_files, daemon=True).start()
+def apply_instruction(df, instruction):
+    instruction = instruction.lower()
+    try:
+        # Drop column
+        match = re.search(r"drop column (\w+)", instruction)
+        if match:
+            df = df.drop(columns=[match.group(1)])
+        # Remove duplicates
+        if "remove duplicates" in instruction:
+            df = df.drop_duplicates()
+        # Drop missing values
+        if "drop missing" in instruction or "remove null" in instruction:
+            df = df.dropna()
+        # Fill missing values
+        match = re.search(r"fill missing.*with ([\w\.]+)", instruction)
+        if match:
+            val = match.group(1)
+            try:
+                val = float(val)
+            except:
+                pass
+            df = df.fillna(val)
+        # Sort
+        match = re.search(r"sort by (\w+)( descending| desc)?", instruction)
+        if match:
+            col = match.group(1)
+            ascending = not bool(match.group(2))
+            df = df.sort_values(by=col, ascending=ascending)
+        # Rename
+        match = re.search(r"rename column (\w+) to (\w+)", instruction)
+        if match:
+            old, new = match.group(1), match.group(2)
+            df = df.rename(columns={old: new})
+        # Filter where col > val
+        match = re.search(r"filter where (\w+) > (\d+)", instruction)
+        if match:
+            col, val = match.group(1), float(match.group(2))
+            df = df[df[col] > val]
+        # Group by and sum
+        match = re.search(r"group by (\w+) and sum (\w+)", instruction)
+        if match:
+            group_col, sum_col = match.group(1), match.group(2)
+            df = df.groupby(group_col)[sum_col].sum().reset_index()
+        # Add column as sum
+        match = re.search(r"add column (\w+) as (\w+) \+ (\w+)", instruction)
+        if match:
+            new_col, col1, col2 = match.group(1), match.group(2), match.group(3)
+            df[new_col] = df[col1] + df[col2]
+        # Normalize column
+        match = re.search(r"normalize column (\w+)", instruction)
+        if match:
+            col = match.group(1)
+            df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
+        # Standardize column
+        match = re.search(r"standardize column (\w+)", instruction)
+        if match:
+            col = match.group(1)
+            df[col] = (df[col] - df[col].mean()) / df[col].std()
+        # Split column by comma
+        match = re.search(r"split column (\w+) by comma", instruction)
+        if match:
+            col = match.group(1)
+            df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)
+        # Remove special characters
+        match = re.search(r"remove special characters from (\w+)", instruction)
+        if match:
+            col = match.group(1)
+            df[col] = df[col].astype(str).str.replace(r"[^a-zA-Z0-9]", "", regex=True)
+    except Exception as e:
+        return df, f"Error: {e}"
+    return df, "success"
+@app.route("/process", methods=["POST"])
+def process_file():
+    if "file" not in request.files or "instruction" not in request.form or "session_id" not in request.form:
+        return jsonify({"error": "Missing file, instruction, or session_id"}), 400
+    file = request.files["file"]
+    instruction = request.form["instruction"]
+    session_id = request.form["session_id"]
+    try:
+        df = pd.read_csv(file) if file.filename.endswith(".csv") else pd.read_excel(file)
+    except Exception as e:
+        return jsonify({"error": f"Failed to read file: {str(e)}"}), 400
+    df, status = apply_instruction(df, instruction)
+    filename = f"cleaned_{session_id}_{file.filename}"
+    output_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+    df.to_csv(output_path, index=False)
+    preview = df.head(10).to_dict(orient="records")
+    return jsonify({
+        "preview": preview,
+        "download_url": f"/download/{filename}",
+        "status": status
+    })
+@app.route("/download/<filename>", methods=["GET"])
+def download_file(filename):
+    session_id = request.args.get("session_id")
+    if not session_id or f"_{session_id}_" not in filename:
+        return jsonify({"error": "Unauthorized download attempt"}), 403
+    path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+    if os.path.exists(path):
+        return send_file(path, as_attachment=True)
+    return jsonify({"error": "File not found"}), 404
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=7860)