File size: 5,549 Bytes
77bf716
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from flask import Flask, request, jsonify, send_file
from flask_cors import CORS
import pandas as pd
import os
import threading
import time
import re

app = Flask(__name__)
CORS(app)

UPLOAD_FOLDER = "/tmp"
SESSION_KEY_PREFIX = "data_tool_session_id"
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = 512 * 1024 * 1024  # 512 MB

# Cleanup function runs every 10 mins and deletes files older than 60 mins
def clean_old_files(folder=UPLOAD_FOLDER, max_age=60):
    while True:
        now = time.time()
        for f in os.listdir(folder):
            path = os.path.join(folder, f)
            if os.path.isfile(path):
                if now - os.path.getmtime(path) > max_age * 60:
                    try:
                        os.remove(path)
                        print(f"[Cleanup] Deleted: {path}")
                    except Exception as e:
                        print(f"[Cleanup Error] {e}")
        time.sleep(600)  # Run every 10 minutes

# Start cleanup thread at launch
threading.Thread(target=clean_old_files, daemon=True).start()

def apply_instruction(df, instruction):
    instruction = instruction.lower()

    try:
        # Drop column
        match = re.search(r"drop column (\w+)", instruction)
        if match:
            df = df.drop(columns=[match.group(1)])

        # Remove duplicates
        if "remove duplicates" in instruction:
            df = df.drop_duplicates()

        # Drop missing values
        if "drop missing" in instruction or "remove null" in instruction:
            df = df.dropna()

        # Fill missing values
        match = re.search(r"fill missing.*with ([\w\.]+)", instruction)
        if match:
            val = match.group(1)
            try:
                val = float(val)
            except:
                pass
            df = df.fillna(val)

        # Sort
        match = re.search(r"sort by (\w+)( descending| desc)?", instruction)
        if match:
            col = match.group(1)
            ascending = not bool(match.group(2))
            df = df.sort_values(by=col, ascending=ascending)

        # Rename
        match = re.search(r"rename column (\w+) to (\w+)", instruction)
        if match:
            old, new = match.group(1), match.group(2)
            df = df.rename(columns={old: new})

        # Filter where col > val
        match = re.search(r"filter where (\w+) > (\d+)", instruction)
        if match:
            col, val = match.group(1), float(match.group(2))
            df = df[df[col] > val]

        # Group by and sum
        match = re.search(r"group by (\w+) and sum (\w+)", instruction)
        if match:
            group_col, sum_col = match.group(1), match.group(2)
            df = df.groupby(group_col)[sum_col].sum().reset_index()

        # Add column as sum
        match = re.search(r"add column (\w+) as (\w+) \+ (\w+)", instruction)
        if match:
            new_col, col1, col2 = match.group(1), match.group(2), match.group(3)
            df[new_col] = df[col1] + df[col2]

        # Normalize column
        match = re.search(r"normalize column (\w+)", instruction)
        if match:
            col = match.group(1)
            df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

        # Standardize column
        match = re.search(r"standardize column (\w+)", instruction)
        if match:
            col = match.group(1)
            df[col] = (df[col] - df[col].mean()) / df[col].std()

        # Split column by comma
        match = re.search(r"split column (\w+) by comma", instruction)
        if match:
            col = match.group(1)
            df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)

        # Remove special characters
        match = re.search(r"remove special characters from (\w+)", instruction)
        if match:
            col = match.group(1)
            df[col] = df[col].astype(str).str.replace(r"[^a-zA-Z0-9]", "", regex=True)

    except Exception as e:
        return df, f"Error: {e}"

    return df, "success"

@app.route("/process", methods=["POST"])
def process_file():
    if "file" not in request.files or "instruction" not in request.form or "session_id" not in request.form:
        return jsonify({"error": "Missing file, instruction, or session_id"}), 400

    file = request.files["file"]
    instruction = request.form["instruction"]
    session_id = request.form["session_id"]

    try:
        df = pd.read_csv(file) if file.filename.endswith(".csv") else pd.read_excel(file)
    except Exception as e:
        return jsonify({"error": f"Failed to read file: {str(e)}"}), 400

    df, status = apply_instruction(df, instruction)

    filename = f"cleaned_{session_id}_{file.filename}"
    output_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
    df.to_csv(output_path, index=False)

    preview = df.head(10).to_dict(orient="records")
    return jsonify({
        "preview": preview,
        "download_url": f"/download/{filename}",
        "status": status
    })

@app.route("/download/<filename>", methods=["GET"])
def download_file(filename):
    session_id = request.args.get("session_id")
    if not session_id or f"_{session_id}_" not in filename:
        return jsonify({"error": "Unauthorized download attempt"}), 403

    path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
    if os.path.exists(path):
        return send_file(path, as_attachment=True)
    return jsonify({"error": "File not found"}), 404

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)