File size: 5,207 Bytes
77bf716
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b74b13
77bf716
2b74b13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77bf716
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b74b13
 
77bf716
 
 
 
 
 
 
 
 
 
2b74b13
77bf716
 
 
2b74b13
77bf716
 
 
2b74b13
77bf716
 
 
2b74b13
77bf716
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b74b13
77bf716
 
 
 
 
 
 
 
 
 
 
 
2b74b13
77bf716
 
 
 
2b74b13
 
77bf716
 
 
 
 
 
 
 
2b74b13
77bf716
 
 
 
 
 
 
 
 
 
 
2b74b13
77bf716
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from flask import Flask, request, jsonify, send_file
from flask_cors import CORS
import pandas as pd
import os
import threading
import time
import re

app = Flask(__name__)
CORS(app)

UPLOAD_FOLDER = "/tmp"
SESSION_KEY_PREFIX = "data_tool_session_id"
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = 512 * 1024 * 1024  # 512 MB

# === Cleanup Thread: delete files older than 60 minutes ===
def clean_old_files(folder=UPLOAD_FOLDER, max_age=60):
    def cleanup_loop():
        while True:
            now = time.time()
            for f in os.listdir(folder):
                path = os.path.join(folder, f)
                if os.path.isfile(path):
                    if now - os.path.getmtime(path) > max_age * 60:
                        try:
                            os.remove(path)
                            print(f"[Cleanup] Deleted: {path}")
                        except Exception as e:
                            print(f"[Cleanup Error] {e}")
            time.sleep(600)  # Every 10 minutes

    threading.Thread(target=cleanup_loop, daemon=True).start()

# Start cleanup thread
clean_old_files()

# === Instruction Parser ===
def apply_instruction(df, instruction):
    instruction = instruction.lower()

    try:
        match = re.search(r"drop column (\w+)", instruction)
        if match:
            df = df.drop(columns=[match.group(1)])

        if "remove duplicates" in instruction:
            df = df.drop_duplicates()

        if "drop missing" in instruction or "remove null" in instruction:
            df = df.dropna()

        match = re.search(r"fill missing.*with ([\w\.]+)", instruction)
        if match:
            val = match.group(1)
            try: val = float(val)
            except: pass
            df = df.fillna(val)

        match = re.search(r"sort by (\w+)( descending| desc)?", instruction)
        if match:
            col = match.group(1)
            ascending = not bool(match.group(2))
            df = df.sort_values(by=col, ascending=ascending)

        match = re.search(r"rename column (\w+) to (\w+)", instruction)
        if match:
            df = df.rename(columns={match.group(1): match.group(2)})

        match = re.search(r"filter where (\w+) > (\d+)", instruction)
        if match:
            df = df[df[match.group(1)] > float(match.group(2))]

        match = re.search(r"group by (\w+) and sum (\w+)", instruction)
        if match:
            df = df.groupby(match.group(1))[match.group(2)].sum().reset_index()

        match = re.search(r"add column (\w+) as (\w+) \+ (\w+)", instruction)
        if match:
            df[match.group(1)] = df[match.group(2)] + df[match.group(3)]

        match = re.search(r"normalize column (\w+)", instruction)
        if match:
            col = match.group(1)
            df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

        match = re.search(r"standardize column (\w+)", instruction)
        if match:
            col = match.group(1)
            df[col] = (df[col] - df[col].mean()) / df[col].std()

        match = re.search(r"split column (\w+) by comma", instruction)
        if match:
            col = match.group(1)
            df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)

        match = re.search(r"remove special characters from (\w+)", instruction)
        if match:
            col = match.group(1)
            df[col] = df[col].astype(str).str.replace(r"[^a-zA-Z0-9]", "", regex=True)

    except Exception as e:
        return df, f"Error: {e}"

    return df, "success"

# === File Processor Endpoint ===
@app.route("/process", methods=["POST"])
def process_file():
    if "file" not in request.files or "instruction" not in request.form or "session_id" not in request.form:
        return jsonify({"error": "Missing file, instruction, or session_id"}), 400

    file = request.files["file"]
    instruction = request.form["instruction"]
    session_id = request.form["session_id"]

    try:
        df = pd.read_csv(file) if file.filename.endswith(".csv") else pd.read_excel(file)
    except Exception as e:
        return jsonify({"error": f"File read error: {str(e)}"}), 400

    df, status = apply_instruction(df, instruction)

    filename = f"cleaned_{session_id}_{file.filename}"
    filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
    df.to_csv(filepath, index=False)

    preview = df.head(10).to_dict(orient="records")
    return jsonify({
        "preview": preview,
        "download_url": f"/download/{filename}",
        "status": status
    })

# === File Download with Session ID Verification ===
@app.route("/download/<filename>", methods=["GET"])
def download_file(filename):
    session_id = request.args.get("session_id")
    if not session_id or f"_{session_id}_" not in filename:
        return jsonify({"error": "Unauthorized download attempt"}), 403

    path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
    if os.path.exists(path):
        return send_file(path, as_attachment=True)
    return jsonify({"error": "File not found"}), 404

# === Run on Port 7860 for Hugging Face ===
if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)