Spaces:

mike23415
/

Data-analytics

Sleeping

File size: 5,207 Bytes

from flask import Flask, request, jsonify, send_file
from flask_cors import CORS
import pandas as pd
import os
import threading
import time
import re

app = Flask(__name__)
CORS(app)

UPLOAD_FOLDER = "/tmp"
SESSION_KEY_PREFIX = "data_tool_session_id"
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = 512 * 1024 * 1024  # 512 MB

# === Cleanup Thread: delete files older than 60 minutes ===
def clean_old_files(folder=UPLOAD_FOLDER, max_age=60):
    def cleanup_loop():
        while True:
            now = time.time()
            for f in os.listdir(folder):
                path = os.path.join(folder, f)
                if os.path.isfile(path):
                    if now - os.path.getmtime(path) > max_age * 60:
                        try:
                            os.remove(path)
                            print(f"[Cleanup] Deleted: {path}")
                        except Exception as e:
                            print(f"[Cleanup Error] {e}")
            time.sleep(600)  # Every 10 minutes

    threading.Thread(target=cleanup_loop, daemon=True).start()

# Start cleanup thread
clean_old_files()

# === Instruction Parser ===
def apply_instruction(df, instruction):
    instruction = instruction.lower()

    try:
        match = re.search(r"drop column (\w+)", instruction)
        if match:
            df = df.drop(columns=[match.group(1)])

        if "remove duplicates" in instruction:
            df = df.drop_duplicates()

        if "drop missing" in instruction or "remove null" in instruction:
            df = df.dropna()

        match = re.search(r"fill missing.*with ([\w\.]+)", instruction)
        if match:
            val = match.group(1)
            try: val = float(val)
            except: pass
            df = df.fillna(val)

        match = re.search(r"sort by (\w+)( descending| desc)?", instruction)
        if match:
            col = match.group(1)
            ascending = not bool(match.group(2))
            df = df.sort_values(by=col, ascending=ascending)

        match = re.search(r"rename column (\w+) to (\w+)", instruction)
        if match:
            df = df.rename(columns={match.group(1): match.group(2)})

        match = re.search(r"filter where (\w+) > (\d+)", instruction)
        if match:
            df = df[df[match.group(1)] > float(match.group(2))]

        match = re.search(r"group by (\w+) and sum (\w+)", instruction)
        if match:
            df = df.groupby(match.group(1))[match.group(2)].sum().reset_index()

        match = re.search(r"add column (\w+) as (\w+) \+ (\w+)", instruction)
        if match:
            df[match.group(1)] = df[match.group(2)] + df[match.group(3)]

        match = re.search(r"normalize column (\w+)", instruction)
        if match:
            col = match.group(1)
            df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

        match = re.search(r"standardize column (\w+)", instruction)
        if match:
            col = match.group(1)
            df[col] = (df[col] - df[col].mean()) / df[col].std()

        match = re.search(r"split column (\w+) by comma", instruction)
        if match:
            col = match.group(1)
            df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)

        match = re.search(r"remove special characters from (\w+)", instruction)
        if match:
            col = match.group(1)
            df[col] = df[col].astype(str).str.replace(r"[^a-zA-Z0-9]", "", regex=True)

    except Exception as e:
        return df, f"Error: {e}"

    return df, "success"

# === File Processor Endpoint ===
@app.route("/process", methods=["POST"])
def process_file():
    if "file" not in request.files or "instruction" not in request.form or "session_id" not in request.form:
        return jsonify({"error": "Missing file, instruction, or session_id"}), 400

    file = request.files["file"]
    instruction = request.form["instruction"]
    session_id = request.form["session_id"]

    try:
        df = pd.read_csv(file) if file.filename.endswith(".csv") else pd.read_excel(file)
    except Exception as e:
        return jsonify({"error": f"File read error: {str(e)}"}), 400

    df, status = apply_instruction(df, instruction)

    filename = f"cleaned_{session_id}_{file.filename}"
    filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
    df.to_csv(filepath, index=False)

    preview = df.head(10).to_dict(orient="records")
    return jsonify({
        "preview": preview,
        "download_url": f"/download/{filename}",
        "status": status
    })

# === File Download with Session ID Verification ===
@app.route("/download/<filename>", methods=["GET"])
def download_file(filename):
    session_id = request.args.get("session_id")
    if not session_id or f"_{session_id}_" not in filename:
        return jsonify({"error": "Unauthorized download attempt"}), 403

    path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
    if os.path.exists(path):
        return send_file(path, as_attachment=True)
    return jsonify({"error": "File not found"}), 404

# === Run on Port 7860 for Hugging Face ===
if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)