Spaces:

mike23415
/

Data-analytics

Sleeping

App Files Files Community

mike23415 commited on Jun 21

Commit

682de52

verified ·

1 Parent(s): 2b74b13

Update app.py

Browse files

Files changed (1) hide show

app.py +272 -30

app.py CHANGED Viewed

@@ -82,68 +82,310 @@ def apply_instruction(df, instruction):
         match = re.search(r"normalize column (\w+)", instruction)
         if match:
             col = match.group(1)
             df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
         match = re.search(r"standardize column (\w+)", instruction)
         if match:
             col = match.group(1)
             df[col] = (df[col] - df[col].mean()) / df[col].std()
         match = re.search(r"split column (\w+) by comma", instruction)
         if match:
             col = match.group(1)
             df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)
         match = re.search(r"remove special characters from (\w+)", instruction)
         if match:
             col = match.group(1)
             df[col] = df[col].astype(str).str.replace(r"[^a-zA-Z0-9]", "", regex=True)
-    except Exception as e:
-        return df, f"Error: {e}"
-    return df, "success"
 # === File Processor Endpoint ===
 @app.route("/process", methods=["POST"])
 def process_file():
-    if "file" not in request.files or "instruction" not in request.form or "session_id" not in request.form:
-        return jsonify({"error": "Missing file, instruction, or session_id"}), 400
-    file = request.files["file"]
-    instruction = request.form["instruction"]
-    session_id = request.form["session_id"]
-    try:
-        df = pd.read_csv(file) if file.filename.endswith(".csv") else pd.read_excel(file)
-    except Exception as e:
-        return jsonify({"error": f"File read error: {str(e)}"}), 400
-    df, status = apply_instruction(df, instruction)
-    filename = f"cleaned_{session_id}_{file.filename}"
-    filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
-    df.to_csv(filepath, index=False)
-    preview = df.head(10).to_dict(orient="records")
-    return jsonify({
-        "preview": preview,
-        "download_url": f"/download/{filename}",
-        "status": status
-    })
 # === File Download with Session ID Verification ===
 @app.route("/download/<filename>", methods=["GET"])
 def download_file(filename):
-    session_id = request.args.get("session_id")
-    if not session_id or f"_{session_id}_" not in filename:
-        return jsonify({"error": "Unauthorized download attempt"}), 403
-    path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
-    if os.path.exists(path):
-        return send_file(path, as_attachment=True)
-    return jsonify({"error": "File not found"}), 404
 # === Run on Port 7860 for Hugging Face ===
 if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=7860)

         match = re.search(r"normalize column (\w+)", instruction)
         if match:
             col = match.group(1)
+from flask import Flask, request, jsonify, send_file
+from flask_cors import CORS
+import pandas as pd
+import os
+import threading
+import time
+import re
+app = Flask(__name__)
+CORS(app)
+UPLOAD_FOLDER = "/tmp"
+SESSION_KEY_PREFIX = "data_tool_session_id"
+app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
+app.config['MAX_CONTENT_LENGTH'] = 512 * 1024 * 1024  # 512 MB
+# === Root Route (Required for Hugging Face) ===
+@app.route("/", methods=["GET"])
+def root():
+    return jsonify({
+        "message": "Data Processing API is running",
+        "status": "healthy",
+        "endpoints": {
+            "POST /process": "Upload and process CSV/Excel files",
+            "GET /download/<filename>": "Download processed file with session_id parameter",
+            "GET /health": "Health check"
+        },
+        "version": "1.0"
+    })
+# === Health Check Route ===
+@app.route("/health", methods=["GET"])
+def health_check():
+    return jsonify({"status": "healthy", "timestamp": time.time()})
+# === Cleanup Thread: delete files older than 60 minutes ===
+def clean_old_files(folder=UPLOAD_FOLDER, max_age=60):
+    def cleanup_loop():
+        while True:
+            now = time.time()
+            try:
+                if os.path.exists(folder):
+                    for f in os.listdir(folder):
+                        path = os.path.join(folder, f)
+                        if os.path.isfile(path):
+                            if now - os.path.getmtime(path) > max_age * 60:
+                                try:
+                                    os.remove(path)
+                                    print(f"[Cleanup] Deleted: {path}")
+                                except Exception as e:
+                                    print(f"[Cleanup Error] {e}")
+            except Exception as e:
+                print(f"[Cleanup Error] {e}")
+            time.sleep(600)  # Every 10 minutes
+    threading.Thread(target=cleanup_loop, daemon=True).start()
+# Start cleanup thread
+clean_old_files()
+# === Instruction Parser ===
+def apply_instruction(df, instruction):
+    instruction = instruction.lower().strip()
+    if not instruction:
+        return df, "No instruction provided"
+    try:
+        # Drop column
+        match = re.search(r"drop column (\w+)", instruction)
+        if match:
+            col_name = match.group(1)
+            if col_name in df.columns:
+                df = df.drop(columns=[col_name])
+                return df, f"Dropped column '{col_name}'"
+            else:
+                return df, f"Error: Column '{col_name}' not found"
+        # Remove duplicates
+        if "remove duplicates" in instruction:
+            original_count = len(df)
+            df = df.drop_duplicates()
+            removed_count = original_count - len(df)
+            return df, f"Removed {removed_count} duplicate rows"
+        # Drop missing values
+        if "drop missing" in instruction or "remove null" in instruction:
+            original_count = len(df)
+            df = df.dropna()
+            removed_count = original_count - len(df)
+            return df, f"Removed {removed_count} rows with missing values"
+        # Fill missing values
+        match = re.search(r"fill missing.*with ([\w\.]+)", instruction)
+        if match:
+            val = match.group(1)
+            try:
+                val = float(val)
+            except:
+                pass
+            missing_count = df.isnull().sum().sum()
+            df = df.fillna(val)
+            return df, f"Filled {missing_count} missing values with '{val}'"
+        # Sort by column
+        match = re.search(r"sort by (\w+)( descending| desc)?", instruction)
+        if match:
+            col = match.group(1)
+            if col not in df.columns:
+                return df, f"Error: Column '{col}' not found"
+            ascending = not bool(match.group(2))
+            df = df.sort_values(by=col, ascending=ascending)
+            order = "descending" if not ascending else "ascending"
+            return df, f"Sorted by '{col}' in {order} order"
+        # Rename column
+        match = re.search(r"rename column (\w+) to (\w+)", instruction)
+        if match:
+            old_name, new_name = match.group(1), match.group(2)
+            if old_name not in df.columns:
+                return df, f"Error: Column '{old_name}' not found"
+            df = df.rename(columns={old_name: new_name})
+            return df, f"Renamed column '{old_name}' to '{new_name}'"
+        # Filter rows
+        match = re.search(r"filter where (\w+) > (\d+)", instruction)
+        if match:
+            col, val = match.group(1), float(match.group(2))
+            if col not in df.columns:
+                return df, f"Error: Column '{col}' not found"
+            original_count = len(df)
+            df = df[df[col] > val]
+            kept_count = len(df)
+            return df, f"Filtered data: kept {kept_count} rows where {col} > {val}"
+        # Group by and sum
+        match = re.search(r"group by (\w+) and sum (\w+)", instruction)
+        if match:
+            group_col, sum_col = match.group(1), match.group(2)
+            if group_col not in df.columns:
+                return df, f"Error: Column '{group_col}' not found"
+            if sum_col not in df.columns:
+                return df, f"Error: Column '{sum_col}' not found"
+            df = df.groupby(group_col)[sum_col].sum().reset_index()
+            return df, f"Grouped by '{group_col}' and summed '{sum_col}'"
+        # Add column (sum of two columns)
+        match = re.search(r"add column (\w+) as (\w+) \+ (\w+)", instruction)
+        if match:
+            new_col, col1, col2 = match.group(1), match.group(2), match.group(3)
+            if col1 not in df.columns:
+                return df, f"Error: Column '{col1}' not found"
+            if col2 not in df.columns:
+                return df, f"Error: Column '{col2}' not found"
+            df[new_col] = df[col1] + df[col2]
+            return df, f"Added column '{new_col}' as sum of '{col1}' and '{col2}'"
+        # Normalize column
+        match = re.search(r"normalize column (\w+)", instruction)
+        if match:
+            col = match.group(1)
+            if col not in df.columns:
+                return df, f"Error: Column '{col}' not found"
+            if not pd.api.types.is_numeric_dtype(df[col]):
+                return df, f"Error: Column '{col}' is not numeric"
             df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
+            return df, f"Normalized column '{col}' using min-max scaling"
+        # Standardize column
         match = re.search(r"standardize column (\w+)", instruction)
         if match:
             col = match.group(1)
+            if col not in df.columns:
+                return df, f"Error: Column '{col}' not found"
+            if not pd.api.types.is_numeric_dtype(df[col]):
+                return df, f"Error: Column '{col}' is not numeric"
             df[col] = (df[col] - df[col].mean()) / df[col].std()
+            return df, f"Standardized column '{col}' using z-score"
+        # Split column by comma
         match = re.search(r"split column (\w+) by comma", instruction)
         if match:
             col = match.group(1)
+            if col not in df.columns:
+                return df, f"Error: Column '{col}' not found"
             df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)
+            return df, f"Split column '{col}' by comma into '{col}_1' and '{col}_2'"
+        # Remove special characters
         match = re.search(r"remove special characters from (\w+)", instruction)
         if match:
             col = match.group(1)
+            if col not in df.columns:
+                return df, f"Error: Column '{col}' not found"
             df[col] = df[col].astype(str).str.replace(r"[^a-zA-Z0-9]", "", regex=True)
+            return df, f"Removed special characters from column '{col}'"
+        # If no instruction matched
+        return df, f"Instruction '{instruction}' not recognized"
+    except Exception as e:
+        return df, f"Error: {str(e)}"
 # === File Processor Endpoint ===
 @app.route("/process", methods=["POST"])
 def process_file():
+    try:
+        # Validate request
+        if "file" not in request.files:
+            return jsonify({"error": "No file provided"}), 400
+        if "instruction" not in request.form:
+            return jsonify({"error": "No instruction provided"}), 400
+        if "session_id" not in request.form:
+            return jsonify({"error": "No session_id provided"}), 400
+        file = request.files["file"]
+        instruction = request.form["instruction"]
+        session_id = request.form["session_id"]
+        if file.filename == '':
+            return jsonify({"error": "No file selected"}), 400
+        # Read file
+        try:
+            if file.filename.lower().endswith('.csv'):
+                df = pd.read_csv(file)
+            elif file.filename.lower().endswith(('.xlsx', '.xls')):
+                df = pd.read_excel(file)
+            else:
+                return jsonify({"error": "Unsupported file format. Use CSV or Excel files."}), 400
+        except Exception as e:
+            return jsonify({"error": f"File reading error: {str(e)}"}), 400
+        # Apply instruction
+        df_processed, status = apply_instruction(df, instruction)
+        # Save processed file
+        original_name = file.filename.rsplit('.', 1)[0]  # Remove extension
+        filename = f"processed_{session_id}_{original_name}.csv"
+        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+        try:
+            df_processed.to_csv(filepath, index=False)
+        except Exception as e:
+            return jsonify({"error": f"File saving error: {str(e)}"}), 500
+        # Generate preview (first 5 rows)
+        preview = df_processed.head(5).to_dict(orient="records")
+        return jsonify({
+            "success": True,
+            "message": status,
+            "preview": preview,
+            "download_url": f"/download/{filename}",
+            "original_rows": len(df),
+            "processed_rows": len(df_processed),
+            "columns": list(df_processed.columns),
+            "filename": filename
+        })
+    except Exception as e:
+        return jsonify({"error": f"Processing error: {str(e)}"}), 500
 # === File Download with Session ID Verification ===
 @app.route("/download/<filename>", methods=["GET"])
 def download_file(filename):
+    try:
+        session_id = request.args.get("session_id")
+        # Validate session
+        if not session_id:
+            return jsonify({"error": "session_id parameter required"}), 400
+        if f"_{session_id}_" not in filename:
+            return jsonify({"error": "Invalid session or unauthorized access"}), 403
+        # Check file exists
+        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+        if not os.path.exists(filepath):
+            return jsonify({"error": "File not found or expired"}), 404
+        return send_file(filepath, as_attachment=True, download_name=filename)
+    except Exception as e:
+        return jsonify({"error": f"Download error: {str(e)}"}), 500
+# === Error Handlers ===
+@app.errorhandler(404)
+def not_found(error):
+    return jsonify({"error": "Endpoint not found"}), 404
+@app.errorhandler(413)
+def too_large(error):
+    return jsonify({"error": "File too large (max 512MB)"}), 413
+@app.errorhandler(500)
+def internal_error(error):
+    return jsonify({"error": "Internal server error"}), 500
 # === Run on Port 7860 for Hugging Face ===
 if __name__ == "__main__":
+    print("🚀 Starting Data Processing API on port 7860...")
+    print("📊 API Endpoints:")
+    print("   POST /process - Process files")
+    print("   GET /download/<filename> - Download processed files")
+    print("   GET /health - Health check")
+    app.run(host="0.0.0.0", port=7860, debug=False)