Spaces:

Agents-MCP-Hackathon
/

DataForge

Running

File size: 11,524 Bytes

2473fee
e7edd2e
b2ca056
 
4be3026
 
2473fee
 
4be3026
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7edd2e
b2ca056
 
 
3774bab
 
bb43287
3774bab
2473fee
 
 
3c3b761
230ff5f
b2ca056
 
 
e7edd2e
b2ca056
 
 
879668d
 
c7ebfd3
879668d
 
 
 
 
 
 
b2ca056
879668d
b2ca056
bb43287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3774bab
 
bb43287
3774bab
4be3026
b2ca056
 
879668d
 
 
 
 
 
4be3026
 
 
 
 
 
 
 
 
 
 
3774bab
 
 
b2ca056
 
4be3026
 
 
 
 
 
 
 
3774bab
 
 
 
e7edd2e
b2ca056
 
 
 
 
3f04a52
 
4f2b22e
 
 
 
 
 
 
 
 
 
3c3b761
3f04a52
 
 
b2ca056
3f04a52
 
 
 
 
 
 
 
 
b2ca056
3f04a52
 
 
 
 
 
 
3c3b761
 
 
 
 
 
 
c7ebfd3
 
3c3b761
 
 
 
 
 
b2ca056
3f04a52
3c3b761
 
 
 
 
 
 
 
 
3f04a52
3c3b761
3f04a52
 
3c3b761
3f04a52
 
 
 
 
 
3c3b761
b2ca056
3c3b761
 
3f04a52
3c3b761
 
3f04a52
3c3b761
 
 
 
c7ebfd3
3c3b761
c7ebfd3
 
3c3b761
 
 
bb43287
c7ebfd3
 
3c3b761
4e7bcf3
 
52c3a29
3f04a52
52c3a29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f04a52
3774bab
e7edd2e
3f04a52
 
c7ebfd3

import os
import gradio as gr
import asyncio
import tempfile
import subprocess
import shutil
from dotenv import find_dotenv, load_dotenv
from langchain.chat_models import init_chat_model

# Auto-install Deno if not found (for Hugging Face Spaces)
def ensure_deno_installed():
    """Install Deno if not already installed (for Hugging Face Spaces compatibility)"""
    try:
        # Check if Deno is already installed
        result = subprocess.run(['deno', '--version'], capture_output=True, text=True)
        if result.returncode == 0:
            print(f"✅ Deno already installed: {result.stdout.split()[1]}")
            return True
    except FileNotFoundError:
        pass
    
    print("🔧 Deno not found. Installing Deno for PyodideSandbox...")
    
    try:
        # Install Deno using the official installer
        install_cmd = "curl -fsSL https://deno.land/install.sh | sh"
        result = subprocess.run(install_cmd, shell=True, capture_output=True, text=True)
        
        if result.returncode == 0:
            # Add Deno to PATH
            deno_path = os.path.expanduser("~/.deno/bin")
            if deno_path not in os.environ.get("PATH", ""):
                os.environ["PATH"] = f"{deno_path}:{os.environ.get('PATH', '')}"
            
            print("✅ Deno installed successfully!")
            return True
        else:
            print(f"❌ Deno installation failed: {result.stderr}")
            return False
            
    except Exception as e:
        print(f"❌ Error installing Deno: {e}")
        return False

# Install Deno before importing sandbox dependencies
print("🔍 Checking Deno installation...")
deno_available = ensure_deno_installed()

# Import the CodeAct agent functionality
from agent import FileInjectedPyodideSandbox, create_pyodide_eval_fn, create_codeact

# Import the new guided analysis functionality
from graph import analyze_file_with_guidance_sync, guided_analysis_graph
from graph_streaming import streaming_analyze_file_with_guidance

# Load environment variables
load_dotenv(find_dotenv())

# Initialize model for file analysis
codeact_model = init_chat_model("o3-2025-04-16", model_provider="openai")

# Store uploaded file path globally
uploaded_file_path = None

def handle_file_upload(file):
    """Handle file upload and store the path globally"""
    global uploaded_file_path
    try:
        if file is not None:
            # With type="filepath", Gradio returns the file path as a string
            uploaded_file_path = file
            filename = os.path.basename(file)
            return f"✅ File uploaded successfully: {filename}"
        else:
            uploaded_file_path = None
            return "❌ No file uploaded"
    except Exception as e:
        uploaded_file_path = None
        return f"❌ Upload error: {str(e)}"

def streaming_analyze_file_with_question(user_question):
    """
    Streaming version that yields progress updates in real-time
    """
    global uploaded_file_path, deno_available
    
    try:
        if not uploaded_file_path or not os.path.exists(uploaded_file_path):
            yield "❌ No file uploaded or file not found. Please upload a file first."
            return
        
        if not user_question or user_question.strip() == "":
            user_question = "Provide a comprehensive analysis of this file including security, performance, and data insights."
        
        # Check if Deno is available for sandbox operations
        if not deno_available:
            yield """❌ Deno runtime not available. This is required for code execution in the sandbox.
            
📋 Troubleshooting:
1. This usually happens on deployment platforms that don't have Deno pre-installed
2. The app attempted to install Deno automatically but failed
3. Try restarting the space or contact support

🔄 Alternative: You can still upload files, but advanced code analysis may be limited."""
            return
        
        # Use the streaming guided analysis approach
        for chunk in streaming_analyze_file_with_guidance(uploaded_file_path, user_question):
            yield chunk
        
    except Exception as e:
        error_msg = str(e)
        if "Deno" in error_msg or "deno" in error_msg:
            yield f"""❌ Deno-related error in analysis: {error_msg}

🔧 This appears to be a Deno runtime issue. The sandbox requires Deno for code execution.
Try restarting the application or contact support if this persists."""
        else:
            yield f"❌ Error in guided analysis: {error_msg}"

def analyze_file_with_question(user_question):
    """
    Non-streaming version for backward compatibility
    """
    global uploaded_file_path, deno_available
    
    try:
        if not uploaded_file_path or not os.path.exists(uploaded_file_path):
            return "❌ No file uploaded or file not found. Please upload a file first."
        
        if not user_question or user_question.strip() == "":
            user_question = "Provide a comprehensive analysis of this file including security, performance, and data insights."
        
        # Check if Deno is available for sandbox operations
        if not deno_available:
            return """❌ Deno runtime not available. This is required for code execution in the sandbox.
            
📋 Troubleshooting:
1. This usually happens on deployment platforms that don't have Deno pre-installed
2. The app attempted to install Deno automatically but failed
3. Try restarting the space or contact support

🔄 Alternative: You can still upload files, but advanced code analysis may be limited."""
        
        # Use the new guided analysis approach
        result = analyze_file_with_guidance_sync(uploaded_file_path, user_question)
        return result
        
    except Exception as e:
        error_msg = str(e)
        if "Deno" in error_msg or "deno" in error_msg:
            return f"""❌ Deno-related error in analysis: {error_msg}

🔧 This appears to be a Deno runtime issue. The sandbox requires Deno for code execution.
Try restarting the application or contact support if this persists."""
        else:
            return f"❌ Error in guided analysis: {error_msg}"

async def analyze_uploaded_file():
    """Legacy function - kept for backward compatibility"""
    return analyze_file_with_question("Provide a comprehensive analysis of this file.")

def run_file_analysis():
    """Wrapper to run async file analysis in sync context"""
    return asyncio.run(analyze_uploaded_file())

# Create the Gradio interface
with gr.Blocks(title="DataForge - AI CodeAct Agent") as demo:
    gr.Markdown("# 🤖 DataForge - AI CodeAct Agent")
    
    # Demo Video Section
    gr.Markdown("""
    ## 🎥 **Demo Video - See DataForge in Action!**
    
    **📺 [Watch the full demo on YouTube](https://www.youtube.com/watch?v=f5jp2i3engM)** - Learn how to use DataForge in just a few minutes!
    
    ---
    """)
    
    gr.Markdown("""
    ## 🔑 **AI Writes Code to Analyze Your Data Locally**
    
    **Why DataForge handles massive files when other AI tools fail:**
    
    ❌ **Other AI Tools**: Upload data to LLM → Hit limits → Fail on large files  
    ✅ **DataForge**: AI writes code → Code processes data locally → No limits!
    
    ### 💪 **Key Benefits:**
    - **♾️ No Size Limits** - Process GB+ files locally
    - **🛡️ Complete Privacy** - Data never leaves your machine  
    - **⚡ Lightning Fast** - No uploads, pure local processing
    - **🎯 Custom Analysis** - Code written for your specific question
    """)
    
    # Supported File Types - Simple Version
    gr.Markdown("## 📋 **Supported Files**")
    gr.Markdown("""
    **📊 Data:** CSV, JSON, XML, TSV  
    **📝 Logs:** Application, access, error, audit logs  
    **🗂️ Text:** Any text file, code files, configs  
    **💾 Size:** No limits - handles multi-GB files locally
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            # File Upload Section
            gr.Markdown("### 📤 File Upload")
            file_upload = gr.File(
                label="Upload File for Analysis",
                type="filepath"
            )
            upload_status = gr.Textbox(
                label="Upload Status",
                value="No file uploaded",
                interactive=False
            )
            
            
            # Question Section
            gr.Markdown("### ❓ Ask Your Question")
            user_question = gr.Textbox(
                label="Your Question about the File",
                placeholder="What would you like to know about this file? (e.g., 'Find security threats', 'Show performance issues', 'What errors are present?')",
                lines=4,
                value=""
            )
            
            analyze_btn = gr.Button("🤖 Activate CodeAct Agent", variant="primary", size="lg")
            
            # How it works
            gr.Markdown("### 🔬 **How It Works**")
            gr.Markdown("""
            1. **🔍 AI samples** your file structure  
            2. **⚡ AI writes** custom analysis code  
            3. **🚀 Code processes** your entire file locally  
            4. **📊 Results** delivered to you
            
            **Your data never leaves your machine!**
            """)
        
        with gr.Column(scale=2):
            analysis_output = gr.Textbox(
                label="🤖 CodeAct Agent Analysis Results",
                lines=25,
                max_lines=35,
                placeholder="Upload a file, ask your question, and click 'Activate CodeAct Agent' to watch the AI write and execute custom analysis code in real-time...",
                interactive=False
            )
    
    # Event handlers
    file_upload.change(
        fn=handle_file_upload,
        inputs=[file_upload],
        outputs=[upload_status]
    )
    
    analyze_btn.click(
        fn=streaming_analyze_file_with_question,
        inputs=[user_question],
        outputs=[analysis_output]
    )
    
    gr.Markdown("---")
    gr.Markdown("## 💡 **Real CodeAct Use Cases - When LLMs Fail, Code Succeeds**")
    gr.Markdown("""
    **🔥 Real Problem:** You have a 500MB server log file and want to ask: *"Which IP addresses made the most requests yesterday?"*
    
    ❌ **Traditional LLM:** "File too large, please upload smaller chunks"  
    ✅ **DataForge CodeAct:** AI writes Python code with regex + Counter to process entire file → Gets exact answer
    
    **🔥 Real Problem:** You have a 2GB CSV with sales data and ask: *"How many orders were placed in each month of 2024?"*
    
    ❌ **Traditional LLM:** Crashes on upload or hits token limits  
    ✅ **DataForge CodeAct:** AI writes pandas code to parse dates and count by month → Simple monthly breakdown
    
    **🔥 Real Problem:** You have 1GB of JSON API logs and ask: *"Find all 500 errors and group them by endpoint and time of day"*
    
    ❌ **Traditional LLM:** "Cannot process this much data"  
    ✅ **DataForge CodeAct:** AI writes JSON parsing + datetime analysis code → Complete error breakdown
    
    **🎯 The Key:** Instead of sending your data to an LLM, the LLM writes code that processes your data locally!
    """)

if __name__ == "__main__":
    print("🤖 Starting DataForge CodeAct Agent Application...")
    print("🚀 Initializing advanced AI-powered file analysis capabilities...")
    demo.launch()