Spaces:

Agents-MCP-Hackathon
/

DataForge

Runtime error

App Files Files Community

ai-puppy commited on Jun 7

Commit

b8d6d6e

2 Parent(s): 5268062 c7ebfd3

Merge branch 'main' of https://huggingface.co/spaces/Agents-MCP-Hackathon/DataForge

Browse files

Files changed (2) hide show

app.py +128 -262
graph.py +73 -8

app.py CHANGED Viewed

@@ -4,9 +4,7 @@ import asyncio
 import tempfile
 from dotenv import find_dotenv, load_dotenv
 from langchain.chat_models import init_chat_model
-from langchain.schema import HumanMessage, SystemMessage
-from langgraph.prebuilt import create_react_agent
-from langsmith import traceable
 # Import the CodeAct agent functionality
 from agent import FileInjectedPyodideSandbox, create_pyodide_eval_fn, create_codeact
@@ -17,77 +15,29 @@ from graph import analyze_file_with_guidance_sync, guided_analysis_graph
 # Load environment variables
 load_dotenv(find_dotenv())
-# Initialize OpenAI model
-openai_model = init_chat_model(
-    model="gpt-4.1-nano-2025-04-14",
-    api_key=os.getenv("OPENAI_API_KEY"),
-)
-# Create the basic chat agent
-chat_agent = create_react_agent(openai_model, tools=[])
-# Initialize CodeAct model for file analysis
 codeact_model = init_chat_model("gpt-4.1-2025-04-14", model_provider="openai")
 # Store uploaded file path globally
 uploaded_file_path = None
-@traceable
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    """
-    Main chat function that processes user input and returns AI response
-    """
-    try:
-        # Convert history to LangChain message format
-        messages = [SystemMessage(content=system_message)]
-        # Add conversation history
-        for user_msg, assistant_msg in history:
-            if user_msg:
-                messages.append(HumanMessage(content=user_msg))
-            if assistant_msg:
-                messages.append(SystemMessage(content=assistant_msg))
-        # Add current user message
-        messages.append(HumanMessage(content=message))
-        # Prepare input for the agent
-        input_data = {"messages": messages}
-        # Stream the response
-        response_text = ""
-        for chunk in chat_agent.stream(input_data, stream_mode="values"):
-            if "messages" in chunk and chunk["messages"]:
-                latest_message = chunk["messages"][-1]
-                if hasattr(latest_message, 'content'):
-                    current_content = latest_message.content
-                    if current_content and len(current_content) > len(response_text):
-                        response_text = current_content
-                        yield response_text
-        # Ensure we return something even if streaming doesn't work
-        if not response_text:
-            yield "I'm sorry, I couldn't process your message. Please check your OpenAI API key."
-    except Exception as e:
-        yield f"Error: {str(e)}. Please make sure your OpenAI API key is set correctly."
 def handle_file_upload(file):
     """Handle file upload and store the path globally"""
     global uploaded_file_path
-    if file is not None:
-        uploaded_file_path = file.name
-        return f"✅ File uploaded successfully: {os.path.basename(file.name)}"
-    else:
         uploaded_file_path = None
-        return "❌ No file uploaded"
 def analyze_file_with_question(user_question):
     """
@@ -95,13 +45,13 @@ def analyze_file_with_question(user_question):
     """
     global uploaded_file_path
-    if not uploaded_file_path or not os.path.exists(uploaded_file_path):
-        return "❌ No file uploaded or file not found. Please upload a file first."
-    if not user_question or user_question.strip() == "":
-        user_question = "Provide a comprehensive analysis of this file including security, performance, and data insights."
     try:
         # Use the new guided analysis approach
         result = analyze_file_with_guidance_sync(uploaded_file_path, user_question)
         return result
@@ -109,45 +59,6 @@ def analyze_file_with_question(user_question):
     except Exception as e:
         return f"❌ Error in guided analysis: {str(e)}"
-def get_question_suggestions(file_path):
-    """
-    Generate suggested questions based on file type and structure
-    """
-    if not file_path or not os.path.exists(file_path):
-        return []
-    file_ext = os.path.splitext(file_path)[1].lower()
-    base_suggestions = [
-        "What are the main patterns in this file?",
-        "Are there any security issues or anomalies?",
-        "Provide a statistical summary of the data",
-        "What insights can you extract from this file?"
-    ]
-    if file_ext in ['.log', '.txt']:
-        return [
-            "Find any security threats or failed login attempts",
-            "Identify performance bottlenecks and slow operations",
-            "What errors or warnings are present?",
-            "Show me time-based trends in the data",
-            "Are there any suspicious IP addresses or user activities?"
-        ] + base_suggestions
-    elif file_ext == '.csv':
-        return [
-            "Analyze the data distribution and statistics",
-            "Find correlations between columns",
-            "Identify outliers or anomalies in the data",
-            "What are the key insights from this dataset?"
-        ] + base_suggestions
-    elif file_ext == '.json':
-        return [
-            "Parse and analyze the JSON structure",
-            "What are the key data fields and their values?",
-            "Find any nested patterns or relationships"
-        ] + base_suggestions
-    else:
-        return base_suggestions
 async def analyze_uploaded_file():
     """Legacy function - kept for backward compatibility"""
     return analyze_file_with_question("Provide a comprehensive analysis of this file.")
@@ -156,167 +67,122 @@ def run_file_analysis():
     """Wrapper to run async file analysis in sync context"""
     return asyncio.run(analyze_uploaded_file())
-def update_question_suggestions():
-    """Update question suggestions based on uploaded file"""
-    global uploaded_file_path
-    suggestions = get_question_suggestions(uploaded_file_path)
-    return gr.Dropdown.update(choices=suggestions, value=suggestions[0] if suggestions else "")
 # Create the Gradio interface
-with gr.Blocks(title="DataForge - AI Assistant with Advanced File Analysis") as demo:
-    gr.Markdown("# 🔍 DataForge - AI Assistant with Advanced File Analysis")
-    gr.Markdown("Upload files and ask specific questions for AI-powered guided analysis using LangGraph.")
-    with gr.Tab("💬 Chat Assistant"):
-        chat_interface = gr.ChatInterface(
-            respond,
-            additional_inputs=[
-                gr.Textbox(
-                    value="You are a helpful AI assistant. Be friendly, informative, and concise in your responses.",
-                    label="System message"
-                ),
-                gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-                gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-                gr.Slider(
-                    minimum=0.1,
-                    maximum=1.0,
-                    value=0.95,
-                    step=0.05,
-                    label="Top-p (nucleus sampling)",
-                ),
-            ],
-            title="Chat with AI Assistant",
-            description="Ask questions or get help with any topic."
-        )
-    with gr.Tab("📁 Advanced File Analysis"):
-        gr.Markdown("## 🚀 Guided File Analysis with LangGraph")
-        gr.Markdown("""
-        Upload files and ask specific questions for targeted AI analysis. Our guided approach:
-        1. 📋 **Examines** your file structure and patterns
-        2. 🎯 **Generates** specific code guidance based on your question
-        3. 🚀 **Executes** enhanced analysis with improved accuracy
-        """)
-        with gr.Row():
-            with gr.Column(scale=1):
-                # File Upload Section
-                gr.Markdown("### 📤 File Upload")
-                file_upload = gr.File(
-                    label="Upload File for Analysis",
-                    file_types=[".txt", ".log", ".csv", ".json", ".xml", ".py", ".js", ".html", ".md"],
-                    type="filepath"
-                )
-                upload_status = gr.Textbox(
-                    label="Upload Status",
-                    value="No file uploaded",
-                    interactive=False
-                )
-                # Question Section
-                gr.Markdown("### ❓ Ask Your Question")
-                question_suggestions = gr.Dropdown(
-                    label="Question Suggestions (select or type your own)",
-                    choices=[],
-                    allow_custom_value=True,
-                    value=""
-                )
-                user_question = gr.Textbox(
-                    label="Your Question about the File",
-                    placeholder="What would you like to know about this file?",
-                    lines=3
-                )
-                analyze_btn = gr.Button("🔍 Run Guided Analysis", variant="primary", size="lg")
-                # Analysis Info
-                gr.Markdown("### ℹ️ Analysis Method")
-                gr.Markdown("""
-                **Guided Analysis Features:**
-                - 🎯 Question-aware code generation
-                - 📋 File structure examination
-                - 🚀 Dynamic prompt optimization
-                - ✅ Higher accuracy than generic analysis
-                """)
-            with gr.Column(scale=2):
-                analysis_output = gr.Textbox(
-                    label="📊 Guided Analysis Results",
-                    lines=25,
-                    max_lines=35,
-                    placeholder="Upload a file, ask a question, and click 'Run Guided Analysis' to see detailed results here...",
-                    interactive=False
-                )
-        # Event handlers
-        file_upload.change(
-            fn=handle_file_upload,
-            inputs=[file_upload],
-            outputs=[upload_status]
-        ).then(
-            fn=update_question_suggestions,
-            inputs=[],
-            outputs=[question_suggestions]
-        )
-        question_suggestions.change(
-            fn=lambda x: x,
-            inputs=[question_suggestions],
-            outputs=[user_question]
-        )
-        analyze_btn.click(
-            fn=analyze_file_with_question,
-            inputs=[user_question],
-            outputs=[analysis_output]
-        )
-    with gr.Tab("📊 Analysis Examples"):
-        gr.Markdown("## 💡 Example Questions by File Type")
-        with gr.Accordion("🔐 Security Analysis Questions", open=False):
-            gr.Markdown("""
-            **For Log Files:**
-            - "Find any failed login attempts and suspicious IP addresses"
-            - "Identify potential security threats or anomalies"
-            - "Show me authentication errors and user access patterns"
-            - "Are there any brute force attacks or repeated failures?"
-            **For Access Logs:**
-            - "Detect unusual access patterns or potential intrusions"
-            - "Find requests with suspicious user agents or payloads"
-            - "Identify high-frequency requests from single IPs"
-            """)
-        with gr.Accordion("⚡ Performance Analysis Questions", open=False):
-            gr.Markdown("""
-            **For Application Logs:**
-            - "Which API endpoints are slowest and why?"
-            - "Find performance bottlenecks and response time issues"
-            - "Show me timeout errors and failed requests"
-            - "What are the peak usage times and load patterns?"
-            **For System Logs:**
-            - "Identify resource usage spikes and memory issues"
-            - "Find database query performance problems"
-            - "Show me error rates and system health indicators"
-            """)
-        with gr.Accordion("📈 Data Analysis Questions", open=False):
             gr.Markdown("""
-            **For CSV/Data Files:**
-            - "Analyze data distribution and find statistical insights"
-            - "Identify outliers and anomalies in the dataset"
-            - "What correlations exist between different columns?"
-            - "Generate a comprehensive data quality report"
-            **For JSON Files:**
-            - "Parse the structure and extract key information"
-            - "Find patterns in nested data and relationships"
-            - "Summarize the main data points and values"
             """)
 if __name__ == "__main__":
     demo.launch()

 import tempfile
 from dotenv import find_dotenv, load_dotenv
 from langchain.chat_models import init_chat_model
+# Simplified imports - focusing on file analysis
 # Import the CodeAct agent functionality
 from agent import FileInjectedPyodideSandbox, create_pyodide_eval_fn, create_codeact
 # Load environment variables
 load_dotenv(find_dotenv())
+# Initialize model for file analysis
 codeact_model = init_chat_model("gpt-4.1-2025-04-14", model_provider="openai")
 # Store uploaded file path globally
 uploaded_file_path = None
+# Chat functionality removed - focusing on file analysis
 def handle_file_upload(file):
     """Handle file upload and store the path globally"""
     global uploaded_file_path
+    try:
+        if file is not None:
+            # With type="filepath", Gradio returns the file path as a string
+            uploaded_file_path = file
+            filename = os.path.basename(file)
+            return f"✅ File uploaded successfully: {filename}"
+        else:
+            uploaded_file_path = None
+            return "❌ No file uploaded"
+    except Exception as e:
         uploaded_file_path = None
+        return f"❌ Upload error: {str(e)}"
 def analyze_file_with_question(user_question):
     """
     """
     global uploaded_file_path
     try:
+        if not uploaded_file_path or not os.path.exists(uploaded_file_path):
+            return "❌ No file uploaded or file not found. Please upload a file first."
+        if not user_question or user_question.strip() == "":
+            user_question = "Provide a comprehensive analysis of this file including security, performance, and data insights."
         # Use the new guided analysis approach
         result = analyze_file_with_guidance_sync(uploaded_file_path, user_question)
         return result
     except Exception as e:
         return f"❌ Error in guided analysis: {str(e)}"
 async def analyze_uploaded_file():
     """Legacy function - kept for backward compatibility"""
     return analyze_file_with_question("Provide a comprehensive analysis of this file.")
     """Wrapper to run async file analysis in sync context"""
     return asyncio.run(analyze_uploaded_file())
 # Create the Gradio interface
+with gr.Blocks(title="DataForge - AI-Powered File Analysis") as demo:
+    gr.Markdown("# 🔍 DataForge - AI-Powered File Analysis")
+    gr.Markdown("""
+    Upload any file and ask specific questions for targeted AI analysis. Our guided approach:
+    1. 📋 **Examines** your file structure and patterns automatically
+    2. 🎯 **Generates** specific code guidance based on your question
+    3. 🚀 **Executes** enhanced analysis with improved accuracy
+    **Simply upload a file and ask any question you want!**
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            # File Upload Section
+            gr.Markdown("### 📤 File Upload")
+            file_upload = gr.File(
+                label="Upload File for Analysis",
+                type="filepath"
+            )
+            upload_status = gr.Textbox(
+                label="Upload Status",
+                value="No file uploaded",
+                interactive=False
+            )
+            # Question Section
+            gr.Markdown("### ❓ Ask Your Question")
+            user_question = gr.Textbox(
+                label="Your Question about the File",
+                placeholder="What would you like to know about this file? (e.g., 'Find security threats', 'Show performance issues', 'What errors are present?')",
+                lines=4,
+                value=""
+            )
+            analyze_btn = gr.Button("🔍 Run Guided Analysis", variant="primary", size="lg")
+            # Analysis Info
+            gr.Markdown("### ℹ️ How It Works")
             gr.Markdown("""
+            **Guided Analysis Process:**
+            - 🎯 **Question-aware**: Code generation tailored to your specific question
+            - 📋 **Smart examination**: Automatically detects file structure and patterns
+            - 🚀 **Dynamic optimization**: Creates targeted analysis approach
+            - ✅ **Higher accuracy**: Prevents common code generation errors
+            - 🔧 **Quality control**: Built-in validation to avoid syntax issues
             """)
+        with gr.Column(scale=2):
+            analysis_output = gr.Textbox(
+                label="📊 Guided Analysis Results",
+                lines=25,
+                max_lines=35,
+                placeholder="Upload a file, type your question, and click 'Run Guided Analysis' to see detailed results here...",
+                interactive=False
+            )
+    # Event handlers
+    file_upload.change(
+        fn=handle_file_upload,
+        inputs=[file_upload],
+        outputs=[upload_status]
+    )
+    analyze_btn.click(
+        fn=analyze_file_with_question,
+        inputs=[user_question],
+        outputs=[analysis_output]
+    )
+    gr.Markdown("---")
+    gr.Markdown("## 💡 Example Questions by File Type")
+    with gr.Accordion("🔐 Security Analysis Questions", open=False):
+        gr.Markdown("""
+        **For Log Files:**
+        - "Find any failed login attempts and suspicious IP addresses"
+        - "Identify potential security threats or anomalies"
+        - "Show me authentication errors and user access patterns"
+        - "Are there any brute force attacks or repeated failures?"
+        **For Access Logs:**
+        - "Detect unusual access patterns or potential intrusions"
+        - "Find requests with suspicious user agents or payloads"
+        - "Identify high-frequency requests from single IPs"
+        """)
+    with gr.Accordion("⚡ Performance Analysis Questions", open=False):
+        gr.Markdown("""
+        **For Application Logs:**
+        - "Which API endpoints are slowest and why?"
+        - "Find performance bottlenecks and response time issues"
+        - "Show me timeout errors and failed requests"
+        - "What are the peak usage times and load patterns?"
+        **For System Logs:**
+        - "Identify resource usage spikes and memory issues"
+        - "Find database query performance problems"
+        - "Show me error rates and system health indicators"
+        """)
+    with gr.Accordion("📈 Data Analysis Questions", open=False):
+        gr.Markdown("""
+        **For CSV/Data Files:**
+        - "Analyze data distribution and find statistical insights"
+        - "Identify outliers and anomalies in the dataset"
+        - "What correlations exist between different columns?"
+        - "Generate a comprehensive data quality report"
+        **For JSON Files:**
+        - "Parse the structure and extract key information"
+        - "Find patterns in nested data and relationships"
+        - "Summarize the main data points and values"
+        """)
 if __name__ == "__main__":
+    print("Starting DataForge application...")
     demo.launch()

graph.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import asyncio
 import os
 import re
 from typing import Annotated, Dict, List, Optional
@@ -58,6 +59,37 @@ class CodeAnalysisState(TypedDict):
     final_analysis: Optional[str]
 def examine_file_structure(state: CodeAnalysisState) -> CodeAnalysisState:
     """
     Node 1: Examine the file structure by reading the first several lines
@@ -244,8 +276,7 @@ def generate_code_guidance(state: CodeAnalysisState) -> CodeAnalysisState:
 def execute_guided_analysis(state: CodeAnalysisState) -> CodeAnalysisState:
     """
-    Node 3: Execute the file analysis using the generated guidance.
-    This replaces the original agent with guided code generation.
     """
     file_path = state["file_path"]
     file_examination = state["file_examination"]
@@ -259,9 +290,8 @@ def execute_guided_analysis(state: CodeAnalysisState) -> CodeAnalysisState:
         }
     try:
-        # Create the guided analysis query
-        guided_query = f"""
-Based on the file examination and guidance, analyze this file with the following SPECIFIC instructions:
 FILE CONTEXT:
 - File Type: {file_examination.file_type}
@@ -284,6 +314,41 @@ SAMPLE FILE STRUCTURE (first few lines):
 USER REQUEST: {analysis_query or "Comprehensive analysis following the guidance above"}
 INSTRUCTIONS:
 1. Follow the specified analysis approach exactly
 2. Import only the recommended libraries: {', '.join(code_guidance.required_imports)}
@@ -291,11 +356,12 @@ INSTRUCTIONS:
 4. Structure your code following: {code_guidance.code_structure}
 5. Generate the expected outputs: {', '.join(code_guidance.expected_outputs)}
 6. Implement proper error handling: {code_guidance.error_handling}
 Since you have detailed guidance about this specific file structure, your code should be highly accurate and efficient.
 The file examination shows this is a {file_examination.file_type} with {file_examination.data_format} data format.
-Write Python code that leverages this specific knowledge for optimal analysis.
 """
         print(f"🚀 Executing guided analysis...")
@@ -326,8 +392,7 @@ Write Python code that leverages this specific knowledge for optimal analysis.
         execution_result = asyncio.run(run_guided_analysis())
         # Create final analysis summary
-        final_analysis = f"""
-=== GUIDED FILE ANALYSIS RESULTS ===
 File: {file_path}
 Type: {file_examination.file_type} ({file_examination.data_format})

 import asyncio
+import ast
 import os
 import re
 from typing import Annotated, Dict, List, Optional
     final_analysis: Optional[str]
+def validate_python_code(code: str) -> tuple[bool, str]:
+    """
+    Validate Python code for syntax errors and potential issues.
+    Returns (is_valid, error_message)
+    """
+    try:
+        # Try to parse the code as AST
+        ast.parse(code)
+        # Check for common problematic patterns
+        lines = code.split('\n')
+        for i, line in enumerate(lines, 1):
+            line_stripped = line.strip()
+            # Check for unterminated strings
+            if line_stripped.startswith('print(') and not line_stripped.endswith(')'):
+                if line_stripped.count('"') % 2 != 0 or line_stripped.count("'") % 2 != 0:
+                    return False, f"Line {i}: Potentially unterminated string in print statement"
+            # Check for very long lines that might get truncated
+            if len(line) > 100:
+                return False, f"Line {i}: Line too long ({len(line)} chars) - may cause truncation"
+        return True, "Code validation passed"
+    except SyntaxError as e:
+        return False, f"Syntax error: {e.msg} at line {e.lineno}"
+    except Exception as e:
+        return False, f"Validation error: {str(e)}"
 def examine_file_structure(state: CodeAnalysisState) -> CodeAnalysisState:
     """
     Node 1: Examine the file structure by reading the first several lines
 def execute_guided_analysis(state: CodeAnalysisState) -> CodeAnalysisState:
     """
+    Node 3: Execute the file analysis using the generated guidance with code quality validation.
     """
     file_path = state["file_path"]
     file_examination = state["file_examination"]
         }
     try:
+        # Create the guided analysis query with strict code quality requirements
+        guided_query = f"""Based on the file examination and guidance, analyze this file with the following SPECIFIC instructions:
 FILE CONTEXT:
 - File Type: {file_examination.file_type}
 USER REQUEST: {analysis_query or "Comprehensive analysis following the guidance above"}
+CRITICAL CODE QUALITY REQUIREMENTS:
+1. ALL print statements MUST be on single lines with properly closed quotes
+2. NO multi-line strings or f-strings that span multiple lines
+3. NO print statements longer than 80 characters - break into multiple prints instead
+4. ALL strings must be properly terminated with matching quotes
+5. Use short variable names and concise output formatting
+6. If you need to print long text, use multiple short print() calls
+7. Always close parentheses, brackets, and quotes on the same line they open
+8. Use simple string concatenation instead of complex f-strings for long output
+9. NEVER use triple quotes for multi-line strings in limited execution environments
+10. Test each print statement individually to ensure it executes without truncation
+EXAMPLE OF SAFE CODING PRACTICES:
+```python
+# GOOD - Short, single-line prints
+print("=== Results ===")
+print(f"Count: {{count}}")
+print(f"User: {{user}}")
+# BAD - Long print that could be truncated
+print(f"This is a very long print statement that could get truncated...")
+# GOOD - Break long output into multiple prints
+print("Analysis complete:")
+print(f"Found {{count}} items")
+print(f"Top user: {{user}}")
+```
+MANDATORY CODE GENERATION PROCESS:
+1. Generate your analysis code following the above requirements
+2. Before presenting the code, internally validate each line for potential issues
+3. Ensure ALL print statements are under 80 characters
+4. Verify all quotes and parentheses are properly closed
+5. If any line might cause issues, rewrite it using multiple shorter statements
 INSTRUCTIONS:
 1. Follow the specified analysis approach exactly
 2. Import only the recommended libraries: {', '.join(code_guidance.required_imports)}
 4. Structure your code following: {code_guidance.code_structure}
 5. Generate the expected outputs: {', '.join(code_guidance.expected_outputs)}
 6. Implement proper error handling: {code_guidance.error_handling}
+7. ENSURE ALL CODE FOLLOWS THE QUALITY REQUIREMENTS ABOVE
 Since you have detailed guidance about this specific file structure, your code should be highly accurate and efficient.
 The file examination shows this is a {file_examination.file_type} with {file_examination.data_format} data format.
+Write Python code that leverages this specific knowledge for optimal analysis and follows strict code quality standards.
 """
         print(f"🚀 Executing guided analysis...")
         execution_result = asyncio.run(run_guided_analysis())
         # Create final analysis summary
+        final_analysis = f"""=== GUIDED FILE ANALYSIS RESULTS ===
 File: {file_path}
 Type: {file_examination.file_type} ({file_examination.data_format})