Spaces:

Agents-MCP-Hackathon
/

data_science_agent

Paused

File size: 16,361 Bytes

import gradio as gr
import pandas as pd
from utils.google_genai_llm import get_response, generate_with_gemini
from prompts.requirements_gathering import requirements_gathering_system_prompt
from prompts.planning import hf_query_gen_prompt

from PIL import Image
import os
import tempfile
import traceback
import hashlib

# Import Marker for document processing
try:
    from marker.converters.pdf import PdfConverter
    from marker.models import create_model_dict
    from marker.output import text_from_rendered
    MARKER_AVAILABLE = True
except ImportError:
    MARKER_AVAILABLE = False
    print("Warning: Marker library not available. PDF, PPT, and DOCX processing will be limited.")

def get_file_hash(file_path):
    """Generate a hash of the file for caching purposes"""
    try:
        with open(file_path, 'rb') as f:
            file_hash = hashlib.md5(f.read()).hexdigest()
        return file_hash
    except Exception:
        return None

def extract_text_with_marker(file_path):
    """Extract text from PDF, PPT, or DOCX using Marker"""
    if not MARKER_AVAILABLE:
        return "Marker library not available for document processing.", ""
    
    try:
        # Create converter with model artifacts
        converter = PdfConverter(
            artifact_dict=create_model_dict(),
        )
        
        # Convert document
        rendered = converter(file_path)
        
        # Extract text from rendered output
        text, _, images = text_from_rendered(rendered)
        
        # Get basic stats
        word_count = len(text.split())
        char_count = len(text)
        
        stats = f"Extracted text ({word_count} words, {char_count} characters)"
        
        return stats, text
    
    except Exception as e:
        error_msg = f"Error processing document: {str(e)}"
        return error_msg, ""

def process_user_input(message, history, uploaded_files, file_cache):
    """Process user input and generate AI response using requirements gathering prompt"""
    
    # Build conversation history from chat history
    conversation_history = ""
    if history:
        for i, (user_msg, ai_msg) in enumerate(history):
            conversation_history += f"User: {user_msg}\n"
            if ai_msg:
                conversation_history += f"Assistant: {ai_msg}\n"
    
    # Add file information to conversation if files are uploaded
    if uploaded_files:
        file_info = f"\n[UPLOADED_FILES]\n"
        new_file_cache = file_cache.copy() if file_cache else {}
        
        for file_path in uploaded_files:
            try:
                file_name = file_path.split('/')[-1]
                file_extension = os.path.splitext(file_name)[1].lower()
                file_hash = get_file_hash(file_path)
                cache_key = f"{file_name}_{file_hash}"
                
                # Handle CSV files
                if file_extension == '.csv':
                    df = pd.read_csv(file_path)
                    file_info += f"- {file_name}: CSV file with {len(df)} rows and {len(df.columns)} columns\n"
                    file_info += f"  Columns: {', '.join(df.columns.tolist())}\n"
                
                # Handle Excel files
                elif file_extension in ['.xlsx', '.xls']:
                    df = pd.read_excel(file_path)
                    file_info += f"- {file_name}: Excel file with {len(df)} rows and {len(df.columns)} columns\n"
                    file_info += f"  Columns: {', '.join(df.columns.tolist())}\n"
                
                # Handle document files with Marker (PDF, PPT, DOCX)
                elif file_extension in ['.pdf', '.ppt', '.pptx', '.doc', '.docx']:
                    file_size = os.path.getsize(file_path)
                    file_size_mb = round(file_size / (1024 * 1024), 2)
                    
                    # Check if file is already processed and cached
                    if cache_key in new_file_cache:
                        # Use cached text
                        extraction_stats = new_file_cache[cache_key]['stats']
                        extracted_text = new_file_cache[cache_key]['text']
                        status = "(cached)"
                    else:
                        # Process new file with Marker
                        extraction_stats, extracted_text = extract_text_with_marker(file_path)
                        # Cache the results
                        new_file_cache[cache_key] = {
                            'stats': extraction_stats,
                            'text': extracted_text,
                            'file_name': file_name,
                            'file_path': file_path
                        }
                        status = "(newly processed)"
                    
                    # Determine document type
                    if file_extension == '.pdf':
                        doc_type = "PDF document"
                    elif file_extension in ['.ppt', '.pptx']:
                        doc_type = "PowerPoint presentation"
                    else:
                        doc_type = "Word document"
                    
                    file_info += f"- {file_name}: {doc_type}, Size: {file_size_mb} MB {status}\n"
                    file_info += f"  Content: {extraction_stats}\n"
                    
                    # Include extracted text in conversation context for better AI understanding
                    if extracted_text and len(extracted_text.strip()) > 0:
                        # Truncate very long texts for context (keep first 2000 chars)
                        text_preview = extracted_text[:200000] + "..." if len(extracted_text) > 200000 else extracted_text
                        file_info += f"  Text Preview: {text_preview}\n"
                
                # Handle image files
                elif file_extension in ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp']:
                    with Image.open(file_path) as img:
                        width, height = img.size
                        mode = img.mode
                        file_size = os.path.getsize(file_path)
                        file_size_mb = round(file_size / (1024 * 1024), 2)
                    file_info += f"- {file_name}: {file_extension.upper()[1:]} image file\n"
                    file_info += f"  Dimensions: {width}x{height} pixels, Mode: {mode}, Size: {file_size_mb} MB\n"
                
                # Handle JSON files
                elif file_extension == '.json':
                    file_size = os.path.getsize(file_path)
                    file_size_kb = round(file_size / 1024, 2)
                    file_info += f"- {file_name}: JSON file, Size: {file_size_kb} KB\n"
                
                # Handle text files
                elif file_extension == '.txt':
                    with open(file_path, 'r', encoding='utf-8') as f:
                        lines = len(f.readlines())
                    file_size = os.path.getsize(file_path)
                    file_size_kb = round(file_size / 1024, 2)
                    file_info += f"- {file_name}: Text file with {lines} lines, Size: {file_size_kb} KB\n"
                
                # Handle other files
                else:
                    file_size = os.path.getsize(file_path)
                    file_size_kb = round(file_size / 1024, 2)
                    file_info += f"- {file_name}: File uploaded, Size: {file_size_kb} KB\n"
                    
            except Exception as e:
                file_info += f"- {file_path.split('/')[-1]}: File uploaded (unable to preview: {str(e)})\n"
                print(f"Error processing file {file_path}: {traceback.format_exc()}")
        
        conversation_history += file_info
        
        # Update the cache
        file_cache.update(new_file_cache)
    
    # Format the prompt with conversation history and current query
    formatted_prompt = requirements_gathering_system_prompt.format(
        conversation_history=conversation_history,
        query=message
    )
    
    # Get AI response
    ai_response = get_response(formatted_prompt)
    
    return ai_response, file_cache

def chat_interface(message, history, uploaded_files, file_cache):
    """Main chat interface function"""
    
    # Get AI response with updated cache
    ai_response, updated_cache = process_user_input(message, history, uploaded_files, file_cache)
    
    # Add to history
    history.append((message, ai_response))
    
    return history, history, "", updated_cache

def clear_chat():
    """Clear the chat history and file cache"""
    return [], [], {}

def upload_file_handler(files):
    """Handle file uploads"""
    if files:
        return files
    return []

def generate_plan(history, file_cache):
    """Generate a plan using the planning prompt and Gemini API"""
    
    # Build conversation history
    conversation_history = ""
    if history:
        for user_msg, ai_msg in history:
            conversation_history += f"User: {user_msg}\n"
            if ai_msg:
                conversation_history += f"Assistant: {ai_msg}\n"
    
    # Format the prompt
    formatted_prompt = hf_query_gen_prompt + "\n\n" + conversation_history
    
    # Get plan from Gemini
    plan = generate_with_gemini(formatted_prompt, "Planning with gemini")
    return plan

# Custom CSS for a sleek design
custom_css = """
.gradio-container {
    max-width: 900px !important;
    margin: auto !important;
}

.chat-container {
    height: 600px !important;
}

#component-0 {
    height: 100vh;
}

.message {
    padding: 15px !important;
    margin: 10px 0 !important;
    border-radius: 15px !important;
}

.user-message {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
    color: white !important;
    margin-left: 20% !important;
}

.bot-message {
    background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%) !important;
    color: white !important;
    margin-right: 20% !important;
}

.upload-area {
    border: 2px dashed #4f46e5 !important;
    border-radius: 10px !important;
    padding: 20px !important;
    text-align: center !important;
    background: linear-gradient(135deg, #f0f4ff 0%, #e0e7ff 100%) !important;
}

.btn-primary {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
    border: none !important;
    border-radius: 25px !important;
    padding: 10px 25px !important;
    font-weight: bold !important;
}

.btn-secondary {
    background: linear-gradient(135deg, #ffeaa7 0%, #fab1a0 100%) !important;
    border: none !important;
    border-radius: 25px !important;
    padding: 10px 25px !important;
    font-weight: bold !important;
    color: #2d3436 !important;
}

.title {
    text-align: center !important;
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
    -webkit-background-clip: text !important;
    -webkit-text-fill-color: transparent !important;
    font-size: 2.5em !important;
    font-weight: bold !important;
    margin-bottom: 20px !important;
}

.subtitle {
    text-align: center !important;
    color: #6c757d !important;
    font-size: 1.2em !important;
    margin-bottom: 30px !important;
}
"""

# Create the Gradio interface
with gr.Blocks(css=custom_css, title="Data Science Requirements Gathering Agent") as app:
    
    # Header
    gr.HTML("""
        <div class="title">🔬 Data Science Consultant</div>
        <div class="subtitle">
            Transform your vague ideas into reality
        </div>
    """)
    
    with gr.Row():
        with gr.Column(scale=3):
            # Chat interface
            chatbot = gr.Chatbot(
                label="Requirements Gathering Conversation",
                height=500,
                show_copy_button=True,
                bubble_full_width=False,
                elem_classes=["chat-container"]
            )

            plan_output = gr.Textbox(
                            label="Generated Plan",
                            interactive=False,
                            visible=True,
                            lines=10,
                            max_lines=20
                        )
            
            with gr.Row():
                with gr.Column(scale=4):
                    msg = gr.Textbox(
                        placeholder="Describe your data science project or ask a question...",
                        label="Your Message",
                        lines=2,
                        max_lines=5
                    )
                with gr.Column(scale=1):
                    send_btn = gr.Button("Send 📤", variant="primary", elem_classes=["btn-primary"])

                with gr.Column(scale=1):
                    plan_btn = gr.Button("Generate Plan 📋", variant="secondary", elem_classes=["btn-secondary"])

            
            with gr.Row():
                clear_btn = gr.Button("Clear Chat 🗑️", variant="secondary", elem_classes=["btn-secondary"])
        
        with gr.Column(scale=1):
            # File upload section
            gr.HTML("<h3 style='text-align: center; color: #4f46e5;'>📁 Upload Data Files</h3>")
            
            file_upload = gr.File(
                label="Upload your files (CSV, Excel, PDF, PPT, DOCX, Images, etc.)",
                file_count="multiple",
                file_types=[".csv", ".xlsx", ".xls", ".json", ".txt", ".pdf", ".ppt", ".pptx", ".doc", ".docx", ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"],
                elem_classes=["upload-area"]
            )
            
            uploaded_files_display = gr.File(
                label="Uploaded Files",
                file_count="multiple",
                interactive=False,
                visible=True
            )
            
            # Instructions
            gr.HTML("""
                <div style="padding: 15px; background: linear-gradient(135deg, #e3f2fd 0%, #f3e5f5 100%); 
                           border-radius: 10px; margin-top: 20px;">
                    <h4 style="color: #4f46e5; margin-bottom: 10px;">💡 How it works:</h4>
                    <ol style="color: #555; font-size: 14px; line-height: 1.6;">
                        <li>Describe your data science project</li>
                        <li>Upload your files (data, documents, images)</li>
                        <li>Answer clarifying questions</li>
                        <li>Get a complete task specification</li>
                    </ol>
                    <p style="color: #666; font-size: 12px; margin-top: 10px;">
                        📄 Supports: CSV, Excel, PDF, PowerPoint, Word docs, Images, JSON, Text files
                    </p>
                </div>
            """)
    
    # State for conversation history and file cache
    chat_history = gr.State([])
    file_cache = gr.State({})
    
    # Event handlers
    def handle_send(message, history, files, cache):
        if message.strip():
            new_history, updated_history, cleared_input, updated_cache = chat_interface(message, history, files, cache)
            return new_history, updated_history, cleared_input, updated_cache
        return history, history, message, cache
    
    # Wire up the interface
    send_btn.click(
        handle_send,
        inputs=[msg, chat_history, uploaded_files_display, file_cache],
        outputs=[chatbot, chat_history, msg, file_cache]
    )
    
    msg.submit(
        handle_send,
        inputs=[msg, chat_history, uploaded_files_display, file_cache],
        outputs=[chatbot, chat_history, msg, file_cache]
    )
    
    clear_btn.click(
        clear_chat,
        outputs=[chatbot, chat_history, file_cache]
    )

    plan_btn.click(
        generate_plan,
        inputs=[chat_history, file_cache],
        outputs=[plan_output]
    )
    
    file_upload.change(
        lambda files: files,
        inputs=[file_upload],
        outputs=[uploaded_files_display]
    )
    
    # Welcome message
    app.load(
        lambda: [(None, "👋 Hello! I'm your Data Science Project Agent. I'll help you transform your project ideas into reality  .\n\n🚀 **Let's get started!** Tell me about your data science project or what you're trying to achieve.")],
        outputs=[chatbot]
    )

if __name__ == "__main__":
    app.launch(share=True, show_error=True)