Spaces:

Agents-MCP-Hackathon
/

data_science_agent

Paused

App Files Files Community

Master-warrier commited on Jun 8

Commit

685adc8

1 Parent(s): 87c1568

Add requirements gathering flow

Browse files

Files changed (8) hide show

app.py +394 -0
configs.py +0 -0
constants.py +0 -0
prompts/__pycache__/requirements_gathering.cpython-310.pyc +0 -0
prompts/requirements_gathering.py +38 -0
requirements.txt +7 -0
utils/__pycache__/google_genai_llm.cpython-310.pyc +0 -0
utils/google_genai_llm.py +12 -0

app.py ADDED Viewed

	@@ -0,0 +1,394 @@

+import gradio as gr
+import pandas as pd
+from utils.google_genai_llm import get_response
+from prompts.requirements_gathering import requirements_gathering_system_prompt
+from PIL import Image
+import os
+import tempfile
+import traceback
+import hashlib
+# Import Marker for document processing
+try:
+    from marker.converters.pdf import PdfConverter
+    from marker.models import create_model_dict
+    from marker.output import text_from_rendered
+    MARKER_AVAILABLE = True
+except ImportError:
+    MARKER_AVAILABLE = False
+    print("Warning: Marker library not available. PDF, PPT, and DOCX processing will be limited.")
+def get_file_hash(file_path):
+    """Generate a hash of the file for caching purposes"""
+    try:
+        with open(file_path, 'rb') as f:
+            file_hash = hashlib.md5(f.read()).hexdigest()
+        return file_hash
+    except Exception:
+        return None
+def extract_text_with_marker(file_path):
+    """Extract text from PDF, PPT, or DOCX using Marker"""
+    if not MARKER_AVAILABLE:
+        return "Marker library not available for document processing.", ""
+    try:
+        # Create converter with model artifacts
+        converter = PdfConverter(
+            artifact_dict=create_model_dict(),
+        )
+        # Convert document
+        rendered = converter(file_path)
+        # Extract text from rendered output
+        text, _, images = text_from_rendered(rendered)
+        # Get basic stats
+        word_count = len(text.split())
+        char_count = len(text)
+        stats = f"Extracted text ({word_count} words, {char_count} characters)"
+        return stats, text
+    except Exception as e:
+        error_msg = f"Error processing document: {str(e)}"
+        return error_msg, ""
+def process_user_input(message, history, uploaded_files, file_cache):
+    """Process user input and generate AI response using requirements gathering prompt"""
+    # Build conversation history from chat history
+    conversation_history = ""
+    if history:
+        for i, (user_msg, ai_msg) in enumerate(history):
+            conversation_history += f"User: {user_msg}\n"
+            if ai_msg:
+                conversation_history += f"Assistant: {ai_msg}\n"
+    # Add file information to conversation if files are uploaded
+    if uploaded_files:
+        file_info = f"\n[UPLOADED_FILES]\n"
+        new_file_cache = file_cache.copy() if file_cache else {}
+        for file_path in uploaded_files:
+            try:
+                file_name = file_path.split('/')[-1]
+                file_extension = os.path.splitext(file_name)[1].lower()
+                file_hash = get_file_hash(file_path)
+                cache_key = f"{file_name}_{file_hash}"
+                # Handle CSV files
+                if file_extension == '.csv':
+                    df = pd.read_csv(file_path)
+                    file_info += f"- {file_name}: CSV file with {len(df)} rows and {len(df.columns)} columns\n"
+                    file_info += f"  Columns: {', '.join(df.columns.tolist())}\n"
+                # Handle Excel files
+                elif file_extension in ['.xlsx', '.xls']:
+                    df = pd.read_excel(file_path)
+                    file_info += f"- {file_name}: Excel file with {len(df)} rows and {len(df.columns)} columns\n"
+                    file_info += f"  Columns: {', '.join(df.columns.tolist())}\n"
+                # Handle document files with Marker (PDF, PPT, DOCX)
+                elif file_extension in ['.pdf', '.ppt', '.pptx', '.doc', '.docx']:
+                    file_size = os.path.getsize(file_path)
+                    file_size_mb = round(file_size / (1024 * 1024), 2)
+                    # Check if file is already processed and cached
+                    if cache_key in new_file_cache:
+                        # Use cached text
+                        extraction_stats = new_file_cache[cache_key]['stats']
+                        extracted_text = new_file_cache[cache_key]['text']
+                        status = "(cached)"
+                    else:
+                        # Process new file with Marker
+                        extraction_stats, extracted_text = extract_text_with_marker(file_path)
+                        # Cache the results
+                        new_file_cache[cache_key] = {
+                            'stats': extraction_stats,
+                            'text': extracted_text,
+                            'file_name': file_name,
+                            'file_path': file_path
+                        }
+                        status = "(newly processed)"
+                    # Determine document type
+                    if file_extension == '.pdf':
+                        doc_type = "PDF document"
+                    elif file_extension in ['.ppt', '.pptx']:
+                        doc_type = "PowerPoint presentation"
+                    else:
+                        doc_type = "Word document"
+                    file_info += f"- {file_name}: {doc_type}, Size: {file_size_mb} MB {status}\n"
+                    file_info += f"  Content: {extraction_stats}\n"
+                    # Include extracted text in conversation context for better AI understanding
+                    if extracted_text and len(extracted_text.strip()) > 0:
+                        # Truncate very long texts for context (keep first 2000 chars)
+                        text_preview = extracted_text[:200000] + "..." if len(extracted_text) > 200000 else extracted_text
+                        file_info += f"  Text Preview: {text_preview}\n"
+                # Handle image files
+                elif file_extension in ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp']:
+                    with Image.open(file_path) as img:
+                        width, height = img.size
+                        mode = img.mode
+                        file_size = os.path.getsize(file_path)
+                        file_size_mb = round(file_size / (1024 * 1024), 2)
+                    file_info += f"- {file_name}: {file_extension.upper()[1:]} image file\n"
+                    file_info += f"  Dimensions: {width}x{height} pixels, Mode: {mode}, Size: {file_size_mb} MB\n"
+                # Handle JSON files
+                elif file_extension == '.json':
+                    file_size = os.path.getsize(file_path)
+                    file_size_kb = round(file_size / 1024, 2)
+                    file_info += f"- {file_name}: JSON file, Size: {file_size_kb} KB\n"
+                # Handle text files
+                elif file_extension == '.txt':
+                    with open(file_path, 'r', encoding='utf-8') as f:
+                        lines = len(f.readlines())
+                    file_size = os.path.getsize(file_path)
+                    file_size_kb = round(file_size / 1024, 2)
+                    file_info += f"- {file_name}: Text file with {lines} lines, Size: {file_size_kb} KB\n"
+                # Handle other files
+                else:
+                    file_size = os.path.getsize(file_path)
+                    file_size_kb = round(file_size / 1024, 2)
+                    file_info += f"- {file_name}: File uploaded, Size: {file_size_kb} KB\n"
+            except Exception as e:
+                file_info += f"- {file_path.split('/')[-1]}: File uploaded (unable to preview: {str(e)})\n"
+                print(f"Error processing file {file_path}: {traceback.format_exc()}")
+        conversation_history += file_info
+        # Update the cache
+        file_cache.update(new_file_cache)
+    # Format the prompt with conversation history and current query
+    formatted_prompt = requirements_gathering_system_prompt.format(
+        conversation_history=conversation_history,
+        query=message
+    )
+    # Get AI response
+    ai_response = get_response(formatted_prompt)
+    return ai_response, file_cache
+def chat_interface(message, history, uploaded_files, file_cache):
+    """Main chat interface function"""
+    # Get AI response with updated cache
+    ai_response, updated_cache = process_user_input(message, history, uploaded_files, file_cache)
+    # Add to history
+    history.append((message, ai_response))
+    return history, history, "", updated_cache
+def clear_chat():
+    """Clear the chat history and file cache"""
+    return [], [], {}
+def upload_file_handler(files):
+    """Handle file uploads"""
+    if files:
+        return files
+    return []
+# Custom CSS for a sleek design
+custom_css = """
+.gradio-container {
+    max-width: 900px !important;
+    margin: auto !important;
+}
+.chat-container {
+    height: 600px !important;
+}
+#component-0 {
+    height: 100vh;
+}
+.message {
+    padding: 15px !important;
+    margin: 10px 0 !important;
+    border-radius: 15px !important;
+}
+.user-message {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+    color: white !important;
+    margin-left: 20% !important;
+}
+.bot-message {
+    background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%) !important;
+    color: white !important;
+    margin-right: 20% !important;
+}
+.upload-area {
+    border: 2px dashed #4f46e5 !important;
+    border-radius: 10px !important;
+    padding: 20px !important;
+    text-align: center !important;
+    background: linear-gradient(135deg, #f0f4ff 0%, #e0e7ff 100%) !important;
+}
+.btn-primary {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+    border: none !important;
+    border-radius: 25px !important;
+    padding: 10px 25px !important;
+    font-weight: bold !important;
+}
+.btn-secondary {
+    background: linear-gradient(135deg, #ffeaa7 0%, #fab1a0 100%) !important;
+    border: none !important;
+    border-radius: 25px !important;
+    padding: 10px 25px !important;
+    font-weight: bold !important;
+    color: #2d3436 !important;
+}
+.title {
+    text-align: center !important;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+    -webkit-background-clip: text !important;
+    -webkit-text-fill-color: transparent !important;
+    font-size: 2.5em !important;
+    font-weight: bold !important;
+    margin-bottom: 20px !important;
+}
+.subtitle {
+    text-align: center !important;
+    color: #6c757d !important;
+    font-size: 1.2em !important;
+    margin-bottom: 30px !important;
+}
+"""
+# Create the Gradio interface
+with gr.Blocks(css=custom_css, title="Data Science Requirements Gathering Agent") as app:
+    # Header
+    gr.HTML("""
+        <div class="title">🔬 Data Science Requirements Agent</div>
+        <div class="subtitle">
+            Transform your vague ideas into fully specified, actionable data science tasks
+        </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=3):
+            # Chat interface
+            chatbot = gr.Chatbot(
+                label="Requirements Gathering Conversation",
+                height=500,
+                show_copy_button=True,
+                bubble_full_width=False,
+                elem_classes=["chat-container"]
+            )
+            with gr.Row():
+                with gr.Column(scale=4):
+                    msg = gr.Textbox(
+                        placeholder="Describe your data science project or ask a question...",
+                        label="Your Message",
+                        lines=2,
+                        max_lines=5
+                    )
+                with gr.Column(scale=1):
+                    send_btn = gr.Button("Send 📤", variant="primary", elem_classes=["btn-primary"])
+            with gr.Row():
+                clear_btn = gr.Button("Clear Chat 🗑️", variant="secondary", elem_classes=["btn-secondary"])
+        with gr.Column(scale=1):
+            # File upload section
+            gr.HTML("<h3 style='text-align: center; color: #4f46e5;'>📁 Upload Data Files</h3>")
+            file_upload = gr.File(
+                label="Upload your files (CSV, Excel, PDF, PPT, DOCX, Images, etc.)",
+                file_count="multiple",
+                file_types=[".csv", ".xlsx", ".xls", ".json", ".txt", ".pdf", ".ppt", ".pptx", ".doc", ".docx", ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"],
+                elem_classes=["upload-area"]
+            )
+            uploaded_files_display = gr.File(
+                label="Uploaded Files",
+                file_count="multiple",
+                interactive=False,
+                visible=True
+            )
+            # Instructions
+            gr.HTML("""
+                <div style="padding: 15px; background: linear-gradient(135deg, #e3f2fd 0%, #f3e5f5 100%);
+                           border-radius: 10px; margin-top: 20px;">
+                    <h4 style="color: #4f46e5; margin-bottom: 10px;">💡 How it works:</h4>
+                    <ol style="color: #555; font-size: 14px; line-height: 1.6;">
+                        <li>Describe your data science project</li>
+                        <li>Upload your files (data, documents, images)</li>
+                        <li>Answer clarifying questions</li>
+                        <li>Get a complete task specification</li>
+                    </ol>
+                    <p style="color: #666; font-size: 12px; margin-top: 10px;">
+                        📄 Supports: CSV, Excel, PDF, PowerPoint, Word docs, Images, JSON, Text files
+                    </p>
+                </div>
+            """)
+    # State for conversation history and file cache
+    chat_history = gr.State([])
+    file_cache = gr.State({})
+    # Event handlers
+    def handle_send(message, history, files, cache):
+        if message.strip():
+            new_history, updated_history, cleared_input, updated_cache = chat_interface(message, history, files, cache)
+            return new_history, updated_history, cleared_input, updated_cache
+        return history, history, message, cache
+    # Wire up the interface
+    send_btn.click(
+        handle_send,
+        inputs=[msg, chat_history, uploaded_files_display, file_cache],
+        outputs=[chatbot, chat_history, msg, file_cache]
+    )
+    msg.submit(
+        handle_send,
+        inputs=[msg, chat_history, uploaded_files_display, file_cache],
+        outputs=[chatbot, chat_history, msg, file_cache]
+    )
+    clear_btn.click(
+        clear_chat,
+        outputs=[chatbot, chat_history, file_cache]
+    )
+    file_upload.change(
+        lambda files: files,
+        inputs=[file_upload],
+        outputs=[uploaded_files_display]
+    )
+    # Welcome message
+    app.load(
+        lambda: [(None, "👋 Hello! I'm your Data Science Project Agent. I'll help you transform your project ideas into reality  .\n\n🚀 **Let's get started!** Tell me about your data science project or what you're trying to achieve.")],
+        outputs=[chatbot]
+    )
+if __name__ == "__main__":
+    app.launch(share=True, show_error=True)

configs.py ADDED Viewed

File without changes

constants.py ADDED Viewed

File without changes

prompts/__pycache__/requirements_gathering.cpython-310.pyc ADDED Viewed

Binary file (3.25 kB). View file

prompts/requirements_gathering.py ADDED Viewed

	@@ -0,0 +1,38 @@

+requirements_gathering_system_prompt = """[SYSTEM_ROLE]
+You are an expert Data Science Scoping Agent. Your single purpose is to interact with a user to transform their vague request into a fully specified, actionable data science task. You are methodical, precise, and never make assumptions. Your job includes collecting the required data file from the user.
+[PRIMARY_DIRECTIVE]
+Your goal is to gather all necessary information and the data file by asking targeted, clarifying questions and prompts. You must continue this process until the task is completely defined. Do not attempt to answer the user's request or perform the task yourself; your only job is to define it and collect the necessary data.
+[AREAS_OF_INQUIRY_CHECKLIST]
+You must ensure you have clear answers and all necessary materials for the following areas before you conclude the clarification process:
+1.  **Project Objective:** What is the primary business or research goal? (e.g., "predict employee churn," "classify customer feedback," "forecast next quarter's sales").
+2.  **Data Source:** Has the user attached the data file? (After understanding the objective, your next step should be to ask for the file).
+3.  **Target Variable:** Exactly which column in the provided data is to be predicted or is the focus of the analysis?
+4.  **Input Features:** Exactly which columns from the data should be used as inputs to influence the outcome?
+5.  **Evaluation Metric:** How will the success of the final model or analysis be measured? (e.g., Accuracy, Precision, Recall for classification; RMSE, MAE for regression; or a business KPI like "reduction in churn rate").
+6.  **Deliverable:** What is the desired final output? (e.g., a summary report, a visualization, a trained model file, a prediction API).
+[OPERATING_PROCEDURE]
+You must follow these steps in every interaction:
+1.  Analyze the complete `[CONVERSATION_HISTORY]`.
+2.  Compare the user's answers and provided files against the `[AREAS_OF_INQUIRY_CHECKLIST]`.
+3.  **If details or files are missing:**
+    * Identify the single most critical piece of missing information or the required file.
+    * Ask ONE clear, concise question or make a single request (e.g., to attach the data).
+    * Do NOT ask multiple questions at once. Acknowledge the user's last answer briefly before asking the new question.
+4.  **If ALL checklist items are answered and files received:**
+    * Do NOT ask any more questions.
+    * State that you have all the necessary information.
+    * Provide a final, structured summary of the task specification under the heading "### Final Task Specification".
+----------------------------------------------------
+[CONVERSATION_HISTORY]
+{conversation_history}
+----------------------------------------------------
+[ORGINAL_USER_QUERY]
+{query}
+----------------------------------------------------
+[CURRENT_TASK]
+Based on the `[OPERATING_PROCEDURE]` and the provided `[CONVERSATION_HISTORY]`, perform your next action: either ask your next clarifying question, request a file, or provide the final task summary.
+"""

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+google-genai
+gradio
+pandas
+python-dotenv==1.0.1
+openpyxl
+Pillow
+marker-pdf[full]

utils/__pycache__/google_genai_llm.cpython-310.pyc ADDED Viewed

Binary file (601 Bytes). View file

utils/google_genai_llm.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from google import genai
+from dotenv import load_dotenv
+import os
+load_dotenv()
+client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
+def get_response(prompt: str) -> str:
+    response = client.models.generate_content(
+        model="gemini-2.5-flash-preview-05-20", contents=prompt
+    )
+    return(response.text)