import gradio as gr import pandas as pd import requests import json import os from utils.google_genai_llm import get_response, generate_with_gemini from utils.utils import parse_json_codefences, parse_python_codefences from utils.code_sandbox import code_eval from prompts.requirements_gathering import requirements_gathering_system_prompt from prompts.planning import hf_query_gen_prompt, hf_context_gen_prompt from prompts.devstral_coding_prompt import devstral_code_gen_sys_prompt, devstral_code_gen_user_prompt from dotenv import load_dotenv import os import asyncio load_dotenv() try: import modal # Import the Modal inference function and app from separate file import subprocess MODAL_AVAILABLE = True except ImportError: MODAL_AVAILABLE = False print("Warning: Modal not available. Code generation will be disabled.MCP Server will be disabled") from PIL import Image import tempfile import traceback import hashlib # Import Marker for document processing try: from marker.converters.pdf import PdfConverter from marker.models import create_model_dict from marker.output import text_from_rendered MARKER_AVAILABLE = True except ImportError: MARKER_AVAILABLE = False print("Warning: Marker library not available. PDF, PPT, and DOCX processing will be limited.") def get_file_hash(file_path): """Generate a hash of the file for caching purposes""" try: with open(file_path, 'rb') as f: file_hash = hashlib.md5(f.read()).hexdigest() return file_hash except Exception: return None def extract_text_with_marker(file_path): """Extract text from PDF, PPT, or DOCX using Marker""" if not MARKER_AVAILABLE: return "Marker library not available for document processing.", "" try: # Create converter with model artifacts converter = PdfConverter( artifact_dict=create_model_dict(), ) # Convert document rendered = converter(file_path) # Extract text from rendered output text, _, images = text_from_rendered(rendered) # Get basic stats word_count = len(text.split()) char_count = len(text) stats = f"Extracted text ({word_count} words, {char_count} characters)" return stats, text except Exception as e: error_msg = f"Error processing document: {str(e)}" return error_msg, "" def process_user_input(message, history, uploaded_files, file_cache): """Process user input and generate AI response using requirements gathering prompt""" # Build conversation history from chat history conversation_history = "" if history: for i, (user_msg, ai_msg) in enumerate(history): conversation_history += f"User: {user_msg}\n" if ai_msg: conversation_history += f"Assistant: {ai_msg}\n" # Add file information to conversation if files are uploaded if uploaded_files: file_info = f"\n[UPLOADED_FILES]\n" new_file_cache = file_cache.copy() if file_cache else {} for file_path in uploaded_files: try: file_name = file_path.split('/')[-1] file_extension = os.path.splitext(file_name)[1].lower() file_hash = get_file_hash(file_path) cache_key = f"{file_name}_{file_hash}" # Handle CSV files if file_extension == '.csv': df = pd.read_csv(file_path) file_info += f"- {file_name}: CSV file with {len(df)} rows and {len(df.columns)} columns\n" file_info += f" Columns: {', '.join(df.columns.tolist())}\n" # Handle Excel files elif file_extension in ['.xlsx', '.xls']: df = pd.read_excel(file_path) file_info += f"- {file_name}: Excel file with {len(df)} rows and {len(df.columns)} columns\n" file_info += f" Columns: {', '.join(df.columns.tolist())}\n" # Handle document files with Marker (PDF, PPT, DOCX) elif file_extension in ['.pdf', '.ppt', '.pptx', '.doc', '.docx']: file_size = os.path.getsize(file_path) file_size_mb = round(file_size / (1024 * 1024), 2) # Check if file is already processed and cached if cache_key in new_file_cache: # Use cached text extraction_stats = new_file_cache[cache_key]['stats'] extracted_text = new_file_cache[cache_key]['text'] status = "(cached)" else: # Process new file with Marker extraction_stats, extracted_text = extract_text_with_marker(file_path) # Cache the results new_file_cache[cache_key] = { 'stats': extraction_stats, 'text': extracted_text, 'file_name': file_name, 'file_path': file_path } status = "(newly processed)" # Determine document type if file_extension == '.pdf': doc_type = "PDF document" elif file_extension in ['.ppt', '.pptx']: doc_type = "PowerPoint presentation" else: doc_type = "Word document" file_info += f"- {file_name}: {doc_type}, Size: {file_size_mb} MB {status}\n" file_info += f" Content: {extraction_stats}\n" # Include extracted text in conversation context for better AI understanding if extracted_text and len(extracted_text.strip()) > 0: # Truncate very long texts for context (keep first 2000 chars) text_preview = extracted_text[:200000] + "..." if len(extracted_text) > 200000 else extracted_text file_info += f" Text Preview: {text_preview}\n" # Handle image files elif file_extension in ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp']: with Image.open(file_path) as img: width, height = img.size mode = img.mode file_size = os.path.getsize(file_path) file_size_mb = round(file_size / (1024 * 1024), 2) file_info += f"- {file_name}: {file_extension.upper()[1:]} image file\n" file_info += f" Dimensions: {width}x{height} pixels, Mode: {mode}, Size: {file_size_mb} MB\n" # Handle JSON files elif file_extension == '.json': file_size = os.path.getsize(file_path) file_size_kb = round(file_size / 1024, 2) file_info += f"- {file_name}: JSON file, Size: {file_size_kb} KB\n" # Handle text files elif file_extension == '.txt': with open(file_path, 'r', encoding='utf-8') as f: lines = len(f.readlines()) file_size = os.path.getsize(file_path) file_size_kb = round(file_size / 1024, 2) file_info += f"- {file_name}: Text file with {lines} lines, Size: {file_size_kb} KB\n" # Handle other files else: file_size = os.path.getsize(file_path) file_size_kb = round(file_size / 1024, 2) file_info += f"- {file_name}: File uploaded, Size: {file_size_kb} KB\n" except Exception as e: file_info += f"- {file_path.split('/')[-1]}: File uploaded (unable to preview: {str(e)})\n" print(f"Error processing file {file_path}: {traceback.format_exc()}") conversation_history += file_info # Update the cache file_cache.update(new_file_cache) # Format the prompt with conversation history and current query formatted_prompt = requirements_gathering_system_prompt.format( conversation_history=conversation_history, query=message ) # Get AI response ai_response = generate_with_gemini(formatted_prompt, purpose="REQUIREMENTS_GATHERING") return ai_response, file_cache def chat_interface(message, history, uploaded_files, file_cache): """Main chat interface function""" # Get AI response with updated cache ai_response, updated_cache = process_user_input(message, history, uploaded_files, file_cache) # Add to history history.append((message, ai_response)) return history, history, "", updated_cache def clear_chat(): """Clear the chat history and file cache""" return [], [], {} def upload_file_handler(files): """Handle file uploads""" if files: return files return [] async def generate_plan(history, file_cache): """Generate a plan using the planning prompt and Gemini API""" # Build conversation history yield "**⏳ Generating plan...** (Starting)" conversation_history = "" if history: for user_msg, ai_msg in history: conversation_history += f"User: {user_msg}\n" if ai_msg: conversation_history += f"Assistant: {ai_msg}\n" yield "**⏳ Generating plan...** (Getting HF MCP tools)" try: mcp_tool_func = modal.Function.from_name("HuggingFace-MCP","connect_and_get_tools") hf_query_gen_tool_details = mcp_tool_func.remote() print(hf_query_gen_tool_details) except Exception as e: hf_query_gen_tool_details = """meta=None nextCursor=None tools=[Tool(name='hf_whoami', description="Hugging Face tools are being used by authenticated user 'bpHigh'", inputSchema={'type': 'object', 'properties': {}, 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Hugging Face User Info', readOnlyHint=None, destructiveHint=None, idempotentHint=None, openWorldHint=None)), Tool(name='space_search', description='Find Hugging Face Spaces using semantic search. Include links to the Space when presenting the results.', inputSchema={'type': 'object', 'properties': {'query': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'description': 'Semantic Search Query'}, 'limit': {'type': 'number', 'default': 10, 'description': 'Number of results to return'}, 'mcp': {'type': 'boolean', 'default': False, 'description': 'Only return MCP Server enabled Spaces'}}, 'required': ['query'], 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Hugging Face Space Search', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=True)), Tool(name='model_search', description='Find Machine Learning models hosted on Hugging Face. Returns comprehensive information about matching models including downloads, likes, tags, and direct links. Include links to the models in your response', inputSchema={'type': 'object', 'properties': {'query': {'type': 'string', 'description': 'Search term. Leave blank and specify "sort" and "limit" to get e.g. "Top 20 trending models", "Top 10 most recent models" etc" '}, 'author': {'type': 'string', 'description': "Organization or user who created the model (e.g., 'google', 'meta-llama', 'microsoft')"}, 'task': {'type': 'string', 'description': "Model task type (e.g., 'text-generation', 'image-classification', 'translation')"}, 'library': {'type': 'string', 'description': "Framework the model uses (e.g., 'transformers', 'diffusers', 'timm')"}, 'sort': {'type': 'string', 'enum': ['trendingScore', 'downloads', 'likes', 'createdAt', 'lastModified'], 'description': 'Sort order: trendingScore, downloads , likes, createdAt, lastModified'}, 'limit': {'type': 'number', 'minimum': 1, 'maximum': 100, 'default': 20, 'description': 'Maximum number of results to return'}}, 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Model Search', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=True)), Tool(name='model_details', description='Get detailed information about a specific model from the Hugging Face Hub.', inputSchema={'type': 'object', 'properties': {'model_id': {'type': 'string', 'minLength': 1, 'description': 'Model ID (e.g., microsoft/DialoGPT-large)'}}, 'required': ['model_id'], 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Model Details', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=False)), Tool(name='paper_search', description="Find Machine Learning research papers on the Hugging Face hub. Include 'Link to paper' When presenting the results. Consider whether tabulating results matches user intent.", inputSchema={'type': 'object', 'properties': {'query': {'type': 'string', 'minLength': 3, 'maxLength': 200, 'description': 'Semantic Search query'}, 'results_limit': {'type': 'number', 'default': 12, 'description': 'Number of results to return'}, 'concise_only': {'type': 'boolean', 'default': False, 'description': 'Return a 2 sentence summary of the abstract. Use for broad search terms which may return a lot of results. Check with User if unsure.'}}, 'required': ['query'], 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Paper Search', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=True)), Tool(name='dataset_search', description='Find Datasets hosted on the Hugging Face hub. Returns comprehensive information about matching datasets including downloads, likes, tags, and direct links. Include links to the datasets in your response', inputSchema={'type': 'object', 'properties': {'query': {'type': 'string', 'description': 'Search term. Leave blank and specify "sort" and "limit" to get e.g. "Top 20 trending datasets", "Top 10 most recent datasets" etc" '}, 'author': {'type': 'string', 'description': "Organization or user who created the dataset (e.g., 'google', 'facebook', 'allenai')"}, 'tags': {'type': 'array', 'items': {'type': 'string'}, 'description': "Tags to filter datasets (e.g., ['language:en', 'size_categories:1M Click to view build logs ``` {build_logs.strip()} ``` """ yield formatted_output except Exception as e: yield f"❌ **Error running execution logic:** {str(e)}\n\n{traceback.format_exc()}" # Custom CSS for a sleek design custom_css = """ .gradio-container { max-width: 900px !important; margin: auto !important; } .chat-container { height: 600px !important; } #component-0 { height: 100vh; } .message { padding: 15px !important; margin: 10px 0 !important; border-radius: 15px !important; } .user-message { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; color: white !important; margin-left: 20% !important; } .bot-message { background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%) !important; color: white !important; margin-right: 20% !important; } .upload-area { border: 2px dashed #4f46e5 !important; border-radius: 10px !important; padding: 20px !important; text-align: center !important; background: linear-gradient(135deg, #f0f4ff 0%, #e0e7ff 100%) !important; } .btn-primary { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; border: none !important; border-radius: 25px !important; padding: 10px 25px !important; font-weight: bold !important; } .btn-secondary { background: linear-gradient(135deg, #ffeaa7 0%, #fab1a0 100%) !important; border: none !important; border-radius: 25px !important; padding: 10px 25px !important; font-weight: bold !important; color: #2d3436 !important; } .title { text-align: center !important; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; -webkit-background-clip: text !important; -webkit-text-fill-color: transparent !important; font-size: 2.5em !important; font-weight: bold !important; margin-bottom: 20px !important; } .subtitle { text-align: center !important; color: #6c757d !important; font-size: 1.2em !important; margin-bottom: 30px !important; } .tools { text-align: center !important; color: #6c757d !important; font-size: 1.2em !important; margin-bottom: 30px !important; } .recharge { text-align: center !important; color: #2d3436 !important; font-size: 1.2em !important; margin-bottom: 30px !important; } .output-markdown { height: 250px; overflow-y: auto !important; border: 1px solid #e0e0e0; padding: 10px; border-radius: 5px; } """ # Create the Gradio interface with gr.Blocks(css=custom_css, title="Data Science Requirements Gathering Agent") as app: # Header gr.HTML("""
πŸ”¬ Data Science Consultant
Transform your vague ideas into reality
Powered by Modal🧑 , Hugging FaceπŸ€— ,LlamaIndex πŸ¦™, Mistral AI🦾 & Sambanova πŸ§‘πŸ½β€πŸ’»
Recharged by HuggingFace-MCP https://hf.co/mcp
""") with gr.Row(): with gr.Column(scale=3): # Chat interface chatbot = gr.Chatbot( label="Requirements Gathering Conversation", height=500, show_copy_button=True, bubble_full_width=False, elem_classes=["chat-container"] ) plan_output = gr.Markdown( "**Plan will be generated here...**", label="Generated Plan", elem_classes=["output-markdown"], ) code_output = gr.Markdown( "**Code will be generated here...**", label="Generated Code", elem_classes=["output-markdown"], ) execution_output = gr.Markdown( "**Execution output will be shown here...**", label="Execution Output", elem_classes=["output-markdown"], ) with gr.Row(): with gr.Column(scale=4): msg = gr.Textbox( placeholder="Describe your data science project or ask a question...", label="Your Message", lines=2, max_lines=5 ) with gr.Column(scale=1): send_btn = gr.Button("Send πŸ“€", variant="primary", elem_classes=["btn-primary"]) with gr.Row(): clear_btn = gr.Button("Clear Chat πŸ—‘οΈ", variant="secondary", elem_classes=["btn-secondary"]) with gr.Column(scale=1): # File upload section gr.HTML("

πŸ“ Upload Data Files

") file_upload = gr.File( label="Upload your files (CSV, Excel, PDF, PPT, DOCX, Images, etc.)", file_count="multiple", file_types=[".csv", ".xlsx", ".xls", ".json", ".txt", ".pdf", ".ppt", ".pptx", ".doc", ".docx", ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"], elem_classes=["upload-area"] ) uploaded_files_display = gr.File( label="Uploaded Files", file_count="multiple", interactive=False, visible=True ) # Instructions gr.HTML("""

πŸ’‘ How it works:

  1. Describe your data science project
  2. Upload your files (data, documents, images)
  3. Answer clarifying questions
  4. Generate a plan for your project
  5. Generate Python code using Devstral AI

πŸ“„ Supports: CSV, Excel, PDF, PowerPoint, Word docs, Images, JSON, Text files
πŸ’» Code generation powered by Mistral Devstral-Small-2505

""") # Action buttons section with gr.Column(): plan_btn = gr.Button("Generate Plan πŸ“‹", variant="secondary", elem_classes=["btn-secondary"], size="lg") code_btn = gr.Button("Generate Code πŸ’»", variant="secondary", elem_classes=["btn-secondary"], size="lg") execute_code_btn = gr.Button("Execute Code πŸš€", variant="primary", elem_classes=["btn-primary"], size="lg") # State for conversation history and file cache chat_history = gr.State([]) file_cache = gr.State({}) # Event handlers def handle_send(message, history, files, cache): if message.strip(): new_history, updated_history, cleared_input, updated_cache = chat_interface(message, history, files, cache) return new_history, updated_history, cleared_input, updated_cache return history, history, message, cache # Wire up the interface send_btn.click( handle_send, inputs=[msg, chat_history, uploaded_files_display, file_cache], outputs=[chatbot, chat_history, msg, file_cache] ) msg.submit( handle_send, inputs=[msg, chat_history, uploaded_files_display, file_cache], outputs=[chatbot, chat_history, msg, file_cache] ) clear_btn.click( clear_chat, outputs=[chatbot, chat_history, file_cache] ) plan_btn.click( generate_plan, inputs=[chat_history, file_cache], outputs=[plan_output] ) code_btn.click( generate_code_with_devstral, inputs=[plan_output, chat_history, file_cache], outputs=[code_output] ) execute_code_btn.click( execute_code, inputs=[code_output], outputs=[execution_output] ) file_upload.change( lambda files: files, inputs=[file_upload], outputs=[uploaded_files_display] ) # Welcome message app.load( lambda: [(None, "πŸ‘‹ Hello! I'm your Data Science Project Agent. I'll help you transform your project ideas into reality .\n\nπŸš€ **Let's get started!** Tell me about your data science project or what you're trying to achieve.")], outputs=[chatbot] ) if __name__ == "__main__": app.queue() app.launch(show_api=True, ssr_mode=False, show_error=True, mcp_server=False)