Spaces:

arbnori45
/

ai_agents_final

Sleeping

App Files Files Community

Arbnor Tefiki commited on Jun 30

Commit

5d9aa5e

1 Parent(s): c594a60

Test the agent in HF

Browse files

Files changed (15) hide show

.gitignore +68 -0
Dockerfile +23 -0
README.md +55 -11
agent/__init__.py +6 -0
agent/agent.py +287 -0
agent/tools/__init__.py +3 -0
agent/tools/file_handlers.py +561 -0
agent/utils/__init__.py +3 -0
agent/utils/data_processor.py +936 -0
agent/utils/question_analyzer.py +195 -0
app.py +217 -0
index.html +0 -19
requirements.txt +16 -0
style.css +0 -28
test_agent.py +78 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,68 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+ENV/
+env/
+# Jupyter Notebook
+.ipynb_checkpoints
+# VS Code
+.vscode/
+*.code-workspace
+# PyCharm
+.idea/
+# Logs
+logs/
+*.log
+# Local configuration
+.env
+# Cache
+.cache/
+.pytest_cache/
+# Mac OS
+.DS_Store

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+FROM python:3.9-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    libffi-dev \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+# Command to run when the container starts
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,11 +1,55 @@
----
-title: Ai Agents Final
-emoji: 📊
-colorFrom: green
-colorTo: purple
-sdk: static
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Multi-Modal AI Agent for Hugging Face Agent Course Unit 4
+This project implements a multi-modal AI agent that can process various file types and answer questions about their content for the Hugging Face Agent Course Unit 4 final assessment.
+## Features
+- Processes and answers questions about different file types:
+  - Excel/CSV files (.xlsx, .csv)
+  - Text files (.txt)
+  - PDFs (.pdf)
+  - Images (.jpg, .png)
+  - Python files (.py)
+  - Microsoft Office files (.docx, .pptx)
+  - JSON files (.jsonld)
+  - Archive files (.zip)
+  - Other specialized formats (.pdb)
+- Analyzes questions to understand what's being asked
+- Identifies and loads relevant resource files
+- Applies appropriate processing techniques based on file type
+- Formulates accurate answers based on file content
+- Includes error handling and logging
+## Project Structure
+- `app.py`: Main application file with Gradio interface
+- `agent/`: Package containing agent components
+  - `agent.py`: Multi-modal agent implementation
+  - `tools/`: File handlers and other tools
+  - `utils/`: Utility functions for question analysis and data processing
+## Installation
+1. Clone the repository
+2. Install dependencies:
+   ```
+   pip install -r requirements.txt
+   ```
+## Usage
+Run the application:
+```
+python app.py
+```
+## Dependencies
+- pandas: For data processing
+- gradio: For the user interface
+- PyPDF2: For PDF processing
+- python-docx: For Word document processing
+- python-pptx: For PowerPoint presentations
+- Pillow: For image processing
+- And more (see requirements.txt)

agent/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""
+Agent package initialization file.
+"""
+from agent.agent import MultiModalAgent
+__all__ = ["MultiModalAgent"]

agent/agent.py ADDED Viewed

	@@ -0,0 +1,287 @@

+"""
+Multi-modal agent for processing different file types and answering questions.
+"""
+import os
+import json
+import logging
+from typing import Dict, Any, List, Optional, Tuple
+from agent.tools.file_handlers import extract_file_content
+from agent.utils.question_analyzer import QuestionAnalyzer
+from agent.utils.data_processor import DataProcessor
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger('MultiModalAgent')
+class MultiModalAgent:
+    """
+    Agent for processing different file types and answering questions.
+    """
+    def __init__(self, resource_dir: str = 'resource'):
+        """
+        Initialize the agent.
+        Args:
+            resource_dir: Directory containing resource files
+        """
+        logger.info("Initializing MultiModalAgent")
+        self.resource_dir = resource_dir
+        self.question_analyzer = QuestionAnalyzer(resource_dir)
+        self.data_processor = DataProcessor()
+        # Cache for file content to avoid re-processing
+        self.file_content_cache = {}
+        # Cache for answers
+        self.answer_cache = {}
+    def __call__(self, question: str) -> str:
+        """
+        Process a question and return an answer.
+        Args:
+            question: The question to answer
+        Returns:
+            Answer to the question
+        """
+        logger.info(f"Processing question: {question[:100]}...")
+        # Check answer cache
+        if question in self.answer_cache:
+            logger.info("Answer found in cache")
+            return self.answer_cache[question]
+        try:
+            # Analyze the question
+            analysis = self.question_analyzer.analyze_question(question)
+            logger.info(f"Question analysis: {analysis}")
+            # Handle general questions that don't require file processing
+            if not analysis.get('file_path'):
+                logger.info("No file reference found in question, trying to answer directly")
+                direct_answer = self._answer_without_file(question)
+                if direct_answer:
+                    self.answer_cache[question] = direct_answer
+                    return direct_answer
+                # If direct answering failed, try to find a file in the resource directory
+                logger.info("Direct answering failed, looking for relevant files")
+                analysis['file_path'] = self._find_most_relevant_file(question)
+                if not analysis['file_path']:
+                    logger.warning("No relevant file found for the question")
+                    return "I couldn't find a relevant file to answer this question."
+            # Extract content from the file
+            file_path = analysis['file_path']
+            if file_path in self.file_content_cache:
+                content, handler = self.file_content_cache[file_path]
+            else:
+                content, handler = extract_file_content(file_path, self.resource_dir)
+                if content is not None:
+                    self.file_content_cache[file_path] = (content, handler)
+            if content is None:
+                logger.error(f"Failed to extract content from file: {file_path}")
+                return "I couldn't extract content from the specified file."
+            # Process the content based on file type
+            answer = self._process_content(content, handler, question)
+            # Cache the answer
+            self.answer_cache[question] = answer
+            return answer
+        except Exception as e:
+            logger.exception(f"Error processing question: {e}")
+            return f"An error occurred while processing your question: {e}"
+    def _answer_without_file(self, question: str) -> Optional[str]:
+        """
+        Try to answer the question without using a file.
+        Args:
+            question: The question to answer
+        Returns:
+            Answer to the question, or None if the question can't be answered directly
+        """
+        # This is a simple implementation that can be expanded based on your needs
+        # Check if the question is asking for metadata about the resource directory
+        if 'how many files' in question.lower() or 'number of files' in question.lower():
+            try:
+                file_count = len(os.listdir(self.resource_dir))
+                return f"There are {file_count} files in the resource directory."
+            except Exception as e:
+                logger.error(f"Error counting files: {e}")
+                return None
+        # Check if the question is asking about file types
+        file_types_patterns = [
+            'what file types', 'which file types', 'what kinds of files',
+            'which kinds of files', 'what formats', 'which formats'
+        ]
+        if any(pattern in question.lower() for pattern in file_types_patterns):
+            try:
+                files = os.listdir(self.resource_dir)
+                extensions = set()
+                for file in files:
+                    _, ext = os.path.splitext(file)
+                    if ext:  # Skip files without extension
+                        extensions.add(ext)
+                if extensions:
+                    extensions_list = sorted(list(extensions))
+                    return f"The resource directory contains files with the following extensions: {', '.join(extensions_list)}"
+                else:
+                    return "The resource directory doesn't contain any files with extensions."
+            except Exception as e:
+                logger.error(f"Error analyzing file types: {e}")
+                return None
+        return None
+    def _find_most_relevant_file(self, question: str) -> Optional[str]:
+        """
+        Find the most relevant file for a question.
+        Args:
+            question: The question to answer
+        Returns:
+            Path to the most relevant file, or None if no relevant file is found
+        """
+        try:
+            # Get all files in the resource directory
+            files = [
+                os.path.join(self.resource_dir, f)
+                for f in os.listdir(self.resource_dir)
+                if os.path.isfile(os.path.join(self.resource_dir, f))
+            ]
+            if not files:
+                logger.warning("No files found in the resource directory")
+                return None
+            # Extract keywords from the question
+            keywords = set(self.question_analyzer._extract_keywords(question))
+            # Calculate relevance scores for each file
+            scores = []
+            for file_path in files:
+                score = 0
+                file_name = os.path.basename(file_path)
+                # Score based on file name
+                for keyword in keywords:
+                    if keyword.lower() in file_name.lower():
+                        score += 2  # Higher weight for filename matches
+                # Score based on file extension
+                _, ext = os.path.splitext(file_path)
+                ext = ext.lower()
+                # Check if the question mentions the file type
+                if 'excel' in question.lower() or 'spreadsheet' in question.lower() or 'xlsx' in question.lower():
+                    if ext in ['.xlsx', '.xls']:
+                        score += 3
+                elif 'csv' in question.lower():
+                    if ext == '.csv':
+                        score += 3
+                elif 'text' in question.lower() or 'txt' in question.lower():
+                    if ext == '.txt':
+                        score += 3
+                elif 'pdf' in question.lower():
+                    if ext == '.pdf':
+                        score += 3
+                elif 'image' in question.lower() or 'picture' in question.lower() or 'photo' in question.lower():
+                    if ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']:
+                        score += 3
+                elif 'word' in question.lower() or 'document' in question.lower() or 'docx' in question.lower():
+                    if ext == '.docx':
+                        score += 3
+                elif 'powerpoint' in question.lower() or 'presentation' in question.lower() or 'slides' in question.lower() or 'pptx' in question.lower():
+                    if ext == '.pptx':
+                        score += 3
+                elif 'json' in question.lower():
+                    if ext in ['.json', '.jsonld']:
+                        score += 3
+                elif 'zip' in question.lower() or 'archive' in question.lower():
+                    if ext == '.zip':
+                        score += 3
+                elif 'python' in question.lower() or 'py' in question.lower() or 'code' in question.lower() or 'script' in question.lower():
+                    if ext == '.py':
+                        score += 3
+                elif 'pdb' in question.lower() or 'protein' in question.lower():
+                    if ext == '.pdb':
+                        score += 3
+                scores.append((file_path, score))
+            # Sort by score in descending order
+            scores.sort(key=lambda x: x[1], reverse=True)
+            # Return the most relevant file if it has a non-zero score
+            if scores and scores[0][1] > 0:
+                return scores[0][0]
+            # If no relevant file is found based on the question, return None
+            return None
+        except Exception as e:
+            logger.error(f"Error finding relevant file: {e}")
+            return None
+    def _process_content(self, content: Any, handler: Any, question: str) -> str:
+        """
+        Process the content based on file type.
+        Args:
+            content: Extracted content from the file
+            handler: File handler used to extract the content
+            question: The question to answer
+        Returns:
+            Answer to the question
+        """
+        try:
+            handler_type = type(handler).__name__
+            if handler_type == 'ExcelHandler':
+                return self.data_processor.process_excel_data(content, question)
+            elif handler_type == 'CSVHandler':
+                return self.data_processor.process_csv_data(content, question)
+            elif handler_type == 'TextHandler':
+                return self.data_processor.process_text_data(content, question)
+            elif handler_type == 'PDFHandler':
+                return self.data_processor.process_pdf_data(content, question)
+            elif handler_type == 'ImageHandler':
+                return self.data_processor.process_image_metadata(content, question)
+            elif handler_type == 'DocxHandler':
+                return self.data_processor.process_docx_data(content, question)
+            elif handler_type == 'PptxHandler':
+                return self.data_processor.process_pptx_data(content, question)
+            elif handler_type == 'JsonHandler':
+                return self.data_processor.process_json_data(content, question)
+            elif handler_type == 'ZipHandler':
+                return self.data_processor.process_zip_data(content, question)
+            elif handler_type == 'PdbHandler':
+                return self.data_processor.process_pdb_data(content, question)
+            elif handler_type == 'PythonHandler':
+                return self.data_processor.process_python_data(content, question)
+            elif handler_type == 'JsonlHandler':
+                return self.data_processor.process_jsonl_data(content, question)
+            else:
+                logger.warning(f"Unknown handler type: {handler_type}")
+                return f"I don't know how to process content from a {handler_type}."
+        except Exception as e:
+            logger.exception(f"Error processing content: {e}")
+            return f"An error occurred while processing the file content: {e}"

agent/tools/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Tools initialization file.
+"""

agent/tools/file_handlers.py ADDED Viewed

	@@ -0,0 +1,561 @@

+"""
+File handlers for processing different file types.
+"""
+import os
+import json
+import csv
+import zipfile
+import io
+import re
+from typing import Dict, Any, List, Optional, Tuple
+import pandas as pd
+from PIL import Image
+import PyPDF2
+import docx
+from pptx import Presentation
+class FileHandler:
+    """Base class for file handlers."""
+    def __init__(self, resource_dir: str):
+        """
+        Initialize the file handler.
+        Args:
+            resource_dir: Directory containing resource files
+        """
+        self.resource_dir = resource_dir
+    def get_file_path(self, file_name: str) -> str:
+        """
+        Get the full path to a file.
+        Args:
+            file_name: Name of the file
+        Returns:
+            Full path to the file
+        """
+        return os.path.join(self.resource_dir, file_name)
+    def can_handle(self, file_path: str) -> bool:
+        """
+        Check if the handler can process the given file.
+        Args:
+            file_path: Path to the file
+        Returns:
+            True if the handler can process the file, False otherwise
+        """
+        raise NotImplementedError("Subclasses must implement this method")
+    def extract_content(self, file_path: str) -> Any:
+        """
+        Extract content from the file.
+        Args:
+            file_path: Path to the file
+        Returns:
+            Extracted content
+        """
+        raise NotImplementedError("Subclasses must implement this method")
+    def get_file_type(self, file_path: str) -> str:
+        """
+        Get the file type based on extension.
+        Args:
+            file_path: Path to the file
+        Returns:
+            File type (extension)
+        """
+        _, ext = os.path.splitext(file_path)
+        return ext.lower()
+class ExcelHandler(FileHandler):
+    """Handler for Excel files."""
+    def can_handle(self, file_path: str) -> bool:
+        """Check if the file is an Excel file."""
+        return self.get_file_type(file_path) in ['.xlsx', '.xls']
+    def extract_content(self, file_path: str) -> Dict[str, pd.DataFrame]:
+        """
+        Extract content from an Excel file.
+        Returns:
+            Dictionary mapping sheet names to DataFrames
+        """
+        try:
+            # Read all sheets
+            excel_file = pd.ExcelFile(file_path)
+            sheets = {}
+            for sheet_name in excel_file.sheet_names:
+                sheets[sheet_name] = pd.read_excel(excel_file, sheet_name)
+            return sheets
+        except Exception as e:
+            print(f"Error extracting content from Excel file {file_path}: {e}")
+            return {}
+class CSVHandler(FileHandler):
+    """Handler for CSV files."""
+    def can_handle(self, file_path: str) -> bool:
+        """Check if the file is a CSV file."""
+        return self.get_file_type(file_path) == '.csv'
+    def extract_content(self, file_path: str) -> pd.DataFrame:
+        """
+        Extract content from a CSV file.
+        Returns:
+            DataFrame containing the CSV data
+        """
+        try:
+            # Try different encodings and delimiters
+            try:
+                return pd.read_csv(file_path)
+            except:
+                # Try with different delimiter
+                with open(file_path, 'r', newline='') as csvfile:
+                    dialect = csv.Sniffer().sniff(csvfile.read(1024))
+                    csvfile.seek(0)
+                    return pd.read_csv(file_path, delimiter=dialect.delimiter)
+        except Exception as e:
+            print(f"Error extracting content from CSV file {file_path}: {e}")
+            return pd.DataFrame()
+class TextHandler(FileHandler):
+    """Handler for text files."""
+    def can_handle(self, file_path: str) -> bool:
+        """Check if the file is a text file."""
+        return self.get_file_type(file_path) == '.txt'
+    def extract_content(self, file_path: str) -> str:
+        """
+        Extract content from a text file.
+        Returns:
+            Text content of the file
+        """
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return f.read()
+        except UnicodeDecodeError:
+            # Try a different encoding
+            try:
+                with open(file_path, 'r', encoding='latin-1') as f:
+                    return f.read()
+            except Exception as e:
+                print(f"Error extracting content from text file {file_path}: {e}")
+                return ""
+        except Exception as e:
+            print(f"Error extracting content from text file {file_path}: {e}")
+            return ""
+class PDFHandler(FileHandler):
+    """Handler for PDF files."""
+    def can_handle(self, file_path: str) -> bool:
+        """Check if the file is a PDF file."""
+        return self.get_file_type(file_path) == '.pdf'
+    def extract_content(self, file_path: str) -> Dict[int, str]:
+        """
+        Extract content from a PDF file.
+        Returns:
+            Dictionary mapping page numbers to text content
+        """
+        try:
+            result = {}
+            with open(file_path, 'rb') as f:
+                pdf_reader = PyPDF2.PdfReader(f)
+                for i in range(len(pdf_reader.pages)):
+                    page = pdf_reader.pages[i]
+                    result[i + 1] = page.extract_text()
+            return result
+        except Exception as e:
+            print(f"Error extracting content from PDF file {file_path}: {e}")
+            return {}
+class ImageHandler(FileHandler):
+    """Handler for image files."""
+    def can_handle(self, file_path: str) -> bool:
+        """Check if the file is an image file."""
+        return self.get_file_type(file_path) in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']
+    def extract_content(self, file_path: str) -> Dict[str, Any]:
+        """
+        Extract metadata from an image file.
+        Returns:
+            Dictionary containing image metadata
+        """
+        try:
+            with Image.open(file_path) as img:
+                metadata = {
+                    'format': img.format,
+                    'mode': img.mode,
+                    'size': img.size,
+                    'width': img.width,
+                    'height': img.height,
+                }
+                # Extract EXIF data if available
+                if hasattr(img, '_getexif') and img._getexif():
+                    exif = {
+                        PyPDF2.ExifTags.TAGS.get(k, k): v
+                        for k, v in img._getexif().items()
+                        if k in PyPDF2.ExifTags.TAGS
+                    }
+                    metadata['exif'] = exif
+                return metadata
+        except Exception as e:
+            print(f"Error extracting content from image file {file_path}: {e}")
+            return {}
+class DocxHandler(FileHandler):
+    """Handler for Word documents."""
+    def can_handle(self, file_path: str) -> bool:
+        """Check if the file is a Word document."""
+        return self.get_file_type(file_path) == '.docx'
+    def extract_content(self, file_path: str) -> str:
+        """
+        Extract text content from a Word document.
+        Returns:
+            Text content of the document
+        """
+        try:
+            doc = docx.Document(file_path)
+            full_text = []
+            # Extract text from paragraphs
+            for para in doc.paragraphs:
+                full_text.append(para.text)
+            # Extract text from tables
+            for table in doc.tables:
+                for row in table.rows:
+                    for cell in row.cells:
+                        full_text.append(cell.text)
+            return '\n'.join(full_text)
+        except Exception as e:
+            print(f"Error extracting content from Word document {file_path}: {e}")
+            return ""
+class PptxHandler(FileHandler):
+    """Handler for PowerPoint presentations."""
+    def can_handle(self, file_path: str) -> bool:
+        """Check if the file is a PowerPoint presentation."""
+        return self.get_file_type(file_path) == '.pptx'
+    def extract_content(self, file_path: str) -> Dict[int, str]:
+        """
+        Extract text content from a PowerPoint presentation.
+        Returns:
+            Dictionary mapping slide numbers to text content
+        """
+        try:
+            prs = Presentation(file_path)
+            slides_text = {}
+            for i, slide in enumerate(prs.slides):
+                texts = []
+                for shape in slide.shapes:
+                    if hasattr(shape, "text"):
+                        texts.append(shape.text)
+                slides_text[i + 1] = '\n'.join(texts)
+            return slides_text
+        except Exception as e:
+            print(f"Error extracting content from PowerPoint presentation {file_path}: {e}")
+            return {}
+class JsonHandler(FileHandler):
+    """Handler for JSON files."""
+    def can_handle(self, file_path: str) -> bool:
+        """Check if the file is a JSON file."""
+        return self.get_file_type(file_path) in ['.json', '.jsonld']
+    def extract_content(self, file_path: str) -> Dict[str, Any]:
+        """
+        Extract content from a JSON file.
+        Returns:
+            Parsed JSON content
+        """
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        except Exception as e:
+            print(f"Error extracting content from JSON file {file_path}: {e}")
+            return {}
+class ZipHandler(FileHandler):
+    """Handler for ZIP archives."""
+    def can_handle(self, file_path: str) -> bool:
+        """Check if the file is a ZIP archive."""
+        return self.get_file_type(file_path) == '.zip'
+    def extract_content(self, file_path: str) -> Dict[str, Any]:
+        """
+        Extract information about the contents of a ZIP archive.
+        Returns:
+            Dictionary containing information about the archived files
+        """
+        try:
+            result = {'files': []}
+            with zipfile.ZipFile(file_path, 'r') as zip_ref:
+                # Get information about each file in the archive
+                for file_info in zip_ref.infolist():
+                    result['files'].append({
+                        'filename': file_info.filename,
+                        'size': file_info.file_size,
+                        'compressed_size': file_info.compress_size,
+                        'date_time': file_info.date_time,
+                    })
+                # Try to extract and process common file types
+                for file_info in zip_ref.infolist():
+                    if file_info.filename.endswith('.txt'):
+                        with zip_ref.open(file_info.filename) as f:
+                            content = f.read().decode('utf-8', errors='ignore')
+                            result[file_info.filename] = content
+            return result
+        except Exception as e:
+            print(f"Error extracting content from ZIP archive {file_path}: {e}")
+            return {}
+class PdbHandler(FileHandler):
+    """Handler for PDB (Protein Data Bank) files."""
+    def can_handle(self, file_path: str) -> bool:
+        """Check if the file is a PDB file."""
+        return self.get_file_type(file_path) == '.pdb'
+    def extract_content(self, file_path: str) -> Dict[str, Any]:
+        """
+        Extract basic information from a PDB file.
+        Returns:
+            Dictionary containing basic information about the PDB file
+        """
+        try:
+            result = {
+                'header': '',
+                'title': '',
+                'compounds': [],
+                'authors': [],
+                'atoms_count': 0,
+            }
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                for line in f:
+                    if line.startswith('HEADER'):
+                        result['header'] = line[10:].strip()
+                    elif line.startswith('TITLE'):
+                        result['title'] += line[10:].strip()
+                    elif line.startswith('COMPND'):
+                        result['compounds'].append(line[10:].strip())
+                    elif line.startswith('AUTHOR'):
+                        result['authors'].append(line[10:].strip())
+                    elif line.startswith('ATOM') or line.startswith('HETATM'):
+                        result['atoms_count'] += 1
+            return result
+        except Exception as e:
+            print(f"Error extracting content from PDB file {file_path}: {e}")
+            return {}
+class PythonHandler(FileHandler):
+    """Handler for Python files."""
+    def can_handle(self, file_path: str) -> bool:
+        """Check if the file is a Python file."""
+        return self.get_file_type(file_path) == '.py'
+    def extract_content(self, file_path: str) -> Dict[str, Any]:
+        """
+        Extract content and structure from a Python file.
+        Returns:
+            Dictionary containing the file content and structure information
+        """
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            result = {
+                'content': content,
+                'classes': [],
+                'functions': [],
+                'imports': [],
+            }
+            # Extract class definitions
+            class_pattern = r'class\s+(\w+)(?:\(([^)]*)\))?:'
+            for match in re.finditer(class_pattern, content):
+                class_name = match.group(1)
+                parent_class = match.group(2) if match.group(2) else None
+                result['classes'].append({
+                    'name': class_name,
+                    'parent': parent_class,
+                })
+            # Extract function definitions
+            func_pattern = r'def\s+(\w+)\s*\(([^)]*)\):'
+            for match in re.finditer(func_pattern, content):
+                func_name = match.group(1)
+                params = match.group(2)
+                result['functions'].append({
+                    'name': func_name,
+                    'params': params.strip(),
+                })
+            # Extract imports
+            import_pattern = r'(?:from\s+(\w+(?:\.\w+)*)\s+)?import\s+(.+?)(?:\s+as\s+(\w+))?$'
+            for line in content.split('\n'):
+                line = line.strip()
+                if line.startswith('import ') or line.startswith('from '):
+                    match = re.match(import_pattern, line)
+                    if match:
+                        from_module = match.group(1)
+                        imported = match.group(2)
+                        as_name = match.group(3)
+                        result['imports'].append({
+                            'from': from_module,
+                            'import': imported,
+                            'as': as_name,
+                        })
+            return result
+        except Exception as e:
+            print(f"Error extracting content from Python file {file_path}: {e}")
+            return {}
+class JsonlHandler(FileHandler):
+    """Handler for JSONL (JSON Lines) files."""
+    def can_handle(self, file_path: str) -> bool:
+        """Check if the file is a JSONL file."""
+        return self.get_file_type(file_path) == '.jsonl'
+    def extract_content(self, file_path: str) -> List[Dict[str, Any]]:
+        """
+        Extract content from a JSONL file.
+        Returns:
+            List of parsed JSON objects
+        """
+        try:
+            result = []
+            with open(file_path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    line = line.strip()
+                    if line:  # Skip empty lines
+                        result.append(json.loads(line))
+            return result
+        except Exception as e:
+            print(f"Error extracting content from JSONL file {file_path}: {e}")
+            return []
+def get_all_handlers(resource_dir: str) -> List[FileHandler]:
+    """
+    Get a list of all file handlers.
+    Args:
+        resource_dir: Directory containing resource files
+    Returns:
+        List of file handlers
+    """
+    return [
+        ExcelHandler(resource_dir),
+        CSVHandler(resource_dir),
+        TextHandler(resource_dir),
+        PDFHandler(resource_dir),
+        ImageHandler(resource_dir),
+        DocxHandler(resource_dir),
+        PptxHandler(resource_dir),
+        JsonHandler(resource_dir),
+        ZipHandler(resource_dir),
+        PdbHandler(resource_dir),
+        PythonHandler(resource_dir),
+        JsonlHandler(resource_dir),
+    ]
+def get_handler_for_file(file_path: str, resource_dir: str) -> Optional[FileHandler]:
+    """
+    Get the appropriate handler for a file.
+    Args:
+        file_path: Path to the file
+        resource_dir: Directory containing resource files
+    Returns:
+        Appropriate file handler, or None if no handler can process the file
+    """
+    handlers = get_all_handlers(resource_dir)
+    for handler in handlers:
+        if handler.can_handle(file_path):
+            return handler
+    return None
+def extract_file_content(file_path: str, resource_dir: str) -> Tuple[Any, Optional[FileHandler]]:
+    """
+    Extract content from a file using the appropriate handler.
+    Args:
+        file_path: Path to the file
+        resource_dir: Directory containing resource files
+    Returns:
+        Tuple of (extracted content, handler used)
+    """
+    handler = get_handler_for_file(file_path, resource_dir)
+    if handler:
+        content = handler.extract_content(file_path)
+        return content, handler
+    return None, None

agent/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Utils initialization file.
+"""

agent/utils/data_processor.py ADDED Viewed

	@@ -0,0 +1,936 @@

+"""
+Data processor for processing extracted data.
+"""
+import re
+import os
+import json
+from typing import Dict, Any, List, Optional, Tuple, Union
+import pandas as pd
+class DataProcessor:
+    """
+    Class for processing extracted data.
+    """
+    def __init__(self):
+        """Initialize the data processor."""
+        pass
+    def process_excel_data(self, data: Dict[str, pd.DataFrame], question: str) -> str:
+        """
+        Process data extracted from an Excel file.
+        Args:
+            data: Dictionary mapping sheet names to DataFrames
+            question: The question to answer
+        Returns:
+            Answer to the question
+        """
+        # Convert question to lowercase for easier matching
+        question_lower = question.lower()
+        # Handle specific question types
+        if 'oldest' in question_lower:
+            return self._find_oldest_item(data, question_lower)
+        elif 'count' in question_lower or 'how many' in question_lower:
+            return self._count_items(data, question_lower)
+        elif 'average' in question_lower or 'mean' in question_lower:
+            return self._calculate_average(data, question_lower)
+        elif 'total' in question_lower or 'sum' in question_lower:
+            return self._calculate_total(data, question_lower)
+        elif 'maximum' in question_lower or 'highest' in question_lower:
+            return self._find_maximum(data, question_lower)
+        elif 'minimum' in question_lower or 'lowest' in question_lower:
+            return self._find_minimum(data, question_lower)
+        else:
+            # Try to extract specific information
+            return self._extract_specific_info(data, question_lower)
+    def _find_oldest_item(self, data: Dict[str, pd.DataFrame], question: str) -> str:
+        """Find the oldest item in the data."""
+        # Look for mentions of specific columns or items
+        year_columns = ['year', 'date', 'time', 'created', 'modified', 'release']
+        item_type = None
+        # Try to extract the type of item we're looking for
+        item_types = [
+            'movie', 'film', 'book', 'song', 'album', 'game', 'video game',
+            'dvd', 'cd', 'blu-ray', 'blu ray', 'record', 'cassette', 'vhs'
+        ]
+        for item in item_types:
+            if item in question:
+                item_type = item
+                break
+        # Iterate through sheets and find the oldest item
+        oldest_year = float('inf')
+        oldest_item = None
+        for sheet_name, df in data.items():
+            # Skip empty sheets
+            if df.empty:
+                continue
+            # Try to find year/date columns
+            year_col = None
+            for col in df.columns:
+                if any(year_term in col.lower() for year_term in year_columns):
+                    year_col = col
+                    break
+            if year_col is None:
+                # If no obvious year column, look for columns with numeric values
+                for col in df.columns:
+                    if pd.api.types.is_numeric_dtype(df[col]):
+                        try:
+                            # Check if values might be years (between 1900 and current year)
+                            if df[col].min() >= 1900 and df[col].max() <= 2025:
+                                year_col = col
+                                break
+                        except:
+                            continue
+            if year_col is not None:
+                # Find title/name column
+                title_col = None
+                title_columns = ['title', 'name', 'item', 'product', 'description']
+                for col in df.columns:
+                    if any(title_term in col.lower() for title_term in title_columns):
+                        title_col = col
+                        break
+                if title_col is None and len(df.columns) > 1:
+                    # If no obvious title column, use the first non-year column
+                    for col in df.columns:
+                        if col != year_col:
+                            title_col = col
+                            break
+                # Filter by item type if specified
+                if item_type:
+                    filtered_df = df
+                    # Look for a column that might contain item types
+                    type_col = None
+                    type_columns = ['type', 'category', 'format', 'medium', 'platform']
+                    for col in df.columns:
+                        if any(type_term in col.lower() for type_term in type_columns):
+                            type_col = col
+                            break
+                    if type_col:
+                        # Filter by item type
+                        filtered_df = df[df[type_col].astype(str).str.lower().str.contains(item_type.lower())]
+                else:
+                    filtered_df = df
+                if not filtered_df.empty and title_col:
+                    try:
+                        # Find the row with the minimum year
+                        min_year_idx = filtered_df[year_col].astype(float).idxmin()
+                        min_year = filtered_df.loc[min_year_idx, year_col]
+                        if min_year < oldest_year:
+                            oldest_year = min_year
+                            oldest_item = filtered_df.loc[min_year_idx, title_col]
+                    except:
+                        continue
+        if oldest_item:
+            return str(oldest_item)
+        else:
+            return "Could not determine the oldest item from the data."
+    def _count_items(self, data: Dict[str, pd.DataFrame], question: str) -> str:
+        """Count items matching specific criteria."""
+        # Extract conditions from the question
+        conditions = self._extract_conditions(question)
+        total_count = 0
+        for sheet_name, df in data.items():
+            # Skip empty sheets
+            if df.empty:
+                continue
+            # Apply conditions to filter the DataFrame
+            filtered_df = df
+            for condition in conditions:
+                col = condition.get('column')
+                value = condition.get('value')
+                operator = condition.get('operator', '=')
+                if col and value is not None:
+                    # Find the best matching column
+                    best_col = self._find_best_matching_column(df, col)
+                    if best_col:
+                        try:
+                            if operator == '=':
+                                filtered_df = filtered_df[filtered_df[best_col].astype(str).str.lower() == str(value).lower()]
+                            elif operator == '>':
+                                filtered_df = filtered_df[filtered_df[best_col] > value]
+                            elif operator == '<':
+                                filtered_df = filtered_df[filtered_df[best_col] < value]
+                            elif operator == '>=':
+                                filtered_df = filtered_df[filtered_df[best_col] >= value]
+                            elif operator == '<=':
+                                filtered_df = filtered_df[filtered_df[best_col] <= value]
+                            elif operator == 'contains':
+                                filtered_df = filtered_df[filtered_df[best_col].astype(str).str.lower().str.contains(str(value).lower())]
+                            elif operator == 'between':
+                                if isinstance(value, list) and len(value) == 2:
+                                    filtered_df = filtered_df[(filtered_df[best_col] >= value[0]) & (filtered_df[best_col] <= value[1])]
+                        except:
+                            continue
+            # Add the count from this sheet
+            total_count += len(filtered_df)
+        return str(total_count)
+    def _calculate_average(self, data: Dict[str, pd.DataFrame], question: str) -> str:
+        """Calculate the average of a column."""
+        # Extract column name from the question
+        column_name = self._extract_column_name(question)
+        if not column_name:
+            return "Could not determine which column to calculate the average for."
+        for sheet_name, df in data.items():
+            # Skip empty sheets
+            if df.empty:
+                continue
+            # Find the best matching column
+            best_col = self._find_best_matching_column(df, column_name)
+            if best_col and pd.api.types.is_numeric_dtype(df[best_col]):
+                try:
+                    avg_value = df[best_col].mean()
+                    return str(avg_value)
+                except:
+                    continue
+        return "Could not calculate the average from the data."
+    def _calculate_total(self, data: Dict[str, pd.DataFrame], question: str) -> str:
+        """Calculate the total of a column."""
+        # Extract column name from the question
+        column_name = self._extract_column_name(question)
+        if not column_name:
+            return "Could not determine which column to calculate the total for."
+        for sheet_name, df in data.items():
+            # Skip empty sheets
+            if df.empty:
+                continue
+            # Find the best matching column
+            best_col = self._find_best_matching_column(df, column_name)
+            if best_col and pd.api.types.is_numeric_dtype(df[best_col]):
+                try:
+                    total_value = df[best_col].sum()
+                    return str(total_value)
+                except:
+                    continue
+        return "Could not calculate the total from the data."
+    def _find_maximum(self, data: Dict[str, pd.DataFrame], question: str) -> str:
+        """Find the maximum value in a column."""
+        # Extract column name from the question
+        column_name = self._extract_column_name(question)
+        if not column_name:
+            return "Could not determine which column to find the maximum for."
+        for sheet_name, df in data.items():
+            # Skip empty sheets
+            if df.empty:
+                continue
+            # Find the best matching column
+            best_col = self._find_best_matching_column(df, column_name)
+            if best_col:
+                try:
+                    max_value = df[best_col].max()
+                    return str(max_value)
+                except:
+                    continue
+        return "Could not find the maximum value from the data."
+    def _find_minimum(self, data: Dict[str, pd.DataFrame], question: str) -> str:
+        """Find the minimum value in a column."""
+        # Extract column name from the question
+        column_name = self._extract_column_name(question)
+        if not column_name:
+            return "Could not determine which column to find the minimum for."
+        for sheet_name, df in data.items():
+            # Skip empty sheets
+            if df.empty:
+                continue
+            # Find the best matching column
+            best_col = self._find_best_matching_column(df, column_name)
+            if best_col:
+                try:
+                    min_value = df[best_col].min()
+                    return str(min_value)
+                except:
+                    continue
+        return "Could not find the minimum value from the data."
+    def _extract_specific_info(self, data: Dict[str, pd.DataFrame], question: str) -> str:
+        """Extract specific information from the data."""
+        # Try to identify what we're looking for
+        looking_for = self._extract_looking_for(question)
+        conditions = self._extract_conditions(question)
+        for sheet_name, df in data.items():
+            # Skip empty sheets
+            if df.empty:
+                continue
+            # Apply conditions to filter the DataFrame
+            filtered_df = df
+            for condition in conditions:
+                col = condition.get('column')
+                value = condition.get('value')
+                operator = condition.get('operator', '=')
+                if col and value is not None:
+                    # Find the best matching column
+                    best_col = self._find_best_matching_column(df, col)
+                    if best_col:
+                        try:
+                            if operator == '=':
+                                filtered_df = filtered_df[filtered_df[best_col].astype(str).str.lower() == str(value).lower()]
+                            elif operator == '>':
+                                filtered_df = filtered_df[filtered_df[best_col] > value]
+                            elif operator == '<':
+                                filtered_df = filtered_df[filtered_df[best_col] < value]
+                            elif operator == '>=':
+                                filtered_df = filtered_df[filtered_df[best_col] >= value]
+                            elif operator == '<=':
+                                filtered_df = filtered_df[filtered_df[best_col] <= value]
+                            elif operator == 'contains':
+                                filtered_df = filtered_df[filtered_df[best_col].astype(str).str.lower().str.contains(str(value).lower())]
+                            elif operator == 'between':
+                                if isinstance(value, list) and len(value) == 2:
+                                    filtered_df = filtered_df[(filtered_df[best_col] >= value[0]) & (filtered_df[best_col] <= value[1])]
+                        except:
+                            continue
+            # If we found matching rows and know what to look for
+            if not filtered_df.empty and looking_for:
+                # Find the best matching column for what we're looking for
+                best_col = self._find_best_matching_column(df, looking_for)
+                if best_col:
+                    try:
+                        # Return the first value
+                        return str(filtered_df.iloc[0][best_col])
+                    except:
+                        continue
+        # If we couldn't extract specific information, return a more general response
+        if data:
+            # Return basic info about the first non-empty sheet
+            for sheet_name, df in data.items():
+                if not df.empty:
+                    return f"The sheet contains {len(df)} rows and {len(df.columns)} columns."
+        return "Could not extract the requested information from the data."
+    def _extract_conditions(self, question: str) -> List[Dict[str, Any]]:
+        """Extract conditions from the question."""
+        conditions = []
+        # Check for "between" conditions
+        between_pattern = r'(\w+) between (\d+) and (\d+)'
+        for match in re.finditer(between_pattern, question):
+            column = match.group(1)
+            start = int(match.group(2))
+            end = int(match.group(3))
+            conditions.append({
+                'column': column,
+                'operator': 'between',
+                'value': [start, end],
+            })
+        # Check for comparison conditions
+        comparison_pattern = r'(\w+) (>|<|>=|<=|=|equals|equal to|contains) (\w+)'
+        for match in re.finditer(comparison_pattern, question):
+            column = match.group(1)
+            op = match.group(2)
+            value = match.group(3)
+            # Convert operator text to symbols
+            if op == 'equals' or op == 'equal to':
+                op = '='
+            elif op == 'contains':
+                op = 'contains'
+            # Try to convert value to number
+            try:
+                value = float(value)
+            except:
+                pass
+            conditions.append({
+                'column': column,
+                'operator': op,
+                'value': value,
+            })
+        # Check for simple equality conditions
+        equality_pattern = r'(?:with|where) (\w+) (?:is|=) (\w+)'
+        for match in re.finditer(equality_pattern, question):
+            column = match.group(1)
+            value = match.group(2)
+            # Try to convert value to number
+            try:
+                value = float(value)
+            except:
+                pass
+            conditions.append({
+                'column': column,
+                'operator': '=',
+                'value': value,
+            })
+        return conditions
+    def _extract_column_name(self, question: str) -> Optional[str]:
+        """Extract column name from the question."""
+        # Check for direct mentions of columns
+        column_pattern = r'(?:column|field) (?:named|called) ["\']?(\w+)["\']?'
+        match = re.search(column_pattern, question)
+        if match:
+            return match.group(1)
+        # Look for common column references
+        common_columns = [
+            'year', 'date', 'time', 'name', 'title', 'price', 'cost',
+            'amount', 'quantity', 'total', 'value', 'age', 'rating',
+            'score', 'grade', 'salary', 'income', 'revenue', 'profit',
+            'loss', 'height', 'weight', 'length', 'width', 'depth',
+            'area', 'volume'
+        ]
+        for col in common_columns:
+            if col in question:
+                return col
+        return None
+    def _extract_looking_for(self, question: str) -> Optional[str]:
+        """Extract what we're looking for from the question."""
+        # Check for direct mentions of what we're looking for
+        looking_for_pattern = r'(?:what is|what are|find|get|return) the (\w+)'
+        match = re.search(looking_for_pattern, question)
+        if match:
+            return match.group(1)
+        # Look for common things we might be looking for
+        common_items = [
+            'name', 'title', 'price', 'cost', 'amount', 'quantity',
+            'total', 'value', 'age', 'rating', 'score', 'grade',
+            'salary', 'income', 'revenue', 'profit', 'loss',
+            'height', 'weight', 'length', 'width', 'depth',
+            'area', 'volume', 'year', 'date', 'time'
+        ]
+        for item in common_items:
+            if item in question:
+                return item
+        return None
+    def _find_best_matching_column(self, df: pd.DataFrame, column_name: str) -> Optional[str]:
+        """Find the best matching column in a DataFrame."""
+        # Check for exact match
+        if column_name in df.columns:
+            return column_name
+        # Check for case-insensitive match
+        for col in df.columns:
+            if col.lower() == column_name.lower():
+                return col
+        # Check for partial match
+        for col in df.columns:
+            if column_name.lower() in col.lower():
+                return col
+        return None
+    def process_csv_data(self, data: pd.DataFrame, question: str) -> str:
+        """
+        Process data extracted from a CSV file.
+        Args:
+            data: DataFrame containing the CSV data
+            question: The question to answer
+        Returns:
+            Answer to the question
+        """
+        # Wrap in a dictionary to reuse Excel processing logic
+        return self.process_excel_data({'Sheet1': data}, question)
+    def process_text_data(self, data: str, question: str) -> str:
+        """
+        Process data extracted from a text file.
+        Args:
+            data: Text content of the file
+            question: The question to answer
+        Returns:
+            Answer to the question
+        """
+        question_lower = question.lower()
+        # Handle specific question types
+        if 'count' in question_lower or 'how many' in question_lower:
+            # Count occurrences of a word or phrase
+            count_pattern = r'(?:count|how many) (?:occurrences of|instances of|times) ["\']?([^"\']+)["\']?'
+            match = re.search(count_pattern, question_lower)
+            if match:
+                term = match.group(1)
+                count = data.lower().count(term.lower())
+                return str(count)
+        # Check if the question is asking for a specific line
+        line_pattern = r'(?:what is|what does|what are|show|return) (?:the|on) (?:line|lines) (\d+)(?:\s*(?:to|-)\s*(\d+))?'
+        match = re.search(line_pattern, question_lower)
+        if match:
+            start_line = int(match.group(1))
+            end_line = int(match.group(2)) if match.group(2) else start_line
+            lines = data.split('\n')
+            if start_line <= len(lines) and end_line <= len(lines):
+                return '\n'.join(lines[start_line-1:end_line])
+        # Check if the question is asking for a specific paragraph
+        para_pattern = r'(?:what is|what does|what are|show|return) (?:the|in) paragraph (\d+)(?:\s*(?:to|-)\s*(\d+))?'
+        match = re.search(para_pattern, question_lower)
+        if match:
+            start_para = int(match.group(1))
+            end_para = int(match.group(2)) if match.group(2) else start_para
+            paragraphs = re.split(r'\n\s*\n', data)
+            if start_para <= len(paragraphs) and end_para <= len(paragraphs):
+                return '\n\n'.join(paragraphs[start_para-1:end_para])
+        # Check for specific information requests
+        info_pattern = r'(?:what|who|where|when|why|how) (?:is|are|was|were|does|do|did) ([^?]+)'
+        match = re.search(info_pattern, question_lower)
+        if match:
+            info = match.group(1).strip()
+            # Look for this information in the text
+            sentences = re.split(r'(?<=[.!?])\s+', data)
+            for sentence in sentences:
+                if info.lower() in sentence.lower():
+                    return sentence.strip()
+        # If nothing specific was found, return a generic summary
+        words = data.split()
+        return f"The text contains {len(words)} words and {len(data.split('. '))} sentences."
+    def process_pdf_data(self, data: Dict[int, str], question: str) -> str:
+        """
+        Process data extracted from a PDF file.
+        Args:
+            data: Dictionary mapping page numbers to text content
+            question: The question to answer
+        Returns:
+            Answer to the question
+        """
+        question_lower = question.lower()
+        # Check if the question is asking for a specific page
+        page_pattern = r'(?:what is|what does|what are|show|return) (?:on|in) page (\d+)'
+        match = re.search(page_pattern, question_lower)
+        if match:
+            page_num = int(match.group(1))
+            if page_num in data:
+                return data[page_num]
+            else:
+                return f"Page {page_num} not found in the PDF."
+        # Check if the question is asking for a specific information across all pages
+        info_pattern = r'(?:what|who|where|when|why|how) (?:is|are|was|were|does|do|did) ([^?]+)'
+        match = re.search(info_pattern, question_lower)
+        if match:
+            info = match.group(1).strip()
+            # Look for this information in all pages
+            for page_num, content in data.items():
+                sentences = re.split(r'(?<=[.!?])\s+', content)
+                for sentence in sentences:
+                    if info.lower() in sentence.lower():
+                        return sentence.strip()
+        # If nothing specific was found, combine all text and return a summary
+        all_text = ' '.join(data.values())
+        words = all_text.split()
+        return f"The PDF contains {len(data)} pages and approximately {len(words)} words."
+    def process_image_metadata(self, metadata: Dict[str, Any], question: str) -> str:
+        """
+        Process metadata extracted from an image file.
+        Args:
+            metadata: Dictionary containing image metadata
+            question: The question to answer
+        Returns:
+            Answer to the question
+        """
+        question_lower = question.lower()
+        # Handle specific question types
+        if 'format' in question_lower or 'type' in question_lower:
+            return metadata.get('format', 'Unknown format')
+        elif 'size' in question_lower or 'resolution' in question_lower:
+            width = metadata.get('width', 0)
+            height = metadata.get('height', 0)
+            return f"{width}x{height}"
+        elif 'width' in question_lower:
+            return str(metadata.get('width', 0))
+        elif 'height' in question_lower:
+            return str(metadata.get('height', 0))
+        elif 'mode' in question_lower or 'color' in question_lower:
+            return metadata.get('mode', 'Unknown mode')
+        elif 'exif' in question_lower:
+            exif = metadata.get('exif', {})
+            if exif:
+                return str(exif)
+            else:
+                return "No EXIF data found."
+        # If nothing specific was found, return basic information
+        return f"Image format: {metadata.get('format', 'Unknown')}, Size: {metadata.get('width', 0)}x{metadata.get('height', 0)}, Mode: {metadata.get('mode', 'Unknown')}"
+    def process_docx_data(self, data: str, question: str) -> str:
+        """
+        Process data extracted from a Word document.
+        Args:
+            data: Text content of the document
+            question: The question to answer
+        Returns:
+            Answer to the question
+        """
+        # Similar to text processing
+        return self.process_text_data(data, question)
+    def process_pptx_data(self, data: Dict[int, str], question: str) -> str:
+        """
+        Process data extracted from a PowerPoint presentation.
+        Args:
+            data: Dictionary mapping slide numbers to text content
+            question: The question to answer
+        Returns:
+            Answer to the question
+        """
+        question_lower = question.lower()
+        # Check if the question is asking for a specific slide
+        slide_pattern = r'(?:what is|what does|what are|show|return) (?:on|in) slide (\d+)'
+        match = re.search(slide_pattern, question_lower)
+        if match:
+            slide_num = int(match.group(1))
+            if slide_num in data:
+                return data[slide_num]
+            else:
+                return f"Slide {slide_num} not found in the presentation."
+        # Check if the question is asking for a specific information across all slides
+        info_pattern = r'(?:what|who|where|when|why|how) (?:is|are|was|were|does|do|did) ([^?]+)'
+        match = re.search(info_pattern, question_lower)
+        if match:
+            info = match.group(1).strip()
+            # Look for this information in all slides
+            for slide_num, content in data.items():
+                if info.lower() in content.lower():
+                    return content.strip()
+        # If nothing specific was found, return a summary
+        return f"The presentation contains {len(data)} slides."
+    def process_json_data(self, data: Dict[str, Any], question: str) -> str:
+        """
+        Process data extracted from a JSON file.
+        Args:
+            data: Parsed JSON content
+            question: The question to answer
+        Returns:
+            Answer to the question
+        """
+        question_lower = question.lower()
+        # Check if the question is asking for a specific key
+        key_pattern = r'(?:what is|what are|show|return) (?:the|in) ["\']?(\w+)["\']?'
+        match = re.search(key_pattern, question_lower)
+        if match:
+            key = match.group(1)
+            # Look for this key in the JSON
+            if key in data:
+                return str(data[key])
+            # Look for nested keys
+            for k, v in data.items():
+                if isinstance(v, dict) and key in v:
+                    return str(v[key])
+        # If nothing specific was found, return a summary
+        return f"The JSON contains {len(data)} top-level keys: {', '.join(data.keys())}"
+    def process_zip_data(self, data: Dict[str, Any], question: str) -> str:
+        """
+        Process data extracted from a ZIP archive.
+        Args:
+            data: Dictionary containing information about the archive
+            question: The question to answer
+        Returns:
+            Answer to the question
+        """
+        question_lower = question.lower()
+        # Handle specific question types
+        if 'how many' in question_lower or 'count' in question_lower:
+            if 'files' in question_lower:
+                return str(len(data.get('files', [])))
+        # Check if the question is asking for a specific file
+        file_pattern = r'(?:does it contain|is there) (?:a file named|a file called) ["\']?([^"\']+)["\']?'
+        match = re.search(file_pattern, question_lower)
+        if match:
+            filename = match.group(1)
+            # Check if the file exists in the archive
+            for file_info in data.get('files', []):
+                if filename.lower() in file_info.get('filename', '').lower():
+                    return f"Yes, the archive contains {file_info['filename']} ({file_info['size']} bytes)"
+            return f"No, the archive does not contain a file named {filename}."
+        # If nothing specific was found, return a summary
+        return f"The ZIP archive contains {len(data.get('files', []))} files."
+    def process_pdb_data(self, data: Dict[str, Any], question: str) -> str:
+        """
+        Process data extracted from a PDB file.
+        Args:
+            data: Dictionary containing information about the PDB file
+            question: The question to answer
+        Returns:
+            Answer to the question
+        """
+        question_lower = question.lower()
+        # Handle specific question types
+        if 'title' in question_lower:
+            return data.get('title', 'No title found.')
+        elif 'header' in question_lower:
+            return data.get('header', 'No header found.')
+        elif 'compound' in question_lower or 'compounds' in question_lower:
+            compounds = data.get('compounds', [])
+            if compounds:
+                return '\n'.join(compounds)
+            else:
+                return 'No compounds found.'
+        elif 'author' in question_lower or 'authors' in question_lower:
+            authors = data.get('authors', [])
+            if authors:
+                return '\n'.join(authors)
+            else:
+                return 'No authors found.'
+        elif 'atoms' in question_lower or 'atom count' in question_lower:
+            return str(data.get('atoms_count', 0))
+        # If nothing specific was found, return a summary
+        return f"PDB file with title: {data.get('title', 'No title')}, containing {data.get('atoms_count', 0)} atoms."
+    def process_python_data(self, data: Dict[str, Any], question: str) -> str:
+        """
+        Process data extracted from a Python file.
+        Args:
+            data: Dictionary containing information about the Python file
+            question: The question to answer
+        Returns:
+            Answer to the question
+        """
+        question_lower = question.lower()
+        # Handle specific question types
+        if 'class' in question_lower or 'classes' in question_lower:
+            classes = data.get('classes', [])
+            if classes:
+                class_names = [c['name'] for c in classes]
+                return ', '.join(class_names)
+            else:
+                return 'No classes found in the file.'
+        elif 'function' in question_lower or 'functions' in question_lower:
+            functions = data.get('functions', [])
+            if functions:
+                func_names = [f['name'] for f in functions]
+                return ', '.join(func_names)
+            else:
+                return 'No functions found in the file.'
+        elif 'import' in question_lower or 'imports' in question_lower:
+            imports = data.get('imports', [])
+            if imports:
+                import_strs = []
+                for imp in imports:
+                    if imp.get('from'):
+                        import_strs.append(f"from {imp['from']} import {imp['import']}")
+                    else:
+                        import_strs.append(f"import {imp['import']}")
+                return '\n'.join(import_strs)
+            else:
+                return 'No imports found in the file.'
+        # Check if the question is asking for a specific class or function
+        class_pattern = r'(?:what is|what does) (?:the class|class) ["\']?(\w+)["\']?'
+        match = re.search(class_pattern, question_lower)
+        if match:
+            class_name = match.group(1)
+            # Look for this class in the data
+            for cls in data.get('classes', []):
+                if cls['name'].lower() == class_name.lower():
+                    parent = f", inherits from {cls['parent']}" if cls['parent'] else ""
+                    return f"Class {cls['name']}{parent}"
+        func_pattern = r'(?:what is|what does) (?:the function|function) ["\']?(\w+)["\']?'
+        match = re.search(func_pattern, question_lower)
+        if match:
+            func_name = match.group(1)
+            # Look for this function in the data
+            for func in data.get('functions', []):
+                if func['name'].lower() == func_name.lower():
+                    return f"Function {func['name']}({func['params']})"
+        # If nothing specific was found, look for the code of a specific function or class
+        code_pattern = r'(?:show|return) (?:the code for|code of) (?:the )?(?:function|class) ["\']?(\w+)["\']?'
+        match = re.search(code_pattern, question_lower)
+        if match:
+            entity_name = match.group(1)
+            content = data.get('content', '')
+            # Look for the code of this entity
+            lines = content.split('\n')
+            entity_lines = []
+            in_entity = False
+            indent = 0
+            for i, line in enumerate(lines):
+                # Check for class or function definition
+                if re.match(rf'(class|def)\s+{re.escape(entity_name)}\s*\(', line):
+                    in_entity = True
+                    entity_lines.append(line)
+                    indent = len(line) - len(line.lstrip())
+                    continue
+                if in_entity:
+                    # Check if we're still in the entity based on indentation
+                    if line.strip() and len(line) - len(line.lstrip()) <= indent:
+                        in_entity = False
+                    else:
+                        entity_lines.append(line)
+            if entity_lines:
+                return '\n'.join(entity_lines)
+        # If nothing specific was found, return a summary
+        return f"Python file with {len(data.get('classes', []))} classes and {len(data.get('functions', []))} functions."
+    def process_jsonl_data(self, data: List[Dict[str, Any]], question: str) -> str:
+        """
+        Process data extracted from a JSONL file.
+        Args:
+            data: List of parsed JSON objects
+            question: The question to answer
+        Returns:
+            Answer to the question
+        """
+        question_lower = question.lower()
+        # Handle specific question types
+        if 'how many' in question_lower or 'count' in question_lower:
+            return str(len(data))
+        # Check if the question is asking for a specific entry
+        entry_pattern = r'(?:what is|what are|show|return) (?:the|in) entry (\d+)'
+        match = re.search(entry_pattern, question_lower)
+        if match:
+            entry_num = int(match.group(1))
+            if 0 <= entry_num < len(data):
+                return str(data[entry_num])
+            else:
+                return f"Entry {entry_num} not found in the data."
+        # Check if the question is asking for entries with a specific key-value pair
+        kv_pattern = r'(?:entries|items) where ["\']?(\w+)["\']? (?:is|=|equals|contains) ["\']?([^"\']+)["\']?'
+        match = re.search(kv_pattern, question_lower)
+        if match:
+            key = match.group(1)
+            value = match.group(2)
+            # Find entries matching the criteria
+            matching_entries = []
+            for entry in data:
+                if key in entry and str(entry[key]).lower() == value.lower():
+                    matching_entries.append(entry)
+            if matching_entries:
+                return str(matching_entries)
+            else:
+                return f"No entries found where {key} = {value}."
+        # If nothing specific was found, return a summary
+        if data and isinstance(data[0], dict):
+            keys = list(data[0].keys())
+            return f"The data contains {len(data)} entries with keys: {', '.join(keys)}"
+        else:
+            return f"The data contains {len(data)} entries."

agent/utils/question_analyzer.py ADDED Viewed

	@@ -0,0 +1,195 @@

+"""
+Utilities for analyzing and understanding questions.
+"""
+import re
+import json
+import os
+from typing import Dict, Any, List, Optional, Tuple, Set
+class QuestionAnalyzer:
+    """
+    Class for analyzing and understanding questions.
+    """
+    def __init__(self, resource_dir: str, metadata_path: Optional[str] = None):
+        """
+        Initialize the question analyzer.
+        Args:
+            resource_dir: Directory containing resource files
+            metadata_path: Path to the metadata file (optional)
+        """
+        self.resource_dir = resource_dir
+        self.metadata_path = metadata_path or os.path.join(resource_dir, 'metadata.jsonl')
+        self.metadata = self._load_metadata()
+    def _load_metadata(self) -> Dict[str, Dict[str, Any]]:
+        """
+        Load metadata from the metadata file.
+        Returns:
+            Dictionary mapping task IDs to metadata
+        """
+        metadata = {}
+        if os.path.exists(self.metadata_path):
+            try:
+                with open(self.metadata_path, 'r', encoding='utf-8') as f:
+                    for line in f:
+                        entry = json.loads(line.strip())
+                        task_id = entry.get('task_id')
+                        if task_id:
+                            metadata[task_id] = entry
+            except Exception as e:
+                print(f"Error loading metadata: {e}")
+        return metadata
+    def extract_file_mention(self, question: str) -> Optional[str]:
+        """
+        Extract mentioned file name from the question.
+        Args:
+            question: The question to analyze
+        Returns:
+            Mentioned file name, or None if no file is mentioned
+        """
+        # Look for "attached file" or "attached spreadsheet" patterns
+        attached_pattern = r'attached (?:file|spreadsheet|document|image|picture|pdf|excel|csv|text file|zip|archive) (?:named |called |")?([\w\.-]+)'
+        match = re.search(attached_pattern, question, re.IGNORECASE)
+        if match:
+            return match.group(1)
+        # Look for file extensions
+        extensions = [
+            '.xlsx', '.xls', '.csv', '.txt', '.pdf', '.jpg', '.jpeg',
+            '.png', '.docx', '.pptx', '.json', '.jsonld', '.zip', '.pdb', '.py'
+        ]
+        for ext in extensions:
+            pattern = r'(\w+(?:-\w+)*' + re.escape(ext) + r')'
+            match = re.search(pattern, question, re.IGNORECASE)
+            if match:
+                return match.group(1)
+        return None
+    def find_relevant_file(self, question: str, task_id: Optional[str] = None) -> Optional[str]:
+        """
+        Find the relevant file for a question.
+        Args:
+            question: The question to analyze
+            task_id: The task ID (optional)
+        Returns:
+            Path to the relevant file, or None if no file is found
+        """
+        # Check if task_id is in metadata and has a file_name
+        if task_id and task_id in self.metadata:
+            file_name = self.metadata[task_id].get('file_name')
+            if file_name:
+                file_path = os.path.join(self.resource_dir, file_name)
+                if os.path.exists(file_path):
+                    return file_path
+        # Extract file mention from question
+        file_mention = self.extract_file_mention(question)
+        if file_mention:
+            # Check if the mentioned file exists
+            file_path = os.path.join(self.resource_dir, file_mention)
+            if os.path.exists(file_path):
+                return file_path
+            # Check if there's a file with a similar name
+            for file_name in os.listdir(self.resource_dir):
+                if file_mention.lower() in file_name.lower():
+                    return os.path.join(self.resource_dir, file_name)
+        # If no file is found, try to find a file mentioned in the metadata
+        if task_id and task_id in self.metadata:
+            # Extract keywords from the question
+            keywords = self._extract_keywords(question)
+            # Check all files in the resource directory
+            best_match = None
+            best_score = 0
+            for file_name in os.listdir(self.resource_dir):
+                # Skip metadata file
+                if file_name == 'metadata.jsonl':
+                    continue
+                # Calculate score based on keyword matches
+                score = 0
+                for keyword in keywords:
+                    if keyword.lower() in file_name.lower():
+                        score += 1
+                if score > best_score:
+                    best_score = score
+                    best_match = file_name
+            if best_match:
+                return os.path.join(self.resource_dir, best_match)
+        return None
+    def _extract_keywords(self, text: str) -> Set[str]:
+        """
+        Extract keywords from text.
+        Args:
+            text: The text to analyze
+        Returns:
+            Set of keywords
+        """
+        # Remove common stop words
+        stop_words = {
+            'a', 'an', 'the', 'and', 'or', 'but', 'if', 'then', 'else', 'when',
+            'at', 'from', 'by', 'for', 'with', 'about', 'against', 'between',
+            'into', 'through', 'during', 'before', 'after', 'above', 'below',
+            'to', 'of', 'in', 'on', 'is', 'are', 'was', 'were', 'be', 'been',
+            'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did',
+            'doing', 'would', 'should', 'could', 'might', 'will', 'shall',
+            'can', 'may', 'must', 'ought'
+        }
+        # Extract words
+        words = re.findall(r'\b\w+\b', text.lower())
+        # Filter out stop words and short words
+        keywords = {word for word in words if word not in stop_words and len(word) > 2}
+        return keywords
+    def analyze_question(self, question: str, task_id: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Analyze a question to understand what it's asking.
+        Args:
+            question: The question to analyze
+            task_id: The task ID (optional)
+        Returns:
+            Dictionary containing analysis results
+        """
+        result = {
+            'question': question,
+            'task_id': task_id,
+            'file_path': None,
+            'keywords': list(self._extract_keywords(question)),
+            'expected_answer': None,
+        }
+        # Find relevant file
+        file_path = self.find_relevant_file(question, task_id)
+        if file_path:
+            result['file_path'] = file_path
+        # Get expected answer if available
+        if task_id and task_id in self.metadata:
+            result['expected_answer'] = self.metadata[task_id].get('Final answer')
+        return result

app.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import os
+import gradio as gr
+import requests
+import inspect
+import pandas as pd
+import logging
+import sys
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+logger = logging.getLogger('app')
+# Add the current directory to sys.path to import local modules
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+# Import the MultiModalAgent
+from agent import MultiModalAgent
+# (Keep Constants as is)
+# --- Constants ---
+DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+# --- Agent Definition ---
+# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
+class BasicAgent:
+    def __init__(self):
+        print("BasicAgent initialized.")
+    def __call__(self, question: str) -> str:
+        print(f"Agent received question (first 50 chars): {question[:50]}...")
+        fixed_answer = "This is a default answer."
+        print(f"Agent returning fixed answer: {fixed_answer}")
+        return fixed_answer
+def run_and_submit_all( profile: gr.OAuthProfile | None):
+    """
+    Fetches all questions, runs the BasicAgent on them, submits all answers,
+    and displays the results.
+    """
+    # --- Determine HF Space Runtime URL and Repo URL ---
+    space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
+    if profile:
+        username= f"{profile.username}"
+        print(f"User logged in: {username}")
+    else:
+        print("User not logged in.")
+        return "Please Login to Hugging Face with the button.", None
+    api_url = DEFAULT_API_URL
+    questions_url = f"{api_url}/questions"
+    submit_url = f"{api_url}/submit"
+    # 1. Instantiate Agent ( modify this part to create your agent)
+    try:
+        logger.info("Creating MultiModalAgent instance...")
+        resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resource')
+        agent = MultiModalAgent(resource_dir=resource_dir)
+        logger.info("MultiModalAgent initialized successfully")
+    except Exception as e:
+        logger.error(f"Error instantiating agent: {e}", exc_info=True)
+        return f"Error initializing agent: {e}", None
+    # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
+    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
+    print(agent_code)
+    # 2. Fetch Questions
+    print(f"Fetching questions from: {questions_url}")
+    try:
+        response = requests.get(questions_url, timeout=15)
+        response.raise_for_status()
+        questions_data = response.json()
+        if not questions_data:
+             print("Fetched questions list is empty.")
+             return "Fetched questions list is empty or invalid format.", None
+        print(f"Fetched {len(questions_data)} questions.")
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching questions: {e}")
+        return f"Error fetching questions: {e}", None
+    except requests.exceptions.JSONDecodeError as e:
+         print(f"Error decoding JSON response from questions endpoint: {e}")
+         print(f"Response text: {response.text[:500]}")
+         return f"Error decoding server response for questions: {e}", None
+    except Exception as e:
+        print(f"An unexpected error occurred fetching questions: {e}")
+        return f"An unexpected error occurred fetching questions: {e}", None
+    # 3. Run your Agent
+    results_log = []
+    answers_payload = []
+    print(f"Running agent on {len(questions_data)} questions...")
+    for item in questions_data:
+        task_id = item.get("task_id")
+        question_text = item.get("question")
+        if not task_id or question_text is None:
+            print(f"Skipping item with missing task_id or question: {item}")
+            continue
+        try:
+            submitted_answer = agent(question_text)
+            answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
+            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
+        except Exception as e:
+             print(f"Error running agent on task {task_id}: {e}")
+             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
+    if not answers_payload:
+        print("Agent did not produce any answers to submit.")
+        return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
+    # 4. Prepare Submission
+    submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
+    status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
+    print(status_update)
+    # 5. Submit
+    print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
+    try:
+        response = requests.post(submit_url, json=submission_data, timeout=60)
+        response.raise_for_status()
+        result_data = response.json()
+        final_status = (
+            f"Submission Successful!\n"
+            f"User: {result_data.get('username')}\n"
+            f"Overall Score: {result_data.get('score', 'N/A')}% "
+            f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
+            f"Message: {result_data.get('message', 'No message received.')}"
+        )
+        print("Submission successful.")
+        results_df = pd.DataFrame(results_log)
+        return final_status, results_df
+    except requests.exceptions.HTTPError as e:
+        error_detail = f"Server responded with status {e.response.status_code}."
+        try:
+            error_json = e.response.json()
+            error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
+        except requests.exceptions.JSONDecodeError:
+            error_detail += f" Response: {e.response.text[:500]}"
+        status_message = f"Submission Failed: {error_detail}"
+        print(status_message)
+        results_df = pd.DataFrame(results_log)
+        return status_message, results_df
+    except requests.exceptions.Timeout:
+        status_message = "Submission Failed: The request timed out."
+        print(status_message)
+        results_df = pd.DataFrame(results_log)
+        return status_message, results_df
+    except requests.exceptions.RequestException as e:
+        status_message = f"Submission Failed: Network error - {e}"
+        print(status_message)
+        results_df = pd.DataFrame(results_log)
+        return status_message, results_df
+    except Exception as e:
+        status_message = f"An unexpected error occurred during submission: {e}"
+        print(status_message)
+        results_df = pd.DataFrame(results_log)
+        return status_message, results_df
+# --- Build Gradio Interface using Blocks ---
+with gr.Blocks() as demo:
+    gr.Markdown("# Basic Agent Evaluation Runner")
+    gr.Markdown(
+        """
+        **Instructions:**
+        1.  Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
+        2.  Log in to your Hugging Face account using the button below. This uses your HF username for submission.
+        3.  Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
+        ---
+        **Disclaimers:**
+        Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
+        This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
+        """
+    )
+    gr.LoginButton()
+    run_button = gr.Button("Run Evaluation & Submit All Answers")
+    status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
+    # Removed max_rows=10 from DataFrame constructor
+    results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
+    run_button.click(
+        fn=run_and_submit_all,
+        outputs=[status_output, results_table]
+    )
+if __name__ == "__main__":
+    print("\n" + "-"*30 + " App Starting " + "-"*30)
+    # Check for SPACE_HOST and SPACE_ID at startup for information
+    space_host_startup = os.getenv("SPACE_HOST")
+    space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
+    if space_host_startup:
+        print(f"✅ SPACE_HOST found: {space_host_startup}")
+        print(f"   Runtime URL should be: https://{space_host_startup}.hf.space")
+    else:
+        print("ℹ️  SPACE_HOST environment variable not found (running locally?).")
+    if space_id_startup: # Print repo URLs if SPACE_ID is found
+        print(f"✅ SPACE_ID found: {space_id_startup}")
+        print(f"   Repo URL: https://huggingface.co/spaces/{space_id_startup}")
+        print(f"   Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
+    else:
+        print("ℹ️  SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
+    print("-"*(60 + len(" App Starting ")) + "\n")
+    print("Launching Gradio Interface for Basic Agent Evaluation...")
+    demo.launch(debug=True, share=False)

index.html DELETED Viewed

@@ -1,19 +0,0 @@
-<!doctype html>
-<html>
-	<head>
-		<meta charset="utf-8" />
-		<meta name="viewport" content="width=device-width" />
-		<title>My static Space</title>
-		<link rel="stylesheet" href="style.css" />
-	</head>
-	<body>
-		<div class="card">
-			<h1>Welcome to your static Space!</h1>
-			<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
-			<p>
-				Also don't forget to check the
-				<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
-			</p>
-		</div>
-	</body>
-</html>

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+pandas>=1.3.0
+gradio>=3.0.0
+requests>=2.25.0
+openpyxl>=3.0.9
+PyPDF2>=2.0.0
+python-docx>=0.8.11
+python-pptx>=0.6.19
+Pillow>=8.0.0
+jsonschema>=4.0.0
+zipfile36>=0.1.3
+scikit-learn>=1.0.0
+nltk>=3.6.0
+python-dotenv>=0.19.0
+pytest>=6.0.0
+PyYAML>=6.0
+biopython>=1.79

style.css DELETED Viewed

@@ -1,28 +0,0 @@
-body {
-	padding: 2rem;
-	font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
-}
-h1 {
-	font-size: 16px;
-	margin-top: 0;
-}
-p {
-	color: rgb(107, 114, 128);
-	font-size: 15px;
-	margin-bottom: 10px;
-	margin-top: 5px;
-}
-.card {
-	max-width: 620px;
-	margin: 0 auto;
-	padding: 16px;
-	border: 1px solid lightgray;
-	border-radius: 16px;
-}
-.card p:last-child {
-	margin-bottom: 0;
-}

test_agent.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""
+Test the MultiModalAgent.
+"""
+import os
+import sys
+import logging
+import json
+# Add the current directory to sys.path to import local modules
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+# Import the MultiModalAgent
+from agent import MultiModalAgent
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger('test_agent')
+def main():
+    """Test the MultiModalAgent with some sample questions."""
+    # Initialize the agent
+    resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resource')
+    agent = MultiModalAgent(resource_dir=resource_dir)
+    # Load test questions from metadata.jsonl
+    metadata_path = os.path.join(resource_dir, 'metadata.jsonl')
+    test_questions = []
+    with open(metadata_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            entry = json.loads(line.strip())
+            if 'Question' in entry and 'file_name' in entry and entry['file_name']:
+                test_questions.append({
+                    'task_id': entry.get('task_id'),
+                    'question': entry['Question'],
+                    'file_name': entry['file_name'],
+                    'expected_answer': entry.get('Final answer')
+                })
+                if len(test_questions) >= 5:  # Limit to 5 questions
+                    break
+    # If no questions with files were found, use some generic questions
+    if not test_questions:
+        test_questions = [
+            {
+                'question': "What's the oldest Blu-Ray in the inventory spreadsheet?",
+                'file_name': None,
+                'expected_answer': None
+            },
+            {
+                'question': "How many files are in the resource directory?",
+                'file_name': None,
+                'expected_answer': None
+            }
+        ]
+    # Test the agent with each question
+    for i, q in enumerate(test_questions):
+        question = q['question']
+        logger.info(f"Testing question {i+1}: {question}")
+        answer = agent(question)
+        logger.info(f"Answer: {answer}")
+        if q['expected_answer']:
+            logger.info(f"Expected answer: {q['expected_answer']}")
+            if answer.strip() == q['expected_answer'].strip():
+                logger.info("Correct answer!")
+            else:
+                logger.warning("Incorrect answer.")
+        logger.info("-" * 80)
+if __name__ == "__main__":
+    main()