Arbnor Tefiki commited on
Commit
5d9aa5e
·
1 Parent(s): c594a60

Test the agent in HF

Browse files
.gitignore ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Distribution / packaging
24
+ .Python
25
+ env/
26
+ build/
27
+ develop-eggs/
28
+ dist/
29
+ downloads/
30
+ eggs/
31
+ .eggs/
32
+ lib/
33
+ lib64/
34
+ parts/
35
+ sdist/
36
+ var/
37
+ *.egg-info/
38
+ .installed.cfg
39
+ *.egg
40
+
41
+ # Virtual Environment
42
+ venv/
43
+ ENV/
44
+ env/
45
+
46
+ # Jupyter Notebook
47
+ .ipynb_checkpoints
48
+
49
+ # VS Code
50
+ .vscode/
51
+ *.code-workspace
52
+
53
+ # PyCharm
54
+ .idea/
55
+
56
+ # Logs
57
+ logs/
58
+ *.log
59
+
60
+ # Local configuration
61
+ .env
62
+
63
+ # Cache
64
+ .cache/
65
+ .pytest_cache/
66
+
67
+ # Mac OS
68
+ .DS_Store
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ libffi-dev \
9
+ && apt-get clean \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Copy requirements first to leverage Docker cache
13
+ COPY requirements.txt .
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ # Copy application code
17
+ COPY . .
18
+
19
+ # Set environment variables
20
+ ENV PYTHONUNBUFFERED=1
21
+
22
+ # Command to run when the container starts
23
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,11 +1,55 @@
1
- ---
2
- title: Ai Agents Final
3
- emoji: 📊
4
- colorFrom: green
5
- colorTo: purple
6
- sdk: static
7
- pinned: false
8
- license: mit
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multi-Modal AI Agent for Hugging Face Agent Course Unit 4
2
+
3
+ This project implements a multi-modal AI agent that can process various file types and answer questions about their content for the Hugging Face Agent Course Unit 4 final assessment.
4
+
5
+ ## Features
6
+
7
+ - Processes and answers questions about different file types:
8
+ - Excel/CSV files (.xlsx, .csv)
9
+ - Text files (.txt)
10
+ - PDFs (.pdf)
11
+ - Images (.jpg, .png)
12
+ - Python files (.py)
13
+ - Microsoft Office files (.docx, .pptx)
14
+ - JSON files (.jsonld)
15
+ - Archive files (.zip)
16
+ - Other specialized formats (.pdb)
17
+
18
+ - Analyzes questions to understand what's being asked
19
+ - Identifies and loads relevant resource files
20
+ - Applies appropriate processing techniques based on file type
21
+ - Formulates accurate answers based on file content
22
+ - Includes error handling and logging
23
+
24
+ ## Project Structure
25
+
26
+ - `app.py`: Main application file with Gradio interface
27
+ - `agent/`: Package containing agent components
28
+ - `agent.py`: Multi-modal agent implementation
29
+ - `tools/`: File handlers and other tools
30
+ - `utils/`: Utility functions for question analysis and data processing
31
+
32
+ ## Installation
33
+
34
+ 1. Clone the repository
35
+ 2. Install dependencies:
36
+ ```
37
+ pip install -r requirements.txt
38
+ ```
39
+
40
+ ## Usage
41
+
42
+ Run the application:
43
+ ```
44
+ python app.py
45
+ ```
46
+
47
+ ## Dependencies
48
+
49
+ - pandas: For data processing
50
+ - gradio: For the user interface
51
+ - PyPDF2: For PDF processing
52
+ - python-docx: For Word document processing
53
+ - python-pptx: For PowerPoint presentations
54
+ - Pillow: For image processing
55
+ - And more (see requirements.txt)
agent/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ Agent package initialization file.
3
+ """
4
+ from agent.agent import MultiModalAgent
5
+
6
+ __all__ = ["MultiModalAgent"]
agent/agent.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Multi-modal agent for processing different file types and answering questions.
3
+ """
4
+ import os
5
+ import json
6
+ import logging
7
+ from typing import Dict, Any, List, Optional, Tuple
8
+
9
+ from agent.tools.file_handlers import extract_file_content
10
+ from agent.utils.question_analyzer import QuestionAnalyzer
11
+ from agent.utils.data_processor import DataProcessor
12
+
13
+ # Configure logging
14
+ logging.basicConfig(
15
+ level=logging.INFO,
16
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
17
+ )
18
+ logger = logging.getLogger('MultiModalAgent')
19
+
20
+ class MultiModalAgent:
21
+ """
22
+ Agent for processing different file types and answering questions.
23
+ """
24
+
25
+ def __init__(self, resource_dir: str = 'resource'):
26
+ """
27
+ Initialize the agent.
28
+
29
+ Args:
30
+ resource_dir: Directory containing resource files
31
+ """
32
+ logger.info("Initializing MultiModalAgent")
33
+ self.resource_dir = resource_dir
34
+ self.question_analyzer = QuestionAnalyzer(resource_dir)
35
+ self.data_processor = DataProcessor()
36
+
37
+ # Cache for file content to avoid re-processing
38
+ self.file_content_cache = {}
39
+
40
+ # Cache for answers
41
+ self.answer_cache = {}
42
+
43
+ def __call__(self, question: str) -> str:
44
+ """
45
+ Process a question and return an answer.
46
+
47
+ Args:
48
+ question: The question to answer
49
+
50
+ Returns:
51
+ Answer to the question
52
+ """
53
+ logger.info(f"Processing question: {question[:100]}...")
54
+
55
+ # Check answer cache
56
+ if question in self.answer_cache:
57
+ logger.info("Answer found in cache")
58
+ return self.answer_cache[question]
59
+
60
+ try:
61
+ # Analyze the question
62
+ analysis = self.question_analyzer.analyze_question(question)
63
+ logger.info(f"Question analysis: {analysis}")
64
+
65
+ # Handle general questions that don't require file processing
66
+ if not analysis.get('file_path'):
67
+ logger.info("No file reference found in question, trying to answer directly")
68
+ direct_answer = self._answer_without_file(question)
69
+ if direct_answer:
70
+ self.answer_cache[question] = direct_answer
71
+ return direct_answer
72
+
73
+ # If direct answering failed, try to find a file in the resource directory
74
+ logger.info("Direct answering failed, looking for relevant files")
75
+ analysis['file_path'] = self._find_most_relevant_file(question)
76
+ if not analysis['file_path']:
77
+ logger.warning("No relevant file found for the question")
78
+ return "I couldn't find a relevant file to answer this question."
79
+
80
+ # Extract content from the file
81
+ file_path = analysis['file_path']
82
+
83
+ if file_path in self.file_content_cache:
84
+ content, handler = self.file_content_cache[file_path]
85
+ else:
86
+ content, handler = extract_file_content(file_path, self.resource_dir)
87
+ if content is not None:
88
+ self.file_content_cache[file_path] = (content, handler)
89
+
90
+ if content is None:
91
+ logger.error(f"Failed to extract content from file: {file_path}")
92
+ return "I couldn't extract content from the specified file."
93
+
94
+ # Process the content based on file type
95
+ answer = self._process_content(content, handler, question)
96
+
97
+ # Cache the answer
98
+ self.answer_cache[question] = answer
99
+
100
+ return answer
101
+ except Exception as e:
102
+ logger.exception(f"Error processing question: {e}")
103
+ return f"An error occurred while processing your question: {e}"
104
+
105
+ def _answer_without_file(self, question: str) -> Optional[str]:
106
+ """
107
+ Try to answer the question without using a file.
108
+
109
+ Args:
110
+ question: The question to answer
111
+
112
+ Returns:
113
+ Answer to the question, or None if the question can't be answered directly
114
+ """
115
+ # This is a simple implementation that can be expanded based on your needs
116
+
117
+ # Check if the question is asking for metadata about the resource directory
118
+ if 'how many files' in question.lower() or 'number of files' in question.lower():
119
+ try:
120
+ file_count = len(os.listdir(self.resource_dir))
121
+ return f"There are {file_count} files in the resource directory."
122
+ except Exception as e:
123
+ logger.error(f"Error counting files: {e}")
124
+ return None
125
+
126
+ # Check if the question is asking about file types
127
+ file_types_patterns = [
128
+ 'what file types', 'which file types', 'what kinds of files',
129
+ 'which kinds of files', 'what formats', 'which formats'
130
+ ]
131
+ if any(pattern in question.lower() for pattern in file_types_patterns):
132
+ try:
133
+ files = os.listdir(self.resource_dir)
134
+ extensions = set()
135
+
136
+ for file in files:
137
+ _, ext = os.path.splitext(file)
138
+ if ext: # Skip files without extension
139
+ extensions.add(ext)
140
+
141
+ if extensions:
142
+ extensions_list = sorted(list(extensions))
143
+ return f"The resource directory contains files with the following extensions: {', '.join(extensions_list)}"
144
+ else:
145
+ return "The resource directory doesn't contain any files with extensions."
146
+ except Exception as e:
147
+ logger.error(f"Error analyzing file types: {e}")
148
+ return None
149
+
150
+ return None
151
+
152
+ def _find_most_relevant_file(self, question: str) -> Optional[str]:
153
+ """
154
+ Find the most relevant file for a question.
155
+
156
+ Args:
157
+ question: The question to answer
158
+
159
+ Returns:
160
+ Path to the most relevant file, or None if no relevant file is found
161
+ """
162
+ try:
163
+ # Get all files in the resource directory
164
+ files = [
165
+ os.path.join(self.resource_dir, f)
166
+ for f in os.listdir(self.resource_dir)
167
+ if os.path.isfile(os.path.join(self.resource_dir, f))
168
+ ]
169
+
170
+ if not files:
171
+ logger.warning("No files found in the resource directory")
172
+ return None
173
+
174
+ # Extract keywords from the question
175
+ keywords = set(self.question_analyzer._extract_keywords(question))
176
+
177
+ # Calculate relevance scores for each file
178
+ scores = []
179
+
180
+ for file_path in files:
181
+ score = 0
182
+ file_name = os.path.basename(file_path)
183
+
184
+ # Score based on file name
185
+ for keyword in keywords:
186
+ if keyword.lower() in file_name.lower():
187
+ score += 2 # Higher weight for filename matches
188
+
189
+ # Score based on file extension
190
+ _, ext = os.path.splitext(file_path)
191
+ ext = ext.lower()
192
+
193
+ # Check if the question mentions the file type
194
+ if 'excel' in question.lower() or 'spreadsheet' in question.lower() or 'xlsx' in question.lower():
195
+ if ext in ['.xlsx', '.xls']:
196
+ score += 3
197
+ elif 'csv' in question.lower():
198
+ if ext == '.csv':
199
+ score += 3
200
+ elif 'text' in question.lower() or 'txt' in question.lower():
201
+ if ext == '.txt':
202
+ score += 3
203
+ elif 'pdf' in question.lower():
204
+ if ext == '.pdf':
205
+ score += 3
206
+ elif 'image' in question.lower() or 'picture' in question.lower() or 'photo' in question.lower():
207
+ if ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']:
208
+ score += 3
209
+ elif 'word' in question.lower() or 'document' in question.lower() or 'docx' in question.lower():
210
+ if ext == '.docx':
211
+ score += 3
212
+ elif 'powerpoint' in question.lower() or 'presentation' in question.lower() or 'slides' in question.lower() or 'pptx' in question.lower():
213
+ if ext == '.pptx':
214
+ score += 3
215
+ elif 'json' in question.lower():
216
+ if ext in ['.json', '.jsonld']:
217
+ score += 3
218
+ elif 'zip' in question.lower() or 'archive' in question.lower():
219
+ if ext == '.zip':
220
+ score += 3
221
+ elif 'python' in question.lower() or 'py' in question.lower() or 'code' in question.lower() or 'script' in question.lower():
222
+ if ext == '.py':
223
+ score += 3
224
+ elif 'pdb' in question.lower() or 'protein' in question.lower():
225
+ if ext == '.pdb':
226
+ score += 3
227
+
228
+ scores.append((file_path, score))
229
+
230
+ # Sort by score in descending order
231
+ scores.sort(key=lambda x: x[1], reverse=True)
232
+
233
+ # Return the most relevant file if it has a non-zero score
234
+ if scores and scores[0][1] > 0:
235
+ return scores[0][0]
236
+
237
+ # If no relevant file is found based on the question, return None
238
+ return None
239
+ except Exception as e:
240
+ logger.error(f"Error finding relevant file: {e}")
241
+ return None
242
+
243
+ def _process_content(self, content: Any, handler: Any, question: str) -> str:
244
+ """
245
+ Process the content based on file type.
246
+
247
+ Args:
248
+ content: Extracted content from the file
249
+ handler: File handler used to extract the content
250
+ question: The question to answer
251
+
252
+ Returns:
253
+ Answer to the question
254
+ """
255
+ try:
256
+ handler_type = type(handler).__name__
257
+
258
+ if handler_type == 'ExcelHandler':
259
+ return self.data_processor.process_excel_data(content, question)
260
+ elif handler_type == 'CSVHandler':
261
+ return self.data_processor.process_csv_data(content, question)
262
+ elif handler_type == 'TextHandler':
263
+ return self.data_processor.process_text_data(content, question)
264
+ elif handler_type == 'PDFHandler':
265
+ return self.data_processor.process_pdf_data(content, question)
266
+ elif handler_type == 'ImageHandler':
267
+ return self.data_processor.process_image_metadata(content, question)
268
+ elif handler_type == 'DocxHandler':
269
+ return self.data_processor.process_docx_data(content, question)
270
+ elif handler_type == 'PptxHandler':
271
+ return self.data_processor.process_pptx_data(content, question)
272
+ elif handler_type == 'JsonHandler':
273
+ return self.data_processor.process_json_data(content, question)
274
+ elif handler_type == 'ZipHandler':
275
+ return self.data_processor.process_zip_data(content, question)
276
+ elif handler_type == 'PdbHandler':
277
+ return self.data_processor.process_pdb_data(content, question)
278
+ elif handler_type == 'PythonHandler':
279
+ return self.data_processor.process_python_data(content, question)
280
+ elif handler_type == 'JsonlHandler':
281
+ return self.data_processor.process_jsonl_data(content, question)
282
+ else:
283
+ logger.warning(f"Unknown handler type: {handler_type}")
284
+ return f"I don't know how to process content from a {handler_type}."
285
+ except Exception as e:
286
+ logger.exception(f"Error processing content: {e}")
287
+ return f"An error occurred while processing the file content: {e}"
agent/tools/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Tools initialization file.
3
+ """
agent/tools/file_handlers.py ADDED
@@ -0,0 +1,561 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File handlers for processing different file types.
3
+ """
4
+ import os
5
+ import json
6
+ import csv
7
+ import zipfile
8
+ import io
9
+ import re
10
+ from typing import Dict, Any, List, Optional, Tuple
11
+
12
+ import pandas as pd
13
+ from PIL import Image
14
+ import PyPDF2
15
+ import docx
16
+ from pptx import Presentation
17
+
18
+ class FileHandler:
19
+ """Base class for file handlers."""
20
+
21
+ def __init__(self, resource_dir: str):
22
+ """
23
+ Initialize the file handler.
24
+
25
+ Args:
26
+ resource_dir: Directory containing resource files
27
+ """
28
+ self.resource_dir = resource_dir
29
+
30
+ def get_file_path(self, file_name: str) -> str:
31
+ """
32
+ Get the full path to a file.
33
+
34
+ Args:
35
+ file_name: Name of the file
36
+
37
+ Returns:
38
+ Full path to the file
39
+ """
40
+ return os.path.join(self.resource_dir, file_name)
41
+
42
+ def can_handle(self, file_path: str) -> bool:
43
+ """
44
+ Check if the handler can process the given file.
45
+
46
+ Args:
47
+ file_path: Path to the file
48
+
49
+ Returns:
50
+ True if the handler can process the file, False otherwise
51
+ """
52
+ raise NotImplementedError("Subclasses must implement this method")
53
+
54
+ def extract_content(self, file_path: str) -> Any:
55
+ """
56
+ Extract content from the file.
57
+
58
+ Args:
59
+ file_path: Path to the file
60
+
61
+ Returns:
62
+ Extracted content
63
+ """
64
+ raise NotImplementedError("Subclasses must implement this method")
65
+
66
+ def get_file_type(self, file_path: str) -> str:
67
+ """
68
+ Get the file type based on extension.
69
+
70
+ Args:
71
+ file_path: Path to the file
72
+
73
+ Returns:
74
+ File type (extension)
75
+ """
76
+ _, ext = os.path.splitext(file_path)
77
+ return ext.lower()
78
+
79
+
80
+ class ExcelHandler(FileHandler):
81
+ """Handler for Excel files."""
82
+
83
+ def can_handle(self, file_path: str) -> bool:
84
+ """Check if the file is an Excel file."""
85
+ return self.get_file_type(file_path) in ['.xlsx', '.xls']
86
+
87
+ def extract_content(self, file_path: str) -> Dict[str, pd.DataFrame]:
88
+ """
89
+ Extract content from an Excel file.
90
+
91
+ Returns:
92
+ Dictionary mapping sheet names to DataFrames
93
+ """
94
+ try:
95
+ # Read all sheets
96
+ excel_file = pd.ExcelFile(file_path)
97
+ sheets = {}
98
+
99
+ for sheet_name in excel_file.sheet_names:
100
+ sheets[sheet_name] = pd.read_excel(excel_file, sheet_name)
101
+
102
+ return sheets
103
+ except Exception as e:
104
+ print(f"Error extracting content from Excel file {file_path}: {e}")
105
+ return {}
106
+
107
+
108
+ class CSVHandler(FileHandler):
109
+ """Handler for CSV files."""
110
+
111
+ def can_handle(self, file_path: str) -> bool:
112
+ """Check if the file is a CSV file."""
113
+ return self.get_file_type(file_path) == '.csv'
114
+
115
+ def extract_content(self, file_path: str) -> pd.DataFrame:
116
+ """
117
+ Extract content from a CSV file.
118
+
119
+ Returns:
120
+ DataFrame containing the CSV data
121
+ """
122
+ try:
123
+ # Try different encodings and delimiters
124
+ try:
125
+ return pd.read_csv(file_path)
126
+ except:
127
+ # Try with different delimiter
128
+ with open(file_path, 'r', newline='') as csvfile:
129
+ dialect = csv.Sniffer().sniff(csvfile.read(1024))
130
+ csvfile.seek(0)
131
+ return pd.read_csv(file_path, delimiter=dialect.delimiter)
132
+ except Exception as e:
133
+ print(f"Error extracting content from CSV file {file_path}: {e}")
134
+ return pd.DataFrame()
135
+
136
+
137
+ class TextHandler(FileHandler):
138
+ """Handler for text files."""
139
+
140
+ def can_handle(self, file_path: str) -> bool:
141
+ """Check if the file is a text file."""
142
+ return self.get_file_type(file_path) == '.txt'
143
+
144
+ def extract_content(self, file_path: str) -> str:
145
+ """
146
+ Extract content from a text file.
147
+
148
+ Returns:
149
+ Text content of the file
150
+ """
151
+ try:
152
+ with open(file_path, 'r', encoding='utf-8') as f:
153
+ return f.read()
154
+ except UnicodeDecodeError:
155
+ # Try a different encoding
156
+ try:
157
+ with open(file_path, 'r', encoding='latin-1') as f:
158
+ return f.read()
159
+ except Exception as e:
160
+ print(f"Error extracting content from text file {file_path}: {e}")
161
+ return ""
162
+ except Exception as e:
163
+ print(f"Error extracting content from text file {file_path}: {e}")
164
+ return ""
165
+
166
+
167
+ class PDFHandler(FileHandler):
168
+ """Handler for PDF files."""
169
+
170
+ def can_handle(self, file_path: str) -> bool:
171
+ """Check if the file is a PDF file."""
172
+ return self.get_file_type(file_path) == '.pdf'
173
+
174
+ def extract_content(self, file_path: str) -> Dict[int, str]:
175
+ """
176
+ Extract content from a PDF file.
177
+
178
+ Returns:
179
+ Dictionary mapping page numbers to text content
180
+ """
181
+ try:
182
+ result = {}
183
+ with open(file_path, 'rb') as f:
184
+ pdf_reader = PyPDF2.PdfReader(f)
185
+ for i in range(len(pdf_reader.pages)):
186
+ page = pdf_reader.pages[i]
187
+ result[i + 1] = page.extract_text()
188
+ return result
189
+ except Exception as e:
190
+ print(f"Error extracting content from PDF file {file_path}: {e}")
191
+ return {}
192
+
193
+
194
+ class ImageHandler(FileHandler):
195
+ """Handler for image files."""
196
+
197
+ def can_handle(self, file_path: str) -> bool:
198
+ """Check if the file is an image file."""
199
+ return self.get_file_type(file_path) in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']
200
+
201
+ def extract_content(self, file_path: str) -> Dict[str, Any]:
202
+ """
203
+ Extract metadata from an image file.
204
+
205
+ Returns:
206
+ Dictionary containing image metadata
207
+ """
208
+ try:
209
+ with Image.open(file_path) as img:
210
+ metadata = {
211
+ 'format': img.format,
212
+ 'mode': img.mode,
213
+ 'size': img.size,
214
+ 'width': img.width,
215
+ 'height': img.height,
216
+ }
217
+
218
+ # Extract EXIF data if available
219
+ if hasattr(img, '_getexif') and img._getexif():
220
+ exif = {
221
+ PyPDF2.ExifTags.TAGS.get(k, k): v
222
+ for k, v in img._getexif().items()
223
+ if k in PyPDF2.ExifTags.TAGS
224
+ }
225
+ metadata['exif'] = exif
226
+
227
+ return metadata
228
+ except Exception as e:
229
+ print(f"Error extracting content from image file {file_path}: {e}")
230
+ return {}
231
+
232
+
233
+ class DocxHandler(FileHandler):
234
+ """Handler for Word documents."""
235
+
236
+ def can_handle(self, file_path: str) -> bool:
237
+ """Check if the file is a Word document."""
238
+ return self.get_file_type(file_path) == '.docx'
239
+
240
+ def extract_content(self, file_path: str) -> str:
241
+ """
242
+ Extract text content from a Word document.
243
+
244
+ Returns:
245
+ Text content of the document
246
+ """
247
+ try:
248
+ doc = docx.Document(file_path)
249
+ full_text = []
250
+
251
+ # Extract text from paragraphs
252
+ for para in doc.paragraphs:
253
+ full_text.append(para.text)
254
+
255
+ # Extract text from tables
256
+ for table in doc.tables:
257
+ for row in table.rows:
258
+ for cell in row.cells:
259
+ full_text.append(cell.text)
260
+
261
+ return '\n'.join(full_text)
262
+ except Exception as e:
263
+ print(f"Error extracting content from Word document {file_path}: {e}")
264
+ return ""
265
+
266
+
267
+ class PptxHandler(FileHandler):
268
+ """Handler for PowerPoint presentations."""
269
+
270
+ def can_handle(self, file_path: str) -> bool:
271
+ """Check if the file is a PowerPoint presentation."""
272
+ return self.get_file_type(file_path) == '.pptx'
273
+
274
+ def extract_content(self, file_path: str) -> Dict[int, str]:
275
+ """
276
+ Extract text content from a PowerPoint presentation.
277
+
278
+ Returns:
279
+ Dictionary mapping slide numbers to text content
280
+ """
281
+ try:
282
+ prs = Presentation(file_path)
283
+ slides_text = {}
284
+
285
+ for i, slide in enumerate(prs.slides):
286
+ texts = []
287
+
288
+ for shape in slide.shapes:
289
+ if hasattr(shape, "text"):
290
+ texts.append(shape.text)
291
+
292
+ slides_text[i + 1] = '\n'.join(texts)
293
+
294
+ return slides_text
295
+ except Exception as e:
296
+ print(f"Error extracting content from PowerPoint presentation {file_path}: {e}")
297
+ return {}
298
+
299
+
300
+ class JsonHandler(FileHandler):
301
+ """Handler for JSON files."""
302
+
303
+ def can_handle(self, file_path: str) -> bool:
304
+ """Check if the file is a JSON file."""
305
+ return self.get_file_type(file_path) in ['.json', '.jsonld']
306
+
307
+ def extract_content(self, file_path: str) -> Dict[str, Any]:
308
+ """
309
+ Extract content from a JSON file.
310
+
311
+ Returns:
312
+ Parsed JSON content
313
+ """
314
+ try:
315
+ with open(file_path, 'r', encoding='utf-8') as f:
316
+ return json.load(f)
317
+ except Exception as e:
318
+ print(f"Error extracting content from JSON file {file_path}: {e}")
319
+ return {}
320
+
321
+
322
+ class ZipHandler(FileHandler):
323
+ """Handler for ZIP archives."""
324
+
325
+ def can_handle(self, file_path: str) -> bool:
326
+ """Check if the file is a ZIP archive."""
327
+ return self.get_file_type(file_path) == '.zip'
328
+
329
+ def extract_content(self, file_path: str) -> Dict[str, Any]:
330
+ """
331
+ Extract information about the contents of a ZIP archive.
332
+
333
+ Returns:
334
+ Dictionary containing information about the archived files
335
+ """
336
+ try:
337
+ result = {'files': []}
338
+
339
+ with zipfile.ZipFile(file_path, 'r') as zip_ref:
340
+ # Get information about each file in the archive
341
+ for file_info in zip_ref.infolist():
342
+ result['files'].append({
343
+ 'filename': file_info.filename,
344
+ 'size': file_info.file_size,
345
+ 'compressed_size': file_info.compress_size,
346
+ 'date_time': file_info.date_time,
347
+ })
348
+
349
+ # Try to extract and process common file types
350
+ for file_info in zip_ref.infolist():
351
+ if file_info.filename.endswith('.txt'):
352
+ with zip_ref.open(file_info.filename) as f:
353
+ content = f.read().decode('utf-8', errors='ignore')
354
+ result[file_info.filename] = content
355
+
356
+ return result
357
+ except Exception as e:
358
+ print(f"Error extracting content from ZIP archive {file_path}: {e}")
359
+ return {}
360
+
361
+
362
+ class PdbHandler(FileHandler):
363
+ """Handler for PDB (Protein Data Bank) files."""
364
+
365
+ def can_handle(self, file_path: str) -> bool:
366
+ """Check if the file is a PDB file."""
367
+ return self.get_file_type(file_path) == '.pdb'
368
+
369
+ def extract_content(self, file_path: str) -> Dict[str, Any]:
370
+ """
371
+ Extract basic information from a PDB file.
372
+
373
+ Returns:
374
+ Dictionary containing basic information about the PDB file
375
+ """
376
+ try:
377
+ result = {
378
+ 'header': '',
379
+ 'title': '',
380
+ 'compounds': [],
381
+ 'authors': [],
382
+ 'atoms_count': 0,
383
+ }
384
+
385
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
386
+ for line in f:
387
+ if line.startswith('HEADER'):
388
+ result['header'] = line[10:].strip()
389
+ elif line.startswith('TITLE'):
390
+ result['title'] += line[10:].strip()
391
+ elif line.startswith('COMPND'):
392
+ result['compounds'].append(line[10:].strip())
393
+ elif line.startswith('AUTHOR'):
394
+ result['authors'].append(line[10:].strip())
395
+ elif line.startswith('ATOM') or line.startswith('HETATM'):
396
+ result['atoms_count'] += 1
397
+
398
+ return result
399
+ except Exception as e:
400
+ print(f"Error extracting content from PDB file {file_path}: {e}")
401
+ return {}
402
+
403
+
404
+ class PythonHandler(FileHandler):
405
+ """Handler for Python files."""
406
+
407
+ def can_handle(self, file_path: str) -> bool:
408
+ """Check if the file is a Python file."""
409
+ return self.get_file_type(file_path) == '.py'
410
+
411
+ def extract_content(self, file_path: str) -> Dict[str, Any]:
412
+ """
413
+ Extract content and structure from a Python file.
414
+
415
+ Returns:
416
+ Dictionary containing the file content and structure information
417
+ """
418
+ try:
419
+ with open(file_path, 'r', encoding='utf-8') as f:
420
+ content = f.read()
421
+
422
+ result = {
423
+ 'content': content,
424
+ 'classes': [],
425
+ 'functions': [],
426
+ 'imports': [],
427
+ }
428
+
429
+ # Extract class definitions
430
+ class_pattern = r'class\s+(\w+)(?:\(([^)]*)\))?:'
431
+ for match in re.finditer(class_pattern, content):
432
+ class_name = match.group(1)
433
+ parent_class = match.group(2) if match.group(2) else None
434
+ result['classes'].append({
435
+ 'name': class_name,
436
+ 'parent': parent_class,
437
+ })
438
+
439
+ # Extract function definitions
440
+ func_pattern = r'def\s+(\w+)\s*\(([^)]*)\):'
441
+ for match in re.finditer(func_pattern, content):
442
+ func_name = match.group(1)
443
+ params = match.group(2)
444
+ result['functions'].append({
445
+ 'name': func_name,
446
+ 'params': params.strip(),
447
+ })
448
+
449
+ # Extract imports
450
+ import_pattern = r'(?:from\s+(\w+(?:\.\w+)*)\s+)?import\s+(.+?)(?:\s+as\s+(\w+))?$'
451
+ for line in content.split('\n'):
452
+ line = line.strip()
453
+ if line.startswith('import ') or line.startswith('from '):
454
+ match = re.match(import_pattern, line)
455
+ if match:
456
+ from_module = match.group(1)
457
+ imported = match.group(2)
458
+ as_name = match.group(3)
459
+ result['imports'].append({
460
+ 'from': from_module,
461
+ 'import': imported,
462
+ 'as': as_name,
463
+ })
464
+
465
+ return result
466
+ except Exception as e:
467
+ print(f"Error extracting content from Python file {file_path}: {e}")
468
+ return {}
469
+
470
+
471
+ class JsonlHandler(FileHandler):
472
+ """Handler for JSONL (JSON Lines) files."""
473
+
474
+ def can_handle(self, file_path: str) -> bool:
475
+ """Check if the file is a JSONL file."""
476
+ return self.get_file_type(file_path) == '.jsonl'
477
+
478
+ def extract_content(self, file_path: str) -> List[Dict[str, Any]]:
479
+ """
480
+ Extract content from a JSONL file.
481
+
482
+ Returns:
483
+ List of parsed JSON objects
484
+ """
485
+ try:
486
+ result = []
487
+ with open(file_path, 'r', encoding='utf-8') as f:
488
+ for line in f:
489
+ line = line.strip()
490
+ if line: # Skip empty lines
491
+ result.append(json.loads(line))
492
+ return result
493
+ except Exception as e:
494
+ print(f"Error extracting content from JSONL file {file_path}: {e}")
495
+ return []
496
+
497
+
498
+ def get_all_handlers(resource_dir: str) -> List[FileHandler]:
499
+ """
500
+ Get a list of all file handlers.
501
+
502
+ Args:
503
+ resource_dir: Directory containing resource files
504
+
505
+ Returns:
506
+ List of file handlers
507
+ """
508
+ return [
509
+ ExcelHandler(resource_dir),
510
+ CSVHandler(resource_dir),
511
+ TextHandler(resource_dir),
512
+ PDFHandler(resource_dir),
513
+ ImageHandler(resource_dir),
514
+ DocxHandler(resource_dir),
515
+ PptxHandler(resource_dir),
516
+ JsonHandler(resource_dir),
517
+ ZipHandler(resource_dir),
518
+ PdbHandler(resource_dir),
519
+ PythonHandler(resource_dir),
520
+ JsonlHandler(resource_dir),
521
+ ]
522
+
523
+
524
+ def get_handler_for_file(file_path: str, resource_dir: str) -> Optional[FileHandler]:
525
+ """
526
+ Get the appropriate handler for a file.
527
+
528
+ Args:
529
+ file_path: Path to the file
530
+ resource_dir: Directory containing resource files
531
+
532
+ Returns:
533
+ Appropriate file handler, or None if no handler can process the file
534
+ """
535
+ handlers = get_all_handlers(resource_dir)
536
+
537
+ for handler in handlers:
538
+ if handler.can_handle(file_path):
539
+ return handler
540
+
541
+ return None
542
+
543
+
544
+ def extract_file_content(file_path: str, resource_dir: str) -> Tuple[Any, Optional[FileHandler]]:
545
+ """
546
+ Extract content from a file using the appropriate handler.
547
+
548
+ Args:
549
+ file_path: Path to the file
550
+ resource_dir: Directory containing resource files
551
+
552
+ Returns:
553
+ Tuple of (extracted content, handler used)
554
+ """
555
+ handler = get_handler_for_file(file_path, resource_dir)
556
+
557
+ if handler:
558
+ content = handler.extract_content(file_path)
559
+ return content, handler
560
+
561
+ return None, None
agent/utils/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Utils initialization file.
3
+ """
agent/utils/data_processor.py ADDED
@@ -0,0 +1,936 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data processor for processing extracted data.
3
+ """
4
+ import re
5
+ import os
6
+ import json
7
+ from typing import Dict, Any, List, Optional, Tuple, Union
8
+ import pandas as pd
9
+
10
+ class DataProcessor:
11
+ """
12
+ Class for processing extracted data.
13
+ """
14
+
15
+ def __init__(self):
16
+ """Initialize the data processor."""
17
+ pass
18
+
19
+ def process_excel_data(self, data: Dict[str, pd.DataFrame], question: str) -> str:
20
+ """
21
+ Process data extracted from an Excel file.
22
+
23
+ Args:
24
+ data: Dictionary mapping sheet names to DataFrames
25
+ question: The question to answer
26
+
27
+ Returns:
28
+ Answer to the question
29
+ """
30
+ # Convert question to lowercase for easier matching
31
+ question_lower = question.lower()
32
+
33
+ # Handle specific question types
34
+ if 'oldest' in question_lower:
35
+ return self._find_oldest_item(data, question_lower)
36
+ elif 'count' in question_lower or 'how many' in question_lower:
37
+ return self._count_items(data, question_lower)
38
+ elif 'average' in question_lower or 'mean' in question_lower:
39
+ return self._calculate_average(data, question_lower)
40
+ elif 'total' in question_lower or 'sum' in question_lower:
41
+ return self._calculate_total(data, question_lower)
42
+ elif 'maximum' in question_lower or 'highest' in question_lower:
43
+ return self._find_maximum(data, question_lower)
44
+ elif 'minimum' in question_lower or 'lowest' in question_lower:
45
+ return self._find_minimum(data, question_lower)
46
+ else:
47
+ # Try to extract specific information
48
+ return self._extract_specific_info(data, question_lower)
49
+
50
+ def _find_oldest_item(self, data: Dict[str, pd.DataFrame], question: str) -> str:
51
+ """Find the oldest item in the data."""
52
+ # Look for mentions of specific columns or items
53
+ year_columns = ['year', 'date', 'time', 'created', 'modified', 'release']
54
+ item_type = None
55
+
56
+ # Try to extract the type of item we're looking for
57
+ item_types = [
58
+ 'movie', 'film', 'book', 'song', 'album', 'game', 'video game',
59
+ 'dvd', 'cd', 'blu-ray', 'blu ray', 'record', 'cassette', 'vhs'
60
+ ]
61
+ for item in item_types:
62
+ if item in question:
63
+ item_type = item
64
+ break
65
+
66
+ # Iterate through sheets and find the oldest item
67
+ oldest_year = float('inf')
68
+ oldest_item = None
69
+
70
+ for sheet_name, df in data.items():
71
+ # Skip empty sheets
72
+ if df.empty:
73
+ continue
74
+
75
+ # Try to find year/date columns
76
+ year_col = None
77
+ for col in df.columns:
78
+ if any(year_term in col.lower() for year_term in year_columns):
79
+ year_col = col
80
+ break
81
+
82
+ if year_col is None:
83
+ # If no obvious year column, look for columns with numeric values
84
+ for col in df.columns:
85
+ if pd.api.types.is_numeric_dtype(df[col]):
86
+ try:
87
+ # Check if values might be years (between 1900 and current year)
88
+ if df[col].min() >= 1900 and df[col].max() <= 2025:
89
+ year_col = col
90
+ break
91
+ except:
92
+ continue
93
+
94
+ if year_col is not None:
95
+ # Find title/name column
96
+ title_col = None
97
+ title_columns = ['title', 'name', 'item', 'product', 'description']
98
+
99
+ for col in df.columns:
100
+ if any(title_term in col.lower() for title_term in title_columns):
101
+ title_col = col
102
+ break
103
+
104
+ if title_col is None and len(df.columns) > 1:
105
+ # If no obvious title column, use the first non-year column
106
+ for col in df.columns:
107
+ if col != year_col:
108
+ title_col = col
109
+ break
110
+
111
+ # Filter by item type if specified
112
+ if item_type:
113
+ filtered_df = df
114
+
115
+ # Look for a column that might contain item types
116
+ type_col = None
117
+ type_columns = ['type', 'category', 'format', 'medium', 'platform']
118
+
119
+ for col in df.columns:
120
+ if any(type_term in col.lower() for type_term in type_columns):
121
+ type_col = col
122
+ break
123
+
124
+ if type_col:
125
+ # Filter by item type
126
+ filtered_df = df[df[type_col].astype(str).str.lower().str.contains(item_type.lower())]
127
+ else:
128
+ filtered_df = df
129
+
130
+ if not filtered_df.empty and title_col:
131
+ try:
132
+ # Find the row with the minimum year
133
+ min_year_idx = filtered_df[year_col].astype(float).idxmin()
134
+ min_year = filtered_df.loc[min_year_idx, year_col]
135
+
136
+ if min_year < oldest_year:
137
+ oldest_year = min_year
138
+ oldest_item = filtered_df.loc[min_year_idx, title_col]
139
+ except:
140
+ continue
141
+
142
+ if oldest_item:
143
+ return str(oldest_item)
144
+ else:
145
+ return "Could not determine the oldest item from the data."
146
+
147
+ def _count_items(self, data: Dict[str, pd.DataFrame], question: str) -> str:
148
+ """Count items matching specific criteria."""
149
+ # Extract conditions from the question
150
+ conditions = self._extract_conditions(question)
151
+
152
+ total_count = 0
153
+
154
+ for sheet_name, df in data.items():
155
+ # Skip empty sheets
156
+ if df.empty:
157
+ continue
158
+
159
+ # Apply conditions to filter the DataFrame
160
+ filtered_df = df
161
+
162
+ for condition in conditions:
163
+ col = condition.get('column')
164
+ value = condition.get('value')
165
+ operator = condition.get('operator', '=')
166
+
167
+ if col and value is not None:
168
+ # Find the best matching column
169
+ best_col = self._find_best_matching_column(df, col)
170
+
171
+ if best_col:
172
+ try:
173
+ if operator == '=':
174
+ filtered_df = filtered_df[filtered_df[best_col].astype(str).str.lower() == str(value).lower()]
175
+ elif operator == '>':
176
+ filtered_df = filtered_df[filtered_df[best_col] > value]
177
+ elif operator == '<':
178
+ filtered_df = filtered_df[filtered_df[best_col] < value]
179
+ elif operator == '>=':
180
+ filtered_df = filtered_df[filtered_df[best_col] >= value]
181
+ elif operator == '<=':
182
+ filtered_df = filtered_df[filtered_df[best_col] <= value]
183
+ elif operator == 'contains':
184
+ filtered_df = filtered_df[filtered_df[best_col].astype(str).str.lower().str.contains(str(value).lower())]
185
+ elif operator == 'between':
186
+ if isinstance(value, list) and len(value) == 2:
187
+ filtered_df = filtered_df[(filtered_df[best_col] >= value[0]) & (filtered_df[best_col] <= value[1])]
188
+ except:
189
+ continue
190
+
191
+ # Add the count from this sheet
192
+ total_count += len(filtered_df)
193
+
194
+ return str(total_count)
195
+
196
+ def _calculate_average(self, data: Dict[str, pd.DataFrame], question: str) -> str:
197
+ """Calculate the average of a column."""
198
+ # Extract column name from the question
199
+ column_name = self._extract_column_name(question)
200
+
201
+ if not column_name:
202
+ return "Could not determine which column to calculate the average for."
203
+
204
+ for sheet_name, df in data.items():
205
+ # Skip empty sheets
206
+ if df.empty:
207
+ continue
208
+
209
+ # Find the best matching column
210
+ best_col = self._find_best_matching_column(df, column_name)
211
+
212
+ if best_col and pd.api.types.is_numeric_dtype(df[best_col]):
213
+ try:
214
+ avg_value = df[best_col].mean()
215
+ return str(avg_value)
216
+ except:
217
+ continue
218
+
219
+ return "Could not calculate the average from the data."
220
+
221
+ def _calculate_total(self, data: Dict[str, pd.DataFrame], question: str) -> str:
222
+ """Calculate the total of a column."""
223
+ # Extract column name from the question
224
+ column_name = self._extract_column_name(question)
225
+
226
+ if not column_name:
227
+ return "Could not determine which column to calculate the total for."
228
+
229
+ for sheet_name, df in data.items():
230
+ # Skip empty sheets
231
+ if df.empty:
232
+ continue
233
+
234
+ # Find the best matching column
235
+ best_col = self._find_best_matching_column(df, column_name)
236
+
237
+ if best_col and pd.api.types.is_numeric_dtype(df[best_col]):
238
+ try:
239
+ total_value = df[best_col].sum()
240
+ return str(total_value)
241
+ except:
242
+ continue
243
+
244
+ return "Could not calculate the total from the data."
245
+
246
+ def _find_maximum(self, data: Dict[str, pd.DataFrame], question: str) -> str:
247
+ """Find the maximum value in a column."""
248
+ # Extract column name from the question
249
+ column_name = self._extract_column_name(question)
250
+
251
+ if not column_name:
252
+ return "Could not determine which column to find the maximum for."
253
+
254
+ for sheet_name, df in data.items():
255
+ # Skip empty sheets
256
+ if df.empty:
257
+ continue
258
+
259
+ # Find the best matching column
260
+ best_col = self._find_best_matching_column(df, column_name)
261
+
262
+ if best_col:
263
+ try:
264
+ max_value = df[best_col].max()
265
+ return str(max_value)
266
+ except:
267
+ continue
268
+
269
+ return "Could not find the maximum value from the data."
270
+
271
+ def _find_minimum(self, data: Dict[str, pd.DataFrame], question: str) -> str:
272
+ """Find the minimum value in a column."""
273
+ # Extract column name from the question
274
+ column_name = self._extract_column_name(question)
275
+
276
+ if not column_name:
277
+ return "Could not determine which column to find the minimum for."
278
+
279
+ for sheet_name, df in data.items():
280
+ # Skip empty sheets
281
+ if df.empty:
282
+ continue
283
+
284
+ # Find the best matching column
285
+ best_col = self._find_best_matching_column(df, column_name)
286
+
287
+ if best_col:
288
+ try:
289
+ min_value = df[best_col].min()
290
+ return str(min_value)
291
+ except:
292
+ continue
293
+
294
+ return "Could not find the minimum value from the data."
295
+
296
+ def _extract_specific_info(self, data: Dict[str, pd.DataFrame], question: str) -> str:
297
+ """Extract specific information from the data."""
298
+ # Try to identify what we're looking for
299
+ looking_for = self._extract_looking_for(question)
300
+ conditions = self._extract_conditions(question)
301
+
302
+ for sheet_name, df in data.items():
303
+ # Skip empty sheets
304
+ if df.empty:
305
+ continue
306
+
307
+ # Apply conditions to filter the DataFrame
308
+ filtered_df = df
309
+
310
+ for condition in conditions:
311
+ col = condition.get('column')
312
+ value = condition.get('value')
313
+ operator = condition.get('operator', '=')
314
+
315
+ if col and value is not None:
316
+ # Find the best matching column
317
+ best_col = self._find_best_matching_column(df, col)
318
+
319
+ if best_col:
320
+ try:
321
+ if operator == '=':
322
+ filtered_df = filtered_df[filtered_df[best_col].astype(str).str.lower() == str(value).lower()]
323
+ elif operator == '>':
324
+ filtered_df = filtered_df[filtered_df[best_col] > value]
325
+ elif operator == '<':
326
+ filtered_df = filtered_df[filtered_df[best_col] < value]
327
+ elif operator == '>=':
328
+ filtered_df = filtered_df[filtered_df[best_col] >= value]
329
+ elif operator == '<=':
330
+ filtered_df = filtered_df[filtered_df[best_col] <= value]
331
+ elif operator == 'contains':
332
+ filtered_df = filtered_df[filtered_df[best_col].astype(str).str.lower().str.contains(str(value).lower())]
333
+ elif operator == 'between':
334
+ if isinstance(value, list) and len(value) == 2:
335
+ filtered_df = filtered_df[(filtered_df[best_col] >= value[0]) & (filtered_df[best_col] <= value[1])]
336
+ except:
337
+ continue
338
+
339
+ # If we found matching rows and know what to look for
340
+ if not filtered_df.empty and looking_for:
341
+ # Find the best matching column for what we're looking for
342
+ best_col = self._find_best_matching_column(df, looking_for)
343
+
344
+ if best_col:
345
+ try:
346
+ # Return the first value
347
+ return str(filtered_df.iloc[0][best_col])
348
+ except:
349
+ continue
350
+
351
+ # If we couldn't extract specific information, return a more general response
352
+ if data:
353
+ # Return basic info about the first non-empty sheet
354
+ for sheet_name, df in data.items():
355
+ if not df.empty:
356
+ return f"The sheet contains {len(df)} rows and {len(df.columns)} columns."
357
+
358
+ return "Could not extract the requested information from the data."
359
+
360
+ def _extract_conditions(self, question: str) -> List[Dict[str, Any]]:
361
+ """Extract conditions from the question."""
362
+ conditions = []
363
+
364
+ # Check for "between" conditions
365
+ between_pattern = r'(\w+) between (\d+) and (\d+)'
366
+ for match in re.finditer(between_pattern, question):
367
+ column = match.group(1)
368
+ start = int(match.group(2))
369
+ end = int(match.group(3))
370
+ conditions.append({
371
+ 'column': column,
372
+ 'operator': 'between',
373
+ 'value': [start, end],
374
+ })
375
+
376
+ # Check for comparison conditions
377
+ comparison_pattern = r'(\w+) (>|<|>=|<=|=|equals|equal to|contains) (\w+)'
378
+ for match in re.finditer(comparison_pattern, question):
379
+ column = match.group(1)
380
+ op = match.group(2)
381
+ value = match.group(3)
382
+
383
+ # Convert operator text to symbols
384
+ if op == 'equals' or op == 'equal to':
385
+ op = '='
386
+ elif op == 'contains':
387
+ op = 'contains'
388
+
389
+ # Try to convert value to number
390
+ try:
391
+ value = float(value)
392
+ except:
393
+ pass
394
+
395
+ conditions.append({
396
+ 'column': column,
397
+ 'operator': op,
398
+ 'value': value,
399
+ })
400
+
401
+ # Check for simple equality conditions
402
+ equality_pattern = r'(?:with|where) (\w+) (?:is|=) (\w+)'
403
+ for match in re.finditer(equality_pattern, question):
404
+ column = match.group(1)
405
+ value = match.group(2)
406
+
407
+ # Try to convert value to number
408
+ try:
409
+ value = float(value)
410
+ except:
411
+ pass
412
+
413
+ conditions.append({
414
+ 'column': column,
415
+ 'operator': '=',
416
+ 'value': value,
417
+ })
418
+
419
+ return conditions
420
+
421
+ def _extract_column_name(self, question: str) -> Optional[str]:
422
+ """Extract column name from the question."""
423
+ # Check for direct mentions of columns
424
+ column_pattern = r'(?:column|field) (?:named|called) ["\']?(\w+)["\']?'
425
+ match = re.search(column_pattern, question)
426
+ if match:
427
+ return match.group(1)
428
+
429
+ # Look for common column references
430
+ common_columns = [
431
+ 'year', 'date', 'time', 'name', 'title', 'price', 'cost',
432
+ 'amount', 'quantity', 'total', 'value', 'age', 'rating',
433
+ 'score', 'grade', 'salary', 'income', 'revenue', 'profit',
434
+ 'loss', 'height', 'weight', 'length', 'width', 'depth',
435
+ 'area', 'volume'
436
+ ]
437
+
438
+ for col in common_columns:
439
+ if col in question:
440
+ return col
441
+
442
+ return None
443
+
444
+ def _extract_looking_for(self, question: str) -> Optional[str]:
445
+ """Extract what we're looking for from the question."""
446
+ # Check for direct mentions of what we're looking for
447
+ looking_for_pattern = r'(?:what is|what are|find|get|return) the (\w+)'
448
+ match = re.search(looking_for_pattern, question)
449
+ if match:
450
+ return match.group(1)
451
+
452
+ # Look for common things we might be looking for
453
+ common_items = [
454
+ 'name', 'title', 'price', 'cost', 'amount', 'quantity',
455
+ 'total', 'value', 'age', 'rating', 'score', 'grade',
456
+ 'salary', 'income', 'revenue', 'profit', 'loss',
457
+ 'height', 'weight', 'length', 'width', 'depth',
458
+ 'area', 'volume', 'year', 'date', 'time'
459
+ ]
460
+
461
+ for item in common_items:
462
+ if item in question:
463
+ return item
464
+
465
+ return None
466
+
467
+ def _find_best_matching_column(self, df: pd.DataFrame, column_name: str) -> Optional[str]:
468
+ """Find the best matching column in a DataFrame."""
469
+ # Check for exact match
470
+ if column_name in df.columns:
471
+ return column_name
472
+
473
+ # Check for case-insensitive match
474
+ for col in df.columns:
475
+ if col.lower() == column_name.lower():
476
+ return col
477
+
478
+ # Check for partial match
479
+ for col in df.columns:
480
+ if column_name.lower() in col.lower():
481
+ return col
482
+
483
+ return None
484
+
485
+ def process_csv_data(self, data: pd.DataFrame, question: str) -> str:
486
+ """
487
+ Process data extracted from a CSV file.
488
+
489
+ Args:
490
+ data: DataFrame containing the CSV data
491
+ question: The question to answer
492
+
493
+ Returns:
494
+ Answer to the question
495
+ """
496
+ # Wrap in a dictionary to reuse Excel processing logic
497
+ return self.process_excel_data({'Sheet1': data}, question)
498
+
499
+ def process_text_data(self, data: str, question: str) -> str:
500
+ """
501
+ Process data extracted from a text file.
502
+
503
+ Args:
504
+ data: Text content of the file
505
+ question: The question to answer
506
+
507
+ Returns:
508
+ Answer to the question
509
+ """
510
+ question_lower = question.lower()
511
+
512
+ # Handle specific question types
513
+ if 'count' in question_lower or 'how many' in question_lower:
514
+ # Count occurrences of a word or phrase
515
+ count_pattern = r'(?:count|how many) (?:occurrences of|instances of|times) ["\']?([^"\']+)["\']?'
516
+ match = re.search(count_pattern, question_lower)
517
+ if match:
518
+ term = match.group(1)
519
+ count = data.lower().count(term.lower())
520
+ return str(count)
521
+
522
+ # Check if the question is asking for a specific line
523
+ line_pattern = r'(?:what is|what does|what are|show|return) (?:the|on) (?:line|lines) (\d+)(?:\s*(?:to|-)\s*(\d+))?'
524
+ match = re.search(line_pattern, question_lower)
525
+ if match:
526
+ start_line = int(match.group(1))
527
+ end_line = int(match.group(2)) if match.group(2) else start_line
528
+
529
+ lines = data.split('\n')
530
+ if start_line <= len(lines) and end_line <= len(lines):
531
+ return '\n'.join(lines[start_line-1:end_line])
532
+
533
+ # Check if the question is asking for a specific paragraph
534
+ para_pattern = r'(?:what is|what does|what are|show|return) (?:the|in) paragraph (\d+)(?:\s*(?:to|-)\s*(\d+))?'
535
+ match = re.search(para_pattern, question_lower)
536
+ if match:
537
+ start_para = int(match.group(1))
538
+ end_para = int(match.group(2)) if match.group(2) else start_para
539
+
540
+ paragraphs = re.split(r'\n\s*\n', data)
541
+ if start_para <= len(paragraphs) and end_para <= len(paragraphs):
542
+ return '\n\n'.join(paragraphs[start_para-1:end_para])
543
+
544
+ # Check for specific information requests
545
+ info_pattern = r'(?:what|who|where|when|why|how) (?:is|are|was|were|does|do|did) ([^?]+)'
546
+ match = re.search(info_pattern, question_lower)
547
+ if match:
548
+ info = match.group(1).strip()
549
+
550
+ # Look for this information in the text
551
+ sentences = re.split(r'(?<=[.!?])\s+', data)
552
+ for sentence in sentences:
553
+ if info.lower() in sentence.lower():
554
+ return sentence.strip()
555
+
556
+ # If nothing specific was found, return a generic summary
557
+ words = data.split()
558
+ return f"The text contains {len(words)} words and {len(data.split('. '))} sentences."
559
+
560
+ def process_pdf_data(self, data: Dict[int, str], question: str) -> str:
561
+ """
562
+ Process data extracted from a PDF file.
563
+
564
+ Args:
565
+ data: Dictionary mapping page numbers to text content
566
+ question: The question to answer
567
+
568
+ Returns:
569
+ Answer to the question
570
+ """
571
+ question_lower = question.lower()
572
+
573
+ # Check if the question is asking for a specific page
574
+ page_pattern = r'(?:what is|what does|what are|show|return) (?:on|in) page (\d+)'
575
+ match = re.search(page_pattern, question_lower)
576
+ if match:
577
+ page_num = int(match.group(1))
578
+ if page_num in data:
579
+ return data[page_num]
580
+ else:
581
+ return f"Page {page_num} not found in the PDF."
582
+
583
+ # Check if the question is asking for a specific information across all pages
584
+ info_pattern = r'(?:what|who|where|when|why|how) (?:is|are|was|were|does|do|did) ([^?]+)'
585
+ match = re.search(info_pattern, question_lower)
586
+ if match:
587
+ info = match.group(1).strip()
588
+
589
+ # Look for this information in all pages
590
+ for page_num, content in data.items():
591
+ sentences = re.split(r'(?<=[.!?])\s+', content)
592
+ for sentence in sentences:
593
+ if info.lower() in sentence.lower():
594
+ return sentence.strip()
595
+
596
+ # If nothing specific was found, combine all text and return a summary
597
+ all_text = ' '.join(data.values())
598
+ words = all_text.split()
599
+ return f"The PDF contains {len(data)} pages and approximately {len(words)} words."
600
+
601
+ def process_image_metadata(self, metadata: Dict[str, Any], question: str) -> str:
602
+ """
603
+ Process metadata extracted from an image file.
604
+
605
+ Args:
606
+ metadata: Dictionary containing image metadata
607
+ question: The question to answer
608
+
609
+ Returns:
610
+ Answer to the question
611
+ """
612
+ question_lower = question.lower()
613
+
614
+ # Handle specific question types
615
+ if 'format' in question_lower or 'type' in question_lower:
616
+ return metadata.get('format', 'Unknown format')
617
+ elif 'size' in question_lower or 'resolution' in question_lower:
618
+ width = metadata.get('width', 0)
619
+ height = metadata.get('height', 0)
620
+ return f"{width}x{height}"
621
+ elif 'width' in question_lower:
622
+ return str(metadata.get('width', 0))
623
+ elif 'height' in question_lower:
624
+ return str(metadata.get('height', 0))
625
+ elif 'mode' in question_lower or 'color' in question_lower:
626
+ return metadata.get('mode', 'Unknown mode')
627
+ elif 'exif' in question_lower:
628
+ exif = metadata.get('exif', {})
629
+ if exif:
630
+ return str(exif)
631
+ else:
632
+ return "No EXIF data found."
633
+
634
+ # If nothing specific was found, return basic information
635
+ return f"Image format: {metadata.get('format', 'Unknown')}, Size: {metadata.get('width', 0)}x{metadata.get('height', 0)}, Mode: {metadata.get('mode', 'Unknown')}"
636
+
637
+ def process_docx_data(self, data: str, question: str) -> str:
638
+ """
639
+ Process data extracted from a Word document.
640
+
641
+ Args:
642
+ data: Text content of the document
643
+ question: The question to answer
644
+
645
+ Returns:
646
+ Answer to the question
647
+ """
648
+ # Similar to text processing
649
+ return self.process_text_data(data, question)
650
+
651
+ def process_pptx_data(self, data: Dict[int, str], question: str) -> str:
652
+ """
653
+ Process data extracted from a PowerPoint presentation.
654
+
655
+ Args:
656
+ data: Dictionary mapping slide numbers to text content
657
+ question: The question to answer
658
+
659
+ Returns:
660
+ Answer to the question
661
+ """
662
+ question_lower = question.lower()
663
+
664
+ # Check if the question is asking for a specific slide
665
+ slide_pattern = r'(?:what is|what does|what are|show|return) (?:on|in) slide (\d+)'
666
+ match = re.search(slide_pattern, question_lower)
667
+ if match:
668
+ slide_num = int(match.group(1))
669
+ if slide_num in data:
670
+ return data[slide_num]
671
+ else:
672
+ return f"Slide {slide_num} not found in the presentation."
673
+
674
+ # Check if the question is asking for a specific information across all slides
675
+ info_pattern = r'(?:what|who|where|when|why|how) (?:is|are|was|were|does|do|did) ([^?]+)'
676
+ match = re.search(info_pattern, question_lower)
677
+ if match:
678
+ info = match.group(1).strip()
679
+
680
+ # Look for this information in all slides
681
+ for slide_num, content in data.items():
682
+ if info.lower() in content.lower():
683
+ return content.strip()
684
+
685
+ # If nothing specific was found, return a summary
686
+ return f"The presentation contains {len(data)} slides."
687
+
688
+ def process_json_data(self, data: Dict[str, Any], question: str) -> str:
689
+ """
690
+ Process data extracted from a JSON file.
691
+
692
+ Args:
693
+ data: Parsed JSON content
694
+ question: The question to answer
695
+
696
+ Returns:
697
+ Answer to the question
698
+ """
699
+ question_lower = question.lower()
700
+
701
+ # Check if the question is asking for a specific key
702
+ key_pattern = r'(?:what is|what are|show|return) (?:the|in) ["\']?(\w+)["\']?'
703
+ match = re.search(key_pattern, question_lower)
704
+ if match:
705
+ key = match.group(1)
706
+
707
+ # Look for this key in the JSON
708
+ if key in data:
709
+ return str(data[key])
710
+
711
+ # Look for nested keys
712
+ for k, v in data.items():
713
+ if isinstance(v, dict) and key in v:
714
+ return str(v[key])
715
+
716
+ # If nothing specific was found, return a summary
717
+ return f"The JSON contains {len(data)} top-level keys: {', '.join(data.keys())}"
718
+
719
+ def process_zip_data(self, data: Dict[str, Any], question: str) -> str:
720
+ """
721
+ Process data extracted from a ZIP archive.
722
+
723
+ Args:
724
+ data: Dictionary containing information about the archive
725
+ question: The question to answer
726
+
727
+ Returns:
728
+ Answer to the question
729
+ """
730
+ question_lower = question.lower()
731
+
732
+ # Handle specific question types
733
+ if 'how many' in question_lower or 'count' in question_lower:
734
+ if 'files' in question_lower:
735
+ return str(len(data.get('files', [])))
736
+
737
+ # Check if the question is asking for a specific file
738
+ file_pattern = r'(?:does it contain|is there) (?:a file named|a file called) ["\']?([^"\']+)["\']?'
739
+ match = re.search(file_pattern, question_lower)
740
+ if match:
741
+ filename = match.group(1)
742
+
743
+ # Check if the file exists in the archive
744
+ for file_info in data.get('files', []):
745
+ if filename.lower() in file_info.get('filename', '').lower():
746
+ return f"Yes, the archive contains {file_info['filename']} ({file_info['size']} bytes)"
747
+
748
+ return f"No, the archive does not contain a file named {filename}."
749
+
750
+ # If nothing specific was found, return a summary
751
+ return f"The ZIP archive contains {len(data.get('files', []))} files."
752
+
753
+ def process_pdb_data(self, data: Dict[str, Any], question: str) -> str:
754
+ """
755
+ Process data extracted from a PDB file.
756
+
757
+ Args:
758
+ data: Dictionary containing information about the PDB file
759
+ question: The question to answer
760
+
761
+ Returns:
762
+ Answer to the question
763
+ """
764
+ question_lower = question.lower()
765
+
766
+ # Handle specific question types
767
+ if 'title' in question_lower:
768
+ return data.get('title', 'No title found.')
769
+ elif 'header' in question_lower:
770
+ return data.get('header', 'No header found.')
771
+ elif 'compound' in question_lower or 'compounds' in question_lower:
772
+ compounds = data.get('compounds', [])
773
+ if compounds:
774
+ return '\n'.join(compounds)
775
+ else:
776
+ return 'No compounds found.'
777
+ elif 'author' in question_lower or 'authors' in question_lower:
778
+ authors = data.get('authors', [])
779
+ if authors:
780
+ return '\n'.join(authors)
781
+ else:
782
+ return 'No authors found.'
783
+ elif 'atoms' in question_lower or 'atom count' in question_lower:
784
+ return str(data.get('atoms_count', 0))
785
+
786
+ # If nothing specific was found, return a summary
787
+ return f"PDB file with title: {data.get('title', 'No title')}, containing {data.get('atoms_count', 0)} atoms."
788
+
789
+ def process_python_data(self, data: Dict[str, Any], question: str) -> str:
790
+ """
791
+ Process data extracted from a Python file.
792
+
793
+ Args:
794
+ data: Dictionary containing information about the Python file
795
+ question: The question to answer
796
+
797
+ Returns:
798
+ Answer to the question
799
+ """
800
+ question_lower = question.lower()
801
+
802
+ # Handle specific question types
803
+ if 'class' in question_lower or 'classes' in question_lower:
804
+ classes = data.get('classes', [])
805
+ if classes:
806
+ class_names = [c['name'] for c in classes]
807
+ return ', '.join(class_names)
808
+ else:
809
+ return 'No classes found in the file.'
810
+ elif 'function' in question_lower or 'functions' in question_lower:
811
+ functions = data.get('functions', [])
812
+ if functions:
813
+ func_names = [f['name'] for f in functions]
814
+ return ', '.join(func_names)
815
+ else:
816
+ return 'No functions found in the file.'
817
+ elif 'import' in question_lower or 'imports' in question_lower:
818
+ imports = data.get('imports', [])
819
+ if imports:
820
+ import_strs = []
821
+ for imp in imports:
822
+ if imp.get('from'):
823
+ import_strs.append(f"from {imp['from']} import {imp['import']}")
824
+ else:
825
+ import_strs.append(f"import {imp['import']}")
826
+ return '\n'.join(import_strs)
827
+ else:
828
+ return 'No imports found in the file.'
829
+
830
+ # Check if the question is asking for a specific class or function
831
+ class_pattern = r'(?:what is|what does) (?:the class|class) ["\']?(\w+)["\']?'
832
+ match = re.search(class_pattern, question_lower)
833
+ if match:
834
+ class_name = match.group(1)
835
+
836
+ # Look for this class in the data
837
+ for cls in data.get('classes', []):
838
+ if cls['name'].lower() == class_name.lower():
839
+ parent = f", inherits from {cls['parent']}" if cls['parent'] else ""
840
+ return f"Class {cls['name']}{parent}"
841
+
842
+ func_pattern = r'(?:what is|what does) (?:the function|function) ["\']?(\w+)["\']?'
843
+ match = re.search(func_pattern, question_lower)
844
+ if match:
845
+ func_name = match.group(1)
846
+
847
+ # Look for this function in the data
848
+ for func in data.get('functions', []):
849
+ if func['name'].lower() == func_name.lower():
850
+ return f"Function {func['name']}({func['params']})"
851
+
852
+ # If nothing specific was found, look for the code of a specific function or class
853
+ code_pattern = r'(?:show|return) (?:the code for|code of) (?:the )?(?:function|class) ["\']?(\w+)["\']?'
854
+ match = re.search(code_pattern, question_lower)
855
+ if match:
856
+ entity_name = match.group(1)
857
+ content = data.get('content', '')
858
+
859
+ # Look for the code of this entity
860
+ lines = content.split('\n')
861
+ entity_lines = []
862
+ in_entity = False
863
+ indent = 0
864
+
865
+ for i, line in enumerate(lines):
866
+ # Check for class or function definition
867
+ if re.match(rf'(class|def)\s+{re.escape(entity_name)}\s*\(', line):
868
+ in_entity = True
869
+ entity_lines.append(line)
870
+ indent = len(line) - len(line.lstrip())
871
+ continue
872
+
873
+ if in_entity:
874
+ # Check if we're still in the entity based on indentation
875
+ if line.strip() and len(line) - len(line.lstrip()) <= indent:
876
+ in_entity = False
877
+ else:
878
+ entity_lines.append(line)
879
+
880
+ if entity_lines:
881
+ return '\n'.join(entity_lines)
882
+
883
+ # If nothing specific was found, return a summary
884
+ return f"Python file with {len(data.get('classes', []))} classes and {len(data.get('functions', []))} functions."
885
+
886
+ def process_jsonl_data(self, data: List[Dict[str, Any]], question: str) -> str:
887
+ """
888
+ Process data extracted from a JSONL file.
889
+
890
+ Args:
891
+ data: List of parsed JSON objects
892
+ question: The question to answer
893
+
894
+ Returns:
895
+ Answer to the question
896
+ """
897
+ question_lower = question.lower()
898
+
899
+ # Handle specific question types
900
+ if 'how many' in question_lower or 'count' in question_lower:
901
+ return str(len(data))
902
+
903
+ # Check if the question is asking for a specific entry
904
+ entry_pattern = r'(?:what is|what are|show|return) (?:the|in) entry (\d+)'
905
+ match = re.search(entry_pattern, question_lower)
906
+ if match:
907
+ entry_num = int(match.group(1))
908
+ if 0 <= entry_num < len(data):
909
+ return str(data[entry_num])
910
+ else:
911
+ return f"Entry {entry_num} not found in the data."
912
+
913
+ # Check if the question is asking for entries with a specific key-value pair
914
+ kv_pattern = r'(?:entries|items) where ["\']?(\w+)["\']? (?:is|=|equals|contains) ["\']?([^"\']+)["\']?'
915
+ match = re.search(kv_pattern, question_lower)
916
+ if match:
917
+ key = match.group(1)
918
+ value = match.group(2)
919
+
920
+ # Find entries matching the criteria
921
+ matching_entries = []
922
+ for entry in data:
923
+ if key in entry and str(entry[key]).lower() == value.lower():
924
+ matching_entries.append(entry)
925
+
926
+ if matching_entries:
927
+ return str(matching_entries)
928
+ else:
929
+ return f"No entries found where {key} = {value}."
930
+
931
+ # If nothing specific was found, return a summary
932
+ if data and isinstance(data[0], dict):
933
+ keys = list(data[0].keys())
934
+ return f"The data contains {len(data)} entries with keys: {', '.join(keys)}"
935
+ else:
936
+ return f"The data contains {len(data)} entries."
agent/utils/question_analyzer.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities for analyzing and understanding questions.
3
+ """
4
+ import re
5
+ import json
6
+ import os
7
+ from typing import Dict, Any, List, Optional, Tuple, Set
8
+
9
+ class QuestionAnalyzer:
10
+ """
11
+ Class for analyzing and understanding questions.
12
+ """
13
+
14
+ def __init__(self, resource_dir: str, metadata_path: Optional[str] = None):
15
+ """
16
+ Initialize the question analyzer.
17
+
18
+ Args:
19
+ resource_dir: Directory containing resource files
20
+ metadata_path: Path to the metadata file (optional)
21
+ """
22
+ self.resource_dir = resource_dir
23
+ self.metadata_path = metadata_path or os.path.join(resource_dir, 'metadata.jsonl')
24
+ self.metadata = self._load_metadata()
25
+
26
+ def _load_metadata(self) -> Dict[str, Dict[str, Any]]:
27
+ """
28
+ Load metadata from the metadata file.
29
+
30
+ Returns:
31
+ Dictionary mapping task IDs to metadata
32
+ """
33
+ metadata = {}
34
+
35
+ if os.path.exists(self.metadata_path):
36
+ try:
37
+ with open(self.metadata_path, 'r', encoding='utf-8') as f:
38
+ for line in f:
39
+ entry = json.loads(line.strip())
40
+ task_id = entry.get('task_id')
41
+ if task_id:
42
+ metadata[task_id] = entry
43
+ except Exception as e:
44
+ print(f"Error loading metadata: {e}")
45
+
46
+ return metadata
47
+
48
+ def extract_file_mention(self, question: str) -> Optional[str]:
49
+ """
50
+ Extract mentioned file name from the question.
51
+
52
+ Args:
53
+ question: The question to analyze
54
+
55
+ Returns:
56
+ Mentioned file name, or None if no file is mentioned
57
+ """
58
+ # Look for "attached file" or "attached spreadsheet" patterns
59
+ attached_pattern = r'attached (?:file|spreadsheet|document|image|picture|pdf|excel|csv|text file|zip|archive) (?:named |called |")?([\w\.-]+)'
60
+ match = re.search(attached_pattern, question, re.IGNORECASE)
61
+ if match:
62
+ return match.group(1)
63
+
64
+ # Look for file extensions
65
+ extensions = [
66
+ '.xlsx', '.xls', '.csv', '.txt', '.pdf', '.jpg', '.jpeg',
67
+ '.png', '.docx', '.pptx', '.json', '.jsonld', '.zip', '.pdb', '.py'
68
+ ]
69
+ for ext in extensions:
70
+ pattern = r'(\w+(?:-\w+)*' + re.escape(ext) + r')'
71
+ match = re.search(pattern, question, re.IGNORECASE)
72
+ if match:
73
+ return match.group(1)
74
+
75
+ return None
76
+
77
+ def find_relevant_file(self, question: str, task_id: Optional[str] = None) -> Optional[str]:
78
+ """
79
+ Find the relevant file for a question.
80
+
81
+ Args:
82
+ question: The question to analyze
83
+ task_id: The task ID (optional)
84
+
85
+ Returns:
86
+ Path to the relevant file, or None if no file is found
87
+ """
88
+ # Check if task_id is in metadata and has a file_name
89
+ if task_id and task_id in self.metadata:
90
+ file_name = self.metadata[task_id].get('file_name')
91
+ if file_name:
92
+ file_path = os.path.join(self.resource_dir, file_name)
93
+ if os.path.exists(file_path):
94
+ return file_path
95
+
96
+ # Extract file mention from question
97
+ file_mention = self.extract_file_mention(question)
98
+ if file_mention:
99
+ # Check if the mentioned file exists
100
+ file_path = os.path.join(self.resource_dir, file_mention)
101
+ if os.path.exists(file_path):
102
+ return file_path
103
+
104
+ # Check if there's a file with a similar name
105
+ for file_name in os.listdir(self.resource_dir):
106
+ if file_mention.lower() in file_name.lower():
107
+ return os.path.join(self.resource_dir, file_name)
108
+
109
+ # If no file is found, try to find a file mentioned in the metadata
110
+ if task_id and task_id in self.metadata:
111
+ # Extract keywords from the question
112
+ keywords = self._extract_keywords(question)
113
+
114
+ # Check all files in the resource directory
115
+ best_match = None
116
+ best_score = 0
117
+
118
+ for file_name in os.listdir(self.resource_dir):
119
+ # Skip metadata file
120
+ if file_name == 'metadata.jsonl':
121
+ continue
122
+
123
+ # Calculate score based on keyword matches
124
+ score = 0
125
+ for keyword in keywords:
126
+ if keyword.lower() in file_name.lower():
127
+ score += 1
128
+
129
+ if score > best_score:
130
+ best_score = score
131
+ best_match = file_name
132
+
133
+ if best_match:
134
+ return os.path.join(self.resource_dir, best_match)
135
+
136
+ return None
137
+
138
+ def _extract_keywords(self, text: str) -> Set[str]:
139
+ """
140
+ Extract keywords from text.
141
+
142
+ Args:
143
+ text: The text to analyze
144
+
145
+ Returns:
146
+ Set of keywords
147
+ """
148
+ # Remove common stop words
149
+ stop_words = {
150
+ 'a', 'an', 'the', 'and', 'or', 'but', 'if', 'then', 'else', 'when',
151
+ 'at', 'from', 'by', 'for', 'with', 'about', 'against', 'between',
152
+ 'into', 'through', 'during', 'before', 'after', 'above', 'below',
153
+ 'to', 'of', 'in', 'on', 'is', 'are', 'was', 'were', 'be', 'been',
154
+ 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did',
155
+ 'doing', 'would', 'should', 'could', 'might', 'will', 'shall',
156
+ 'can', 'may', 'must', 'ought'
157
+ }
158
+
159
+ # Extract words
160
+ words = re.findall(r'\b\w+\b', text.lower())
161
+
162
+ # Filter out stop words and short words
163
+ keywords = {word for word in words if word not in stop_words and len(word) > 2}
164
+
165
+ return keywords
166
+
167
+ def analyze_question(self, question: str, task_id: Optional[str] = None) -> Dict[str, Any]:
168
+ """
169
+ Analyze a question to understand what it's asking.
170
+
171
+ Args:
172
+ question: The question to analyze
173
+ task_id: The task ID (optional)
174
+
175
+ Returns:
176
+ Dictionary containing analysis results
177
+ """
178
+ result = {
179
+ 'question': question,
180
+ 'task_id': task_id,
181
+ 'file_path': None,
182
+ 'keywords': list(self._extract_keywords(question)),
183
+ 'expected_answer': None,
184
+ }
185
+
186
+ # Find relevant file
187
+ file_path = self.find_relevant_file(question, task_id)
188
+ if file_path:
189
+ result['file_path'] = file_path
190
+
191
+ # Get expected answer if available
192
+ if task_id and task_id in self.metadata:
193
+ result['expected_answer'] = self.metadata[task_id].get('Final answer')
194
+
195
+ return result
app.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import requests
4
+ import inspect
5
+ import pandas as pd
6
+ import logging
7
+ import sys
8
+
9
+ # Configure logging
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
13
+ handlers=[
14
+ logging.StreamHandler(sys.stdout)
15
+ ]
16
+ )
17
+ logger = logging.getLogger('app')
18
+
19
+ # Add the current directory to sys.path to import local modules
20
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
21
+
22
+ # Import the MultiModalAgent
23
+ from agent import MultiModalAgent
24
+
25
+ # (Keep Constants as is)
26
+ # --- Constants ---
27
+ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
28
+
29
+ # --- Agent Definition ---
30
+ # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
31
+ class BasicAgent:
32
+ def __init__(self):
33
+ print("BasicAgent initialized.")
34
+ def __call__(self, question: str) -> str:
35
+ print(f"Agent received question (first 50 chars): {question[:50]}...")
36
+ fixed_answer = "This is a default answer."
37
+ print(f"Agent returning fixed answer: {fixed_answer}")
38
+ return fixed_answer
39
+
40
+ def run_and_submit_all( profile: gr.OAuthProfile | None):
41
+ """
42
+ Fetches all questions, runs the BasicAgent on them, submits all answers,
43
+ and displays the results.
44
+ """
45
+ # --- Determine HF Space Runtime URL and Repo URL ---
46
+ space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
47
+
48
+ if profile:
49
+ username= f"{profile.username}"
50
+ print(f"User logged in: {username}")
51
+ else:
52
+ print("User not logged in.")
53
+ return "Please Login to Hugging Face with the button.", None
54
+
55
+ api_url = DEFAULT_API_URL
56
+ questions_url = f"{api_url}/questions"
57
+ submit_url = f"{api_url}/submit"
58
+
59
+ # 1. Instantiate Agent ( modify this part to create your agent)
60
+ try:
61
+ logger.info("Creating MultiModalAgent instance...")
62
+ resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resource')
63
+ agent = MultiModalAgent(resource_dir=resource_dir)
64
+ logger.info("MultiModalAgent initialized successfully")
65
+ except Exception as e:
66
+ logger.error(f"Error instantiating agent: {e}", exc_info=True)
67
+ return f"Error initializing agent: {e}", None
68
+ # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
69
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
70
+ print(agent_code)
71
+
72
+ # 2. Fetch Questions
73
+ print(f"Fetching questions from: {questions_url}")
74
+ try:
75
+ response = requests.get(questions_url, timeout=15)
76
+ response.raise_for_status()
77
+ questions_data = response.json()
78
+ if not questions_data:
79
+ print("Fetched questions list is empty.")
80
+ return "Fetched questions list is empty or invalid format.", None
81
+ print(f"Fetched {len(questions_data)} questions.")
82
+ except requests.exceptions.RequestException as e:
83
+ print(f"Error fetching questions: {e}")
84
+ return f"Error fetching questions: {e}", None
85
+ except requests.exceptions.JSONDecodeError as e:
86
+ print(f"Error decoding JSON response from questions endpoint: {e}")
87
+ print(f"Response text: {response.text[:500]}")
88
+ return f"Error decoding server response for questions: {e}", None
89
+ except Exception as e:
90
+ print(f"An unexpected error occurred fetching questions: {e}")
91
+ return f"An unexpected error occurred fetching questions: {e}", None
92
+
93
+ # 3. Run your Agent
94
+ results_log = []
95
+ answers_payload = []
96
+ print(f"Running agent on {len(questions_data)} questions...")
97
+ for item in questions_data:
98
+ task_id = item.get("task_id")
99
+ question_text = item.get("question")
100
+ if not task_id or question_text is None:
101
+ print(f"Skipping item with missing task_id or question: {item}")
102
+ continue
103
+ try:
104
+ submitted_answer = agent(question_text)
105
+ answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
106
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
107
+ except Exception as e:
108
+ print(f"Error running agent on task {task_id}: {e}")
109
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
110
+
111
+ if not answers_payload:
112
+ print("Agent did not produce any answers to submit.")
113
+ return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
114
+
115
+ # 4. Prepare Submission
116
+ submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
117
+ status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
118
+ print(status_update)
119
+
120
+ # 5. Submit
121
+ print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
122
+ try:
123
+ response = requests.post(submit_url, json=submission_data, timeout=60)
124
+ response.raise_for_status()
125
+ result_data = response.json()
126
+ final_status = (
127
+ f"Submission Successful!\n"
128
+ f"User: {result_data.get('username')}\n"
129
+ f"Overall Score: {result_data.get('score', 'N/A')}% "
130
+ f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
131
+ f"Message: {result_data.get('message', 'No message received.')}"
132
+ )
133
+ print("Submission successful.")
134
+ results_df = pd.DataFrame(results_log)
135
+ return final_status, results_df
136
+ except requests.exceptions.HTTPError as e:
137
+ error_detail = f"Server responded with status {e.response.status_code}."
138
+ try:
139
+ error_json = e.response.json()
140
+ error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
141
+ except requests.exceptions.JSONDecodeError:
142
+ error_detail += f" Response: {e.response.text[:500]}"
143
+ status_message = f"Submission Failed: {error_detail}"
144
+ print(status_message)
145
+ results_df = pd.DataFrame(results_log)
146
+ return status_message, results_df
147
+ except requests.exceptions.Timeout:
148
+ status_message = "Submission Failed: The request timed out."
149
+ print(status_message)
150
+ results_df = pd.DataFrame(results_log)
151
+ return status_message, results_df
152
+ except requests.exceptions.RequestException as e:
153
+ status_message = f"Submission Failed: Network error - {e}"
154
+ print(status_message)
155
+ results_df = pd.DataFrame(results_log)
156
+ return status_message, results_df
157
+ except Exception as e:
158
+ status_message = f"An unexpected error occurred during submission: {e}"
159
+ print(status_message)
160
+ results_df = pd.DataFrame(results_log)
161
+ return status_message, results_df
162
+
163
+
164
+ # --- Build Gradio Interface using Blocks ---
165
+ with gr.Blocks() as demo:
166
+ gr.Markdown("# Basic Agent Evaluation Runner")
167
+ gr.Markdown(
168
+ """
169
+ **Instructions:**
170
+
171
+ 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
172
+ 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
173
+ 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
174
+
175
+ ---
176
+ **Disclaimers:**
177
+ Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
178
+ This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
179
+ """
180
+ )
181
+
182
+ gr.LoginButton()
183
+
184
+ run_button = gr.Button("Run Evaluation & Submit All Answers")
185
+
186
+ status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
187
+ # Removed max_rows=10 from DataFrame constructor
188
+ results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
189
+
190
+ run_button.click(
191
+ fn=run_and_submit_all,
192
+ outputs=[status_output, results_table]
193
+ )
194
+
195
+ if __name__ == "__main__":
196
+ print("\n" + "-"*30 + " App Starting " + "-"*30)
197
+ # Check for SPACE_HOST and SPACE_ID at startup for information
198
+ space_host_startup = os.getenv("SPACE_HOST")
199
+ space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
200
+
201
+ if space_host_startup:
202
+ print(f"✅ SPACE_HOST found: {space_host_startup}")
203
+ print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
204
+ else:
205
+ print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
206
+
207
+ if space_id_startup: # Print repo URLs if SPACE_ID is found
208
+ print(f"✅ SPACE_ID found: {space_id_startup}")
209
+ print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
210
+ print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
211
+ else:
212
+ print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
213
+
214
+ print("-"*(60 + len(" App Starting ")) + "\n")
215
+
216
+ print("Launching Gradio Interface for Basic Agent Evaluation...")
217
+ demo.launch(debug=True, share=False)
index.html DELETED
@@ -1,19 +0,0 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
19
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas>=1.3.0
2
+ gradio>=3.0.0
3
+ requests>=2.25.0
4
+ openpyxl>=3.0.9
5
+ PyPDF2>=2.0.0
6
+ python-docx>=0.8.11
7
+ python-pptx>=0.6.19
8
+ Pillow>=8.0.0
9
+ jsonschema>=4.0.0
10
+ zipfile36>=0.1.3
11
+ scikit-learn>=1.0.0
12
+ nltk>=3.6.0
13
+ python-dotenv>=0.19.0
14
+ pytest>=6.0.0
15
+ PyYAML>=6.0
16
+ biopython>=1.79
style.css DELETED
@@ -1,28 +0,0 @@
1
- body {
2
- padding: 2rem;
3
- font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
4
- }
5
-
6
- h1 {
7
- font-size: 16px;
8
- margin-top: 0;
9
- }
10
-
11
- p {
12
- color: rgb(107, 114, 128);
13
- font-size: 15px;
14
- margin-bottom: 10px;
15
- margin-top: 5px;
16
- }
17
-
18
- .card {
19
- max-width: 620px;
20
- margin: 0 auto;
21
- padding: 16px;
22
- border: 1px solid lightgray;
23
- border-radius: 16px;
24
- }
25
-
26
- .card p:last-child {
27
- margin-bottom: 0;
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_agent.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test the MultiModalAgent.
3
+ """
4
+ import os
5
+ import sys
6
+ import logging
7
+ import json
8
+
9
+ # Add the current directory to sys.path to import local modules
10
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
11
+
12
+ # Import the MultiModalAgent
13
+ from agent import MultiModalAgent
14
+
15
+ # Configure logging
16
+ logging.basicConfig(
17
+ level=logging.INFO,
18
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
19
+ )
20
+ logger = logging.getLogger('test_agent')
21
+
22
+ def main():
23
+ """Test the MultiModalAgent with some sample questions."""
24
+ # Initialize the agent
25
+ resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resource')
26
+ agent = MultiModalAgent(resource_dir=resource_dir)
27
+
28
+ # Load test questions from metadata.jsonl
29
+ metadata_path = os.path.join(resource_dir, 'metadata.jsonl')
30
+ test_questions = []
31
+
32
+ with open(metadata_path, 'r', encoding='utf-8') as f:
33
+ for line in f:
34
+ entry = json.loads(line.strip())
35
+ if 'Question' in entry and 'file_name' in entry and entry['file_name']:
36
+ test_questions.append({
37
+ 'task_id': entry.get('task_id'),
38
+ 'question': entry['Question'],
39
+ 'file_name': entry['file_name'],
40
+ 'expected_answer': entry.get('Final answer')
41
+ })
42
+ if len(test_questions) >= 5: # Limit to 5 questions
43
+ break
44
+
45
+ # If no questions with files were found, use some generic questions
46
+ if not test_questions:
47
+ test_questions = [
48
+ {
49
+ 'question': "What's the oldest Blu-Ray in the inventory spreadsheet?",
50
+ 'file_name': None,
51
+ 'expected_answer': None
52
+ },
53
+ {
54
+ 'question': "How many files are in the resource directory?",
55
+ 'file_name': None,
56
+ 'expected_answer': None
57
+ }
58
+ ]
59
+
60
+ # Test the agent with each question
61
+ for i, q in enumerate(test_questions):
62
+ question = q['question']
63
+ logger.info(f"Testing question {i+1}: {question}")
64
+
65
+ answer = agent(question)
66
+ logger.info(f"Answer: {answer}")
67
+
68
+ if q['expected_answer']:
69
+ logger.info(f"Expected answer: {q['expected_answer']}")
70
+ if answer.strip() == q['expected_answer'].strip():
71
+ logger.info("Correct answer!")
72
+ else:
73
+ logger.warning("Incorrect answer.")
74
+
75
+ logger.info("-" * 80)
76
+
77
+ if __name__ == "__main__":
78
+ main()