Spaces:
Sleeping
Sleeping
Arbnor Tefiki
commited on
Commit
·
5d9aa5e
1
Parent(s):
c594a60
Test the agent in HF
Browse files- .gitignore +68 -0
- Dockerfile +23 -0
- README.md +55 -11
- agent/__init__.py +6 -0
- agent/agent.py +287 -0
- agent/tools/__init__.py +3 -0
- agent/tools/file_handlers.py +561 -0
- agent/utils/__init__.py +3 -0
- agent/utils/data_processor.py +936 -0
- agent/utils/question_analyzer.py +195 -0
- app.py +217 -0
- index.html +0 -19
- requirements.txt +16 -0
- style.css +0 -28
- test_agent.py +78 -0
.gitignore
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
*.so
|
6 |
+
.Python
|
7 |
+
build/
|
8 |
+
develop-eggs/
|
9 |
+
dist/
|
10 |
+
downloads/
|
11 |
+
eggs/
|
12 |
+
.eggs/
|
13 |
+
lib/
|
14 |
+
lib64/
|
15 |
+
parts/
|
16 |
+
sdist/
|
17 |
+
var/
|
18 |
+
wheels/
|
19 |
+
*.egg-info/
|
20 |
+
.installed.cfg
|
21 |
+
*.egg
|
22 |
+
|
23 |
+
# Distribution / packaging
|
24 |
+
.Python
|
25 |
+
env/
|
26 |
+
build/
|
27 |
+
develop-eggs/
|
28 |
+
dist/
|
29 |
+
downloads/
|
30 |
+
eggs/
|
31 |
+
.eggs/
|
32 |
+
lib/
|
33 |
+
lib64/
|
34 |
+
parts/
|
35 |
+
sdist/
|
36 |
+
var/
|
37 |
+
*.egg-info/
|
38 |
+
.installed.cfg
|
39 |
+
*.egg
|
40 |
+
|
41 |
+
# Virtual Environment
|
42 |
+
venv/
|
43 |
+
ENV/
|
44 |
+
env/
|
45 |
+
|
46 |
+
# Jupyter Notebook
|
47 |
+
.ipynb_checkpoints
|
48 |
+
|
49 |
+
# VS Code
|
50 |
+
.vscode/
|
51 |
+
*.code-workspace
|
52 |
+
|
53 |
+
# PyCharm
|
54 |
+
.idea/
|
55 |
+
|
56 |
+
# Logs
|
57 |
+
logs/
|
58 |
+
*.log
|
59 |
+
|
60 |
+
# Local configuration
|
61 |
+
.env
|
62 |
+
|
63 |
+
# Cache
|
64 |
+
.cache/
|
65 |
+
.pytest_cache/
|
66 |
+
|
67 |
+
# Mac OS
|
68 |
+
.DS_Store
|
Dockerfile
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
# Install system dependencies
|
6 |
+
RUN apt-get update && apt-get install -y \
|
7 |
+
build-essential \
|
8 |
+
libffi-dev \
|
9 |
+
&& apt-get clean \
|
10 |
+
&& rm -rf /var/lib/apt/lists/*
|
11 |
+
|
12 |
+
# Copy requirements first to leverage Docker cache
|
13 |
+
COPY requirements.txt .
|
14 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
15 |
+
|
16 |
+
# Copy application code
|
17 |
+
COPY . .
|
18 |
+
|
19 |
+
# Set environment variables
|
20 |
+
ENV PYTHONUNBUFFERED=1
|
21 |
+
|
22 |
+
# Command to run when the container starts
|
23 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
@@ -1,11 +1,55 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Multi-Modal AI Agent for Hugging Face Agent Course Unit 4
|
2 |
+
|
3 |
+
This project implements a multi-modal AI agent that can process various file types and answer questions about their content for the Hugging Face Agent Course Unit 4 final assessment.
|
4 |
+
|
5 |
+
## Features
|
6 |
+
|
7 |
+
- Processes and answers questions about different file types:
|
8 |
+
- Excel/CSV files (.xlsx, .csv)
|
9 |
+
- Text files (.txt)
|
10 |
+
- PDFs (.pdf)
|
11 |
+
- Images (.jpg, .png)
|
12 |
+
- Python files (.py)
|
13 |
+
- Microsoft Office files (.docx, .pptx)
|
14 |
+
- JSON files (.jsonld)
|
15 |
+
- Archive files (.zip)
|
16 |
+
- Other specialized formats (.pdb)
|
17 |
+
|
18 |
+
- Analyzes questions to understand what's being asked
|
19 |
+
- Identifies and loads relevant resource files
|
20 |
+
- Applies appropriate processing techniques based on file type
|
21 |
+
- Formulates accurate answers based on file content
|
22 |
+
- Includes error handling and logging
|
23 |
+
|
24 |
+
## Project Structure
|
25 |
+
|
26 |
+
- `app.py`: Main application file with Gradio interface
|
27 |
+
- `agent/`: Package containing agent components
|
28 |
+
- `agent.py`: Multi-modal agent implementation
|
29 |
+
- `tools/`: File handlers and other tools
|
30 |
+
- `utils/`: Utility functions for question analysis and data processing
|
31 |
+
|
32 |
+
## Installation
|
33 |
+
|
34 |
+
1. Clone the repository
|
35 |
+
2. Install dependencies:
|
36 |
+
```
|
37 |
+
pip install -r requirements.txt
|
38 |
+
```
|
39 |
+
|
40 |
+
## Usage
|
41 |
+
|
42 |
+
Run the application:
|
43 |
+
```
|
44 |
+
python app.py
|
45 |
+
```
|
46 |
+
|
47 |
+
## Dependencies
|
48 |
+
|
49 |
+
- pandas: For data processing
|
50 |
+
- gradio: For the user interface
|
51 |
+
- PyPDF2: For PDF processing
|
52 |
+
- python-docx: For Word document processing
|
53 |
+
- python-pptx: For PowerPoint presentations
|
54 |
+
- Pillow: For image processing
|
55 |
+
- And more (see requirements.txt)
|
agent/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Agent package initialization file.
|
3 |
+
"""
|
4 |
+
from agent.agent import MultiModalAgent
|
5 |
+
|
6 |
+
__all__ = ["MultiModalAgent"]
|
agent/agent.py
ADDED
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Multi-modal agent for processing different file types and answering questions.
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import json
|
6 |
+
import logging
|
7 |
+
from typing import Dict, Any, List, Optional, Tuple
|
8 |
+
|
9 |
+
from agent.tools.file_handlers import extract_file_content
|
10 |
+
from agent.utils.question_analyzer import QuestionAnalyzer
|
11 |
+
from agent.utils.data_processor import DataProcessor
|
12 |
+
|
13 |
+
# Configure logging
|
14 |
+
logging.basicConfig(
|
15 |
+
level=logging.INFO,
|
16 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
17 |
+
)
|
18 |
+
logger = logging.getLogger('MultiModalAgent')
|
19 |
+
|
20 |
+
class MultiModalAgent:
|
21 |
+
"""
|
22 |
+
Agent for processing different file types and answering questions.
|
23 |
+
"""
|
24 |
+
|
25 |
+
def __init__(self, resource_dir: str = 'resource'):
|
26 |
+
"""
|
27 |
+
Initialize the agent.
|
28 |
+
|
29 |
+
Args:
|
30 |
+
resource_dir: Directory containing resource files
|
31 |
+
"""
|
32 |
+
logger.info("Initializing MultiModalAgent")
|
33 |
+
self.resource_dir = resource_dir
|
34 |
+
self.question_analyzer = QuestionAnalyzer(resource_dir)
|
35 |
+
self.data_processor = DataProcessor()
|
36 |
+
|
37 |
+
# Cache for file content to avoid re-processing
|
38 |
+
self.file_content_cache = {}
|
39 |
+
|
40 |
+
# Cache for answers
|
41 |
+
self.answer_cache = {}
|
42 |
+
|
43 |
+
def __call__(self, question: str) -> str:
|
44 |
+
"""
|
45 |
+
Process a question and return an answer.
|
46 |
+
|
47 |
+
Args:
|
48 |
+
question: The question to answer
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
Answer to the question
|
52 |
+
"""
|
53 |
+
logger.info(f"Processing question: {question[:100]}...")
|
54 |
+
|
55 |
+
# Check answer cache
|
56 |
+
if question in self.answer_cache:
|
57 |
+
logger.info("Answer found in cache")
|
58 |
+
return self.answer_cache[question]
|
59 |
+
|
60 |
+
try:
|
61 |
+
# Analyze the question
|
62 |
+
analysis = self.question_analyzer.analyze_question(question)
|
63 |
+
logger.info(f"Question analysis: {analysis}")
|
64 |
+
|
65 |
+
# Handle general questions that don't require file processing
|
66 |
+
if not analysis.get('file_path'):
|
67 |
+
logger.info("No file reference found in question, trying to answer directly")
|
68 |
+
direct_answer = self._answer_without_file(question)
|
69 |
+
if direct_answer:
|
70 |
+
self.answer_cache[question] = direct_answer
|
71 |
+
return direct_answer
|
72 |
+
|
73 |
+
# If direct answering failed, try to find a file in the resource directory
|
74 |
+
logger.info("Direct answering failed, looking for relevant files")
|
75 |
+
analysis['file_path'] = self._find_most_relevant_file(question)
|
76 |
+
if not analysis['file_path']:
|
77 |
+
logger.warning("No relevant file found for the question")
|
78 |
+
return "I couldn't find a relevant file to answer this question."
|
79 |
+
|
80 |
+
# Extract content from the file
|
81 |
+
file_path = analysis['file_path']
|
82 |
+
|
83 |
+
if file_path in self.file_content_cache:
|
84 |
+
content, handler = self.file_content_cache[file_path]
|
85 |
+
else:
|
86 |
+
content, handler = extract_file_content(file_path, self.resource_dir)
|
87 |
+
if content is not None:
|
88 |
+
self.file_content_cache[file_path] = (content, handler)
|
89 |
+
|
90 |
+
if content is None:
|
91 |
+
logger.error(f"Failed to extract content from file: {file_path}")
|
92 |
+
return "I couldn't extract content from the specified file."
|
93 |
+
|
94 |
+
# Process the content based on file type
|
95 |
+
answer = self._process_content(content, handler, question)
|
96 |
+
|
97 |
+
# Cache the answer
|
98 |
+
self.answer_cache[question] = answer
|
99 |
+
|
100 |
+
return answer
|
101 |
+
except Exception as e:
|
102 |
+
logger.exception(f"Error processing question: {e}")
|
103 |
+
return f"An error occurred while processing your question: {e}"
|
104 |
+
|
105 |
+
def _answer_without_file(self, question: str) -> Optional[str]:
|
106 |
+
"""
|
107 |
+
Try to answer the question without using a file.
|
108 |
+
|
109 |
+
Args:
|
110 |
+
question: The question to answer
|
111 |
+
|
112 |
+
Returns:
|
113 |
+
Answer to the question, or None if the question can't be answered directly
|
114 |
+
"""
|
115 |
+
# This is a simple implementation that can be expanded based on your needs
|
116 |
+
|
117 |
+
# Check if the question is asking for metadata about the resource directory
|
118 |
+
if 'how many files' in question.lower() or 'number of files' in question.lower():
|
119 |
+
try:
|
120 |
+
file_count = len(os.listdir(self.resource_dir))
|
121 |
+
return f"There are {file_count} files in the resource directory."
|
122 |
+
except Exception as e:
|
123 |
+
logger.error(f"Error counting files: {e}")
|
124 |
+
return None
|
125 |
+
|
126 |
+
# Check if the question is asking about file types
|
127 |
+
file_types_patterns = [
|
128 |
+
'what file types', 'which file types', 'what kinds of files',
|
129 |
+
'which kinds of files', 'what formats', 'which formats'
|
130 |
+
]
|
131 |
+
if any(pattern in question.lower() for pattern in file_types_patterns):
|
132 |
+
try:
|
133 |
+
files = os.listdir(self.resource_dir)
|
134 |
+
extensions = set()
|
135 |
+
|
136 |
+
for file in files:
|
137 |
+
_, ext = os.path.splitext(file)
|
138 |
+
if ext: # Skip files without extension
|
139 |
+
extensions.add(ext)
|
140 |
+
|
141 |
+
if extensions:
|
142 |
+
extensions_list = sorted(list(extensions))
|
143 |
+
return f"The resource directory contains files with the following extensions: {', '.join(extensions_list)}"
|
144 |
+
else:
|
145 |
+
return "The resource directory doesn't contain any files with extensions."
|
146 |
+
except Exception as e:
|
147 |
+
logger.error(f"Error analyzing file types: {e}")
|
148 |
+
return None
|
149 |
+
|
150 |
+
return None
|
151 |
+
|
152 |
+
def _find_most_relevant_file(self, question: str) -> Optional[str]:
|
153 |
+
"""
|
154 |
+
Find the most relevant file for a question.
|
155 |
+
|
156 |
+
Args:
|
157 |
+
question: The question to answer
|
158 |
+
|
159 |
+
Returns:
|
160 |
+
Path to the most relevant file, or None if no relevant file is found
|
161 |
+
"""
|
162 |
+
try:
|
163 |
+
# Get all files in the resource directory
|
164 |
+
files = [
|
165 |
+
os.path.join(self.resource_dir, f)
|
166 |
+
for f in os.listdir(self.resource_dir)
|
167 |
+
if os.path.isfile(os.path.join(self.resource_dir, f))
|
168 |
+
]
|
169 |
+
|
170 |
+
if not files:
|
171 |
+
logger.warning("No files found in the resource directory")
|
172 |
+
return None
|
173 |
+
|
174 |
+
# Extract keywords from the question
|
175 |
+
keywords = set(self.question_analyzer._extract_keywords(question))
|
176 |
+
|
177 |
+
# Calculate relevance scores for each file
|
178 |
+
scores = []
|
179 |
+
|
180 |
+
for file_path in files:
|
181 |
+
score = 0
|
182 |
+
file_name = os.path.basename(file_path)
|
183 |
+
|
184 |
+
# Score based on file name
|
185 |
+
for keyword in keywords:
|
186 |
+
if keyword.lower() in file_name.lower():
|
187 |
+
score += 2 # Higher weight for filename matches
|
188 |
+
|
189 |
+
# Score based on file extension
|
190 |
+
_, ext = os.path.splitext(file_path)
|
191 |
+
ext = ext.lower()
|
192 |
+
|
193 |
+
# Check if the question mentions the file type
|
194 |
+
if 'excel' in question.lower() or 'spreadsheet' in question.lower() or 'xlsx' in question.lower():
|
195 |
+
if ext in ['.xlsx', '.xls']:
|
196 |
+
score += 3
|
197 |
+
elif 'csv' in question.lower():
|
198 |
+
if ext == '.csv':
|
199 |
+
score += 3
|
200 |
+
elif 'text' in question.lower() or 'txt' in question.lower():
|
201 |
+
if ext == '.txt':
|
202 |
+
score += 3
|
203 |
+
elif 'pdf' in question.lower():
|
204 |
+
if ext == '.pdf':
|
205 |
+
score += 3
|
206 |
+
elif 'image' in question.lower() or 'picture' in question.lower() or 'photo' in question.lower():
|
207 |
+
if ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']:
|
208 |
+
score += 3
|
209 |
+
elif 'word' in question.lower() or 'document' in question.lower() or 'docx' in question.lower():
|
210 |
+
if ext == '.docx':
|
211 |
+
score += 3
|
212 |
+
elif 'powerpoint' in question.lower() or 'presentation' in question.lower() or 'slides' in question.lower() or 'pptx' in question.lower():
|
213 |
+
if ext == '.pptx':
|
214 |
+
score += 3
|
215 |
+
elif 'json' in question.lower():
|
216 |
+
if ext in ['.json', '.jsonld']:
|
217 |
+
score += 3
|
218 |
+
elif 'zip' in question.lower() or 'archive' in question.lower():
|
219 |
+
if ext == '.zip':
|
220 |
+
score += 3
|
221 |
+
elif 'python' in question.lower() or 'py' in question.lower() or 'code' in question.lower() or 'script' in question.lower():
|
222 |
+
if ext == '.py':
|
223 |
+
score += 3
|
224 |
+
elif 'pdb' in question.lower() or 'protein' in question.lower():
|
225 |
+
if ext == '.pdb':
|
226 |
+
score += 3
|
227 |
+
|
228 |
+
scores.append((file_path, score))
|
229 |
+
|
230 |
+
# Sort by score in descending order
|
231 |
+
scores.sort(key=lambda x: x[1], reverse=True)
|
232 |
+
|
233 |
+
# Return the most relevant file if it has a non-zero score
|
234 |
+
if scores and scores[0][1] > 0:
|
235 |
+
return scores[0][0]
|
236 |
+
|
237 |
+
# If no relevant file is found based on the question, return None
|
238 |
+
return None
|
239 |
+
except Exception as e:
|
240 |
+
logger.error(f"Error finding relevant file: {e}")
|
241 |
+
return None
|
242 |
+
|
243 |
+
def _process_content(self, content: Any, handler: Any, question: str) -> str:
|
244 |
+
"""
|
245 |
+
Process the content based on file type.
|
246 |
+
|
247 |
+
Args:
|
248 |
+
content: Extracted content from the file
|
249 |
+
handler: File handler used to extract the content
|
250 |
+
question: The question to answer
|
251 |
+
|
252 |
+
Returns:
|
253 |
+
Answer to the question
|
254 |
+
"""
|
255 |
+
try:
|
256 |
+
handler_type = type(handler).__name__
|
257 |
+
|
258 |
+
if handler_type == 'ExcelHandler':
|
259 |
+
return self.data_processor.process_excel_data(content, question)
|
260 |
+
elif handler_type == 'CSVHandler':
|
261 |
+
return self.data_processor.process_csv_data(content, question)
|
262 |
+
elif handler_type == 'TextHandler':
|
263 |
+
return self.data_processor.process_text_data(content, question)
|
264 |
+
elif handler_type == 'PDFHandler':
|
265 |
+
return self.data_processor.process_pdf_data(content, question)
|
266 |
+
elif handler_type == 'ImageHandler':
|
267 |
+
return self.data_processor.process_image_metadata(content, question)
|
268 |
+
elif handler_type == 'DocxHandler':
|
269 |
+
return self.data_processor.process_docx_data(content, question)
|
270 |
+
elif handler_type == 'PptxHandler':
|
271 |
+
return self.data_processor.process_pptx_data(content, question)
|
272 |
+
elif handler_type == 'JsonHandler':
|
273 |
+
return self.data_processor.process_json_data(content, question)
|
274 |
+
elif handler_type == 'ZipHandler':
|
275 |
+
return self.data_processor.process_zip_data(content, question)
|
276 |
+
elif handler_type == 'PdbHandler':
|
277 |
+
return self.data_processor.process_pdb_data(content, question)
|
278 |
+
elif handler_type == 'PythonHandler':
|
279 |
+
return self.data_processor.process_python_data(content, question)
|
280 |
+
elif handler_type == 'JsonlHandler':
|
281 |
+
return self.data_processor.process_jsonl_data(content, question)
|
282 |
+
else:
|
283 |
+
logger.warning(f"Unknown handler type: {handler_type}")
|
284 |
+
return f"I don't know how to process content from a {handler_type}."
|
285 |
+
except Exception as e:
|
286 |
+
logger.exception(f"Error processing content: {e}")
|
287 |
+
return f"An error occurred while processing the file content: {e}"
|
agent/tools/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Tools initialization file.
|
3 |
+
"""
|
agent/tools/file_handlers.py
ADDED
@@ -0,0 +1,561 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
File handlers for processing different file types.
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import json
|
6 |
+
import csv
|
7 |
+
import zipfile
|
8 |
+
import io
|
9 |
+
import re
|
10 |
+
from typing import Dict, Any, List, Optional, Tuple
|
11 |
+
|
12 |
+
import pandas as pd
|
13 |
+
from PIL import Image
|
14 |
+
import PyPDF2
|
15 |
+
import docx
|
16 |
+
from pptx import Presentation
|
17 |
+
|
18 |
+
class FileHandler:
|
19 |
+
"""Base class for file handlers."""
|
20 |
+
|
21 |
+
def __init__(self, resource_dir: str):
|
22 |
+
"""
|
23 |
+
Initialize the file handler.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
resource_dir: Directory containing resource files
|
27 |
+
"""
|
28 |
+
self.resource_dir = resource_dir
|
29 |
+
|
30 |
+
def get_file_path(self, file_name: str) -> str:
|
31 |
+
"""
|
32 |
+
Get the full path to a file.
|
33 |
+
|
34 |
+
Args:
|
35 |
+
file_name: Name of the file
|
36 |
+
|
37 |
+
Returns:
|
38 |
+
Full path to the file
|
39 |
+
"""
|
40 |
+
return os.path.join(self.resource_dir, file_name)
|
41 |
+
|
42 |
+
def can_handle(self, file_path: str) -> bool:
|
43 |
+
"""
|
44 |
+
Check if the handler can process the given file.
|
45 |
+
|
46 |
+
Args:
|
47 |
+
file_path: Path to the file
|
48 |
+
|
49 |
+
Returns:
|
50 |
+
True if the handler can process the file, False otherwise
|
51 |
+
"""
|
52 |
+
raise NotImplementedError("Subclasses must implement this method")
|
53 |
+
|
54 |
+
def extract_content(self, file_path: str) -> Any:
|
55 |
+
"""
|
56 |
+
Extract content from the file.
|
57 |
+
|
58 |
+
Args:
|
59 |
+
file_path: Path to the file
|
60 |
+
|
61 |
+
Returns:
|
62 |
+
Extracted content
|
63 |
+
"""
|
64 |
+
raise NotImplementedError("Subclasses must implement this method")
|
65 |
+
|
66 |
+
def get_file_type(self, file_path: str) -> str:
|
67 |
+
"""
|
68 |
+
Get the file type based on extension.
|
69 |
+
|
70 |
+
Args:
|
71 |
+
file_path: Path to the file
|
72 |
+
|
73 |
+
Returns:
|
74 |
+
File type (extension)
|
75 |
+
"""
|
76 |
+
_, ext = os.path.splitext(file_path)
|
77 |
+
return ext.lower()
|
78 |
+
|
79 |
+
|
80 |
+
class ExcelHandler(FileHandler):
|
81 |
+
"""Handler for Excel files."""
|
82 |
+
|
83 |
+
def can_handle(self, file_path: str) -> bool:
|
84 |
+
"""Check if the file is an Excel file."""
|
85 |
+
return self.get_file_type(file_path) in ['.xlsx', '.xls']
|
86 |
+
|
87 |
+
def extract_content(self, file_path: str) -> Dict[str, pd.DataFrame]:
|
88 |
+
"""
|
89 |
+
Extract content from an Excel file.
|
90 |
+
|
91 |
+
Returns:
|
92 |
+
Dictionary mapping sheet names to DataFrames
|
93 |
+
"""
|
94 |
+
try:
|
95 |
+
# Read all sheets
|
96 |
+
excel_file = pd.ExcelFile(file_path)
|
97 |
+
sheets = {}
|
98 |
+
|
99 |
+
for sheet_name in excel_file.sheet_names:
|
100 |
+
sheets[sheet_name] = pd.read_excel(excel_file, sheet_name)
|
101 |
+
|
102 |
+
return sheets
|
103 |
+
except Exception as e:
|
104 |
+
print(f"Error extracting content from Excel file {file_path}: {e}")
|
105 |
+
return {}
|
106 |
+
|
107 |
+
|
108 |
+
class CSVHandler(FileHandler):
|
109 |
+
"""Handler for CSV files."""
|
110 |
+
|
111 |
+
def can_handle(self, file_path: str) -> bool:
|
112 |
+
"""Check if the file is a CSV file."""
|
113 |
+
return self.get_file_type(file_path) == '.csv'
|
114 |
+
|
115 |
+
def extract_content(self, file_path: str) -> pd.DataFrame:
|
116 |
+
"""
|
117 |
+
Extract content from a CSV file.
|
118 |
+
|
119 |
+
Returns:
|
120 |
+
DataFrame containing the CSV data
|
121 |
+
"""
|
122 |
+
try:
|
123 |
+
# Try different encodings and delimiters
|
124 |
+
try:
|
125 |
+
return pd.read_csv(file_path)
|
126 |
+
except:
|
127 |
+
# Try with different delimiter
|
128 |
+
with open(file_path, 'r', newline='') as csvfile:
|
129 |
+
dialect = csv.Sniffer().sniff(csvfile.read(1024))
|
130 |
+
csvfile.seek(0)
|
131 |
+
return pd.read_csv(file_path, delimiter=dialect.delimiter)
|
132 |
+
except Exception as e:
|
133 |
+
print(f"Error extracting content from CSV file {file_path}: {e}")
|
134 |
+
return pd.DataFrame()
|
135 |
+
|
136 |
+
|
137 |
+
class TextHandler(FileHandler):
|
138 |
+
"""Handler for text files."""
|
139 |
+
|
140 |
+
def can_handle(self, file_path: str) -> bool:
|
141 |
+
"""Check if the file is a text file."""
|
142 |
+
return self.get_file_type(file_path) == '.txt'
|
143 |
+
|
144 |
+
def extract_content(self, file_path: str) -> str:
|
145 |
+
"""
|
146 |
+
Extract content from a text file.
|
147 |
+
|
148 |
+
Returns:
|
149 |
+
Text content of the file
|
150 |
+
"""
|
151 |
+
try:
|
152 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
153 |
+
return f.read()
|
154 |
+
except UnicodeDecodeError:
|
155 |
+
# Try a different encoding
|
156 |
+
try:
|
157 |
+
with open(file_path, 'r', encoding='latin-1') as f:
|
158 |
+
return f.read()
|
159 |
+
except Exception as e:
|
160 |
+
print(f"Error extracting content from text file {file_path}: {e}")
|
161 |
+
return ""
|
162 |
+
except Exception as e:
|
163 |
+
print(f"Error extracting content from text file {file_path}: {e}")
|
164 |
+
return ""
|
165 |
+
|
166 |
+
|
167 |
+
class PDFHandler(FileHandler):
|
168 |
+
"""Handler for PDF files."""
|
169 |
+
|
170 |
+
def can_handle(self, file_path: str) -> bool:
|
171 |
+
"""Check if the file is a PDF file."""
|
172 |
+
return self.get_file_type(file_path) == '.pdf'
|
173 |
+
|
174 |
+
def extract_content(self, file_path: str) -> Dict[int, str]:
|
175 |
+
"""
|
176 |
+
Extract content from a PDF file.
|
177 |
+
|
178 |
+
Returns:
|
179 |
+
Dictionary mapping page numbers to text content
|
180 |
+
"""
|
181 |
+
try:
|
182 |
+
result = {}
|
183 |
+
with open(file_path, 'rb') as f:
|
184 |
+
pdf_reader = PyPDF2.PdfReader(f)
|
185 |
+
for i in range(len(pdf_reader.pages)):
|
186 |
+
page = pdf_reader.pages[i]
|
187 |
+
result[i + 1] = page.extract_text()
|
188 |
+
return result
|
189 |
+
except Exception as e:
|
190 |
+
print(f"Error extracting content from PDF file {file_path}: {e}")
|
191 |
+
return {}
|
192 |
+
|
193 |
+
|
194 |
+
class ImageHandler(FileHandler):
|
195 |
+
"""Handler for image files."""
|
196 |
+
|
197 |
+
def can_handle(self, file_path: str) -> bool:
|
198 |
+
"""Check if the file is an image file."""
|
199 |
+
return self.get_file_type(file_path) in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']
|
200 |
+
|
201 |
+
def extract_content(self, file_path: str) -> Dict[str, Any]:
|
202 |
+
"""
|
203 |
+
Extract metadata from an image file.
|
204 |
+
|
205 |
+
Returns:
|
206 |
+
Dictionary containing image metadata
|
207 |
+
"""
|
208 |
+
try:
|
209 |
+
with Image.open(file_path) as img:
|
210 |
+
metadata = {
|
211 |
+
'format': img.format,
|
212 |
+
'mode': img.mode,
|
213 |
+
'size': img.size,
|
214 |
+
'width': img.width,
|
215 |
+
'height': img.height,
|
216 |
+
}
|
217 |
+
|
218 |
+
# Extract EXIF data if available
|
219 |
+
if hasattr(img, '_getexif') and img._getexif():
|
220 |
+
exif = {
|
221 |
+
PyPDF2.ExifTags.TAGS.get(k, k): v
|
222 |
+
for k, v in img._getexif().items()
|
223 |
+
if k in PyPDF2.ExifTags.TAGS
|
224 |
+
}
|
225 |
+
metadata['exif'] = exif
|
226 |
+
|
227 |
+
return metadata
|
228 |
+
except Exception as e:
|
229 |
+
print(f"Error extracting content from image file {file_path}: {e}")
|
230 |
+
return {}
|
231 |
+
|
232 |
+
|
233 |
+
class DocxHandler(FileHandler):
|
234 |
+
"""Handler for Word documents."""
|
235 |
+
|
236 |
+
def can_handle(self, file_path: str) -> bool:
|
237 |
+
"""Check if the file is a Word document."""
|
238 |
+
return self.get_file_type(file_path) == '.docx'
|
239 |
+
|
240 |
+
def extract_content(self, file_path: str) -> str:
|
241 |
+
"""
|
242 |
+
Extract text content from a Word document.
|
243 |
+
|
244 |
+
Returns:
|
245 |
+
Text content of the document
|
246 |
+
"""
|
247 |
+
try:
|
248 |
+
doc = docx.Document(file_path)
|
249 |
+
full_text = []
|
250 |
+
|
251 |
+
# Extract text from paragraphs
|
252 |
+
for para in doc.paragraphs:
|
253 |
+
full_text.append(para.text)
|
254 |
+
|
255 |
+
# Extract text from tables
|
256 |
+
for table in doc.tables:
|
257 |
+
for row in table.rows:
|
258 |
+
for cell in row.cells:
|
259 |
+
full_text.append(cell.text)
|
260 |
+
|
261 |
+
return '\n'.join(full_text)
|
262 |
+
except Exception as e:
|
263 |
+
print(f"Error extracting content from Word document {file_path}: {e}")
|
264 |
+
return ""
|
265 |
+
|
266 |
+
|
267 |
+
class PptxHandler(FileHandler):
|
268 |
+
"""Handler for PowerPoint presentations."""
|
269 |
+
|
270 |
+
def can_handle(self, file_path: str) -> bool:
|
271 |
+
"""Check if the file is a PowerPoint presentation."""
|
272 |
+
return self.get_file_type(file_path) == '.pptx'
|
273 |
+
|
274 |
+
def extract_content(self, file_path: str) -> Dict[int, str]:
|
275 |
+
"""
|
276 |
+
Extract text content from a PowerPoint presentation.
|
277 |
+
|
278 |
+
Returns:
|
279 |
+
Dictionary mapping slide numbers to text content
|
280 |
+
"""
|
281 |
+
try:
|
282 |
+
prs = Presentation(file_path)
|
283 |
+
slides_text = {}
|
284 |
+
|
285 |
+
for i, slide in enumerate(prs.slides):
|
286 |
+
texts = []
|
287 |
+
|
288 |
+
for shape in slide.shapes:
|
289 |
+
if hasattr(shape, "text"):
|
290 |
+
texts.append(shape.text)
|
291 |
+
|
292 |
+
slides_text[i + 1] = '\n'.join(texts)
|
293 |
+
|
294 |
+
return slides_text
|
295 |
+
except Exception as e:
|
296 |
+
print(f"Error extracting content from PowerPoint presentation {file_path}: {e}")
|
297 |
+
return {}
|
298 |
+
|
299 |
+
|
300 |
+
class JsonHandler(FileHandler):
|
301 |
+
"""Handler for JSON files."""
|
302 |
+
|
303 |
+
def can_handle(self, file_path: str) -> bool:
|
304 |
+
"""Check if the file is a JSON file."""
|
305 |
+
return self.get_file_type(file_path) in ['.json', '.jsonld']
|
306 |
+
|
307 |
+
def extract_content(self, file_path: str) -> Dict[str, Any]:
|
308 |
+
"""
|
309 |
+
Extract content from a JSON file.
|
310 |
+
|
311 |
+
Returns:
|
312 |
+
Parsed JSON content
|
313 |
+
"""
|
314 |
+
try:
|
315 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
316 |
+
return json.load(f)
|
317 |
+
except Exception as e:
|
318 |
+
print(f"Error extracting content from JSON file {file_path}: {e}")
|
319 |
+
return {}
|
320 |
+
|
321 |
+
|
322 |
+
class ZipHandler(FileHandler):
|
323 |
+
"""Handler for ZIP archives."""
|
324 |
+
|
325 |
+
def can_handle(self, file_path: str) -> bool:
|
326 |
+
"""Check if the file is a ZIP archive."""
|
327 |
+
return self.get_file_type(file_path) == '.zip'
|
328 |
+
|
329 |
+
def extract_content(self, file_path: str) -> Dict[str, Any]:
|
330 |
+
"""
|
331 |
+
Extract information about the contents of a ZIP archive.
|
332 |
+
|
333 |
+
Returns:
|
334 |
+
Dictionary containing information about the archived files
|
335 |
+
"""
|
336 |
+
try:
|
337 |
+
result = {'files': []}
|
338 |
+
|
339 |
+
with zipfile.ZipFile(file_path, 'r') as zip_ref:
|
340 |
+
# Get information about each file in the archive
|
341 |
+
for file_info in zip_ref.infolist():
|
342 |
+
result['files'].append({
|
343 |
+
'filename': file_info.filename,
|
344 |
+
'size': file_info.file_size,
|
345 |
+
'compressed_size': file_info.compress_size,
|
346 |
+
'date_time': file_info.date_time,
|
347 |
+
})
|
348 |
+
|
349 |
+
# Try to extract and process common file types
|
350 |
+
for file_info in zip_ref.infolist():
|
351 |
+
if file_info.filename.endswith('.txt'):
|
352 |
+
with zip_ref.open(file_info.filename) as f:
|
353 |
+
content = f.read().decode('utf-8', errors='ignore')
|
354 |
+
result[file_info.filename] = content
|
355 |
+
|
356 |
+
return result
|
357 |
+
except Exception as e:
|
358 |
+
print(f"Error extracting content from ZIP archive {file_path}: {e}")
|
359 |
+
return {}
|
360 |
+
|
361 |
+
|
362 |
+
class PdbHandler(FileHandler):
|
363 |
+
"""Handler for PDB (Protein Data Bank) files."""
|
364 |
+
|
365 |
+
def can_handle(self, file_path: str) -> bool:
|
366 |
+
"""Check if the file is a PDB file."""
|
367 |
+
return self.get_file_type(file_path) == '.pdb'
|
368 |
+
|
369 |
+
def extract_content(self, file_path: str) -> Dict[str, Any]:
|
370 |
+
"""
|
371 |
+
Extract basic information from a PDB file.
|
372 |
+
|
373 |
+
Returns:
|
374 |
+
Dictionary containing basic information about the PDB file
|
375 |
+
"""
|
376 |
+
try:
|
377 |
+
result = {
|
378 |
+
'header': '',
|
379 |
+
'title': '',
|
380 |
+
'compounds': [],
|
381 |
+
'authors': [],
|
382 |
+
'atoms_count': 0,
|
383 |
+
}
|
384 |
+
|
385 |
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
386 |
+
for line in f:
|
387 |
+
if line.startswith('HEADER'):
|
388 |
+
result['header'] = line[10:].strip()
|
389 |
+
elif line.startswith('TITLE'):
|
390 |
+
result['title'] += line[10:].strip()
|
391 |
+
elif line.startswith('COMPND'):
|
392 |
+
result['compounds'].append(line[10:].strip())
|
393 |
+
elif line.startswith('AUTHOR'):
|
394 |
+
result['authors'].append(line[10:].strip())
|
395 |
+
elif line.startswith('ATOM') or line.startswith('HETATM'):
|
396 |
+
result['atoms_count'] += 1
|
397 |
+
|
398 |
+
return result
|
399 |
+
except Exception as e:
|
400 |
+
print(f"Error extracting content from PDB file {file_path}: {e}")
|
401 |
+
return {}
|
402 |
+
|
403 |
+
|
404 |
+
class PythonHandler(FileHandler):
|
405 |
+
"""Handler for Python files."""
|
406 |
+
|
407 |
+
def can_handle(self, file_path: str) -> bool:
|
408 |
+
"""Check if the file is a Python file."""
|
409 |
+
return self.get_file_type(file_path) == '.py'
|
410 |
+
|
411 |
+
def extract_content(self, file_path: str) -> Dict[str, Any]:
|
412 |
+
"""
|
413 |
+
Extract content and structure from a Python file.
|
414 |
+
|
415 |
+
Returns:
|
416 |
+
Dictionary containing the file content and structure information
|
417 |
+
"""
|
418 |
+
try:
|
419 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
420 |
+
content = f.read()
|
421 |
+
|
422 |
+
result = {
|
423 |
+
'content': content,
|
424 |
+
'classes': [],
|
425 |
+
'functions': [],
|
426 |
+
'imports': [],
|
427 |
+
}
|
428 |
+
|
429 |
+
# Extract class definitions
|
430 |
+
class_pattern = r'class\s+(\w+)(?:\(([^)]*)\))?:'
|
431 |
+
for match in re.finditer(class_pattern, content):
|
432 |
+
class_name = match.group(1)
|
433 |
+
parent_class = match.group(2) if match.group(2) else None
|
434 |
+
result['classes'].append({
|
435 |
+
'name': class_name,
|
436 |
+
'parent': parent_class,
|
437 |
+
})
|
438 |
+
|
439 |
+
# Extract function definitions
|
440 |
+
func_pattern = r'def\s+(\w+)\s*\(([^)]*)\):'
|
441 |
+
for match in re.finditer(func_pattern, content):
|
442 |
+
func_name = match.group(1)
|
443 |
+
params = match.group(2)
|
444 |
+
result['functions'].append({
|
445 |
+
'name': func_name,
|
446 |
+
'params': params.strip(),
|
447 |
+
})
|
448 |
+
|
449 |
+
# Extract imports
|
450 |
+
import_pattern = r'(?:from\s+(\w+(?:\.\w+)*)\s+)?import\s+(.+?)(?:\s+as\s+(\w+))?$'
|
451 |
+
for line in content.split('\n'):
|
452 |
+
line = line.strip()
|
453 |
+
if line.startswith('import ') or line.startswith('from '):
|
454 |
+
match = re.match(import_pattern, line)
|
455 |
+
if match:
|
456 |
+
from_module = match.group(1)
|
457 |
+
imported = match.group(2)
|
458 |
+
as_name = match.group(3)
|
459 |
+
result['imports'].append({
|
460 |
+
'from': from_module,
|
461 |
+
'import': imported,
|
462 |
+
'as': as_name,
|
463 |
+
})
|
464 |
+
|
465 |
+
return result
|
466 |
+
except Exception as e:
|
467 |
+
print(f"Error extracting content from Python file {file_path}: {e}")
|
468 |
+
return {}
|
469 |
+
|
470 |
+
|
471 |
+
class JsonlHandler(FileHandler):
|
472 |
+
"""Handler for JSONL (JSON Lines) files."""
|
473 |
+
|
474 |
+
def can_handle(self, file_path: str) -> bool:
|
475 |
+
"""Check if the file is a JSONL file."""
|
476 |
+
return self.get_file_type(file_path) == '.jsonl'
|
477 |
+
|
478 |
+
def extract_content(self, file_path: str) -> List[Dict[str, Any]]:
|
479 |
+
"""
|
480 |
+
Extract content from a JSONL file.
|
481 |
+
|
482 |
+
Returns:
|
483 |
+
List of parsed JSON objects
|
484 |
+
"""
|
485 |
+
try:
|
486 |
+
result = []
|
487 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
488 |
+
for line in f:
|
489 |
+
line = line.strip()
|
490 |
+
if line: # Skip empty lines
|
491 |
+
result.append(json.loads(line))
|
492 |
+
return result
|
493 |
+
except Exception as e:
|
494 |
+
print(f"Error extracting content from JSONL file {file_path}: {e}")
|
495 |
+
return []
|
496 |
+
|
497 |
+
|
498 |
+
def get_all_handlers(resource_dir: str) -> List[FileHandler]:
|
499 |
+
"""
|
500 |
+
Get a list of all file handlers.
|
501 |
+
|
502 |
+
Args:
|
503 |
+
resource_dir: Directory containing resource files
|
504 |
+
|
505 |
+
Returns:
|
506 |
+
List of file handlers
|
507 |
+
"""
|
508 |
+
return [
|
509 |
+
ExcelHandler(resource_dir),
|
510 |
+
CSVHandler(resource_dir),
|
511 |
+
TextHandler(resource_dir),
|
512 |
+
PDFHandler(resource_dir),
|
513 |
+
ImageHandler(resource_dir),
|
514 |
+
DocxHandler(resource_dir),
|
515 |
+
PptxHandler(resource_dir),
|
516 |
+
JsonHandler(resource_dir),
|
517 |
+
ZipHandler(resource_dir),
|
518 |
+
PdbHandler(resource_dir),
|
519 |
+
PythonHandler(resource_dir),
|
520 |
+
JsonlHandler(resource_dir),
|
521 |
+
]
|
522 |
+
|
523 |
+
|
524 |
+
def get_handler_for_file(file_path: str, resource_dir: str) -> Optional[FileHandler]:
|
525 |
+
"""
|
526 |
+
Get the appropriate handler for a file.
|
527 |
+
|
528 |
+
Args:
|
529 |
+
file_path: Path to the file
|
530 |
+
resource_dir: Directory containing resource files
|
531 |
+
|
532 |
+
Returns:
|
533 |
+
Appropriate file handler, or None if no handler can process the file
|
534 |
+
"""
|
535 |
+
handlers = get_all_handlers(resource_dir)
|
536 |
+
|
537 |
+
for handler in handlers:
|
538 |
+
if handler.can_handle(file_path):
|
539 |
+
return handler
|
540 |
+
|
541 |
+
return None
|
542 |
+
|
543 |
+
|
544 |
+
def extract_file_content(file_path: str, resource_dir: str) -> Tuple[Any, Optional[FileHandler]]:
|
545 |
+
"""
|
546 |
+
Extract content from a file using the appropriate handler.
|
547 |
+
|
548 |
+
Args:
|
549 |
+
file_path: Path to the file
|
550 |
+
resource_dir: Directory containing resource files
|
551 |
+
|
552 |
+
Returns:
|
553 |
+
Tuple of (extracted content, handler used)
|
554 |
+
"""
|
555 |
+
handler = get_handler_for_file(file_path, resource_dir)
|
556 |
+
|
557 |
+
if handler:
|
558 |
+
content = handler.extract_content(file_path)
|
559 |
+
return content, handler
|
560 |
+
|
561 |
+
return None, None
|
agent/utils/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Utils initialization file.
|
3 |
+
"""
|
agent/utils/data_processor.py
ADDED
@@ -0,0 +1,936 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Data processor for processing extracted data.
|
3 |
+
"""
|
4 |
+
import re
|
5 |
+
import os
|
6 |
+
import json
|
7 |
+
from typing import Dict, Any, List, Optional, Tuple, Union
|
8 |
+
import pandas as pd
|
9 |
+
|
10 |
+
class DataProcessor:
|
11 |
+
"""
|
12 |
+
Class for processing extracted data.
|
13 |
+
"""
|
14 |
+
|
15 |
+
def __init__(self):
|
16 |
+
"""Initialize the data processor."""
|
17 |
+
pass
|
18 |
+
|
19 |
+
def process_excel_data(self, data: Dict[str, pd.DataFrame], question: str) -> str:
|
20 |
+
"""
|
21 |
+
Process data extracted from an Excel file.
|
22 |
+
|
23 |
+
Args:
|
24 |
+
data: Dictionary mapping sheet names to DataFrames
|
25 |
+
question: The question to answer
|
26 |
+
|
27 |
+
Returns:
|
28 |
+
Answer to the question
|
29 |
+
"""
|
30 |
+
# Convert question to lowercase for easier matching
|
31 |
+
question_lower = question.lower()
|
32 |
+
|
33 |
+
# Handle specific question types
|
34 |
+
if 'oldest' in question_lower:
|
35 |
+
return self._find_oldest_item(data, question_lower)
|
36 |
+
elif 'count' in question_lower or 'how many' in question_lower:
|
37 |
+
return self._count_items(data, question_lower)
|
38 |
+
elif 'average' in question_lower or 'mean' in question_lower:
|
39 |
+
return self._calculate_average(data, question_lower)
|
40 |
+
elif 'total' in question_lower or 'sum' in question_lower:
|
41 |
+
return self._calculate_total(data, question_lower)
|
42 |
+
elif 'maximum' in question_lower or 'highest' in question_lower:
|
43 |
+
return self._find_maximum(data, question_lower)
|
44 |
+
elif 'minimum' in question_lower or 'lowest' in question_lower:
|
45 |
+
return self._find_minimum(data, question_lower)
|
46 |
+
else:
|
47 |
+
# Try to extract specific information
|
48 |
+
return self._extract_specific_info(data, question_lower)
|
49 |
+
|
50 |
+
def _find_oldest_item(self, data: Dict[str, pd.DataFrame], question: str) -> str:
|
51 |
+
"""Find the oldest item in the data."""
|
52 |
+
# Look for mentions of specific columns or items
|
53 |
+
year_columns = ['year', 'date', 'time', 'created', 'modified', 'release']
|
54 |
+
item_type = None
|
55 |
+
|
56 |
+
# Try to extract the type of item we're looking for
|
57 |
+
item_types = [
|
58 |
+
'movie', 'film', 'book', 'song', 'album', 'game', 'video game',
|
59 |
+
'dvd', 'cd', 'blu-ray', 'blu ray', 'record', 'cassette', 'vhs'
|
60 |
+
]
|
61 |
+
for item in item_types:
|
62 |
+
if item in question:
|
63 |
+
item_type = item
|
64 |
+
break
|
65 |
+
|
66 |
+
# Iterate through sheets and find the oldest item
|
67 |
+
oldest_year = float('inf')
|
68 |
+
oldest_item = None
|
69 |
+
|
70 |
+
for sheet_name, df in data.items():
|
71 |
+
# Skip empty sheets
|
72 |
+
if df.empty:
|
73 |
+
continue
|
74 |
+
|
75 |
+
# Try to find year/date columns
|
76 |
+
year_col = None
|
77 |
+
for col in df.columns:
|
78 |
+
if any(year_term in col.lower() for year_term in year_columns):
|
79 |
+
year_col = col
|
80 |
+
break
|
81 |
+
|
82 |
+
if year_col is None:
|
83 |
+
# If no obvious year column, look for columns with numeric values
|
84 |
+
for col in df.columns:
|
85 |
+
if pd.api.types.is_numeric_dtype(df[col]):
|
86 |
+
try:
|
87 |
+
# Check if values might be years (between 1900 and current year)
|
88 |
+
if df[col].min() >= 1900 and df[col].max() <= 2025:
|
89 |
+
year_col = col
|
90 |
+
break
|
91 |
+
except:
|
92 |
+
continue
|
93 |
+
|
94 |
+
if year_col is not None:
|
95 |
+
# Find title/name column
|
96 |
+
title_col = None
|
97 |
+
title_columns = ['title', 'name', 'item', 'product', 'description']
|
98 |
+
|
99 |
+
for col in df.columns:
|
100 |
+
if any(title_term in col.lower() for title_term in title_columns):
|
101 |
+
title_col = col
|
102 |
+
break
|
103 |
+
|
104 |
+
if title_col is None and len(df.columns) > 1:
|
105 |
+
# If no obvious title column, use the first non-year column
|
106 |
+
for col in df.columns:
|
107 |
+
if col != year_col:
|
108 |
+
title_col = col
|
109 |
+
break
|
110 |
+
|
111 |
+
# Filter by item type if specified
|
112 |
+
if item_type:
|
113 |
+
filtered_df = df
|
114 |
+
|
115 |
+
# Look for a column that might contain item types
|
116 |
+
type_col = None
|
117 |
+
type_columns = ['type', 'category', 'format', 'medium', 'platform']
|
118 |
+
|
119 |
+
for col in df.columns:
|
120 |
+
if any(type_term in col.lower() for type_term in type_columns):
|
121 |
+
type_col = col
|
122 |
+
break
|
123 |
+
|
124 |
+
if type_col:
|
125 |
+
# Filter by item type
|
126 |
+
filtered_df = df[df[type_col].astype(str).str.lower().str.contains(item_type.lower())]
|
127 |
+
else:
|
128 |
+
filtered_df = df
|
129 |
+
|
130 |
+
if not filtered_df.empty and title_col:
|
131 |
+
try:
|
132 |
+
# Find the row with the minimum year
|
133 |
+
min_year_idx = filtered_df[year_col].astype(float).idxmin()
|
134 |
+
min_year = filtered_df.loc[min_year_idx, year_col]
|
135 |
+
|
136 |
+
if min_year < oldest_year:
|
137 |
+
oldest_year = min_year
|
138 |
+
oldest_item = filtered_df.loc[min_year_idx, title_col]
|
139 |
+
except:
|
140 |
+
continue
|
141 |
+
|
142 |
+
if oldest_item:
|
143 |
+
return str(oldest_item)
|
144 |
+
else:
|
145 |
+
return "Could not determine the oldest item from the data."
|
146 |
+
|
147 |
+
def _count_items(self, data: Dict[str, pd.DataFrame], question: str) -> str:
|
148 |
+
"""Count items matching specific criteria."""
|
149 |
+
# Extract conditions from the question
|
150 |
+
conditions = self._extract_conditions(question)
|
151 |
+
|
152 |
+
total_count = 0
|
153 |
+
|
154 |
+
for sheet_name, df in data.items():
|
155 |
+
# Skip empty sheets
|
156 |
+
if df.empty:
|
157 |
+
continue
|
158 |
+
|
159 |
+
# Apply conditions to filter the DataFrame
|
160 |
+
filtered_df = df
|
161 |
+
|
162 |
+
for condition in conditions:
|
163 |
+
col = condition.get('column')
|
164 |
+
value = condition.get('value')
|
165 |
+
operator = condition.get('operator', '=')
|
166 |
+
|
167 |
+
if col and value is not None:
|
168 |
+
# Find the best matching column
|
169 |
+
best_col = self._find_best_matching_column(df, col)
|
170 |
+
|
171 |
+
if best_col:
|
172 |
+
try:
|
173 |
+
if operator == '=':
|
174 |
+
filtered_df = filtered_df[filtered_df[best_col].astype(str).str.lower() == str(value).lower()]
|
175 |
+
elif operator == '>':
|
176 |
+
filtered_df = filtered_df[filtered_df[best_col] > value]
|
177 |
+
elif operator == '<':
|
178 |
+
filtered_df = filtered_df[filtered_df[best_col] < value]
|
179 |
+
elif operator == '>=':
|
180 |
+
filtered_df = filtered_df[filtered_df[best_col] >= value]
|
181 |
+
elif operator == '<=':
|
182 |
+
filtered_df = filtered_df[filtered_df[best_col] <= value]
|
183 |
+
elif operator == 'contains':
|
184 |
+
filtered_df = filtered_df[filtered_df[best_col].astype(str).str.lower().str.contains(str(value).lower())]
|
185 |
+
elif operator == 'between':
|
186 |
+
if isinstance(value, list) and len(value) == 2:
|
187 |
+
filtered_df = filtered_df[(filtered_df[best_col] >= value[0]) & (filtered_df[best_col] <= value[1])]
|
188 |
+
except:
|
189 |
+
continue
|
190 |
+
|
191 |
+
# Add the count from this sheet
|
192 |
+
total_count += len(filtered_df)
|
193 |
+
|
194 |
+
return str(total_count)
|
195 |
+
|
196 |
+
def _calculate_average(self, data: Dict[str, pd.DataFrame], question: str) -> str:
|
197 |
+
"""Calculate the average of a column."""
|
198 |
+
# Extract column name from the question
|
199 |
+
column_name = self._extract_column_name(question)
|
200 |
+
|
201 |
+
if not column_name:
|
202 |
+
return "Could not determine which column to calculate the average for."
|
203 |
+
|
204 |
+
for sheet_name, df in data.items():
|
205 |
+
# Skip empty sheets
|
206 |
+
if df.empty:
|
207 |
+
continue
|
208 |
+
|
209 |
+
# Find the best matching column
|
210 |
+
best_col = self._find_best_matching_column(df, column_name)
|
211 |
+
|
212 |
+
if best_col and pd.api.types.is_numeric_dtype(df[best_col]):
|
213 |
+
try:
|
214 |
+
avg_value = df[best_col].mean()
|
215 |
+
return str(avg_value)
|
216 |
+
except:
|
217 |
+
continue
|
218 |
+
|
219 |
+
return "Could not calculate the average from the data."
|
220 |
+
|
221 |
+
def _calculate_total(self, data: Dict[str, pd.DataFrame], question: str) -> str:
|
222 |
+
"""Calculate the total of a column."""
|
223 |
+
# Extract column name from the question
|
224 |
+
column_name = self._extract_column_name(question)
|
225 |
+
|
226 |
+
if not column_name:
|
227 |
+
return "Could not determine which column to calculate the total for."
|
228 |
+
|
229 |
+
for sheet_name, df in data.items():
|
230 |
+
# Skip empty sheets
|
231 |
+
if df.empty:
|
232 |
+
continue
|
233 |
+
|
234 |
+
# Find the best matching column
|
235 |
+
best_col = self._find_best_matching_column(df, column_name)
|
236 |
+
|
237 |
+
if best_col and pd.api.types.is_numeric_dtype(df[best_col]):
|
238 |
+
try:
|
239 |
+
total_value = df[best_col].sum()
|
240 |
+
return str(total_value)
|
241 |
+
except:
|
242 |
+
continue
|
243 |
+
|
244 |
+
return "Could not calculate the total from the data."
|
245 |
+
|
246 |
+
def _find_maximum(self, data: Dict[str, pd.DataFrame], question: str) -> str:
|
247 |
+
"""Find the maximum value in a column."""
|
248 |
+
# Extract column name from the question
|
249 |
+
column_name = self._extract_column_name(question)
|
250 |
+
|
251 |
+
if not column_name:
|
252 |
+
return "Could not determine which column to find the maximum for."
|
253 |
+
|
254 |
+
for sheet_name, df in data.items():
|
255 |
+
# Skip empty sheets
|
256 |
+
if df.empty:
|
257 |
+
continue
|
258 |
+
|
259 |
+
# Find the best matching column
|
260 |
+
best_col = self._find_best_matching_column(df, column_name)
|
261 |
+
|
262 |
+
if best_col:
|
263 |
+
try:
|
264 |
+
max_value = df[best_col].max()
|
265 |
+
return str(max_value)
|
266 |
+
except:
|
267 |
+
continue
|
268 |
+
|
269 |
+
return "Could not find the maximum value from the data."
|
270 |
+
|
271 |
+
def _find_minimum(self, data: Dict[str, pd.DataFrame], question: str) -> str:
|
272 |
+
"""Find the minimum value in a column."""
|
273 |
+
# Extract column name from the question
|
274 |
+
column_name = self._extract_column_name(question)
|
275 |
+
|
276 |
+
if not column_name:
|
277 |
+
return "Could not determine which column to find the minimum for."
|
278 |
+
|
279 |
+
for sheet_name, df in data.items():
|
280 |
+
# Skip empty sheets
|
281 |
+
if df.empty:
|
282 |
+
continue
|
283 |
+
|
284 |
+
# Find the best matching column
|
285 |
+
best_col = self._find_best_matching_column(df, column_name)
|
286 |
+
|
287 |
+
if best_col:
|
288 |
+
try:
|
289 |
+
min_value = df[best_col].min()
|
290 |
+
return str(min_value)
|
291 |
+
except:
|
292 |
+
continue
|
293 |
+
|
294 |
+
return "Could not find the minimum value from the data."
|
295 |
+
|
296 |
+
def _extract_specific_info(self, data: Dict[str, pd.DataFrame], question: str) -> str:
|
297 |
+
"""Extract specific information from the data."""
|
298 |
+
# Try to identify what we're looking for
|
299 |
+
looking_for = self._extract_looking_for(question)
|
300 |
+
conditions = self._extract_conditions(question)
|
301 |
+
|
302 |
+
for sheet_name, df in data.items():
|
303 |
+
# Skip empty sheets
|
304 |
+
if df.empty:
|
305 |
+
continue
|
306 |
+
|
307 |
+
# Apply conditions to filter the DataFrame
|
308 |
+
filtered_df = df
|
309 |
+
|
310 |
+
for condition in conditions:
|
311 |
+
col = condition.get('column')
|
312 |
+
value = condition.get('value')
|
313 |
+
operator = condition.get('operator', '=')
|
314 |
+
|
315 |
+
if col and value is not None:
|
316 |
+
# Find the best matching column
|
317 |
+
best_col = self._find_best_matching_column(df, col)
|
318 |
+
|
319 |
+
if best_col:
|
320 |
+
try:
|
321 |
+
if operator == '=':
|
322 |
+
filtered_df = filtered_df[filtered_df[best_col].astype(str).str.lower() == str(value).lower()]
|
323 |
+
elif operator == '>':
|
324 |
+
filtered_df = filtered_df[filtered_df[best_col] > value]
|
325 |
+
elif operator == '<':
|
326 |
+
filtered_df = filtered_df[filtered_df[best_col] < value]
|
327 |
+
elif operator == '>=':
|
328 |
+
filtered_df = filtered_df[filtered_df[best_col] >= value]
|
329 |
+
elif operator == '<=':
|
330 |
+
filtered_df = filtered_df[filtered_df[best_col] <= value]
|
331 |
+
elif operator == 'contains':
|
332 |
+
filtered_df = filtered_df[filtered_df[best_col].astype(str).str.lower().str.contains(str(value).lower())]
|
333 |
+
elif operator == 'between':
|
334 |
+
if isinstance(value, list) and len(value) == 2:
|
335 |
+
filtered_df = filtered_df[(filtered_df[best_col] >= value[0]) & (filtered_df[best_col] <= value[1])]
|
336 |
+
except:
|
337 |
+
continue
|
338 |
+
|
339 |
+
# If we found matching rows and know what to look for
|
340 |
+
if not filtered_df.empty and looking_for:
|
341 |
+
# Find the best matching column for what we're looking for
|
342 |
+
best_col = self._find_best_matching_column(df, looking_for)
|
343 |
+
|
344 |
+
if best_col:
|
345 |
+
try:
|
346 |
+
# Return the first value
|
347 |
+
return str(filtered_df.iloc[0][best_col])
|
348 |
+
except:
|
349 |
+
continue
|
350 |
+
|
351 |
+
# If we couldn't extract specific information, return a more general response
|
352 |
+
if data:
|
353 |
+
# Return basic info about the first non-empty sheet
|
354 |
+
for sheet_name, df in data.items():
|
355 |
+
if not df.empty:
|
356 |
+
return f"The sheet contains {len(df)} rows and {len(df.columns)} columns."
|
357 |
+
|
358 |
+
return "Could not extract the requested information from the data."
|
359 |
+
|
360 |
+
def _extract_conditions(self, question: str) -> List[Dict[str, Any]]:
|
361 |
+
"""Extract conditions from the question."""
|
362 |
+
conditions = []
|
363 |
+
|
364 |
+
# Check for "between" conditions
|
365 |
+
between_pattern = r'(\w+) between (\d+) and (\d+)'
|
366 |
+
for match in re.finditer(between_pattern, question):
|
367 |
+
column = match.group(1)
|
368 |
+
start = int(match.group(2))
|
369 |
+
end = int(match.group(3))
|
370 |
+
conditions.append({
|
371 |
+
'column': column,
|
372 |
+
'operator': 'between',
|
373 |
+
'value': [start, end],
|
374 |
+
})
|
375 |
+
|
376 |
+
# Check for comparison conditions
|
377 |
+
comparison_pattern = r'(\w+) (>|<|>=|<=|=|equals|equal to|contains) (\w+)'
|
378 |
+
for match in re.finditer(comparison_pattern, question):
|
379 |
+
column = match.group(1)
|
380 |
+
op = match.group(2)
|
381 |
+
value = match.group(3)
|
382 |
+
|
383 |
+
# Convert operator text to symbols
|
384 |
+
if op == 'equals' or op == 'equal to':
|
385 |
+
op = '='
|
386 |
+
elif op == 'contains':
|
387 |
+
op = 'contains'
|
388 |
+
|
389 |
+
# Try to convert value to number
|
390 |
+
try:
|
391 |
+
value = float(value)
|
392 |
+
except:
|
393 |
+
pass
|
394 |
+
|
395 |
+
conditions.append({
|
396 |
+
'column': column,
|
397 |
+
'operator': op,
|
398 |
+
'value': value,
|
399 |
+
})
|
400 |
+
|
401 |
+
# Check for simple equality conditions
|
402 |
+
equality_pattern = r'(?:with|where) (\w+) (?:is|=) (\w+)'
|
403 |
+
for match in re.finditer(equality_pattern, question):
|
404 |
+
column = match.group(1)
|
405 |
+
value = match.group(2)
|
406 |
+
|
407 |
+
# Try to convert value to number
|
408 |
+
try:
|
409 |
+
value = float(value)
|
410 |
+
except:
|
411 |
+
pass
|
412 |
+
|
413 |
+
conditions.append({
|
414 |
+
'column': column,
|
415 |
+
'operator': '=',
|
416 |
+
'value': value,
|
417 |
+
})
|
418 |
+
|
419 |
+
return conditions
|
420 |
+
|
421 |
+
def _extract_column_name(self, question: str) -> Optional[str]:
|
422 |
+
"""Extract column name from the question."""
|
423 |
+
# Check for direct mentions of columns
|
424 |
+
column_pattern = r'(?:column|field) (?:named|called) ["\']?(\w+)["\']?'
|
425 |
+
match = re.search(column_pattern, question)
|
426 |
+
if match:
|
427 |
+
return match.group(1)
|
428 |
+
|
429 |
+
# Look for common column references
|
430 |
+
common_columns = [
|
431 |
+
'year', 'date', 'time', 'name', 'title', 'price', 'cost',
|
432 |
+
'amount', 'quantity', 'total', 'value', 'age', 'rating',
|
433 |
+
'score', 'grade', 'salary', 'income', 'revenue', 'profit',
|
434 |
+
'loss', 'height', 'weight', 'length', 'width', 'depth',
|
435 |
+
'area', 'volume'
|
436 |
+
]
|
437 |
+
|
438 |
+
for col in common_columns:
|
439 |
+
if col in question:
|
440 |
+
return col
|
441 |
+
|
442 |
+
return None
|
443 |
+
|
444 |
+
def _extract_looking_for(self, question: str) -> Optional[str]:
|
445 |
+
"""Extract what we're looking for from the question."""
|
446 |
+
# Check for direct mentions of what we're looking for
|
447 |
+
looking_for_pattern = r'(?:what is|what are|find|get|return) the (\w+)'
|
448 |
+
match = re.search(looking_for_pattern, question)
|
449 |
+
if match:
|
450 |
+
return match.group(1)
|
451 |
+
|
452 |
+
# Look for common things we might be looking for
|
453 |
+
common_items = [
|
454 |
+
'name', 'title', 'price', 'cost', 'amount', 'quantity',
|
455 |
+
'total', 'value', 'age', 'rating', 'score', 'grade',
|
456 |
+
'salary', 'income', 'revenue', 'profit', 'loss',
|
457 |
+
'height', 'weight', 'length', 'width', 'depth',
|
458 |
+
'area', 'volume', 'year', 'date', 'time'
|
459 |
+
]
|
460 |
+
|
461 |
+
for item in common_items:
|
462 |
+
if item in question:
|
463 |
+
return item
|
464 |
+
|
465 |
+
return None
|
466 |
+
|
467 |
+
def _find_best_matching_column(self, df: pd.DataFrame, column_name: str) -> Optional[str]:
|
468 |
+
"""Find the best matching column in a DataFrame."""
|
469 |
+
# Check for exact match
|
470 |
+
if column_name in df.columns:
|
471 |
+
return column_name
|
472 |
+
|
473 |
+
# Check for case-insensitive match
|
474 |
+
for col in df.columns:
|
475 |
+
if col.lower() == column_name.lower():
|
476 |
+
return col
|
477 |
+
|
478 |
+
# Check for partial match
|
479 |
+
for col in df.columns:
|
480 |
+
if column_name.lower() in col.lower():
|
481 |
+
return col
|
482 |
+
|
483 |
+
return None
|
484 |
+
|
485 |
+
def process_csv_data(self, data: pd.DataFrame, question: str) -> str:
|
486 |
+
"""
|
487 |
+
Process data extracted from a CSV file.
|
488 |
+
|
489 |
+
Args:
|
490 |
+
data: DataFrame containing the CSV data
|
491 |
+
question: The question to answer
|
492 |
+
|
493 |
+
Returns:
|
494 |
+
Answer to the question
|
495 |
+
"""
|
496 |
+
# Wrap in a dictionary to reuse Excel processing logic
|
497 |
+
return self.process_excel_data({'Sheet1': data}, question)
|
498 |
+
|
499 |
+
def process_text_data(self, data: str, question: str) -> str:
|
500 |
+
"""
|
501 |
+
Process data extracted from a text file.
|
502 |
+
|
503 |
+
Args:
|
504 |
+
data: Text content of the file
|
505 |
+
question: The question to answer
|
506 |
+
|
507 |
+
Returns:
|
508 |
+
Answer to the question
|
509 |
+
"""
|
510 |
+
question_lower = question.lower()
|
511 |
+
|
512 |
+
# Handle specific question types
|
513 |
+
if 'count' in question_lower or 'how many' in question_lower:
|
514 |
+
# Count occurrences of a word or phrase
|
515 |
+
count_pattern = r'(?:count|how many) (?:occurrences of|instances of|times) ["\']?([^"\']+)["\']?'
|
516 |
+
match = re.search(count_pattern, question_lower)
|
517 |
+
if match:
|
518 |
+
term = match.group(1)
|
519 |
+
count = data.lower().count(term.lower())
|
520 |
+
return str(count)
|
521 |
+
|
522 |
+
# Check if the question is asking for a specific line
|
523 |
+
line_pattern = r'(?:what is|what does|what are|show|return) (?:the|on) (?:line|lines) (\d+)(?:\s*(?:to|-)\s*(\d+))?'
|
524 |
+
match = re.search(line_pattern, question_lower)
|
525 |
+
if match:
|
526 |
+
start_line = int(match.group(1))
|
527 |
+
end_line = int(match.group(2)) if match.group(2) else start_line
|
528 |
+
|
529 |
+
lines = data.split('\n')
|
530 |
+
if start_line <= len(lines) and end_line <= len(lines):
|
531 |
+
return '\n'.join(lines[start_line-1:end_line])
|
532 |
+
|
533 |
+
# Check if the question is asking for a specific paragraph
|
534 |
+
para_pattern = r'(?:what is|what does|what are|show|return) (?:the|in) paragraph (\d+)(?:\s*(?:to|-)\s*(\d+))?'
|
535 |
+
match = re.search(para_pattern, question_lower)
|
536 |
+
if match:
|
537 |
+
start_para = int(match.group(1))
|
538 |
+
end_para = int(match.group(2)) if match.group(2) else start_para
|
539 |
+
|
540 |
+
paragraphs = re.split(r'\n\s*\n', data)
|
541 |
+
if start_para <= len(paragraphs) and end_para <= len(paragraphs):
|
542 |
+
return '\n\n'.join(paragraphs[start_para-1:end_para])
|
543 |
+
|
544 |
+
# Check for specific information requests
|
545 |
+
info_pattern = r'(?:what|who|where|when|why|how) (?:is|are|was|were|does|do|did) ([^?]+)'
|
546 |
+
match = re.search(info_pattern, question_lower)
|
547 |
+
if match:
|
548 |
+
info = match.group(1).strip()
|
549 |
+
|
550 |
+
# Look for this information in the text
|
551 |
+
sentences = re.split(r'(?<=[.!?])\s+', data)
|
552 |
+
for sentence in sentences:
|
553 |
+
if info.lower() in sentence.lower():
|
554 |
+
return sentence.strip()
|
555 |
+
|
556 |
+
# If nothing specific was found, return a generic summary
|
557 |
+
words = data.split()
|
558 |
+
return f"The text contains {len(words)} words and {len(data.split('. '))} sentences."
|
559 |
+
|
560 |
+
def process_pdf_data(self, data: Dict[int, str], question: str) -> str:
|
561 |
+
"""
|
562 |
+
Process data extracted from a PDF file.
|
563 |
+
|
564 |
+
Args:
|
565 |
+
data: Dictionary mapping page numbers to text content
|
566 |
+
question: The question to answer
|
567 |
+
|
568 |
+
Returns:
|
569 |
+
Answer to the question
|
570 |
+
"""
|
571 |
+
question_lower = question.lower()
|
572 |
+
|
573 |
+
# Check if the question is asking for a specific page
|
574 |
+
page_pattern = r'(?:what is|what does|what are|show|return) (?:on|in) page (\d+)'
|
575 |
+
match = re.search(page_pattern, question_lower)
|
576 |
+
if match:
|
577 |
+
page_num = int(match.group(1))
|
578 |
+
if page_num in data:
|
579 |
+
return data[page_num]
|
580 |
+
else:
|
581 |
+
return f"Page {page_num} not found in the PDF."
|
582 |
+
|
583 |
+
# Check if the question is asking for a specific information across all pages
|
584 |
+
info_pattern = r'(?:what|who|where|when|why|how) (?:is|are|was|were|does|do|did) ([^?]+)'
|
585 |
+
match = re.search(info_pattern, question_lower)
|
586 |
+
if match:
|
587 |
+
info = match.group(1).strip()
|
588 |
+
|
589 |
+
# Look for this information in all pages
|
590 |
+
for page_num, content in data.items():
|
591 |
+
sentences = re.split(r'(?<=[.!?])\s+', content)
|
592 |
+
for sentence in sentences:
|
593 |
+
if info.lower() in sentence.lower():
|
594 |
+
return sentence.strip()
|
595 |
+
|
596 |
+
# If nothing specific was found, combine all text and return a summary
|
597 |
+
all_text = ' '.join(data.values())
|
598 |
+
words = all_text.split()
|
599 |
+
return f"The PDF contains {len(data)} pages and approximately {len(words)} words."
|
600 |
+
|
601 |
+
def process_image_metadata(self, metadata: Dict[str, Any], question: str) -> str:
|
602 |
+
"""
|
603 |
+
Process metadata extracted from an image file.
|
604 |
+
|
605 |
+
Args:
|
606 |
+
metadata: Dictionary containing image metadata
|
607 |
+
question: The question to answer
|
608 |
+
|
609 |
+
Returns:
|
610 |
+
Answer to the question
|
611 |
+
"""
|
612 |
+
question_lower = question.lower()
|
613 |
+
|
614 |
+
# Handle specific question types
|
615 |
+
if 'format' in question_lower or 'type' in question_lower:
|
616 |
+
return metadata.get('format', 'Unknown format')
|
617 |
+
elif 'size' in question_lower or 'resolution' in question_lower:
|
618 |
+
width = metadata.get('width', 0)
|
619 |
+
height = metadata.get('height', 0)
|
620 |
+
return f"{width}x{height}"
|
621 |
+
elif 'width' in question_lower:
|
622 |
+
return str(metadata.get('width', 0))
|
623 |
+
elif 'height' in question_lower:
|
624 |
+
return str(metadata.get('height', 0))
|
625 |
+
elif 'mode' in question_lower or 'color' in question_lower:
|
626 |
+
return metadata.get('mode', 'Unknown mode')
|
627 |
+
elif 'exif' in question_lower:
|
628 |
+
exif = metadata.get('exif', {})
|
629 |
+
if exif:
|
630 |
+
return str(exif)
|
631 |
+
else:
|
632 |
+
return "No EXIF data found."
|
633 |
+
|
634 |
+
# If nothing specific was found, return basic information
|
635 |
+
return f"Image format: {metadata.get('format', 'Unknown')}, Size: {metadata.get('width', 0)}x{metadata.get('height', 0)}, Mode: {metadata.get('mode', 'Unknown')}"
|
636 |
+
|
637 |
+
def process_docx_data(self, data: str, question: str) -> str:
|
638 |
+
"""
|
639 |
+
Process data extracted from a Word document.
|
640 |
+
|
641 |
+
Args:
|
642 |
+
data: Text content of the document
|
643 |
+
question: The question to answer
|
644 |
+
|
645 |
+
Returns:
|
646 |
+
Answer to the question
|
647 |
+
"""
|
648 |
+
# Similar to text processing
|
649 |
+
return self.process_text_data(data, question)
|
650 |
+
|
651 |
+
def process_pptx_data(self, data: Dict[int, str], question: str) -> str:
|
652 |
+
"""
|
653 |
+
Process data extracted from a PowerPoint presentation.
|
654 |
+
|
655 |
+
Args:
|
656 |
+
data: Dictionary mapping slide numbers to text content
|
657 |
+
question: The question to answer
|
658 |
+
|
659 |
+
Returns:
|
660 |
+
Answer to the question
|
661 |
+
"""
|
662 |
+
question_lower = question.lower()
|
663 |
+
|
664 |
+
# Check if the question is asking for a specific slide
|
665 |
+
slide_pattern = r'(?:what is|what does|what are|show|return) (?:on|in) slide (\d+)'
|
666 |
+
match = re.search(slide_pattern, question_lower)
|
667 |
+
if match:
|
668 |
+
slide_num = int(match.group(1))
|
669 |
+
if slide_num in data:
|
670 |
+
return data[slide_num]
|
671 |
+
else:
|
672 |
+
return f"Slide {slide_num} not found in the presentation."
|
673 |
+
|
674 |
+
# Check if the question is asking for a specific information across all slides
|
675 |
+
info_pattern = r'(?:what|who|where|when|why|how) (?:is|are|was|were|does|do|did) ([^?]+)'
|
676 |
+
match = re.search(info_pattern, question_lower)
|
677 |
+
if match:
|
678 |
+
info = match.group(1).strip()
|
679 |
+
|
680 |
+
# Look for this information in all slides
|
681 |
+
for slide_num, content in data.items():
|
682 |
+
if info.lower() in content.lower():
|
683 |
+
return content.strip()
|
684 |
+
|
685 |
+
# If nothing specific was found, return a summary
|
686 |
+
return f"The presentation contains {len(data)} slides."
|
687 |
+
|
688 |
+
def process_json_data(self, data: Dict[str, Any], question: str) -> str:
|
689 |
+
"""
|
690 |
+
Process data extracted from a JSON file.
|
691 |
+
|
692 |
+
Args:
|
693 |
+
data: Parsed JSON content
|
694 |
+
question: The question to answer
|
695 |
+
|
696 |
+
Returns:
|
697 |
+
Answer to the question
|
698 |
+
"""
|
699 |
+
question_lower = question.lower()
|
700 |
+
|
701 |
+
# Check if the question is asking for a specific key
|
702 |
+
key_pattern = r'(?:what is|what are|show|return) (?:the|in) ["\']?(\w+)["\']?'
|
703 |
+
match = re.search(key_pattern, question_lower)
|
704 |
+
if match:
|
705 |
+
key = match.group(1)
|
706 |
+
|
707 |
+
# Look for this key in the JSON
|
708 |
+
if key in data:
|
709 |
+
return str(data[key])
|
710 |
+
|
711 |
+
# Look for nested keys
|
712 |
+
for k, v in data.items():
|
713 |
+
if isinstance(v, dict) and key in v:
|
714 |
+
return str(v[key])
|
715 |
+
|
716 |
+
# If nothing specific was found, return a summary
|
717 |
+
return f"The JSON contains {len(data)} top-level keys: {', '.join(data.keys())}"
|
718 |
+
|
719 |
+
def process_zip_data(self, data: Dict[str, Any], question: str) -> str:
|
720 |
+
"""
|
721 |
+
Process data extracted from a ZIP archive.
|
722 |
+
|
723 |
+
Args:
|
724 |
+
data: Dictionary containing information about the archive
|
725 |
+
question: The question to answer
|
726 |
+
|
727 |
+
Returns:
|
728 |
+
Answer to the question
|
729 |
+
"""
|
730 |
+
question_lower = question.lower()
|
731 |
+
|
732 |
+
# Handle specific question types
|
733 |
+
if 'how many' in question_lower or 'count' in question_lower:
|
734 |
+
if 'files' in question_lower:
|
735 |
+
return str(len(data.get('files', [])))
|
736 |
+
|
737 |
+
# Check if the question is asking for a specific file
|
738 |
+
file_pattern = r'(?:does it contain|is there) (?:a file named|a file called) ["\']?([^"\']+)["\']?'
|
739 |
+
match = re.search(file_pattern, question_lower)
|
740 |
+
if match:
|
741 |
+
filename = match.group(1)
|
742 |
+
|
743 |
+
# Check if the file exists in the archive
|
744 |
+
for file_info in data.get('files', []):
|
745 |
+
if filename.lower() in file_info.get('filename', '').lower():
|
746 |
+
return f"Yes, the archive contains {file_info['filename']} ({file_info['size']} bytes)"
|
747 |
+
|
748 |
+
return f"No, the archive does not contain a file named {filename}."
|
749 |
+
|
750 |
+
# If nothing specific was found, return a summary
|
751 |
+
return f"The ZIP archive contains {len(data.get('files', []))} files."
|
752 |
+
|
753 |
+
def process_pdb_data(self, data: Dict[str, Any], question: str) -> str:
|
754 |
+
"""
|
755 |
+
Process data extracted from a PDB file.
|
756 |
+
|
757 |
+
Args:
|
758 |
+
data: Dictionary containing information about the PDB file
|
759 |
+
question: The question to answer
|
760 |
+
|
761 |
+
Returns:
|
762 |
+
Answer to the question
|
763 |
+
"""
|
764 |
+
question_lower = question.lower()
|
765 |
+
|
766 |
+
# Handle specific question types
|
767 |
+
if 'title' in question_lower:
|
768 |
+
return data.get('title', 'No title found.')
|
769 |
+
elif 'header' in question_lower:
|
770 |
+
return data.get('header', 'No header found.')
|
771 |
+
elif 'compound' in question_lower or 'compounds' in question_lower:
|
772 |
+
compounds = data.get('compounds', [])
|
773 |
+
if compounds:
|
774 |
+
return '\n'.join(compounds)
|
775 |
+
else:
|
776 |
+
return 'No compounds found.'
|
777 |
+
elif 'author' in question_lower or 'authors' in question_lower:
|
778 |
+
authors = data.get('authors', [])
|
779 |
+
if authors:
|
780 |
+
return '\n'.join(authors)
|
781 |
+
else:
|
782 |
+
return 'No authors found.'
|
783 |
+
elif 'atoms' in question_lower or 'atom count' in question_lower:
|
784 |
+
return str(data.get('atoms_count', 0))
|
785 |
+
|
786 |
+
# If nothing specific was found, return a summary
|
787 |
+
return f"PDB file with title: {data.get('title', 'No title')}, containing {data.get('atoms_count', 0)} atoms."
|
788 |
+
|
789 |
+
def process_python_data(self, data: Dict[str, Any], question: str) -> str:
|
790 |
+
"""
|
791 |
+
Process data extracted from a Python file.
|
792 |
+
|
793 |
+
Args:
|
794 |
+
data: Dictionary containing information about the Python file
|
795 |
+
question: The question to answer
|
796 |
+
|
797 |
+
Returns:
|
798 |
+
Answer to the question
|
799 |
+
"""
|
800 |
+
question_lower = question.lower()
|
801 |
+
|
802 |
+
# Handle specific question types
|
803 |
+
if 'class' in question_lower or 'classes' in question_lower:
|
804 |
+
classes = data.get('classes', [])
|
805 |
+
if classes:
|
806 |
+
class_names = [c['name'] for c in classes]
|
807 |
+
return ', '.join(class_names)
|
808 |
+
else:
|
809 |
+
return 'No classes found in the file.'
|
810 |
+
elif 'function' in question_lower or 'functions' in question_lower:
|
811 |
+
functions = data.get('functions', [])
|
812 |
+
if functions:
|
813 |
+
func_names = [f['name'] for f in functions]
|
814 |
+
return ', '.join(func_names)
|
815 |
+
else:
|
816 |
+
return 'No functions found in the file.'
|
817 |
+
elif 'import' in question_lower or 'imports' in question_lower:
|
818 |
+
imports = data.get('imports', [])
|
819 |
+
if imports:
|
820 |
+
import_strs = []
|
821 |
+
for imp in imports:
|
822 |
+
if imp.get('from'):
|
823 |
+
import_strs.append(f"from {imp['from']} import {imp['import']}")
|
824 |
+
else:
|
825 |
+
import_strs.append(f"import {imp['import']}")
|
826 |
+
return '\n'.join(import_strs)
|
827 |
+
else:
|
828 |
+
return 'No imports found in the file.'
|
829 |
+
|
830 |
+
# Check if the question is asking for a specific class or function
|
831 |
+
class_pattern = r'(?:what is|what does) (?:the class|class) ["\']?(\w+)["\']?'
|
832 |
+
match = re.search(class_pattern, question_lower)
|
833 |
+
if match:
|
834 |
+
class_name = match.group(1)
|
835 |
+
|
836 |
+
# Look for this class in the data
|
837 |
+
for cls in data.get('classes', []):
|
838 |
+
if cls['name'].lower() == class_name.lower():
|
839 |
+
parent = f", inherits from {cls['parent']}" if cls['parent'] else ""
|
840 |
+
return f"Class {cls['name']}{parent}"
|
841 |
+
|
842 |
+
func_pattern = r'(?:what is|what does) (?:the function|function) ["\']?(\w+)["\']?'
|
843 |
+
match = re.search(func_pattern, question_lower)
|
844 |
+
if match:
|
845 |
+
func_name = match.group(1)
|
846 |
+
|
847 |
+
# Look for this function in the data
|
848 |
+
for func in data.get('functions', []):
|
849 |
+
if func['name'].lower() == func_name.lower():
|
850 |
+
return f"Function {func['name']}({func['params']})"
|
851 |
+
|
852 |
+
# If nothing specific was found, look for the code of a specific function or class
|
853 |
+
code_pattern = r'(?:show|return) (?:the code for|code of) (?:the )?(?:function|class) ["\']?(\w+)["\']?'
|
854 |
+
match = re.search(code_pattern, question_lower)
|
855 |
+
if match:
|
856 |
+
entity_name = match.group(1)
|
857 |
+
content = data.get('content', '')
|
858 |
+
|
859 |
+
# Look for the code of this entity
|
860 |
+
lines = content.split('\n')
|
861 |
+
entity_lines = []
|
862 |
+
in_entity = False
|
863 |
+
indent = 0
|
864 |
+
|
865 |
+
for i, line in enumerate(lines):
|
866 |
+
# Check for class or function definition
|
867 |
+
if re.match(rf'(class|def)\s+{re.escape(entity_name)}\s*\(', line):
|
868 |
+
in_entity = True
|
869 |
+
entity_lines.append(line)
|
870 |
+
indent = len(line) - len(line.lstrip())
|
871 |
+
continue
|
872 |
+
|
873 |
+
if in_entity:
|
874 |
+
# Check if we're still in the entity based on indentation
|
875 |
+
if line.strip() and len(line) - len(line.lstrip()) <= indent:
|
876 |
+
in_entity = False
|
877 |
+
else:
|
878 |
+
entity_lines.append(line)
|
879 |
+
|
880 |
+
if entity_lines:
|
881 |
+
return '\n'.join(entity_lines)
|
882 |
+
|
883 |
+
# If nothing specific was found, return a summary
|
884 |
+
return f"Python file with {len(data.get('classes', []))} classes and {len(data.get('functions', []))} functions."
|
885 |
+
|
886 |
+
def process_jsonl_data(self, data: List[Dict[str, Any]], question: str) -> str:
|
887 |
+
"""
|
888 |
+
Process data extracted from a JSONL file.
|
889 |
+
|
890 |
+
Args:
|
891 |
+
data: List of parsed JSON objects
|
892 |
+
question: The question to answer
|
893 |
+
|
894 |
+
Returns:
|
895 |
+
Answer to the question
|
896 |
+
"""
|
897 |
+
question_lower = question.lower()
|
898 |
+
|
899 |
+
# Handle specific question types
|
900 |
+
if 'how many' in question_lower or 'count' in question_lower:
|
901 |
+
return str(len(data))
|
902 |
+
|
903 |
+
# Check if the question is asking for a specific entry
|
904 |
+
entry_pattern = r'(?:what is|what are|show|return) (?:the|in) entry (\d+)'
|
905 |
+
match = re.search(entry_pattern, question_lower)
|
906 |
+
if match:
|
907 |
+
entry_num = int(match.group(1))
|
908 |
+
if 0 <= entry_num < len(data):
|
909 |
+
return str(data[entry_num])
|
910 |
+
else:
|
911 |
+
return f"Entry {entry_num} not found in the data."
|
912 |
+
|
913 |
+
# Check if the question is asking for entries with a specific key-value pair
|
914 |
+
kv_pattern = r'(?:entries|items) where ["\']?(\w+)["\']? (?:is|=|equals|contains) ["\']?([^"\']+)["\']?'
|
915 |
+
match = re.search(kv_pattern, question_lower)
|
916 |
+
if match:
|
917 |
+
key = match.group(1)
|
918 |
+
value = match.group(2)
|
919 |
+
|
920 |
+
# Find entries matching the criteria
|
921 |
+
matching_entries = []
|
922 |
+
for entry in data:
|
923 |
+
if key in entry and str(entry[key]).lower() == value.lower():
|
924 |
+
matching_entries.append(entry)
|
925 |
+
|
926 |
+
if matching_entries:
|
927 |
+
return str(matching_entries)
|
928 |
+
else:
|
929 |
+
return f"No entries found where {key} = {value}."
|
930 |
+
|
931 |
+
# If nothing specific was found, return a summary
|
932 |
+
if data and isinstance(data[0], dict):
|
933 |
+
keys = list(data[0].keys())
|
934 |
+
return f"The data contains {len(data)} entries with keys: {', '.join(keys)}"
|
935 |
+
else:
|
936 |
+
return f"The data contains {len(data)} entries."
|
agent/utils/question_analyzer.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Utilities for analyzing and understanding questions.
|
3 |
+
"""
|
4 |
+
import re
|
5 |
+
import json
|
6 |
+
import os
|
7 |
+
from typing import Dict, Any, List, Optional, Tuple, Set
|
8 |
+
|
9 |
+
class QuestionAnalyzer:
|
10 |
+
"""
|
11 |
+
Class for analyzing and understanding questions.
|
12 |
+
"""
|
13 |
+
|
14 |
+
def __init__(self, resource_dir: str, metadata_path: Optional[str] = None):
|
15 |
+
"""
|
16 |
+
Initialize the question analyzer.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
resource_dir: Directory containing resource files
|
20 |
+
metadata_path: Path to the metadata file (optional)
|
21 |
+
"""
|
22 |
+
self.resource_dir = resource_dir
|
23 |
+
self.metadata_path = metadata_path or os.path.join(resource_dir, 'metadata.jsonl')
|
24 |
+
self.metadata = self._load_metadata()
|
25 |
+
|
26 |
+
def _load_metadata(self) -> Dict[str, Dict[str, Any]]:
|
27 |
+
"""
|
28 |
+
Load metadata from the metadata file.
|
29 |
+
|
30 |
+
Returns:
|
31 |
+
Dictionary mapping task IDs to metadata
|
32 |
+
"""
|
33 |
+
metadata = {}
|
34 |
+
|
35 |
+
if os.path.exists(self.metadata_path):
|
36 |
+
try:
|
37 |
+
with open(self.metadata_path, 'r', encoding='utf-8') as f:
|
38 |
+
for line in f:
|
39 |
+
entry = json.loads(line.strip())
|
40 |
+
task_id = entry.get('task_id')
|
41 |
+
if task_id:
|
42 |
+
metadata[task_id] = entry
|
43 |
+
except Exception as e:
|
44 |
+
print(f"Error loading metadata: {e}")
|
45 |
+
|
46 |
+
return metadata
|
47 |
+
|
48 |
+
def extract_file_mention(self, question: str) -> Optional[str]:
|
49 |
+
"""
|
50 |
+
Extract mentioned file name from the question.
|
51 |
+
|
52 |
+
Args:
|
53 |
+
question: The question to analyze
|
54 |
+
|
55 |
+
Returns:
|
56 |
+
Mentioned file name, or None if no file is mentioned
|
57 |
+
"""
|
58 |
+
# Look for "attached file" or "attached spreadsheet" patterns
|
59 |
+
attached_pattern = r'attached (?:file|spreadsheet|document|image|picture|pdf|excel|csv|text file|zip|archive) (?:named |called |")?([\w\.-]+)'
|
60 |
+
match = re.search(attached_pattern, question, re.IGNORECASE)
|
61 |
+
if match:
|
62 |
+
return match.group(1)
|
63 |
+
|
64 |
+
# Look for file extensions
|
65 |
+
extensions = [
|
66 |
+
'.xlsx', '.xls', '.csv', '.txt', '.pdf', '.jpg', '.jpeg',
|
67 |
+
'.png', '.docx', '.pptx', '.json', '.jsonld', '.zip', '.pdb', '.py'
|
68 |
+
]
|
69 |
+
for ext in extensions:
|
70 |
+
pattern = r'(\w+(?:-\w+)*' + re.escape(ext) + r')'
|
71 |
+
match = re.search(pattern, question, re.IGNORECASE)
|
72 |
+
if match:
|
73 |
+
return match.group(1)
|
74 |
+
|
75 |
+
return None
|
76 |
+
|
77 |
+
def find_relevant_file(self, question: str, task_id: Optional[str] = None) -> Optional[str]:
|
78 |
+
"""
|
79 |
+
Find the relevant file for a question.
|
80 |
+
|
81 |
+
Args:
|
82 |
+
question: The question to analyze
|
83 |
+
task_id: The task ID (optional)
|
84 |
+
|
85 |
+
Returns:
|
86 |
+
Path to the relevant file, or None if no file is found
|
87 |
+
"""
|
88 |
+
# Check if task_id is in metadata and has a file_name
|
89 |
+
if task_id and task_id in self.metadata:
|
90 |
+
file_name = self.metadata[task_id].get('file_name')
|
91 |
+
if file_name:
|
92 |
+
file_path = os.path.join(self.resource_dir, file_name)
|
93 |
+
if os.path.exists(file_path):
|
94 |
+
return file_path
|
95 |
+
|
96 |
+
# Extract file mention from question
|
97 |
+
file_mention = self.extract_file_mention(question)
|
98 |
+
if file_mention:
|
99 |
+
# Check if the mentioned file exists
|
100 |
+
file_path = os.path.join(self.resource_dir, file_mention)
|
101 |
+
if os.path.exists(file_path):
|
102 |
+
return file_path
|
103 |
+
|
104 |
+
# Check if there's a file with a similar name
|
105 |
+
for file_name in os.listdir(self.resource_dir):
|
106 |
+
if file_mention.lower() in file_name.lower():
|
107 |
+
return os.path.join(self.resource_dir, file_name)
|
108 |
+
|
109 |
+
# If no file is found, try to find a file mentioned in the metadata
|
110 |
+
if task_id and task_id in self.metadata:
|
111 |
+
# Extract keywords from the question
|
112 |
+
keywords = self._extract_keywords(question)
|
113 |
+
|
114 |
+
# Check all files in the resource directory
|
115 |
+
best_match = None
|
116 |
+
best_score = 0
|
117 |
+
|
118 |
+
for file_name in os.listdir(self.resource_dir):
|
119 |
+
# Skip metadata file
|
120 |
+
if file_name == 'metadata.jsonl':
|
121 |
+
continue
|
122 |
+
|
123 |
+
# Calculate score based on keyword matches
|
124 |
+
score = 0
|
125 |
+
for keyword in keywords:
|
126 |
+
if keyword.lower() in file_name.lower():
|
127 |
+
score += 1
|
128 |
+
|
129 |
+
if score > best_score:
|
130 |
+
best_score = score
|
131 |
+
best_match = file_name
|
132 |
+
|
133 |
+
if best_match:
|
134 |
+
return os.path.join(self.resource_dir, best_match)
|
135 |
+
|
136 |
+
return None
|
137 |
+
|
138 |
+
def _extract_keywords(self, text: str) -> Set[str]:
|
139 |
+
"""
|
140 |
+
Extract keywords from text.
|
141 |
+
|
142 |
+
Args:
|
143 |
+
text: The text to analyze
|
144 |
+
|
145 |
+
Returns:
|
146 |
+
Set of keywords
|
147 |
+
"""
|
148 |
+
# Remove common stop words
|
149 |
+
stop_words = {
|
150 |
+
'a', 'an', 'the', 'and', 'or', 'but', 'if', 'then', 'else', 'when',
|
151 |
+
'at', 'from', 'by', 'for', 'with', 'about', 'against', 'between',
|
152 |
+
'into', 'through', 'during', 'before', 'after', 'above', 'below',
|
153 |
+
'to', 'of', 'in', 'on', 'is', 'are', 'was', 'were', 'be', 'been',
|
154 |
+
'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did',
|
155 |
+
'doing', 'would', 'should', 'could', 'might', 'will', 'shall',
|
156 |
+
'can', 'may', 'must', 'ought'
|
157 |
+
}
|
158 |
+
|
159 |
+
# Extract words
|
160 |
+
words = re.findall(r'\b\w+\b', text.lower())
|
161 |
+
|
162 |
+
# Filter out stop words and short words
|
163 |
+
keywords = {word for word in words if word not in stop_words and len(word) > 2}
|
164 |
+
|
165 |
+
return keywords
|
166 |
+
|
167 |
+
def analyze_question(self, question: str, task_id: Optional[str] = None) -> Dict[str, Any]:
|
168 |
+
"""
|
169 |
+
Analyze a question to understand what it's asking.
|
170 |
+
|
171 |
+
Args:
|
172 |
+
question: The question to analyze
|
173 |
+
task_id: The task ID (optional)
|
174 |
+
|
175 |
+
Returns:
|
176 |
+
Dictionary containing analysis results
|
177 |
+
"""
|
178 |
+
result = {
|
179 |
+
'question': question,
|
180 |
+
'task_id': task_id,
|
181 |
+
'file_path': None,
|
182 |
+
'keywords': list(self._extract_keywords(question)),
|
183 |
+
'expected_answer': None,
|
184 |
+
}
|
185 |
+
|
186 |
+
# Find relevant file
|
187 |
+
file_path = self.find_relevant_file(question, task_id)
|
188 |
+
if file_path:
|
189 |
+
result['file_path'] = file_path
|
190 |
+
|
191 |
+
# Get expected answer if available
|
192 |
+
if task_id and task_id in self.metadata:
|
193 |
+
result['expected_answer'] = self.metadata[task_id].get('Final answer')
|
194 |
+
|
195 |
+
return result
|
app.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import requests
|
4 |
+
import inspect
|
5 |
+
import pandas as pd
|
6 |
+
import logging
|
7 |
+
import sys
|
8 |
+
|
9 |
+
# Configure logging
|
10 |
+
logging.basicConfig(
|
11 |
+
level=logging.INFO,
|
12 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
13 |
+
handlers=[
|
14 |
+
logging.StreamHandler(sys.stdout)
|
15 |
+
]
|
16 |
+
)
|
17 |
+
logger = logging.getLogger('app')
|
18 |
+
|
19 |
+
# Add the current directory to sys.path to import local modules
|
20 |
+
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
21 |
+
|
22 |
+
# Import the MultiModalAgent
|
23 |
+
from agent import MultiModalAgent
|
24 |
+
|
25 |
+
# (Keep Constants as is)
|
26 |
+
# --- Constants ---
|
27 |
+
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
28 |
+
|
29 |
+
# --- Agent Definition ---
|
30 |
+
# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
|
31 |
+
class BasicAgent:
|
32 |
+
def __init__(self):
|
33 |
+
print("BasicAgent initialized.")
|
34 |
+
def __call__(self, question: str) -> str:
|
35 |
+
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
36 |
+
fixed_answer = "This is a default answer."
|
37 |
+
print(f"Agent returning fixed answer: {fixed_answer}")
|
38 |
+
return fixed_answer
|
39 |
+
|
40 |
+
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
41 |
+
"""
|
42 |
+
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
43 |
+
and displays the results.
|
44 |
+
"""
|
45 |
+
# --- Determine HF Space Runtime URL and Repo URL ---
|
46 |
+
space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
|
47 |
+
|
48 |
+
if profile:
|
49 |
+
username= f"{profile.username}"
|
50 |
+
print(f"User logged in: {username}")
|
51 |
+
else:
|
52 |
+
print("User not logged in.")
|
53 |
+
return "Please Login to Hugging Face with the button.", None
|
54 |
+
|
55 |
+
api_url = DEFAULT_API_URL
|
56 |
+
questions_url = f"{api_url}/questions"
|
57 |
+
submit_url = f"{api_url}/submit"
|
58 |
+
|
59 |
+
# 1. Instantiate Agent ( modify this part to create your agent)
|
60 |
+
try:
|
61 |
+
logger.info("Creating MultiModalAgent instance...")
|
62 |
+
resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resource')
|
63 |
+
agent = MultiModalAgent(resource_dir=resource_dir)
|
64 |
+
logger.info("MultiModalAgent initialized successfully")
|
65 |
+
except Exception as e:
|
66 |
+
logger.error(f"Error instantiating agent: {e}", exc_info=True)
|
67 |
+
return f"Error initializing agent: {e}", None
|
68 |
+
# In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
|
69 |
+
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
|
70 |
+
print(agent_code)
|
71 |
+
|
72 |
+
# 2. Fetch Questions
|
73 |
+
print(f"Fetching questions from: {questions_url}")
|
74 |
+
try:
|
75 |
+
response = requests.get(questions_url, timeout=15)
|
76 |
+
response.raise_for_status()
|
77 |
+
questions_data = response.json()
|
78 |
+
if not questions_data:
|
79 |
+
print("Fetched questions list is empty.")
|
80 |
+
return "Fetched questions list is empty or invalid format.", None
|
81 |
+
print(f"Fetched {len(questions_data)} questions.")
|
82 |
+
except requests.exceptions.RequestException as e:
|
83 |
+
print(f"Error fetching questions: {e}")
|
84 |
+
return f"Error fetching questions: {e}", None
|
85 |
+
except requests.exceptions.JSONDecodeError as e:
|
86 |
+
print(f"Error decoding JSON response from questions endpoint: {e}")
|
87 |
+
print(f"Response text: {response.text[:500]}")
|
88 |
+
return f"Error decoding server response for questions: {e}", None
|
89 |
+
except Exception as e:
|
90 |
+
print(f"An unexpected error occurred fetching questions: {e}")
|
91 |
+
return f"An unexpected error occurred fetching questions: {e}", None
|
92 |
+
|
93 |
+
# 3. Run your Agent
|
94 |
+
results_log = []
|
95 |
+
answers_payload = []
|
96 |
+
print(f"Running agent on {len(questions_data)} questions...")
|
97 |
+
for item in questions_data:
|
98 |
+
task_id = item.get("task_id")
|
99 |
+
question_text = item.get("question")
|
100 |
+
if not task_id or question_text is None:
|
101 |
+
print(f"Skipping item with missing task_id or question: {item}")
|
102 |
+
continue
|
103 |
+
try:
|
104 |
+
submitted_answer = agent(question_text)
|
105 |
+
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
106 |
+
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
107 |
+
except Exception as e:
|
108 |
+
print(f"Error running agent on task {task_id}: {e}")
|
109 |
+
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
|
110 |
+
|
111 |
+
if not answers_payload:
|
112 |
+
print("Agent did not produce any answers to submit.")
|
113 |
+
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
114 |
+
|
115 |
+
# 4. Prepare Submission
|
116 |
+
submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
|
117 |
+
status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
|
118 |
+
print(status_update)
|
119 |
+
|
120 |
+
# 5. Submit
|
121 |
+
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
|
122 |
+
try:
|
123 |
+
response = requests.post(submit_url, json=submission_data, timeout=60)
|
124 |
+
response.raise_for_status()
|
125 |
+
result_data = response.json()
|
126 |
+
final_status = (
|
127 |
+
f"Submission Successful!\n"
|
128 |
+
f"User: {result_data.get('username')}\n"
|
129 |
+
f"Overall Score: {result_data.get('score', 'N/A')}% "
|
130 |
+
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
|
131 |
+
f"Message: {result_data.get('message', 'No message received.')}"
|
132 |
+
)
|
133 |
+
print("Submission successful.")
|
134 |
+
results_df = pd.DataFrame(results_log)
|
135 |
+
return final_status, results_df
|
136 |
+
except requests.exceptions.HTTPError as e:
|
137 |
+
error_detail = f"Server responded with status {e.response.status_code}."
|
138 |
+
try:
|
139 |
+
error_json = e.response.json()
|
140 |
+
error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
|
141 |
+
except requests.exceptions.JSONDecodeError:
|
142 |
+
error_detail += f" Response: {e.response.text[:500]}"
|
143 |
+
status_message = f"Submission Failed: {error_detail}"
|
144 |
+
print(status_message)
|
145 |
+
results_df = pd.DataFrame(results_log)
|
146 |
+
return status_message, results_df
|
147 |
+
except requests.exceptions.Timeout:
|
148 |
+
status_message = "Submission Failed: The request timed out."
|
149 |
+
print(status_message)
|
150 |
+
results_df = pd.DataFrame(results_log)
|
151 |
+
return status_message, results_df
|
152 |
+
except requests.exceptions.RequestException as e:
|
153 |
+
status_message = f"Submission Failed: Network error - {e}"
|
154 |
+
print(status_message)
|
155 |
+
results_df = pd.DataFrame(results_log)
|
156 |
+
return status_message, results_df
|
157 |
+
except Exception as e:
|
158 |
+
status_message = f"An unexpected error occurred during submission: {e}"
|
159 |
+
print(status_message)
|
160 |
+
results_df = pd.DataFrame(results_log)
|
161 |
+
return status_message, results_df
|
162 |
+
|
163 |
+
|
164 |
+
# --- Build Gradio Interface using Blocks ---
|
165 |
+
with gr.Blocks() as demo:
|
166 |
+
gr.Markdown("# Basic Agent Evaluation Runner")
|
167 |
+
gr.Markdown(
|
168 |
+
"""
|
169 |
+
**Instructions:**
|
170 |
+
|
171 |
+
1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
|
172 |
+
2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
|
173 |
+
3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
|
174 |
+
|
175 |
+
---
|
176 |
+
**Disclaimers:**
|
177 |
+
Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
|
178 |
+
This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
|
179 |
+
"""
|
180 |
+
)
|
181 |
+
|
182 |
+
gr.LoginButton()
|
183 |
+
|
184 |
+
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
185 |
+
|
186 |
+
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
187 |
+
# Removed max_rows=10 from DataFrame constructor
|
188 |
+
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
189 |
+
|
190 |
+
run_button.click(
|
191 |
+
fn=run_and_submit_all,
|
192 |
+
outputs=[status_output, results_table]
|
193 |
+
)
|
194 |
+
|
195 |
+
if __name__ == "__main__":
|
196 |
+
print("\n" + "-"*30 + " App Starting " + "-"*30)
|
197 |
+
# Check for SPACE_HOST and SPACE_ID at startup for information
|
198 |
+
space_host_startup = os.getenv("SPACE_HOST")
|
199 |
+
space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
|
200 |
+
|
201 |
+
if space_host_startup:
|
202 |
+
print(f"✅ SPACE_HOST found: {space_host_startup}")
|
203 |
+
print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
|
204 |
+
else:
|
205 |
+
print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
|
206 |
+
|
207 |
+
if space_id_startup: # Print repo URLs if SPACE_ID is found
|
208 |
+
print(f"✅ SPACE_ID found: {space_id_startup}")
|
209 |
+
print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
|
210 |
+
print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
|
211 |
+
else:
|
212 |
+
print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
|
213 |
+
|
214 |
+
print("-"*(60 + len(" App Starting ")) + "\n")
|
215 |
+
|
216 |
+
print("Launching Gradio Interface for Basic Agent Evaluation...")
|
217 |
+
demo.launch(debug=True, share=False)
|
index.html
DELETED
@@ -1,19 +0,0 @@
|
|
1 |
-
<!doctype html>
|
2 |
-
<html>
|
3 |
-
<head>
|
4 |
-
<meta charset="utf-8" />
|
5 |
-
<meta name="viewport" content="width=device-width" />
|
6 |
-
<title>My static Space</title>
|
7 |
-
<link rel="stylesheet" href="style.css" />
|
8 |
-
</head>
|
9 |
-
<body>
|
10 |
-
<div class="card">
|
11 |
-
<h1>Welcome to your static Space!</h1>
|
12 |
-
<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
|
13 |
-
<p>
|
14 |
-
Also don't forget to check the
|
15 |
-
<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
|
16 |
-
</p>
|
17 |
-
</div>
|
18 |
-
</body>
|
19 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas>=1.3.0
|
2 |
+
gradio>=3.0.0
|
3 |
+
requests>=2.25.0
|
4 |
+
openpyxl>=3.0.9
|
5 |
+
PyPDF2>=2.0.0
|
6 |
+
python-docx>=0.8.11
|
7 |
+
python-pptx>=0.6.19
|
8 |
+
Pillow>=8.0.0
|
9 |
+
jsonschema>=4.0.0
|
10 |
+
zipfile36>=0.1.3
|
11 |
+
scikit-learn>=1.0.0
|
12 |
+
nltk>=3.6.0
|
13 |
+
python-dotenv>=0.19.0
|
14 |
+
pytest>=6.0.0
|
15 |
+
PyYAML>=6.0
|
16 |
+
biopython>=1.79
|
style.css
DELETED
@@ -1,28 +0,0 @@
|
|
1 |
-
body {
|
2 |
-
padding: 2rem;
|
3 |
-
font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
|
4 |
-
}
|
5 |
-
|
6 |
-
h1 {
|
7 |
-
font-size: 16px;
|
8 |
-
margin-top: 0;
|
9 |
-
}
|
10 |
-
|
11 |
-
p {
|
12 |
-
color: rgb(107, 114, 128);
|
13 |
-
font-size: 15px;
|
14 |
-
margin-bottom: 10px;
|
15 |
-
margin-top: 5px;
|
16 |
-
}
|
17 |
-
|
18 |
-
.card {
|
19 |
-
max-width: 620px;
|
20 |
-
margin: 0 auto;
|
21 |
-
padding: 16px;
|
22 |
-
border: 1px solid lightgray;
|
23 |
-
border-radius: 16px;
|
24 |
-
}
|
25 |
-
|
26 |
-
.card p:last-child {
|
27 |
-
margin-bottom: 0;
|
28 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_agent.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Test the MultiModalAgent.
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
import logging
|
7 |
+
import json
|
8 |
+
|
9 |
+
# Add the current directory to sys.path to import local modules
|
10 |
+
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
11 |
+
|
12 |
+
# Import the MultiModalAgent
|
13 |
+
from agent import MultiModalAgent
|
14 |
+
|
15 |
+
# Configure logging
|
16 |
+
logging.basicConfig(
|
17 |
+
level=logging.INFO,
|
18 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
19 |
+
)
|
20 |
+
logger = logging.getLogger('test_agent')
|
21 |
+
|
22 |
+
def main():
|
23 |
+
"""Test the MultiModalAgent with some sample questions."""
|
24 |
+
# Initialize the agent
|
25 |
+
resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resource')
|
26 |
+
agent = MultiModalAgent(resource_dir=resource_dir)
|
27 |
+
|
28 |
+
# Load test questions from metadata.jsonl
|
29 |
+
metadata_path = os.path.join(resource_dir, 'metadata.jsonl')
|
30 |
+
test_questions = []
|
31 |
+
|
32 |
+
with open(metadata_path, 'r', encoding='utf-8') as f:
|
33 |
+
for line in f:
|
34 |
+
entry = json.loads(line.strip())
|
35 |
+
if 'Question' in entry and 'file_name' in entry and entry['file_name']:
|
36 |
+
test_questions.append({
|
37 |
+
'task_id': entry.get('task_id'),
|
38 |
+
'question': entry['Question'],
|
39 |
+
'file_name': entry['file_name'],
|
40 |
+
'expected_answer': entry.get('Final answer')
|
41 |
+
})
|
42 |
+
if len(test_questions) >= 5: # Limit to 5 questions
|
43 |
+
break
|
44 |
+
|
45 |
+
# If no questions with files were found, use some generic questions
|
46 |
+
if not test_questions:
|
47 |
+
test_questions = [
|
48 |
+
{
|
49 |
+
'question': "What's the oldest Blu-Ray in the inventory spreadsheet?",
|
50 |
+
'file_name': None,
|
51 |
+
'expected_answer': None
|
52 |
+
},
|
53 |
+
{
|
54 |
+
'question': "How many files are in the resource directory?",
|
55 |
+
'file_name': None,
|
56 |
+
'expected_answer': None
|
57 |
+
}
|
58 |
+
]
|
59 |
+
|
60 |
+
# Test the agent with each question
|
61 |
+
for i, q in enumerate(test_questions):
|
62 |
+
question = q['question']
|
63 |
+
logger.info(f"Testing question {i+1}: {question}")
|
64 |
+
|
65 |
+
answer = agent(question)
|
66 |
+
logger.info(f"Answer: {answer}")
|
67 |
+
|
68 |
+
if q['expected_answer']:
|
69 |
+
logger.info(f"Expected answer: {q['expected_answer']}")
|
70 |
+
if answer.strip() == q['expected_answer'].strip():
|
71 |
+
logger.info("Correct answer!")
|
72 |
+
else:
|
73 |
+
logger.warning("Incorrect answer.")
|
74 |
+
|
75 |
+
logger.info("-" * 80)
|
76 |
+
|
77 |
+
if __name__ == "__main__":
|
78 |
+
main()
|