Arbnor Tefiki commited on
Commit
f011b22
·
1 Parent(s): 6fd83b8

Add some debugging tools

Browse files
agent/agent.py CHANGED
@@ -40,26 +40,30 @@ class MultiModalAgent:
40
  # Cache for answers
41
  self.answer_cache = {}
42
 
43
- def __call__(self, question: str) -> str:
44
  """
45
  Process a question and return an answer.
46
 
47
  Args:
48
  question: The question to answer
 
49
 
50
  Returns:
51
  Answer to the question
52
  """
53
  logger.info(f"Processing question: {question[:100]}...")
 
 
54
 
55
  # Check answer cache
56
- if question in self.answer_cache:
 
57
  logger.info("Answer found in cache")
58
- return self.answer_cache[question]
59
 
60
  try:
61
  # Analyze the question
62
- analysis = self.question_analyzer.analyze_question(question)
63
  logger.info(f"Question analysis: {analysis}")
64
 
65
  # Handle general questions that don't require file processing
@@ -67,7 +71,7 @@ class MultiModalAgent:
67
  logger.info("No file reference found in question, trying to answer directly")
68
  direct_answer = self._answer_without_file(question)
69
  if direct_answer:
70
- self.answer_cache[question] = direct_answer
71
  return direct_answer
72
 
73
  # If direct answering failed, try to find a file in the resource directory
@@ -75,7 +79,19 @@ class MultiModalAgent:
75
  analysis['file_path'] = self._find_most_relevant_file(question)
76
  if not analysis['file_path']:
77
  logger.warning("No relevant file found for the question")
78
- return "I couldn't find a relevant file to answer this question."
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  # Extract content from the file
81
  file_path = analysis['file_path']
@@ -95,7 +111,7 @@ class MultiModalAgent:
95
  answer = self._process_content(content, handler, question)
96
 
97
  # Cache the answer
98
- self.answer_cache[question] = answer
99
 
100
  return answer
101
  except Exception as e:
@@ -232,10 +248,21 @@ class MultiModalAgent:
232
 
233
  # Return the most relevant file if it has a non-zero score
234
  if scores and scores[0][1] > 0:
 
235
  return scores[0][0]
236
 
237
- # If no relevant file is found based on the question, return None
 
 
 
 
 
 
 
 
 
238
  return None
 
239
  except Exception as e:
240
  logger.error(f"Error finding relevant file: {e}")
241
  return None
 
40
  # Cache for answers
41
  self.answer_cache = {}
42
 
43
+ def __call__(self, question: str, task_id: Optional[str] = None) -> str:
44
  """
45
  Process a question and return an answer.
46
 
47
  Args:
48
  question: The question to answer
49
+ task_id: The task ID (optional)
50
 
51
  Returns:
52
  Answer to the question
53
  """
54
  logger.info(f"Processing question: {question[:100]}...")
55
+ if task_id:
56
+ logger.info(f"Task ID: {task_id}")
57
 
58
  # Check answer cache
59
+ cache_key = f"{task_id}:{question}" if task_id else question
60
+ if cache_key in self.answer_cache:
61
  logger.info("Answer found in cache")
62
+ return self.answer_cache[cache_key]
63
 
64
  try:
65
  # Analyze the question
66
+ analysis = self.question_analyzer.analyze_question(question, task_id)
67
  logger.info(f"Question analysis: {analysis}")
68
 
69
  # Handle general questions that don't require file processing
 
71
  logger.info("No file reference found in question, trying to answer directly")
72
  direct_answer = self._answer_without_file(question)
73
  if direct_answer:
74
+ self.answer_cache[cache_key] = direct_answer
75
  return direct_answer
76
 
77
  # If direct answering failed, try to find a file in the resource directory
 
79
  analysis['file_path'] = self._find_most_relevant_file(question)
80
  if not analysis['file_path']:
81
  logger.warning("No relevant file found for the question")
82
+ # List available files for debugging
83
+ try:
84
+ files = os.listdir(self.resource_dir)
85
+ logger.info(f"Available files in {self.resource_dir}: {files}")
86
+ except Exception as e:
87
+ logger.error(f"Error listing files in resource directory: {e}")
88
+
89
+ # Check if resource directory exists
90
+ if not os.path.exists(self.resource_dir):
91
+ logger.error(f"Resource directory does not exist: {self.resource_dir}")
92
+ return f"Error: Resource directory not found at {self.resource_dir}. Please check the path."
93
+
94
+ return "I couldn't find a relevant file to answer this question. Please provide more context or specify a file."
95
 
96
  # Extract content from the file
97
  file_path = analysis['file_path']
 
111
  answer = self._process_content(content, handler, question)
112
 
113
  # Cache the answer
114
+ self.answer_cache[cache_key] = answer
115
 
116
  return answer
117
  except Exception as e:
 
248
 
249
  # Return the most relevant file if it has a non-zero score
250
  if scores and scores[0][1] > 0:
251
+ logger.info(f"Found relevant file: {scores[0][0]} with score {scores[0][1]}")
252
  return scores[0][0]
253
 
254
+ # If no relevant file is found based on the question, try to default to the metadata file
255
+ if not scores or scores[0][1] == 0:
256
+ # Look for metadata file as a fallback
257
+ metadata_path = os.path.join(self.resource_dir, 'metadata.jsonl')
258
+ if os.path.exists(metadata_path):
259
+ logger.info("No specific file found, defaulting to metadata.jsonl")
260
+ return metadata_path
261
+
262
+ # If we get here, no relevant file was found
263
+ logger.warning("No relevant file found for the question")
264
  return None
265
+
266
  except Exception as e:
267
  logger.error(f"Error finding relevant file: {e}")
268
  return None
agent/utils/debug_metadata.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test script to debug metadata loading and file finding.
3
+ """
4
+ import os
5
+ import json
6
+ import sys
7
+
8
+ # Add the parent directory to sys.path
9
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
10
+
11
+ from agent.utils.question_analyzer import QuestionAnalyzer
12
+
13
+ def main():
14
+ """Main function to test metadata loading and file finding."""
15
+ # Get the resource directory
16
+ resource_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'resource')
17
+ print(f"Resource directory: {resource_dir}")
18
+
19
+ # Check if the directory exists
20
+ if not os.path.exists(resource_dir):
21
+ print(f"Resource directory does not exist: {resource_dir}")
22
+ return
23
+
24
+ # List files in the directory
25
+ print("Files in resource directory:")
26
+ for file in os.listdir(resource_dir):
27
+ print(f" {file}")
28
+
29
+ # Check for metadata.jsonl
30
+ metadata_path = os.path.join(resource_dir, 'metadata.jsonl')
31
+ if not os.path.exists(metadata_path):
32
+ print(f"Metadata file does not exist: {metadata_path}")
33
+ return
34
+
35
+ # Load metadata
36
+ print("\nLoading metadata...")
37
+ question_analyzer = QuestionAnalyzer(resource_dir)
38
+
39
+ # Print metadata entries
40
+ print(f"Metadata entries: {len(question_analyzer.metadata)}")
41
+
42
+ # Print first few entries
43
+ count = 0
44
+ for task_id, entry in question_analyzer.metadata.items():
45
+ print(f"\nTask ID: {task_id}")
46
+ print(f"Question: {entry.get('Question', 'N/A')[:100]}...")
47
+ print(f"File Name: {entry.get('file_name', 'N/A')}")
48
+ print(f"Expected Answer: {entry.get('Final answer', 'N/A')}")
49
+
50
+ # Check if the file exists
51
+ if entry.get('file_name'):
52
+ file_path = os.path.join(resource_dir, entry['file_name'])
53
+ if os.path.exists(file_path):
54
+ print(f"✅ File exists: {file_path}")
55
+ else:
56
+ print(f"❌ File does not exist: {file_path}")
57
+
58
+ count += 1
59
+ if count >= 5:
60
+ break
61
+
62
+ # Test file finding
63
+ print("\nTesting file finding...")
64
+ test_questions = []
65
+
66
+ with open(metadata_path, 'r', encoding='utf-8') as f:
67
+ for line in f:
68
+ entry = json.loads(line.strip())
69
+ if 'Question' in entry and 'file_name' in entry and entry['file_name']:
70
+ test_questions.append({
71
+ 'task_id': entry.get('task_id'),
72
+ 'question': entry['Question'],
73
+ 'file_name': entry['file_name']
74
+ })
75
+ if len(test_questions) >= 5:
76
+ break
77
+
78
+ for q in test_questions:
79
+ print(f"\nQuestion: {q['question'][:100]}...")
80
+ print(f"Expected file: {q['file_name']}")
81
+
82
+ file_path = question_analyzer.find_relevant_file(q['question'], q['task_id'])
83
+ if file_path:
84
+ print(f"✅ Found file: {os.path.basename(file_path)}")
85
+ else:
86
+ print("❌ No file found")
87
+
88
+ if __name__ == "__main__":
89
+ main()
agent/utils/question_analyzer.py CHANGED
@@ -88,23 +88,49 @@ class QuestionAnalyzer:
88
  # Check if task_id is in metadata and has a file_name
89
  if task_id and task_id in self.metadata:
90
  file_name = self.metadata[task_id].get('file_name')
91
- if file_name:
92
  file_path = os.path.join(self.resource_dir, file_name)
93
  if os.path.exists(file_path):
 
94
  return file_path
95
 
 
 
 
 
 
 
 
 
 
 
 
96
  # Extract file mention from question
97
  file_mention = self.extract_file_mention(question)
98
  if file_mention:
99
  # Check if the mentioned file exists
100
  file_path = os.path.join(self.resource_dir, file_mention)
101
  if os.path.exists(file_path):
 
102
  return file_path
103
 
104
  # Check if there's a file with a similar name
105
  for file_name in os.listdir(self.resource_dir):
106
  if file_mention.lower() in file_name.lower():
107
- return os.path.join(self.resource_dir, file_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  # If no file is found, try to find a file mentioned in the metadata
110
  if task_id and task_id in self.metadata:
@@ -131,9 +157,58 @@ class QuestionAnalyzer:
131
  best_match = file_name
132
 
133
  if best_match:
134
- return os.path.join(self.resource_dir, best_match)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  def _extract_keywords(self, text: str) -> Set[str]:
139
  """
@@ -183,6 +258,13 @@ class QuestionAnalyzer:
183
  'expected_answer': None,
184
  }
185
 
 
 
 
 
 
 
 
186
  # Find relevant file
187
  file_path = self.find_relevant_file(question, task_id)
188
  if file_path:
 
88
  # Check if task_id is in metadata and has a file_name
89
  if task_id and task_id in self.metadata:
90
  file_name = self.metadata[task_id].get('file_name')
91
+ if file_name and file_name.strip(): # Make sure file_name is not empty
92
  file_path = os.path.join(self.resource_dir, file_name)
93
  if os.path.exists(file_path):
94
+ print(f"Found file in metadata for task_id {task_id}: {file_path}")
95
  return file_path
96
 
97
+ # Try to find task_id in all metadata entries by matching the question
98
+ if not task_id:
99
+ for entry_id, entry in self.metadata.items():
100
+ if entry.get('Question') and entry.get('Question') == question:
101
+ file_name = entry.get('file_name')
102
+ if file_name and file_name.strip():
103
+ file_path = os.path.join(self.resource_dir, file_name)
104
+ if os.path.exists(file_path):
105
+ print(f"Found file in metadata by matching question: {file_path}")
106
+ return file_path
107
+
108
  # Extract file mention from question
109
  file_mention = self.extract_file_mention(question)
110
  if file_mention:
111
  # Check if the mentioned file exists
112
  file_path = os.path.join(self.resource_dir, file_mention)
113
  if os.path.exists(file_path):
114
+ print(f"Found file by direct mention: {file_path}")
115
  return file_path
116
 
117
  # Check if there's a file with a similar name
118
  for file_name in os.listdir(self.resource_dir):
119
  if file_mention.lower() in file_name.lower():
120
+ file_path = os.path.join(self.resource_dir, file_name)
121
+ print(f"Found file by partial name match: {file_path}")
122
+ return file_path
123
+
124
+ # Look for UUID pattern in the question which might be a file name without extension
125
+ uuid_pattern = r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})'
126
+ uuid_match = re.search(uuid_pattern, question, re.IGNORECASE)
127
+ if uuid_match:
128
+ uuid = uuid_match.group(1)
129
+ for file_name in os.listdir(self.resource_dir):
130
+ if uuid in file_name:
131
+ file_path = os.path.join(self.resource_dir, file_name)
132
+ print(f"Found file by UUID match: {file_path}")
133
+ return file_path
134
 
135
  # If no file is found, try to find a file mentioned in the metadata
136
  if task_id and task_id in self.metadata:
 
157
  best_match = file_name
158
 
159
  if best_match:
160
+ file_path = os.path.join(self.resource_dir, best_match)
161
+ print(f"Found file by keyword matching: {file_path}")
162
+ return file_path
163
+
164
+ # If still no match, check the content of metadata.jsonl for clues
165
+ try:
166
+ with open(self.metadata_path, 'r', encoding='utf-8') as f:
167
+ for line in f:
168
+ entry = json.loads(line.strip())
169
+ if 'Question' in entry and entry['Question'] and 'file_name' in entry and entry['file_name']:
170
+ # Compare with current question
171
+ if self._questions_are_similar(question, entry['Question']):
172
+ file_name = entry['file_name']
173
+ file_path = os.path.join(self.resource_dir, file_name)
174
+ if os.path.exists(file_path):
175
+ print(f"Found file by similar question in metadata: {file_path}")
176
+ return file_path
177
+ except Exception as e:
178
+ print(f"Error searching metadata for similar questions: {e}")
179
 
180
  return None
181
+
182
+ def _questions_are_similar(self, q1: str, q2: str) -> bool:
183
+ """
184
+ Check if two questions are similar.
185
+
186
+ Args:
187
+ q1: First question
188
+ q2: Second question
189
+
190
+ Returns:
191
+ True if the questions are similar, False otherwise
192
+ """
193
+ # Convert to lowercase and remove punctuation
194
+ q1 = re.sub(r'[^\w\s]', '', q1.lower())
195
+ q2 = re.sub(r'[^\w\s]', '', q2.lower())
196
+
197
+ # Split into words
198
+ words1 = set(q1.split())
199
+ words2 = set(q2.split())
200
+
201
+ # Calculate Jaccard similarity
202
+ intersection = len(words1.intersection(words2))
203
+ union = len(words1.union(words2))
204
+
205
+ if union == 0:
206
+ return False
207
+
208
+ similarity = intersection / union
209
+
210
+ # Return True if similarity is above threshold
211
+ return similarity > 0.5
212
 
213
  def _extract_keywords(self, text: str) -> Set[str]:
214
  """
 
258
  'expected_answer': None,
259
  }
260
 
261
+ # Try to extract task_id from the question if not provided
262
+ if not task_id:
263
+ task_id_match = re.search(r'task_id[: ]+([\w\-]+)', question, re.IGNORECASE)
264
+ if task_id_match:
265
+ result['task_id'] = task_id_match.group(1)
266
+ task_id = result['task_id']
267
+
268
  # Find relevant file
269
  file_path = self.find_relevant_file(question, task_id)
270
  if file_path:
app.py CHANGED
@@ -101,11 +101,13 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
101
  print(f"Skipping item with missing task_id or question: {item}")
102
  continue
103
  try:
104
- submitted_answer = agent(question_text)
 
 
105
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
106
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
107
  except Exception as e:
108
- print(f"Error running agent on task {task_id}: {e}")
109
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
110
 
111
  if not answers_payload:
 
101
  print(f"Skipping item with missing task_id or question: {item}")
102
  continue
103
  try:
104
+ logger.info(f"Processing question with task_id: {task_id}")
105
+ logger.info(f"Question text: {question_text[:100]}...")
106
+ submitted_answer = agent(question_text, task_id)
107
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
108
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
109
  except Exception as e:
110
+ logger.error(f"Error running agent on task {task_id}: {e}", exc_info=True)
111
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
112
 
113
  if not answers_payload: