martinsu commited on
Commit
c48121b
·
1 Parent(s): 81917a3

Add .gitignore, implement BasicAgent in agent.py, and enhance app.py for checkpointing

Browse files
Files changed (4) hide show
  1. .gitignore +15 -0
  2. agent.py +1009 -0
  3. app.py +156 -43
  4. requirements.txt +9 -1
.gitignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .env
2
+ __pycache__/
3
+ *.pyc
4
+
5
+ # Documentation
6
+ DOCUMENTATION/
7
+
8
+ #test sets
9
+ TEST_SET/
10
+
11
+ #test results
12
+ test_results/
13
+
14
+ #cursor
15
+ .cursor/
agent.py ADDED
@@ -0,0 +1,1009 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, TypedDict, Dict, Any, Literal
2
+ from langgraph.graph import StateGraph, START, END
3
+ from langgraph.types import Command
4
+ from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
5
+ from langchain_anthropic import ChatAnthropic
6
+ from langchain_core.tools import tool
7
+ from langchain_core.prompts import ChatPromptTemplate
8
+ from langgraph.prebuilt import ToolNode
9
+ import os
10
+ from dotenv import load_dotenv
11
+ from datetime import datetime
12
+ from tavily import TavilyClient
13
+ from langfuse.callback import CallbackHandler
14
+ import requests
15
+ import json
16
+ import time
17
+ from daytona_sdk import Daytona, DaytonaConfig
18
+
19
+
20
+
21
+ # Load environment variablesTuple
22
+ load_dotenv()
23
+
24
+ # Define the state schema with messages that ToolNode can use
25
+ class AgentState(TypedDict):
26
+ messages: List
27
+ current_question: str
28
+ final_answer: str
29
+ validation_result: str
30
+ worker_iterations: int
31
+ supervisor_satisfaction: bool
32
+ validator_approval: bool
33
+
34
+ # Define tools following Langgraph guide
35
+
36
+
37
+ @tool
38
+ def search_web_tavily(query: str) -> str:
39
+ """Search the web for information using the Tavily search API."""
40
+ # Initialize the Tavily client with API key from environment variables
41
+ client = TavilyClient(os.getenv("TAVILY_API_KEY"))
42
+
43
+ # Perform the search
44
+ response = client.search(query=query)
45
+
46
+ # Process the results into a readable format
47
+ results = []
48
+ for i, result in enumerate(response.get("results", []), 1):
49
+ results.append(f"{i}. {result.get('title')}\n URL: {result.get('url')}\n {result.get('content')}\n")
50
+
51
+ # Format the final response
52
+ formatted_response = f"Search results for '{query}':\n\n" + "\n".join(results)
53
+
54
+ return formatted_response
55
+
56
+ @tool
57
+ def search_web_serper(query: str, result_limit: int = 5, search_type: str = "search") -> str:
58
+ """Search the web for information using the Serper.dev API.
59
+
60
+ This tool provides comprehensive search results including:
61
+ 1. Knowledge Graph data when available (title, description, attributes)
62
+ 2. Organic search results (titles, links, snippets)
63
+ 3. Related questions from "People Also Ask" section
64
+ 4. Top stories and news articles related to the query
65
+
66
+ It's particularly useful for gathering factual information, current events,
67
+ and general knowledge from across the web. The results are formatted in a
68
+ readable structure with clear sections.
69
+
70
+ Parameters:
71
+ - query: The search query string
72
+ - result_limit: Maximum number of results to return per section (default: 5)
73
+ - search_type: Type of search ('search', 'news', 'places', 'images', 'shopping')
74
+ """
75
+ # API URL and headers setup
76
+ url = "https://google.serper.dev/search"
77
+ headers = {
78
+ 'X-API-KEY': os.getenv("SERPER_API_KEY"),
79
+ 'Content-Type': 'application/json'
80
+ }
81
+
82
+ # Prepare the payload with the query and search type
83
+ payload = json.dumps({
84
+ "q": query,
85
+ "type": search_type
86
+ })
87
+
88
+ try:
89
+ # Make the API request
90
+ response = requests.request("POST", url, headers=headers, data=payload, timeout=30)
91
+ response.raise_for_status() # Raise exception for HTTP errors
92
+
93
+ # Parse the JSON response
94
+ data = response.json()
95
+
96
+ # Format the results
97
+ results = []
98
+
99
+ # Add knowledge graph if available
100
+ if "knowledgeGraph" in data:
101
+ kg = data["knowledgeGraph"]
102
+ results.append(f"Knowledge Graph:\n{kg.get('title', 'Unknown')} - {kg.get('type', '')}")
103
+ results.append(f"Description: {kg.get('description', 'No description available')}")
104
+
105
+ if "attributes" in kg:
106
+ results.append("Attributes:")
107
+ for key, value in kg["attributes"].items():
108
+ results.append(f"- {key}: {value}")
109
+
110
+ results.append("") # Empty line for separation
111
+
112
+ # Add organic search results
113
+ if "organic" in data:
114
+ results.append("Organic Search Results:")
115
+ for i, result in enumerate(data["organic"][:result_limit], 1):
116
+ results.append(f"{i}. {result.get('title', 'No title')}")
117
+ results.append(f" URL: {result.get('link', 'No link')}")
118
+ results.append(f" {result.get('snippet', 'No snippet')}")
119
+ results.append("") # Empty line for separation
120
+
121
+ # Add people also ask if available
122
+ if "peopleAlsoAsk" in data and data["peopleAlsoAsk"]:
123
+ results.append("People Also Ask:")
124
+ for i, qa in enumerate(data["peopleAlsoAsk"][:min(3, result_limit)], 1):
125
+ results.append(f"{i}. Q: {qa.get('question', 'No question')}")
126
+ results.append(f" A: {qa.get('snippet', 'No answer')}")
127
+ results.append("") # Empty line for separation
128
+
129
+ # Add top stories if available
130
+ if "topStories" in data and data["topStories"]:
131
+ results.append("Top Stories:")
132
+ for i, story in enumerate(data["topStories"][:min(3, result_limit)], 1):
133
+ results.append(f"{i}. {story.get('title', 'No title')}")
134
+ results.append(f" Source: {story.get('source', 'Unknown source')}")
135
+ if "date" in story:
136
+ results.append(f" Published: {story.get('date')}")
137
+ results.append(f" URL: {story.get('link', 'No link')}")
138
+ results.append("") # Empty line for separation
139
+
140
+ # Format the final response
141
+ formatted_response = f"Search results for '{query}':\n\n" + "\n".join(results)
142
+
143
+ return formatted_response
144
+
145
+ except requests.exceptions.Timeout:
146
+ return f"Error: Request to Serper API timed out after 30 seconds"
147
+ except requests.exceptions.RequestException as e:
148
+ return f"Error making request to Serper API: {str(e)}"
149
+ except json.JSONDecodeError:
150
+ return f"Error: Received invalid JSON response from Serper API"
151
+ except Exception as e:
152
+ return f"Error processing search results: {str(e)}"
153
+
154
+ # Initialize a global Daytona sandbox for reuse
155
+ _daytona_sandbox = None
156
+
157
+ @tool
158
+ def execute_code_securely(code: str, language: str = "python", timeout: int = 300) -> str:
159
+ """Execute code securely in an isolated sandbox environment using Daytona.
160
+
161
+ This tool runs code in a secure, isolated environment to prevent security issues.
162
+ It's particularly useful for solving computational problems, data processing tasks,
163
+ mathematical calculations, and other scenarios where code execution is needed.
164
+
165
+ The tool supports multiple languages, with Python as the default.
166
+
167
+ Parameters:
168
+ - code: The code to execute
169
+ - language: The programming language (default: "python")
170
+ - timeout: Maximum execution time in seconds (default: 30)
171
+
172
+ Returns:
173
+ - The execution result or error message
174
+ """
175
+ global _daytona_sandbox
176
+
177
+ try:
178
+ # Initialize Daytona client if not already done
179
+ if _daytona_sandbox is None:
180
+ api_key = os.getenv("DAYTONA_API_KEY")
181
+ if not api_key:
182
+ return "Error: DAYTONA_API_KEY environment variable not set"
183
+
184
+ # Initialize the Daytona client and create a sandbox
185
+ config = DaytonaConfig(api_key=api_key)
186
+ daytona_client = Daytona(config)
187
+ _daytona_sandbox = daytona_client.create()
188
+
189
+ # Execute the code based on the specified language
190
+ if language.lower() == "python":
191
+ response = _daytona_sandbox.process.code_run(code, timeout=timeout)
192
+ else:
193
+ # For non-Python languages, create a temporary file and execute it
194
+ timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
195
+ file_extension = {
196
+ "javascript": "js",
197
+ "nodejs": "js",
198
+ "ruby": "rb",
199
+ "php": "php",
200
+ "bash": "sh",
201
+ "shell": "sh",
202
+ "powershell": "ps1",
203
+ "c": "c",
204
+ "cpp": "cpp",
205
+ "java": "java",
206
+ "go": "go",
207
+ "rust": "rs",
208
+ }.get(language.lower(), "txt")
209
+
210
+ filename = f"/tmp/code_{timestamp}.{file_extension}"
211
+
212
+ # Upload the code file to the sandbox
213
+ _daytona_sandbox.fs.upload_file(filename, code.encode('utf-8'))
214
+
215
+ # Prepare the execution command based on language
216
+ exec_cmd = {
217
+ "javascript": f"node {filename}",
218
+ "nodejs": f"node {filename}",
219
+ "ruby": f"ruby {filename}",
220
+ "php": f"php {filename}",
221
+ "bash": f"bash {filename}",
222
+ "shell": f"sh {filename}",
223
+ "powershell": f"pwsh {filename}",
224
+ "c": f"gcc {filename} -o /tmp/prog_{timestamp} && /tmp/prog_{timestamp}",
225
+ "cpp": f"g++ {filename} -o /tmp/prog_{timestamp} && /tmp/prog_{timestamp}",
226
+ "java": f"javac {filename} && java -cp /tmp {os.path.basename(filename).split('.')[0]}",
227
+ "go": f"go run {filename}",
228
+ "rust": f"rustc {filename} -o /tmp/prog_{timestamp} && /tmp/prog_{timestamp}",
229
+ }.get(language.lower(), f"cat {filename}")
230
+
231
+ # Execute the command
232
+ response = _daytona_sandbox.process.exec(exec_cmd, cwd="/tmp", timeout=timeout)
233
+
234
+ # Extract and return the result
235
+ if hasattr(response, 'result'):
236
+ result = response.result
237
+ elif hasattr(response, 'stdout'):
238
+ result = response.stdout
239
+ else:
240
+ result = str(response)
241
+
242
+ return f"Code Execution Result ({language}):\n{result}"
243
+
244
+ except Exception as e:
245
+ # Clean up on error
246
+ try:
247
+ if _daytona_sandbox is not None:
248
+ _daytona_sandbox = None
249
+ except:
250
+ pass
251
+
252
+ return f"Error executing code: {str(e)}"
253
+
254
+ @tool
255
+ def execute_shell_command(command: str, working_dir: str = "/tmp", timeout: int = 300) -> str:
256
+ """Execute a shell command securely in an isolated sandbox environment using Daytona.
257
+
258
+ This tool runs shell commands in a secure, isolated environment to prevent security issues.
259
+ It's useful for file operations, system tasks, and other command-line operations.
260
+
261
+ Parameters:
262
+ - command: The shell command to execute
263
+ - working_dir: The working directory (default: "/tmp")
264
+ - timeout: Maximum execution time in seconds (default: 30)
265
+
266
+ Returns:
267
+ - The command execution output or error message
268
+ """
269
+ global _daytona_sandbox
270
+
271
+ try:
272
+ # Initialize Daytona client if not already done
273
+ if _daytona_sandbox is None:
274
+ api_key = os.getenv("DAYTONA_API_KEY")
275
+ if not api_key:
276
+ return "Error: DAYTONA_API_KEY environment variable not set"
277
+
278
+ # Initialize the Daytona client and create a sandbox
279
+ config = DaytonaConfig(api_key=api_key)
280
+ daytona_client = Daytona(config)
281
+ _daytona_sandbox = daytona_client.create()
282
+
283
+ # Execute the command
284
+ response = _daytona_sandbox.process.exec(command, cwd=working_dir, timeout=timeout)
285
+
286
+ # Extract and return the result
287
+ if hasattr(response, 'result'):
288
+ result = response.result
289
+ elif hasattr(response, 'stdout'):
290
+ result = response.stdout
291
+ else:
292
+ result = str(response)
293
+
294
+ return f"Shell Command Execution Result:\n{result}"
295
+
296
+ except Exception as e:
297
+ # Clean up on error
298
+ try:
299
+ if _daytona_sandbox is not None:
300
+ _daytona_sandbox = None
301
+ except:
302
+ pass
303
+
304
+ return f"Error executing shell command: {str(e)}"
305
+
306
+ @tool
307
+ def sandbox_file_operation(operation: str, file_path: str, content: str = "", target_path: str = "") -> str:
308
+ """Perform file operations in the secure sandbox environment.
309
+
310
+ This tool allows secure file manipulation in an isolated sandbox.
311
+ It supports creating, reading, writing, moving, copying and deleting files.
312
+
313
+ Parameters:
314
+ - operation: The operation to perform ('create', 'read', 'write', 'append', 'delete', 'move', 'copy', 'list')
315
+ - file_path: Path to the file to operate on
316
+ - content: Content to write (for 'create', 'write', 'append' operations)
317
+ - target_path: Target path for 'move' and 'copy' operations
318
+
319
+ Returns:
320
+ - Operation result or file content
321
+ """
322
+ global _daytona_sandbox
323
+
324
+ try:
325
+ # Initialize Daytona client if not already done
326
+ if _daytona_sandbox is None:
327
+ api_key = os.getenv("DAYTONA_API_KEY")
328
+ if not api_key:
329
+ return "Error: DAYTONA_API_KEY environment variable not set"
330
+
331
+ # Initialize the Daytona client and create a sandbox
332
+ config = DaytonaConfig(api_key=api_key)
333
+ daytona_client = Daytona(config)
334
+ _daytona_sandbox = daytona_client.create()
335
+
336
+ # Perform the requested operation
337
+ operation = operation.lower()
338
+
339
+ if operation == "create" or operation == "write":
340
+ # Create or overwrite file
341
+ _daytona_sandbox.fs.upload_file(file_path, content.encode('utf-8'))
342
+ return f"File {file_path} created/written successfully"
343
+
344
+ elif operation == "append":
345
+ # First try to read the existing content
346
+ try:
347
+ existing_content = _daytona_sandbox.fs.download_file(file_path).decode('utf-8')
348
+ except:
349
+ existing_content = ""
350
+
351
+ # Append new content and write back
352
+ new_content = existing_content + content
353
+ _daytona_sandbox.fs.upload_file(file_path, new_content.encode('utf-8'))
354
+ return f"Content appended to {file_path} successfully"
355
+
356
+ elif operation == "read":
357
+ # Read file content
358
+ try:
359
+ content = _daytona_sandbox.fs.download_file(file_path).decode('utf-8')
360
+ return f"Content of {file_path}:\n{content}"
361
+ except Exception as e:
362
+ return f"Error reading {file_path}: {str(e)}"
363
+
364
+ elif operation == "delete":
365
+ # Delete file
366
+ response = _daytona_sandbox.process.exec(f"rm -f {file_path}", cwd="/tmp")
367
+ return f"File {file_path} deleted"
368
+
369
+ elif operation == "move":
370
+ # Move file
371
+ if not target_path:
372
+ return "Error: Target path required for move operation"
373
+ response = _daytona_sandbox.process.exec(f"mv {file_path} {target_path}", cwd="/tmp")
374
+ return f"File moved from {file_path} to {target_path}"
375
+
376
+ elif operation == "copy":
377
+ # Copy file
378
+ if not target_path:
379
+ return "Error: Target path required for copy operation"
380
+ response = _daytona_sandbox.process.exec(f"cp {file_path} {target_path}", cwd="/tmp")
381
+ return f"File copied from {file_path} to {target_path}"
382
+
383
+ elif operation == "list":
384
+ # List directory contents
385
+ response = _daytona_sandbox.process.exec(f"ls -la {file_path}", cwd="/tmp")
386
+ if hasattr(response, 'result'):
387
+ result = response.result
388
+ elif hasattr(response, 'stdout'):
389
+ result = response.stdout
390
+ else:
391
+ result = str(response)
392
+ return f"Directory listing of {file_path}:\n{result}"
393
+
394
+ else:
395
+ return f"Unsupported operation: {operation}"
396
+
397
+ except Exception as e:
398
+ return f"Error performing file operation: {str(e)}"
399
+
400
+ def cleanup_daytona_sandbox():
401
+ """Clean up the Daytona sandbox when it's no longer needed."""
402
+ global _daytona_sandbox
403
+
404
+ try:
405
+ if _daytona_sandbox is not None:
406
+ # Get the Daytona client
407
+ api_key = os.getenv("DAYTONA_API_KEY")
408
+ if api_key:
409
+ config = DaytonaConfig(api_key=api_key)
410
+ daytona_client = Daytona(config)
411
+
412
+ # Remove the sandbox
413
+ daytona_client.remove(_daytona_sandbox)
414
+ _daytona_sandbox = None
415
+ print("Daytona sandbox cleaned up successfully")
416
+ except Exception as e:
417
+ print(f"Error cleaning up Daytona sandbox: {str(e)}")
418
+
419
+ # Track last execution time for rate limiting
420
+ _last_extract_url_time = 0
421
+
422
+ @tool
423
+ def extract_document_data(input_method: str, files: list, prompt: str, json_mode: bool = False) -> str:
424
+ """Extract structured data from documents using Dumpling AI.
425
+
426
+ This tool allows you to extract information from various document formats including PDFs,
427
+ Office documents, images, and many other file types. It uses vision-capable Large Language
428
+ Models (LLMs) to interpret and extract data based on your specific prompt.
429
+
430
+ Parameters:
431
+ - input_method: How to input files, either "url" or "base64"
432
+ - files: List of file URLs or base64-encoded strings depending on input_method
433
+ - prompt: Specific instructions for what data to extract from the document
434
+ - json_mode: Whether to return structured JSON (true) or free text (false)
435
+
436
+ Returns:
437
+ - Extracted data from the document based on your prompt
438
+
439
+ Supported file extensions include PDFs, Word docs, Excel files, PowerPoint, images, HTML, and many others.
440
+ """
441
+ api_key = os.getenv("DUMPLING_API_KEY")
442
+ if not api_key:
443
+ return "Error: DUMPLING_API_KEY environment variable not set"
444
+
445
+ try:
446
+ url = "https://app.dumplingai.com/api/v1/extract-document"
447
+ headers = {
448
+ "Content-Type": "application/json",
449
+ "Authorization": f"Bearer {api_key}"
450
+ }
451
+
452
+ data = {
453
+ "inputMethod": input_method,
454
+ "files": files,
455
+ "prompt": prompt,
456
+ "jsonMode": json_mode
457
+ }
458
+
459
+ response = requests.post(url, headers=headers, json=data, timeout=120)
460
+ response.raise_for_status()
461
+
462
+ result = response.json()
463
+
464
+ # Format the response in a readable way
465
+ formatted_response = f"Document Extraction Results:\n\n"
466
+ formatted_response += f"Extracted Data:\n{result.get('results', 'No results found')}\n\n"
467
+ formatted_response += f"Pages Processed: {result.get('pages', 'Unknown')}\n"
468
+ formatted_response += f"Files Processed: {result.get('fileCount', 'Unknown')}\n"
469
+ formatted_response += f"Credit Usage: {result.get('creditUsage', 'Unknown')}\n"
470
+
471
+ return formatted_response
472
+
473
+ except requests.exceptions.Timeout:
474
+ return "Error: Request to Dumpling AI API timed out after 120 seconds"
475
+ except requests.exceptions.HTTPError as e:
476
+ error_detail = f"HTTP Error: {e.response.status_code}"
477
+ try:
478
+ error_json = e.response.json()
479
+ error_detail += f" - {error_json.get('detail', error_json)}"
480
+ except:
481
+ error_detail += f" - {e.response.text[:500]}"
482
+ return error_detail
483
+ except requests.exceptions.RequestException as e:
484
+ return f"Error making request to Dumpling AI API: {str(e)}"
485
+ except Exception as e:
486
+ return f"Error extracting document data: {str(e)}"
487
+
488
+ @tool
489
+ def extract_url_content(url: str) -> str:
490
+ """Extract content from a URL using Diffbot API (supports webpages, articles, PDFs, etc.).
491
+ This function is rate-limited to execute no more frequently than once every 20 seconds."""
492
+ global _last_extract_url_time
493
+
494
+ # Check if we need to wait before executing
495
+ current_time = time.time()
496
+ time_since_last_call = current_time - _last_extract_url_time
497
+
498
+ if time_since_last_call < 20 and _last_extract_url_time > 0:
499
+ # Calculate how long to wait
500
+ wait_time = 20 - time_since_last_call
501
+ print(f"Rate limiting: waiting {wait_time:.2f} seconds before next API call")
502
+ time.sleep(wait_time)
503
+ current_time = time.time() # Update current time after sleeping
504
+
505
+ # Update last execution time
506
+ _last_extract_url_time = current_time
507
+
508
+ # Diffbot token from environment or use the fallback
509
+ token = os.getenv("DIFFBOT_TOKEN")
510
+ if not token:
511
+ return "Error: DIFFBOT_TOKEN environment variable not set"
512
+
513
+ # Set up the API endpoint
514
+ api_url = "https://api.diffbot.com/v3/article"
515
+
516
+ # Parameters for the request
517
+ params = {
518
+ "token": token,
519
+ "url": url
520
+ }
521
+
522
+ try:
523
+ # Make the API request with a timeout
524
+ response = requests.get(api_url, params=params, timeout=30) # 30 second timeout
525
+ response.raise_for_status() # Raise exception for HTTP errors
526
+
527
+ # Parse the response
528
+ data = response.json()
529
+
530
+ # Extract relevant information
531
+ if "objects" in data and len(data["objects"]) > 0:
532
+ obj = data["objects"][0]
533
+
534
+ # Create a formatted result
535
+ result = f"Title: {obj.get('title', 'No title')}\n\n"
536
+
537
+ if "text" in obj:
538
+ result += f"Content:\n{obj.get('text')}\n\n"
539
+
540
+ #if "html" in obj:
541
+ # result += f"HTML Content:\n{obj.get('html')}\n\n"
542
+
543
+ if "categories" in obj and obj["categories"]:
544
+ categories = ", ".join([f"{cat.get('name')} ({cat.get('score', 0):.2f})"
545
+ for cat in obj["categories"]])
546
+ result += f"Categories: {categories}\n"
547
+
548
+ result += f"Source: {obj.get('siteName', 'Unknown')}\n"
549
+ result += f"URL: {obj.get('pageUrl', url)}"
550
+
551
+ return result
552
+ else:
553
+ return f"No content could be extracted from {url}. Response: {data}"
554
+
555
+ except requests.exceptions.Timeout:
556
+ return f"Error: Request to extract content from {url} timed out after 30 seconds"
557
+ except requests.exceptions.RequestException as e:
558
+ return f"Error: Failed to extract content from {url}: {str(e)}"
559
+ except Exception as e:
560
+ return f"Error extracting content from {url}: {str(e)}"
561
+
562
+ class BasicAgent:
563
+ def __init__(self):
564
+ print("BasicAgent initialized.")
565
+ # Initialize the Anthropic models
566
+ # Standard model for supervisor and validator
567
+ self.langfuse_handler = CallbackHandler()
568
+
569
+ self.supervisor_model = ChatAnthropic(
570
+ model="claude-3-7-sonnet-20250219",
571
+ max_tokens=20000,
572
+ anthropic_api_key=os.getenv("ANTHROPIC_API_KEY"),
573
+ temperature=0.6,
574
+ # thinking={
575
+ # "type": "enabled",
576
+ # "budget_tokens": 5000
577
+ # }
578
+ )
579
+
580
+ # Standard model for validator
581
+ self.validator_model = ChatAnthropic(
582
+ model="claude-3-7-sonnet-20250219",
583
+ max_tokens=20000,
584
+ temperature=0.5, # Lower temperature for more consistent validation
585
+ anthropic_api_key=os.getenv("ANTHROPIC_API_KEY")
586
+ )
587
+
588
+ # Tool-enabled model for worker
589
+ self.worker_model_base = ChatAnthropic(
590
+ model="claude-3-7-sonnet-20250219",
591
+ max_tokens=20000,
592
+ temperature=0.75,
593
+ anthropic_api_key=os.getenv("ANTHROPIC_API_KEY")
594
+ )
595
+
596
+ # Initialize tools
597
+ self.tools = [search_web_tavily, search_web_serper, execute_code_securely, execute_shell_command, sandbox_file_operation, extract_document_data, extract_url_content]
598
+
599
+ # Bind tools only to the worker model
600
+ self.worker_model = self.worker_model_base.bind_tools(self.tools)
601
+
602
+ # Create the tool node for executing tools
603
+ self.tool_node = ToolNode(self.tools)
604
+
605
+ # Create the workflow
606
+ self.app = self._create_workflow()
607
+
608
+ def _process_messages_after_tools(self, messages):
609
+ """Process messages to ensure tool calls and tool results are properly paired.
610
+ This helps prevent the Anthropic error: unexpected `tool_use_id` found in `tool_result` blocks."""
611
+ # Create a mapping of tool_call_id to AIMessage index
612
+ tool_call_map = {}
613
+ for i, msg in enumerate(messages):
614
+ if isinstance(msg, AIMessage) and getattr(msg, "tool_calls", None):
615
+ for tool_call in msg.tool_calls:
616
+ if "id" in tool_call:
617
+ tool_call_map[tool_call["id"]] = i
618
+
619
+ # Filter out ToolMessages that don't have a matching AIMessage with tool_calls
620
+ processed_messages = []
621
+ for i, msg in enumerate(messages):
622
+ if isinstance(msg, ToolMessage) and hasattr(msg, "tool_call_id"):
623
+ # Only include if there is a matching AIMessage with this tool_call_id
624
+ if msg.tool_call_id in tool_call_map:
625
+ ai_msg_index = tool_call_map[msg.tool_call_id]
626
+ # Make sure this tool message comes right after its AIMessage
627
+ if i > ai_msg_index and not any(
628
+ isinstance(messages[j], ToolMessage) and
629
+ hasattr(messages[j], "tool_call_id") and
630
+ messages[j].tool_call_id == msg.tool_call_id
631
+ for j in range(ai_msg_index + 1, i)
632
+ ):
633
+ processed_messages.append(msg)
634
+ else:
635
+ processed_messages.append(msg)
636
+
637
+ return processed_messages
638
+
639
+ def _create_workflow(self):
640
+ workflow = StateGraph(AgentState)
641
+
642
+ # Add nodes
643
+ workflow.add_node("supervisor", self._supervisor_agent)
644
+ workflow.add_node("worker", self._worker_agent)
645
+ workflow.add_node("tools", self._handle_tools)
646
+ workflow.add_node("validator", self._validation_agent)
647
+
648
+ # Add edges using the START and END constants
649
+ workflow.add_edge(START, "supervisor")
650
+
651
+ # All nodes use Command to specify their next destination, so we don't need conditional edges
652
+ # Each node's Command(goto=...) specifies the next node
653
+
654
+ # Compile the graph
655
+ return workflow.compile()
656
+
657
+ def _supervisor_agent(self, state: AgentState) -> Command:
658
+ """Supervisor agent that coordinates the workflow."""
659
+ # Get the question from state
660
+ current_question = state["current_question"]
661
+ messages = state["messages"]
662
+ worker_iterations = state.get("worker_iterations", 0)
663
+
664
+ # If we have messages and this isn't the first iteration, evaluate worker's response
665
+ if messages and worker_iterations > 0:
666
+ # Find the last worker response
667
+ worker_response = None
668
+ for msg in reversed(messages):
669
+ if isinstance(msg, AIMessage) and not getattr(msg, "tool_calls", None):
670
+ worker_response = msg.content
671
+ break
672
+
673
+ if worker_response:
674
+ # Evaluate the worker's response
675
+ eval_prompt = ChatPromptTemplate.from_messages([
676
+ ("system", """You are a supervisor agent evaluating a worker's research report about user's question.
677
+ Analyze whether the report with answer completely and accurately answers the question.
678
+
679
+ Your evaluation criteria:
680
+ - Completeness: Does the answer address all aspects of the question?
681
+ - Accuracy: Are the facts, references and reasoning correct?
682
+ - Path clarity: Is the path to the answer logical and well-explained?
683
+ - Evidence quality: Are the references reliable and directly relevant?
684
+
685
+ Worker has access to search and web content extraction tools, also python code execution tool.
686
+
687
+ Tasks given to You are not casual questions by random humans, but tricky contest puzzles that test LLM capabilities.
688
+
689
+ If all criteria are met, respond with "SATISFIED".
690
+ If any criteria are not met, respond with "UNSATISFIED: [specific detailed feedback]".
691
+ Be precise in your feedback so the worker knows exactly what to improve."""),
692
+ ("human", f"Question: {current_question}\nWorker's report with answer: {worker_response}")
693
+ ])
694
+
695
+ evaluation = self.supervisor_model.invoke(eval_prompt.format_prompt().to_messages()).content
696
+
697
+ # Determine if supervisor is satisfied
698
+ supervisor_satisfaction = evaluation.startswith("SATISFIED")
699
+
700
+ if supervisor_satisfaction:
701
+ # If satisfied, prepare to move to validator
702
+ return Command(
703
+ goto="validator",
704
+ update={
705
+ "supervisor_satisfaction": True
706
+ }
707
+ )
708
+ else:
709
+ # If not satisfied, give feedback to worker
710
+ feedback = evaluation.replace("UNSATISFIED: ", "")
711
+
712
+ prompt = ChatPromptTemplate.from_messages([
713
+ ("system", """You are a supervisor agent providing targeted feedback to the worker agent.
714
+
715
+ Your role is to guide the worker to improve their research report by:
716
+ 1) Highlighting specific areas that need improvement
717
+ 2) Providing clear, actionable guidance on what additional research is needed
718
+ 3) Explaining exactly how the worker should revise their approach
719
+ 4) Reminding them of any specific formatting requirements in the original question
720
+
721
+ Worker has access to the following tools:
722
+ - Web search (using Tavily and Serper)
723
+ - Web content extraction
724
+ - Secure code execution (for Python and other languages)
725
+ - Secure shell command execution
726
+ - Secure file operations
727
+
728
+ For computational puzzles, math problems, data processing, or tasks requiring exact precision,
729
+ recommend using the code execution tools rather than relying on reasoning alone.
730
+
731
+ Tasks given to You are not casual questions by random humans, but tricky contest puzzles that test LLM capabilities.
732
+
733
+ Focus on being constructive and precise. The worker should understand exactly what to do next."""),
734
+ ("human", f"Question: {current_question}\nWorker's current response: {worker_response}\nImprovement needed: {feedback}")
735
+ ])
736
+
737
+ feedback_message = self.supervisor_model.invoke(prompt.format_prompt().to_messages()).content
738
+
739
+ # Update messages with feedback and increment worker iterations
740
+ return Command(
741
+ goto="worker",
742
+ update={
743
+ "messages": messages + [HumanMessage(content=feedback_message)],
744
+ "worker_iterations": worker_iterations + 1,
745
+ "supervisor_satisfaction": False
746
+ }
747
+ )
748
+
749
+ # First iteration, provide initial instructions
750
+ prompt = ChatPromptTemplate.from_messages([
751
+ ("system", """You are a supervisor agent responsible for coordinating a research workflow.
752
+
753
+ Your responsibilities:
754
+ 1) Analyze the question to identify required knowledge, tools, and research strategy
755
+ 2) Provide clear, specific instructions to the worker agent
756
+ 3) Specify exactly what information to gather and what analysis to perform
757
+
758
+ The worker will prepare a concise research report containing:
759
+ 1) Their research path - the logical sequence of steps taken to reach the answer
760
+ 2) The specific references used with clear citations
761
+ 3) A proposed final answer formatted EXACTLY as requested in the question in separate section
762
+
763
+ Worker has access to the following powerful tools:
764
+ - Web search (using Tavily and Serper)
765
+ - Web content extraction
766
+ - Secure code execution (for Python and other languages)
767
+ - Secure shell command execution
768
+ - Secure file operations
769
+
770
+ You must understand LLM limitations of solving puzzles that can be solved only by code execution,
771
+ for example math problems, word character flipping, counting and similar tasks that typically plain LLM will fail at.
772
+
773
+ In case of such tasks, worker should use the code execution tools to solve the puzzle.
774
+
775
+ Tasks given to You are not casual questions by random humans, but tricky contest puzzles that test LLM capabilities.
776
+
777
+ Worker should give You full report with all sections for You to evaluate."""
778
+ ),
779
+ ("human", current_question)
780
+ ])
781
+
782
+ response = self.supervisor_model.invoke(prompt.format_prompt().to_messages()).content
783
+
784
+ # Use Command pattern to update state and move to worker
785
+ return Command(
786
+ goto="worker",
787
+ update={
788
+ "messages": [HumanMessage(content=current_question), AIMessage(content=response)],
789
+ "worker_iterations": 1,
790
+ "supervisor_satisfaction": False
791
+ }
792
+ )
793
+
794
+ def _worker_agent(self, state: AgentState) -> Command:
795
+ """Worker agent that performs the actual work using tools when needed."""
796
+ messages = state["messages"]
797
+
798
+ # Process messages to ensure proper tool call-result pairing
799
+ processed_messages = self._process_messages_after_tools(messages)
800
+
801
+ # Filter out any ToolMessages that don't have a corresponding AIMessage with tool_calls
802
+ # This helps prevent the "unexpected tool_use_id" error with Anthropic
803
+ filtered_messages = []
804
+ tool_call_ids = set()
805
+
806
+ # First pass: collect all tool_call_ids from AIMessages
807
+ for msg in processed_messages:
808
+ if isinstance(msg, AIMessage) and getattr(msg, "tool_calls", None):
809
+ for tool_call in msg.tool_calls:
810
+ if "id" in tool_call:
811
+ tool_call_ids.add(tool_call["id"])
812
+
813
+ # Second pass: only include ToolMessages that have a corresponding tool_call_id
814
+ for msg in processed_messages:
815
+ if isinstance(msg, ToolMessage) and getattr(msg, "tool_call_id", None):
816
+ if msg.tool_call_id in tool_call_ids:
817
+ filtered_messages.append(msg)
818
+ else:
819
+ filtered_messages.append(msg)
820
+
821
+ # If messages exist, use them directly with the tool-enabled model
822
+ response = self.worker_model.invoke(filtered_messages)
823
+
824
+ # Update messages - add the response to the original messages
825
+ # We don't want to lose the original message history
826
+ updated_messages = messages + [response]
827
+
828
+ # Determine next step using Command pattern
829
+ if response.tool_calls:
830
+ # If tool calls are present, go to tools
831
+ return Command(
832
+ goto="tools",
833
+ update={"messages": updated_messages}
834
+ )
835
+ else:
836
+ # No tool calls, return to supervisor for evaluation
837
+ return Command(
838
+ goto="supervisor",
839
+ update={"messages": updated_messages}
840
+ )
841
+
842
+ def _validation_agent(self, state: AgentState) -> Command:
843
+ """Agent that validates the final answer."""
844
+ messages = state["messages"]
845
+ question = state["current_question"]
846
+
847
+ # Get the final answer from the last message
848
+ final_answer = ""
849
+ for msg in reversed(messages):
850
+ if isinstance(msg, AIMessage) and not getattr(msg, "tool_calls", None):
851
+ final_answer = msg.content
852
+ break
853
+
854
+ prompt = ChatPromptTemplate.from_messages([
855
+ ("system", """You are a quality assurance agent responsible for final verification of research reports and precise formatting of final answers.
856
+
857
+ Your critical responsibilities:
858
+ 1) Verify the factual accuracy and completeness of the report, ensuring you can extract and format the final answer exactly as requested in the question
859
+ 2) Ensure EXACT compliance with any formatting instructions in the question by producing a properly structured final answer
860
+
861
+ Pay extremely close attention to formatting requirements. The user may request:
862
+ - Only specific parts of information (first/last names, specific data points, numerical values)
863
+ - Particular ordering (alphabetical, chronological, size-based, relevance-based)
864
+ - Special formatting (bullet points, numbered lists, specific separators, tables)
865
+ - Exact text case, spacing, punctuation, or other presentational elements
866
+
867
+ Exact formatting compliance is MANDATORY for this challenge evaluation. Your role is to ensure the final answer meets all specified requirements.
868
+ If numerical values are requested, ensure they are formatted as numbers, not text.
869
+
870
+ Remember that the worker had access to:
871
+ - Web search tools
872
+ - Web content extraction
873
+ - Secure code execution
874
+ - Secure shell commands
875
+ - Secure file operations
876
+
877
+ For computational or precision-based questions, check if code execution was appropriately used and validate the results.
878
+
879
+ When evaluating the answer:
880
+ - Check if all required information is present and accurate
881
+ - Verify that the answer directly addresses the specific question asked
882
+ - Ensure any numerical values, dates, names, or technical terms are correct
883
+ - Confirm that the formatting precisely matches what was requested
884
+ - Do not add units to the final answer if not explicitly requested
885
+ - Answers tend to be as short as possible, so do not add extra data unless explicitly requested
886
+
887
+ If the answer report is correct, format it exactly as asked in the question, and respond with:
888
+ "APPROVED: [THE PROPERLY FORMATTED ANSWER]"
889
+
890
+ If there are issues with overall answer quality and you cannot format the final answer as requested, respond with:
891
+ "REJECTED: [DETAILED EXPLANATION OF ISSUES]"
892
+
893
+ Be extremely precise in your evaluation - the success of this task depends on your attention to detail.
894
+ """
895
+ ),
896
+ ("human", f"Question: {question}\nReport to validate: {final_answer}")
897
+ ])
898
+ validation_result = self.validator_model.invoke(prompt.format_prompt().to_messages()).content
899
+ validator_approval = validation_result.startswith("APPROVED")
900
+
901
+ if validator_approval:
902
+ # Approved - end the workflow
903
+ return Command(
904
+ goto=END,
905
+ update={
906
+ "final_answer": validation_result[10:], # Remove "APPROVED: " prefix
907
+ "validation_result": validation_result,
908
+ "validator_approval": True
909
+ }
910
+ )
911
+ else:
912
+ # Rejected - restart from supervisor with reset state
913
+ return Command(
914
+ goto="supervisor",
915
+ update={
916
+ "messages": [HumanMessage(content=question)],
917
+ "validation_result": validation_result,
918
+ "validator_approval": False,
919
+ "worker_iterations": 0,
920
+ "supervisor_satisfaction": False
921
+ }
922
+ )
923
+
924
+ def _handle_tools(self, state: AgentState) -> Command:
925
+ """Custom wrapper around ToolNode to ensure proper message handling."""
926
+ # Execute the tool using the tool node
927
+ tool_result = self.tool_node.invoke(state)
928
+
929
+ # Process the result to ensure proper message ordering
930
+ if "messages" in tool_result:
931
+ # Get original messages
932
+ original_messages = state["messages"]
933
+ # Get all existing AIMessages with tool calls and their indices
934
+ ai_indices = {}
935
+ for i, msg in enumerate(original_messages):
936
+ if isinstance(msg, AIMessage) and getattr(msg, "tool_calls", None):
937
+ for tool_call in msg.tool_calls:
938
+ if "id" in tool_call:
939
+ ai_indices[tool_call["id"]] = i
940
+
941
+ # Add the new tool messages, ensuring they come right after their corresponding tool call
942
+ updated_messages = list(original_messages)
943
+ for msg in tool_result["messages"]:
944
+ if isinstance(msg, ToolMessage) and hasattr(msg, "tool_call_id"):
945
+ tool_id = msg.tool_call_id
946
+ if tool_id in ai_indices:
947
+ # Insert after the AIMessage with the matching tool call
948
+ insert_idx = ai_indices[tool_id] + 1
949
+ # Move past any existing tool messages for this AI message
950
+ while insert_idx < len(updated_messages) and \
951
+ isinstance(updated_messages[insert_idx], ToolMessage) and \
952
+ hasattr(updated_messages[insert_idx], "tool_call_id") and \
953
+ updated_messages[insert_idx].tool_call_id != tool_id:
954
+ insert_idx += 1
955
+ updated_messages.insert(insert_idx, msg)
956
+ # Update subsequent indices
957
+ for id in ai_indices:
958
+ if ai_indices[id] >= insert_idx:
959
+ ai_indices[id] += 1
960
+ else:
961
+ # No matching tool call found, just append
962
+ updated_messages.append(msg)
963
+
964
+ return Command(
965
+ goto="worker",
966
+ update={"messages": updated_messages}
967
+ )
968
+
969
+ # If no message updates, just return the state
970
+ return Command(
971
+ goto="worker",
972
+ update=tool_result
973
+ )
974
+
975
+ def __call__(self, question: str) -> str:
976
+ print(f"Agent received question (first 50 chars): {question[:50]}...")
977
+
978
+ # Initialize the state
979
+ initial_state = {
980
+ "messages": [],
981
+ "current_question": question,
982
+ "final_answer": "",
983
+ "validation_result": "",
984
+ "worker_iterations": 0,
985
+ "supervisor_satisfaction": False,
986
+ "validator_approval": False
987
+ }
988
+
989
+ try:
990
+ # Run the workflow
991
+ final_state = self.app.invoke(initial_state, config={"callbacks": [self.langfuse_handler], "recursion_limit": 50})
992
+
993
+ # Return the final answer
994
+ answer = final_state.get("final_answer", "")
995
+ if not answer and final_state["messages"]:
996
+ for msg in reversed(final_state["messages"]):
997
+ if isinstance(msg, AIMessage) and not getattr(msg, "tool_calls", None):
998
+ answer = msg.content
999
+ break
1000
+
1001
+ print(f"Agent returning answer: {answer[:50]}...")
1002
+ return answer
1003
+ except Exception as e:
1004
+ print(f"Error in agent processing: {str(e)}")
1005
+ # Fallback to basic workflow without tool calls if there's an error
1006
+ return f"I encountered an error while processing your question: {str(e)}. Please try reformulating your question."
1007
+ finally:
1008
+ # Clean up resources
1009
+ cleanup_daytona_sandbox()
app.py CHANGED
@@ -1,23 +1,20 @@
1
  import os
 
 
2
  import gradio as gr
3
  import requests
4
  import inspect
5
  import pandas as pd
 
 
 
 
 
 
6
 
7
- # (Keep Constants as is)
8
  # --- Constants ---
9
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
10
-
11
- # --- Basic Agent Definition ---
12
- # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
13
- class BasicAgent:
14
- def __init__(self):
15
- print("BasicAgent initialized.")
16
- def __call__(self, question: str) -> str:
17
- print(f"Agent received question (first 50 chars): {question[:50]}...")
18
- fixed_answer = "This is a default answer."
19
- print(f"Agent returning fixed answer: {fixed_answer}")
20
- return fixed_answer
21
 
22
  def run_and_submit_all( profile: gr.OAuthProfile | None):
23
  """
@@ -48,44 +45,105 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
48
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
49
  print(agent_code)
50
 
51
- # 2. Fetch Questions
52
- print(f"Fetching questions from: {questions_url}")
53
- try:
54
- response = requests.get(questions_url, timeout=15)
55
- response.raise_for_status()
56
- questions_data = response.json()
57
- if not questions_data:
58
- print("Fetched questions list is empty.")
59
- return "Fetched questions list is empty or invalid format.", None
60
- print(f"Fetched {len(questions_data)} questions.")
61
- except requests.exceptions.RequestException as e:
62
- print(f"Error fetching questions: {e}")
63
- return f"Error fetching questions: {e}", None
64
- except requests.exceptions.JSONDecodeError as e:
65
- print(f"Error decoding JSON response from questions endpoint: {e}")
66
- print(f"Response text: {response.text[:500]}")
67
- return f"Error decoding server response for questions: {e}", None
68
- except Exception as e:
69
- print(f"An unexpected error occurred fetching questions: {e}")
70
- return f"An unexpected error occurred fetching questions: {e}", None
71
 
72
- # 3. Run your Agent
73
  results_log = []
74
  answers_payload = []
75
- print(f"Running agent on {len(questions_data)} questions...")
76
- for item in questions_data:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  task_id = item.get("task_id")
78
  question_text = item.get("question")
79
  if not task_id or question_text is None:
80
  print(f"Skipping item with missing task_id or question: {item}")
81
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  try:
 
83
  submitted_answer = agent(question_text)
84
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
85
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
 
 
 
86
  except Exception as e:
87
- print(f"Error running agent on task {task_id}: {e}")
88
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
 
 
 
89
 
90
  if not answers_payload:
91
  print("Agent did not produce any answers to submit.")
@@ -99,17 +157,42 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
99
  # 5. Submit
100
  print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
101
  try:
102
- response = requests.post(submit_url, json=submission_data, timeout=60)
103
- response.raise_for_status()
104
- result_data = response.json()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  final_status = (
106
- f"Submission Successful!\n"
107
  f"User: {result_data.get('username')}\n"
108
  f"Overall Score: {result_data.get('score', 'N/A')}% "
109
  f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
110
  f"Message: {result_data.get('message', 'No message received.')}"
111
  )
112
- print("Submission successful.")
 
 
 
 
 
 
 
 
 
113
  results_df = pd.DataFrame(results_log)
114
  return final_status, results_df
115
  except requests.exceptions.HTTPError as e:
@@ -139,6 +222,24 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
139
  results_df = pd.DataFrame(results_log)
140
  return status_message, results_df
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  # --- Build Gradio Interface using Blocks ---
144
  with gr.Blocks() as demo:
@@ -190,6 +291,18 @@ if __name__ == "__main__":
190
  else:
191
  print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
192
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  print("-"*(60 + len(" App Starting ")) + "\n")
194
 
195
  print("Launching Gradio Interface for Basic Agent Evaluation...")
 
1
  import os
2
+ import json
3
+ from dotenv import load_dotenv
4
  import gradio as gr
5
  import requests
6
  import inspect
7
  import pandas as pd
8
+ from agent import BasicAgent
9
+ import time
10
+ from datetime import datetime
11
+
12
+ # Load environment variables from .env file
13
+ load_dotenv()
14
 
 
15
  # --- Constants ---
16
+ DEFAULT_API_URL = os.getenv('DEFAULT_API_URL', "https://agents-course-unit4-scoring.hf.space")
17
+ CHECKPOINT_FILE = "agent_checkpoint.json"
 
 
 
 
 
 
 
 
 
 
18
 
19
  def run_and_submit_all( profile: gr.OAuthProfile | None):
20
  """
 
45
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
46
  print(agent_code)
47
 
48
+ # Check for existing checkpoint
49
+ checkpoint_data = None
50
+ if os.path.exists(CHECKPOINT_FILE):
51
+ try:
52
+ with open(CHECKPOINT_FILE, 'r') as f:
53
+ checkpoint_data = json.load(f)
54
+ print(f"Found checkpoint with {len(checkpoint_data.get('questions', []))} questions and {len(checkpoint_data.get('answers', []))} answers")
55
+ except Exception as e:
56
+ print(f"Error loading checkpoint: {e}")
57
+ # If checkpoint is corrupt, remove it
58
+ try:
59
+ os.remove(CHECKPOINT_FILE)
60
+ except:
61
+ pass
62
+ checkpoint_data = None
 
 
 
 
 
63
 
64
+ # Initialize results tracking
65
  results_log = []
66
  answers_payload = []
67
+
68
+ if checkpoint_data:
69
+ # If we have a checkpoint, use it
70
+ questions_data = checkpoint_data.get('questions', [])
71
+ # Load any answers we already have
72
+ existing_answers = checkpoint_data.get('answers', [])
73
+ existing_answers_dict = {a.get('task_id'): a.get('submitted_answer') for a in existing_answers}
74
+ print(f"Loaded {len(existing_answers)} existing answers from checkpoint")
75
+
76
+ # Load existing results log
77
+ if 'results_log' in checkpoint_data:
78
+ results_log = checkpoint_data.get('results_log', [])
79
+
80
+ # We'll use the checkpoint data
81
+ print(f"Resuming from checkpoint with {len(questions_data)} questions")
82
+ else:
83
+ # 2. Fetch Questions from server
84
+ print(f"Fetching questions from: {questions_url}")
85
+ try:
86
+ response = requests.get(questions_url, timeout=15)
87
+ response.raise_for_status()
88
+ questions_data = response.json()
89
+ if not questions_data:
90
+ print("Fetched questions list is empty.")
91
+ return "Fetched questions list is empty or invalid format.", None
92
+ print(f"Fetched {len(questions_data)} questions.")
93
+
94
+ # Save questions to checkpoint immediately
95
+ save_checkpoint(questions_data, [], username, [])
96
+
97
+ # No existing answers
98
+ existing_answers_dict = {}
99
+
100
+ except requests.exceptions.RequestException as e:
101
+ print(f"Error fetching questions: {e}")
102
+ return f"Error fetching questions: {e}", None
103
+ except requests.exceptions.JSONDecodeError as e:
104
+ print(f"Error decoding JSON response from questions endpoint: {e}")
105
+ print(f"Response text: {response.text[:500]}")
106
+ return f"Error decoding server response for questions: {e}", None
107
+ except Exception as e:
108
+ print(f"An unexpected error occurred fetching questions: {e}")
109
+ return f"An unexpected error occurred fetching questions: {e}", None
110
+
111
+ # 3. Run your Agent on questions we haven't answered yet
112
+ print(f"Running agent on questions...")
113
+ for idx, item in enumerate(questions_data):
114
  task_id = item.get("task_id")
115
  question_text = item.get("question")
116
  if not task_id or question_text is None:
117
  print(f"Skipping item with missing task_id or question: {item}")
118
  continue
119
+
120
+ # Skip if we already have an answer for this question
121
+ if task_id in existing_answers_dict:
122
+ submitted_answer = existing_answers_dict[task_id]
123
+ print(f"Using cached answer for task_id {task_id}")
124
+ answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
125
+
126
+ # Check if we already have this in results_log
127
+ if not any(r.get("Task ID") == task_id for r in results_log):
128
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
129
+
130
+ continue
131
+
132
  try:
133
+ print(f"Processing question {idx+1}/{len(questions_data)}: {task_id}")
134
  submitted_answer = agent(question_text)
135
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
136
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
137
+
138
+ # Save checkpoint after each answer
139
+ save_checkpoint(questions_data, answers_payload, username, results_log)
140
+
141
  except Exception as e:
142
+ print(f"Error running agent on task {task_id}: {e}")
143
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
144
+
145
+ # Save checkpoint even if there was an error
146
+ save_checkpoint(questions_data, answers_payload, username, results_log)
147
 
148
  if not answers_payload:
149
  print("Agent did not produce any answers to submit.")
 
157
  # 5. Submit
158
  print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
159
  try:
160
+ # Check if we're in production mode
161
+ is_production = os.getenv('PRODUCTION_RUN', 'FALSE').upper() == 'TRUE'
162
+
163
+ if is_production:
164
+ print("Running in PRODUCTION mode - making actual submission")
165
+ response = requests.post(submit_url, json=submission_data, timeout=60)
166
+ response.raise_for_status()
167
+ result_data = response.json()
168
+ else:
169
+ print("Running in SIMULATION mode - generating mock response")
170
+ # Simulate a successful response
171
+ result_data = {
172
+ "username": username,
173
+ "score": 85,
174
+ "correct_count": len(answers_payload) - 2, # Simulate some incorrect answers
175
+ "total_attempted": len(answers_payload),
176
+ "message": "Simulation mode: This is a mock response"
177
+ }
178
+
179
  final_status = (
180
+ f"Submission {'Successful' if is_production else 'Simulated'}!\n"
181
  f"User: {result_data.get('username')}\n"
182
  f"Overall Score: {result_data.get('score', 'N/A')}% "
183
  f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
184
  f"Message: {result_data.get('message', 'No message received.')}"
185
  )
186
+ print(f"Submission {'completed' if is_production else 'simulated'} successfully.")
187
+
188
+ # Delete checkpoint file after successful submission
189
+ if os.path.exists(CHECKPOINT_FILE):
190
+ try:
191
+ os.remove(CHECKPOINT_FILE)
192
+ print(f"Checkpoint file removed after successful submission")
193
+ except Exception as e:
194
+ print(f"Warning: Could not remove checkpoint file: {e}")
195
+
196
  results_df = pd.DataFrame(results_log)
197
  return final_status, results_df
198
  except requests.exceptions.HTTPError as e:
 
222
  results_df = pd.DataFrame(results_log)
223
  return status_message, results_df
224
 
225
+ def save_checkpoint(questions_data, answers_payload, username, results_log):
226
+ """Save checkpoint data to a local file."""
227
+ try:
228
+ checkpoint_data = {
229
+ 'questions': questions_data,
230
+ 'answers': answers_payload,
231
+ 'username': username,
232
+ 'timestamp': time.time(),
233
+ 'results_log': results_log
234
+ }
235
+
236
+ with open(CHECKPOINT_FILE, 'w') as f:
237
+ json.dump(checkpoint_data, f)
238
+
239
+ print(f"Checkpoint saved with {len(questions_data)} questions and {len(answers_payload)} answers")
240
+ except Exception as e:
241
+ print(f"Error saving checkpoint: {e}")
242
+
243
 
244
  # --- Build Gradio Interface using Blocks ---
245
  with gr.Blocks() as demo:
 
291
  else:
292
  print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
293
 
294
+ # Check for existing checkpoint
295
+ if os.path.exists(CHECKPOINT_FILE):
296
+ try:
297
+ with open(CHECKPOINT_FILE, 'r') as f:
298
+ checkpoint_data = json.load(f)
299
+ print(f"✅ Checkpoint found with {len(checkpoint_data.get('questions', []))} questions and {len(checkpoint_data.get('answers', []))} answers")
300
+ print(f" Created at: {datetime.fromtimestamp(checkpoint_data.get('timestamp', 0)).strftime('%Y-%m-%d %H:%M:%S')}")
301
+ except Exception as e:
302
+ print(f"⚠️ Checkpoint file exists but could not be read: {e}")
303
+ else:
304
+ print("ℹ️ No checkpoint file found. Will start fresh.")
305
+
306
  print("-"*(60 + len(" App Starting ")) + "\n")
307
 
308
  print("Launching Gradio Interface for Basic Agent Evaluation...")
requirements.txt CHANGED
@@ -1,2 +1,10 @@
1
  gradio
2
- requests
 
 
 
 
 
 
 
 
 
1
  gradio
2
+ requests
3
+ python-dotenv
4
+ langgraph
5
+ langchain-core
6
+ langchain-anthropic
7
+ anthropic
8
+ python-Levenshtein
9
+ daytona_sdk
10
+