Add .gitignore, implement BasicAgent in agent.py, and enhance app.py for checkpointing
Browse files- .gitignore +15 -0
- agent.py +1009 -0
- app.py +156 -43
- requirements.txt +9 -1
.gitignore
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
__pycache__/
|
3 |
+
*.pyc
|
4 |
+
|
5 |
+
# Documentation
|
6 |
+
DOCUMENTATION/
|
7 |
+
|
8 |
+
#test sets
|
9 |
+
TEST_SET/
|
10 |
+
|
11 |
+
#test results
|
12 |
+
test_results/
|
13 |
+
|
14 |
+
#cursor
|
15 |
+
.cursor/
|
agent.py
ADDED
@@ -0,0 +1,1009 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, TypedDict, Dict, Any, Literal
|
2 |
+
from langgraph.graph import StateGraph, START, END
|
3 |
+
from langgraph.types import Command
|
4 |
+
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
|
5 |
+
from langchain_anthropic import ChatAnthropic
|
6 |
+
from langchain_core.tools import tool
|
7 |
+
from langchain_core.prompts import ChatPromptTemplate
|
8 |
+
from langgraph.prebuilt import ToolNode
|
9 |
+
import os
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
from datetime import datetime
|
12 |
+
from tavily import TavilyClient
|
13 |
+
from langfuse.callback import CallbackHandler
|
14 |
+
import requests
|
15 |
+
import json
|
16 |
+
import time
|
17 |
+
from daytona_sdk import Daytona, DaytonaConfig
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
# Load environment variablesTuple
|
22 |
+
load_dotenv()
|
23 |
+
|
24 |
+
# Define the state schema with messages that ToolNode can use
|
25 |
+
class AgentState(TypedDict):
|
26 |
+
messages: List
|
27 |
+
current_question: str
|
28 |
+
final_answer: str
|
29 |
+
validation_result: str
|
30 |
+
worker_iterations: int
|
31 |
+
supervisor_satisfaction: bool
|
32 |
+
validator_approval: bool
|
33 |
+
|
34 |
+
# Define tools following Langgraph guide
|
35 |
+
|
36 |
+
|
37 |
+
@tool
|
38 |
+
def search_web_tavily(query: str) -> str:
|
39 |
+
"""Search the web for information using the Tavily search API."""
|
40 |
+
# Initialize the Tavily client with API key from environment variables
|
41 |
+
client = TavilyClient(os.getenv("TAVILY_API_KEY"))
|
42 |
+
|
43 |
+
# Perform the search
|
44 |
+
response = client.search(query=query)
|
45 |
+
|
46 |
+
# Process the results into a readable format
|
47 |
+
results = []
|
48 |
+
for i, result in enumerate(response.get("results", []), 1):
|
49 |
+
results.append(f"{i}. {result.get('title')}\n URL: {result.get('url')}\n {result.get('content')}\n")
|
50 |
+
|
51 |
+
# Format the final response
|
52 |
+
formatted_response = f"Search results for '{query}':\n\n" + "\n".join(results)
|
53 |
+
|
54 |
+
return formatted_response
|
55 |
+
|
56 |
+
@tool
|
57 |
+
def search_web_serper(query: str, result_limit: int = 5, search_type: str = "search") -> str:
|
58 |
+
"""Search the web for information using the Serper.dev API.
|
59 |
+
|
60 |
+
This tool provides comprehensive search results including:
|
61 |
+
1. Knowledge Graph data when available (title, description, attributes)
|
62 |
+
2. Organic search results (titles, links, snippets)
|
63 |
+
3. Related questions from "People Also Ask" section
|
64 |
+
4. Top stories and news articles related to the query
|
65 |
+
|
66 |
+
It's particularly useful for gathering factual information, current events,
|
67 |
+
and general knowledge from across the web. The results are formatted in a
|
68 |
+
readable structure with clear sections.
|
69 |
+
|
70 |
+
Parameters:
|
71 |
+
- query: The search query string
|
72 |
+
- result_limit: Maximum number of results to return per section (default: 5)
|
73 |
+
- search_type: Type of search ('search', 'news', 'places', 'images', 'shopping')
|
74 |
+
"""
|
75 |
+
# API URL and headers setup
|
76 |
+
url = "https://google.serper.dev/search"
|
77 |
+
headers = {
|
78 |
+
'X-API-KEY': os.getenv("SERPER_API_KEY"),
|
79 |
+
'Content-Type': 'application/json'
|
80 |
+
}
|
81 |
+
|
82 |
+
# Prepare the payload with the query and search type
|
83 |
+
payload = json.dumps({
|
84 |
+
"q": query,
|
85 |
+
"type": search_type
|
86 |
+
})
|
87 |
+
|
88 |
+
try:
|
89 |
+
# Make the API request
|
90 |
+
response = requests.request("POST", url, headers=headers, data=payload, timeout=30)
|
91 |
+
response.raise_for_status() # Raise exception for HTTP errors
|
92 |
+
|
93 |
+
# Parse the JSON response
|
94 |
+
data = response.json()
|
95 |
+
|
96 |
+
# Format the results
|
97 |
+
results = []
|
98 |
+
|
99 |
+
# Add knowledge graph if available
|
100 |
+
if "knowledgeGraph" in data:
|
101 |
+
kg = data["knowledgeGraph"]
|
102 |
+
results.append(f"Knowledge Graph:\n{kg.get('title', 'Unknown')} - {kg.get('type', '')}")
|
103 |
+
results.append(f"Description: {kg.get('description', 'No description available')}")
|
104 |
+
|
105 |
+
if "attributes" in kg:
|
106 |
+
results.append("Attributes:")
|
107 |
+
for key, value in kg["attributes"].items():
|
108 |
+
results.append(f"- {key}: {value}")
|
109 |
+
|
110 |
+
results.append("") # Empty line for separation
|
111 |
+
|
112 |
+
# Add organic search results
|
113 |
+
if "organic" in data:
|
114 |
+
results.append("Organic Search Results:")
|
115 |
+
for i, result in enumerate(data["organic"][:result_limit], 1):
|
116 |
+
results.append(f"{i}. {result.get('title', 'No title')}")
|
117 |
+
results.append(f" URL: {result.get('link', 'No link')}")
|
118 |
+
results.append(f" {result.get('snippet', 'No snippet')}")
|
119 |
+
results.append("") # Empty line for separation
|
120 |
+
|
121 |
+
# Add people also ask if available
|
122 |
+
if "peopleAlsoAsk" in data and data["peopleAlsoAsk"]:
|
123 |
+
results.append("People Also Ask:")
|
124 |
+
for i, qa in enumerate(data["peopleAlsoAsk"][:min(3, result_limit)], 1):
|
125 |
+
results.append(f"{i}. Q: {qa.get('question', 'No question')}")
|
126 |
+
results.append(f" A: {qa.get('snippet', 'No answer')}")
|
127 |
+
results.append("") # Empty line for separation
|
128 |
+
|
129 |
+
# Add top stories if available
|
130 |
+
if "topStories" in data and data["topStories"]:
|
131 |
+
results.append("Top Stories:")
|
132 |
+
for i, story in enumerate(data["topStories"][:min(3, result_limit)], 1):
|
133 |
+
results.append(f"{i}. {story.get('title', 'No title')}")
|
134 |
+
results.append(f" Source: {story.get('source', 'Unknown source')}")
|
135 |
+
if "date" in story:
|
136 |
+
results.append(f" Published: {story.get('date')}")
|
137 |
+
results.append(f" URL: {story.get('link', 'No link')}")
|
138 |
+
results.append("") # Empty line for separation
|
139 |
+
|
140 |
+
# Format the final response
|
141 |
+
formatted_response = f"Search results for '{query}':\n\n" + "\n".join(results)
|
142 |
+
|
143 |
+
return formatted_response
|
144 |
+
|
145 |
+
except requests.exceptions.Timeout:
|
146 |
+
return f"Error: Request to Serper API timed out after 30 seconds"
|
147 |
+
except requests.exceptions.RequestException as e:
|
148 |
+
return f"Error making request to Serper API: {str(e)}"
|
149 |
+
except json.JSONDecodeError:
|
150 |
+
return f"Error: Received invalid JSON response from Serper API"
|
151 |
+
except Exception as e:
|
152 |
+
return f"Error processing search results: {str(e)}"
|
153 |
+
|
154 |
+
# Initialize a global Daytona sandbox for reuse
|
155 |
+
_daytona_sandbox = None
|
156 |
+
|
157 |
+
@tool
|
158 |
+
def execute_code_securely(code: str, language: str = "python", timeout: int = 300) -> str:
|
159 |
+
"""Execute code securely in an isolated sandbox environment using Daytona.
|
160 |
+
|
161 |
+
This tool runs code in a secure, isolated environment to prevent security issues.
|
162 |
+
It's particularly useful for solving computational problems, data processing tasks,
|
163 |
+
mathematical calculations, and other scenarios where code execution is needed.
|
164 |
+
|
165 |
+
The tool supports multiple languages, with Python as the default.
|
166 |
+
|
167 |
+
Parameters:
|
168 |
+
- code: The code to execute
|
169 |
+
- language: The programming language (default: "python")
|
170 |
+
- timeout: Maximum execution time in seconds (default: 30)
|
171 |
+
|
172 |
+
Returns:
|
173 |
+
- The execution result or error message
|
174 |
+
"""
|
175 |
+
global _daytona_sandbox
|
176 |
+
|
177 |
+
try:
|
178 |
+
# Initialize Daytona client if not already done
|
179 |
+
if _daytona_sandbox is None:
|
180 |
+
api_key = os.getenv("DAYTONA_API_KEY")
|
181 |
+
if not api_key:
|
182 |
+
return "Error: DAYTONA_API_KEY environment variable not set"
|
183 |
+
|
184 |
+
# Initialize the Daytona client and create a sandbox
|
185 |
+
config = DaytonaConfig(api_key=api_key)
|
186 |
+
daytona_client = Daytona(config)
|
187 |
+
_daytona_sandbox = daytona_client.create()
|
188 |
+
|
189 |
+
# Execute the code based on the specified language
|
190 |
+
if language.lower() == "python":
|
191 |
+
response = _daytona_sandbox.process.code_run(code, timeout=timeout)
|
192 |
+
else:
|
193 |
+
# For non-Python languages, create a temporary file and execute it
|
194 |
+
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
195 |
+
file_extension = {
|
196 |
+
"javascript": "js",
|
197 |
+
"nodejs": "js",
|
198 |
+
"ruby": "rb",
|
199 |
+
"php": "php",
|
200 |
+
"bash": "sh",
|
201 |
+
"shell": "sh",
|
202 |
+
"powershell": "ps1",
|
203 |
+
"c": "c",
|
204 |
+
"cpp": "cpp",
|
205 |
+
"java": "java",
|
206 |
+
"go": "go",
|
207 |
+
"rust": "rs",
|
208 |
+
}.get(language.lower(), "txt")
|
209 |
+
|
210 |
+
filename = f"/tmp/code_{timestamp}.{file_extension}"
|
211 |
+
|
212 |
+
# Upload the code file to the sandbox
|
213 |
+
_daytona_sandbox.fs.upload_file(filename, code.encode('utf-8'))
|
214 |
+
|
215 |
+
# Prepare the execution command based on language
|
216 |
+
exec_cmd = {
|
217 |
+
"javascript": f"node {filename}",
|
218 |
+
"nodejs": f"node {filename}",
|
219 |
+
"ruby": f"ruby {filename}",
|
220 |
+
"php": f"php {filename}",
|
221 |
+
"bash": f"bash {filename}",
|
222 |
+
"shell": f"sh {filename}",
|
223 |
+
"powershell": f"pwsh {filename}",
|
224 |
+
"c": f"gcc {filename} -o /tmp/prog_{timestamp} && /tmp/prog_{timestamp}",
|
225 |
+
"cpp": f"g++ {filename} -o /tmp/prog_{timestamp} && /tmp/prog_{timestamp}",
|
226 |
+
"java": f"javac {filename} && java -cp /tmp {os.path.basename(filename).split('.')[0]}",
|
227 |
+
"go": f"go run {filename}",
|
228 |
+
"rust": f"rustc {filename} -o /tmp/prog_{timestamp} && /tmp/prog_{timestamp}",
|
229 |
+
}.get(language.lower(), f"cat {filename}")
|
230 |
+
|
231 |
+
# Execute the command
|
232 |
+
response = _daytona_sandbox.process.exec(exec_cmd, cwd="/tmp", timeout=timeout)
|
233 |
+
|
234 |
+
# Extract and return the result
|
235 |
+
if hasattr(response, 'result'):
|
236 |
+
result = response.result
|
237 |
+
elif hasattr(response, 'stdout'):
|
238 |
+
result = response.stdout
|
239 |
+
else:
|
240 |
+
result = str(response)
|
241 |
+
|
242 |
+
return f"Code Execution Result ({language}):\n{result}"
|
243 |
+
|
244 |
+
except Exception as e:
|
245 |
+
# Clean up on error
|
246 |
+
try:
|
247 |
+
if _daytona_sandbox is not None:
|
248 |
+
_daytona_sandbox = None
|
249 |
+
except:
|
250 |
+
pass
|
251 |
+
|
252 |
+
return f"Error executing code: {str(e)}"
|
253 |
+
|
254 |
+
@tool
|
255 |
+
def execute_shell_command(command: str, working_dir: str = "/tmp", timeout: int = 300) -> str:
|
256 |
+
"""Execute a shell command securely in an isolated sandbox environment using Daytona.
|
257 |
+
|
258 |
+
This tool runs shell commands in a secure, isolated environment to prevent security issues.
|
259 |
+
It's useful for file operations, system tasks, and other command-line operations.
|
260 |
+
|
261 |
+
Parameters:
|
262 |
+
- command: The shell command to execute
|
263 |
+
- working_dir: The working directory (default: "/tmp")
|
264 |
+
- timeout: Maximum execution time in seconds (default: 30)
|
265 |
+
|
266 |
+
Returns:
|
267 |
+
- The command execution output or error message
|
268 |
+
"""
|
269 |
+
global _daytona_sandbox
|
270 |
+
|
271 |
+
try:
|
272 |
+
# Initialize Daytona client if not already done
|
273 |
+
if _daytona_sandbox is None:
|
274 |
+
api_key = os.getenv("DAYTONA_API_KEY")
|
275 |
+
if not api_key:
|
276 |
+
return "Error: DAYTONA_API_KEY environment variable not set"
|
277 |
+
|
278 |
+
# Initialize the Daytona client and create a sandbox
|
279 |
+
config = DaytonaConfig(api_key=api_key)
|
280 |
+
daytona_client = Daytona(config)
|
281 |
+
_daytona_sandbox = daytona_client.create()
|
282 |
+
|
283 |
+
# Execute the command
|
284 |
+
response = _daytona_sandbox.process.exec(command, cwd=working_dir, timeout=timeout)
|
285 |
+
|
286 |
+
# Extract and return the result
|
287 |
+
if hasattr(response, 'result'):
|
288 |
+
result = response.result
|
289 |
+
elif hasattr(response, 'stdout'):
|
290 |
+
result = response.stdout
|
291 |
+
else:
|
292 |
+
result = str(response)
|
293 |
+
|
294 |
+
return f"Shell Command Execution Result:\n{result}"
|
295 |
+
|
296 |
+
except Exception as e:
|
297 |
+
# Clean up on error
|
298 |
+
try:
|
299 |
+
if _daytona_sandbox is not None:
|
300 |
+
_daytona_sandbox = None
|
301 |
+
except:
|
302 |
+
pass
|
303 |
+
|
304 |
+
return f"Error executing shell command: {str(e)}"
|
305 |
+
|
306 |
+
@tool
|
307 |
+
def sandbox_file_operation(operation: str, file_path: str, content: str = "", target_path: str = "") -> str:
|
308 |
+
"""Perform file operations in the secure sandbox environment.
|
309 |
+
|
310 |
+
This tool allows secure file manipulation in an isolated sandbox.
|
311 |
+
It supports creating, reading, writing, moving, copying and deleting files.
|
312 |
+
|
313 |
+
Parameters:
|
314 |
+
- operation: The operation to perform ('create', 'read', 'write', 'append', 'delete', 'move', 'copy', 'list')
|
315 |
+
- file_path: Path to the file to operate on
|
316 |
+
- content: Content to write (for 'create', 'write', 'append' operations)
|
317 |
+
- target_path: Target path for 'move' and 'copy' operations
|
318 |
+
|
319 |
+
Returns:
|
320 |
+
- Operation result or file content
|
321 |
+
"""
|
322 |
+
global _daytona_sandbox
|
323 |
+
|
324 |
+
try:
|
325 |
+
# Initialize Daytona client if not already done
|
326 |
+
if _daytona_sandbox is None:
|
327 |
+
api_key = os.getenv("DAYTONA_API_KEY")
|
328 |
+
if not api_key:
|
329 |
+
return "Error: DAYTONA_API_KEY environment variable not set"
|
330 |
+
|
331 |
+
# Initialize the Daytona client and create a sandbox
|
332 |
+
config = DaytonaConfig(api_key=api_key)
|
333 |
+
daytona_client = Daytona(config)
|
334 |
+
_daytona_sandbox = daytona_client.create()
|
335 |
+
|
336 |
+
# Perform the requested operation
|
337 |
+
operation = operation.lower()
|
338 |
+
|
339 |
+
if operation == "create" or operation == "write":
|
340 |
+
# Create or overwrite file
|
341 |
+
_daytona_sandbox.fs.upload_file(file_path, content.encode('utf-8'))
|
342 |
+
return f"File {file_path} created/written successfully"
|
343 |
+
|
344 |
+
elif operation == "append":
|
345 |
+
# First try to read the existing content
|
346 |
+
try:
|
347 |
+
existing_content = _daytona_sandbox.fs.download_file(file_path).decode('utf-8')
|
348 |
+
except:
|
349 |
+
existing_content = ""
|
350 |
+
|
351 |
+
# Append new content and write back
|
352 |
+
new_content = existing_content + content
|
353 |
+
_daytona_sandbox.fs.upload_file(file_path, new_content.encode('utf-8'))
|
354 |
+
return f"Content appended to {file_path} successfully"
|
355 |
+
|
356 |
+
elif operation == "read":
|
357 |
+
# Read file content
|
358 |
+
try:
|
359 |
+
content = _daytona_sandbox.fs.download_file(file_path).decode('utf-8')
|
360 |
+
return f"Content of {file_path}:\n{content}"
|
361 |
+
except Exception as e:
|
362 |
+
return f"Error reading {file_path}: {str(e)}"
|
363 |
+
|
364 |
+
elif operation == "delete":
|
365 |
+
# Delete file
|
366 |
+
response = _daytona_sandbox.process.exec(f"rm -f {file_path}", cwd="/tmp")
|
367 |
+
return f"File {file_path} deleted"
|
368 |
+
|
369 |
+
elif operation == "move":
|
370 |
+
# Move file
|
371 |
+
if not target_path:
|
372 |
+
return "Error: Target path required for move operation"
|
373 |
+
response = _daytona_sandbox.process.exec(f"mv {file_path} {target_path}", cwd="/tmp")
|
374 |
+
return f"File moved from {file_path} to {target_path}"
|
375 |
+
|
376 |
+
elif operation == "copy":
|
377 |
+
# Copy file
|
378 |
+
if not target_path:
|
379 |
+
return "Error: Target path required for copy operation"
|
380 |
+
response = _daytona_sandbox.process.exec(f"cp {file_path} {target_path}", cwd="/tmp")
|
381 |
+
return f"File copied from {file_path} to {target_path}"
|
382 |
+
|
383 |
+
elif operation == "list":
|
384 |
+
# List directory contents
|
385 |
+
response = _daytona_sandbox.process.exec(f"ls -la {file_path}", cwd="/tmp")
|
386 |
+
if hasattr(response, 'result'):
|
387 |
+
result = response.result
|
388 |
+
elif hasattr(response, 'stdout'):
|
389 |
+
result = response.stdout
|
390 |
+
else:
|
391 |
+
result = str(response)
|
392 |
+
return f"Directory listing of {file_path}:\n{result}"
|
393 |
+
|
394 |
+
else:
|
395 |
+
return f"Unsupported operation: {operation}"
|
396 |
+
|
397 |
+
except Exception as e:
|
398 |
+
return f"Error performing file operation: {str(e)}"
|
399 |
+
|
400 |
+
def cleanup_daytona_sandbox():
|
401 |
+
"""Clean up the Daytona sandbox when it's no longer needed."""
|
402 |
+
global _daytona_sandbox
|
403 |
+
|
404 |
+
try:
|
405 |
+
if _daytona_sandbox is not None:
|
406 |
+
# Get the Daytona client
|
407 |
+
api_key = os.getenv("DAYTONA_API_KEY")
|
408 |
+
if api_key:
|
409 |
+
config = DaytonaConfig(api_key=api_key)
|
410 |
+
daytona_client = Daytona(config)
|
411 |
+
|
412 |
+
# Remove the sandbox
|
413 |
+
daytona_client.remove(_daytona_sandbox)
|
414 |
+
_daytona_sandbox = None
|
415 |
+
print("Daytona sandbox cleaned up successfully")
|
416 |
+
except Exception as e:
|
417 |
+
print(f"Error cleaning up Daytona sandbox: {str(e)}")
|
418 |
+
|
419 |
+
# Track last execution time for rate limiting
|
420 |
+
_last_extract_url_time = 0
|
421 |
+
|
422 |
+
@tool
|
423 |
+
def extract_document_data(input_method: str, files: list, prompt: str, json_mode: bool = False) -> str:
|
424 |
+
"""Extract structured data from documents using Dumpling AI.
|
425 |
+
|
426 |
+
This tool allows you to extract information from various document formats including PDFs,
|
427 |
+
Office documents, images, and many other file types. It uses vision-capable Large Language
|
428 |
+
Models (LLMs) to interpret and extract data based on your specific prompt.
|
429 |
+
|
430 |
+
Parameters:
|
431 |
+
- input_method: How to input files, either "url" or "base64"
|
432 |
+
- files: List of file URLs or base64-encoded strings depending on input_method
|
433 |
+
- prompt: Specific instructions for what data to extract from the document
|
434 |
+
- json_mode: Whether to return structured JSON (true) or free text (false)
|
435 |
+
|
436 |
+
Returns:
|
437 |
+
- Extracted data from the document based on your prompt
|
438 |
+
|
439 |
+
Supported file extensions include PDFs, Word docs, Excel files, PowerPoint, images, HTML, and many others.
|
440 |
+
"""
|
441 |
+
api_key = os.getenv("DUMPLING_API_KEY")
|
442 |
+
if not api_key:
|
443 |
+
return "Error: DUMPLING_API_KEY environment variable not set"
|
444 |
+
|
445 |
+
try:
|
446 |
+
url = "https://app.dumplingai.com/api/v1/extract-document"
|
447 |
+
headers = {
|
448 |
+
"Content-Type": "application/json",
|
449 |
+
"Authorization": f"Bearer {api_key}"
|
450 |
+
}
|
451 |
+
|
452 |
+
data = {
|
453 |
+
"inputMethod": input_method,
|
454 |
+
"files": files,
|
455 |
+
"prompt": prompt,
|
456 |
+
"jsonMode": json_mode
|
457 |
+
}
|
458 |
+
|
459 |
+
response = requests.post(url, headers=headers, json=data, timeout=120)
|
460 |
+
response.raise_for_status()
|
461 |
+
|
462 |
+
result = response.json()
|
463 |
+
|
464 |
+
# Format the response in a readable way
|
465 |
+
formatted_response = f"Document Extraction Results:\n\n"
|
466 |
+
formatted_response += f"Extracted Data:\n{result.get('results', 'No results found')}\n\n"
|
467 |
+
formatted_response += f"Pages Processed: {result.get('pages', 'Unknown')}\n"
|
468 |
+
formatted_response += f"Files Processed: {result.get('fileCount', 'Unknown')}\n"
|
469 |
+
formatted_response += f"Credit Usage: {result.get('creditUsage', 'Unknown')}\n"
|
470 |
+
|
471 |
+
return formatted_response
|
472 |
+
|
473 |
+
except requests.exceptions.Timeout:
|
474 |
+
return "Error: Request to Dumpling AI API timed out after 120 seconds"
|
475 |
+
except requests.exceptions.HTTPError as e:
|
476 |
+
error_detail = f"HTTP Error: {e.response.status_code}"
|
477 |
+
try:
|
478 |
+
error_json = e.response.json()
|
479 |
+
error_detail += f" - {error_json.get('detail', error_json)}"
|
480 |
+
except:
|
481 |
+
error_detail += f" - {e.response.text[:500]}"
|
482 |
+
return error_detail
|
483 |
+
except requests.exceptions.RequestException as e:
|
484 |
+
return f"Error making request to Dumpling AI API: {str(e)}"
|
485 |
+
except Exception as e:
|
486 |
+
return f"Error extracting document data: {str(e)}"
|
487 |
+
|
488 |
+
@tool
|
489 |
+
def extract_url_content(url: str) -> str:
|
490 |
+
"""Extract content from a URL using Diffbot API (supports webpages, articles, PDFs, etc.).
|
491 |
+
This function is rate-limited to execute no more frequently than once every 20 seconds."""
|
492 |
+
global _last_extract_url_time
|
493 |
+
|
494 |
+
# Check if we need to wait before executing
|
495 |
+
current_time = time.time()
|
496 |
+
time_since_last_call = current_time - _last_extract_url_time
|
497 |
+
|
498 |
+
if time_since_last_call < 20 and _last_extract_url_time > 0:
|
499 |
+
# Calculate how long to wait
|
500 |
+
wait_time = 20 - time_since_last_call
|
501 |
+
print(f"Rate limiting: waiting {wait_time:.2f} seconds before next API call")
|
502 |
+
time.sleep(wait_time)
|
503 |
+
current_time = time.time() # Update current time after sleeping
|
504 |
+
|
505 |
+
# Update last execution time
|
506 |
+
_last_extract_url_time = current_time
|
507 |
+
|
508 |
+
# Diffbot token from environment or use the fallback
|
509 |
+
token = os.getenv("DIFFBOT_TOKEN")
|
510 |
+
if not token:
|
511 |
+
return "Error: DIFFBOT_TOKEN environment variable not set"
|
512 |
+
|
513 |
+
# Set up the API endpoint
|
514 |
+
api_url = "https://api.diffbot.com/v3/article"
|
515 |
+
|
516 |
+
# Parameters for the request
|
517 |
+
params = {
|
518 |
+
"token": token,
|
519 |
+
"url": url
|
520 |
+
}
|
521 |
+
|
522 |
+
try:
|
523 |
+
# Make the API request with a timeout
|
524 |
+
response = requests.get(api_url, params=params, timeout=30) # 30 second timeout
|
525 |
+
response.raise_for_status() # Raise exception for HTTP errors
|
526 |
+
|
527 |
+
# Parse the response
|
528 |
+
data = response.json()
|
529 |
+
|
530 |
+
# Extract relevant information
|
531 |
+
if "objects" in data and len(data["objects"]) > 0:
|
532 |
+
obj = data["objects"][0]
|
533 |
+
|
534 |
+
# Create a formatted result
|
535 |
+
result = f"Title: {obj.get('title', 'No title')}\n\n"
|
536 |
+
|
537 |
+
if "text" in obj:
|
538 |
+
result += f"Content:\n{obj.get('text')}\n\n"
|
539 |
+
|
540 |
+
#if "html" in obj:
|
541 |
+
# result += f"HTML Content:\n{obj.get('html')}\n\n"
|
542 |
+
|
543 |
+
if "categories" in obj and obj["categories"]:
|
544 |
+
categories = ", ".join([f"{cat.get('name')} ({cat.get('score', 0):.2f})"
|
545 |
+
for cat in obj["categories"]])
|
546 |
+
result += f"Categories: {categories}\n"
|
547 |
+
|
548 |
+
result += f"Source: {obj.get('siteName', 'Unknown')}\n"
|
549 |
+
result += f"URL: {obj.get('pageUrl', url)}"
|
550 |
+
|
551 |
+
return result
|
552 |
+
else:
|
553 |
+
return f"No content could be extracted from {url}. Response: {data}"
|
554 |
+
|
555 |
+
except requests.exceptions.Timeout:
|
556 |
+
return f"Error: Request to extract content from {url} timed out after 30 seconds"
|
557 |
+
except requests.exceptions.RequestException as e:
|
558 |
+
return f"Error: Failed to extract content from {url}: {str(e)}"
|
559 |
+
except Exception as e:
|
560 |
+
return f"Error extracting content from {url}: {str(e)}"
|
561 |
+
|
562 |
+
class BasicAgent:
|
563 |
+
def __init__(self):
|
564 |
+
print("BasicAgent initialized.")
|
565 |
+
# Initialize the Anthropic models
|
566 |
+
# Standard model for supervisor and validator
|
567 |
+
self.langfuse_handler = CallbackHandler()
|
568 |
+
|
569 |
+
self.supervisor_model = ChatAnthropic(
|
570 |
+
model="claude-3-7-sonnet-20250219",
|
571 |
+
max_tokens=20000,
|
572 |
+
anthropic_api_key=os.getenv("ANTHROPIC_API_KEY"),
|
573 |
+
temperature=0.6,
|
574 |
+
# thinking={
|
575 |
+
# "type": "enabled",
|
576 |
+
# "budget_tokens": 5000
|
577 |
+
# }
|
578 |
+
)
|
579 |
+
|
580 |
+
# Standard model for validator
|
581 |
+
self.validator_model = ChatAnthropic(
|
582 |
+
model="claude-3-7-sonnet-20250219",
|
583 |
+
max_tokens=20000,
|
584 |
+
temperature=0.5, # Lower temperature for more consistent validation
|
585 |
+
anthropic_api_key=os.getenv("ANTHROPIC_API_KEY")
|
586 |
+
)
|
587 |
+
|
588 |
+
# Tool-enabled model for worker
|
589 |
+
self.worker_model_base = ChatAnthropic(
|
590 |
+
model="claude-3-7-sonnet-20250219",
|
591 |
+
max_tokens=20000,
|
592 |
+
temperature=0.75,
|
593 |
+
anthropic_api_key=os.getenv("ANTHROPIC_API_KEY")
|
594 |
+
)
|
595 |
+
|
596 |
+
# Initialize tools
|
597 |
+
self.tools = [search_web_tavily, search_web_serper, execute_code_securely, execute_shell_command, sandbox_file_operation, extract_document_data, extract_url_content]
|
598 |
+
|
599 |
+
# Bind tools only to the worker model
|
600 |
+
self.worker_model = self.worker_model_base.bind_tools(self.tools)
|
601 |
+
|
602 |
+
# Create the tool node for executing tools
|
603 |
+
self.tool_node = ToolNode(self.tools)
|
604 |
+
|
605 |
+
# Create the workflow
|
606 |
+
self.app = self._create_workflow()
|
607 |
+
|
608 |
+
def _process_messages_after_tools(self, messages):
|
609 |
+
"""Process messages to ensure tool calls and tool results are properly paired.
|
610 |
+
This helps prevent the Anthropic error: unexpected `tool_use_id` found in `tool_result` blocks."""
|
611 |
+
# Create a mapping of tool_call_id to AIMessage index
|
612 |
+
tool_call_map = {}
|
613 |
+
for i, msg in enumerate(messages):
|
614 |
+
if isinstance(msg, AIMessage) and getattr(msg, "tool_calls", None):
|
615 |
+
for tool_call in msg.tool_calls:
|
616 |
+
if "id" in tool_call:
|
617 |
+
tool_call_map[tool_call["id"]] = i
|
618 |
+
|
619 |
+
# Filter out ToolMessages that don't have a matching AIMessage with tool_calls
|
620 |
+
processed_messages = []
|
621 |
+
for i, msg in enumerate(messages):
|
622 |
+
if isinstance(msg, ToolMessage) and hasattr(msg, "tool_call_id"):
|
623 |
+
# Only include if there is a matching AIMessage with this tool_call_id
|
624 |
+
if msg.tool_call_id in tool_call_map:
|
625 |
+
ai_msg_index = tool_call_map[msg.tool_call_id]
|
626 |
+
# Make sure this tool message comes right after its AIMessage
|
627 |
+
if i > ai_msg_index and not any(
|
628 |
+
isinstance(messages[j], ToolMessage) and
|
629 |
+
hasattr(messages[j], "tool_call_id") and
|
630 |
+
messages[j].tool_call_id == msg.tool_call_id
|
631 |
+
for j in range(ai_msg_index + 1, i)
|
632 |
+
):
|
633 |
+
processed_messages.append(msg)
|
634 |
+
else:
|
635 |
+
processed_messages.append(msg)
|
636 |
+
|
637 |
+
return processed_messages
|
638 |
+
|
639 |
+
def _create_workflow(self):
|
640 |
+
workflow = StateGraph(AgentState)
|
641 |
+
|
642 |
+
# Add nodes
|
643 |
+
workflow.add_node("supervisor", self._supervisor_agent)
|
644 |
+
workflow.add_node("worker", self._worker_agent)
|
645 |
+
workflow.add_node("tools", self._handle_tools)
|
646 |
+
workflow.add_node("validator", self._validation_agent)
|
647 |
+
|
648 |
+
# Add edges using the START and END constants
|
649 |
+
workflow.add_edge(START, "supervisor")
|
650 |
+
|
651 |
+
# All nodes use Command to specify their next destination, so we don't need conditional edges
|
652 |
+
# Each node's Command(goto=...) specifies the next node
|
653 |
+
|
654 |
+
# Compile the graph
|
655 |
+
return workflow.compile()
|
656 |
+
|
657 |
+
def _supervisor_agent(self, state: AgentState) -> Command:
|
658 |
+
"""Supervisor agent that coordinates the workflow."""
|
659 |
+
# Get the question from state
|
660 |
+
current_question = state["current_question"]
|
661 |
+
messages = state["messages"]
|
662 |
+
worker_iterations = state.get("worker_iterations", 0)
|
663 |
+
|
664 |
+
# If we have messages and this isn't the first iteration, evaluate worker's response
|
665 |
+
if messages and worker_iterations > 0:
|
666 |
+
# Find the last worker response
|
667 |
+
worker_response = None
|
668 |
+
for msg in reversed(messages):
|
669 |
+
if isinstance(msg, AIMessage) and not getattr(msg, "tool_calls", None):
|
670 |
+
worker_response = msg.content
|
671 |
+
break
|
672 |
+
|
673 |
+
if worker_response:
|
674 |
+
# Evaluate the worker's response
|
675 |
+
eval_prompt = ChatPromptTemplate.from_messages([
|
676 |
+
("system", """You are a supervisor agent evaluating a worker's research report about user's question.
|
677 |
+
Analyze whether the report with answer completely and accurately answers the question.
|
678 |
+
|
679 |
+
Your evaluation criteria:
|
680 |
+
- Completeness: Does the answer address all aspects of the question?
|
681 |
+
- Accuracy: Are the facts, references and reasoning correct?
|
682 |
+
- Path clarity: Is the path to the answer logical and well-explained?
|
683 |
+
- Evidence quality: Are the references reliable and directly relevant?
|
684 |
+
|
685 |
+
Worker has access to search and web content extraction tools, also python code execution tool.
|
686 |
+
|
687 |
+
Tasks given to You are not casual questions by random humans, but tricky contest puzzles that test LLM capabilities.
|
688 |
+
|
689 |
+
If all criteria are met, respond with "SATISFIED".
|
690 |
+
If any criteria are not met, respond with "UNSATISFIED: [specific detailed feedback]".
|
691 |
+
Be precise in your feedback so the worker knows exactly what to improve."""),
|
692 |
+
("human", f"Question: {current_question}\nWorker's report with answer: {worker_response}")
|
693 |
+
])
|
694 |
+
|
695 |
+
evaluation = self.supervisor_model.invoke(eval_prompt.format_prompt().to_messages()).content
|
696 |
+
|
697 |
+
# Determine if supervisor is satisfied
|
698 |
+
supervisor_satisfaction = evaluation.startswith("SATISFIED")
|
699 |
+
|
700 |
+
if supervisor_satisfaction:
|
701 |
+
# If satisfied, prepare to move to validator
|
702 |
+
return Command(
|
703 |
+
goto="validator",
|
704 |
+
update={
|
705 |
+
"supervisor_satisfaction": True
|
706 |
+
}
|
707 |
+
)
|
708 |
+
else:
|
709 |
+
# If not satisfied, give feedback to worker
|
710 |
+
feedback = evaluation.replace("UNSATISFIED: ", "")
|
711 |
+
|
712 |
+
prompt = ChatPromptTemplate.from_messages([
|
713 |
+
("system", """You are a supervisor agent providing targeted feedback to the worker agent.
|
714 |
+
|
715 |
+
Your role is to guide the worker to improve their research report by:
|
716 |
+
1) Highlighting specific areas that need improvement
|
717 |
+
2) Providing clear, actionable guidance on what additional research is needed
|
718 |
+
3) Explaining exactly how the worker should revise their approach
|
719 |
+
4) Reminding them of any specific formatting requirements in the original question
|
720 |
+
|
721 |
+
Worker has access to the following tools:
|
722 |
+
- Web search (using Tavily and Serper)
|
723 |
+
- Web content extraction
|
724 |
+
- Secure code execution (for Python and other languages)
|
725 |
+
- Secure shell command execution
|
726 |
+
- Secure file operations
|
727 |
+
|
728 |
+
For computational puzzles, math problems, data processing, or tasks requiring exact precision,
|
729 |
+
recommend using the code execution tools rather than relying on reasoning alone.
|
730 |
+
|
731 |
+
Tasks given to You are not casual questions by random humans, but tricky contest puzzles that test LLM capabilities.
|
732 |
+
|
733 |
+
Focus on being constructive and precise. The worker should understand exactly what to do next."""),
|
734 |
+
("human", f"Question: {current_question}\nWorker's current response: {worker_response}\nImprovement needed: {feedback}")
|
735 |
+
])
|
736 |
+
|
737 |
+
feedback_message = self.supervisor_model.invoke(prompt.format_prompt().to_messages()).content
|
738 |
+
|
739 |
+
# Update messages with feedback and increment worker iterations
|
740 |
+
return Command(
|
741 |
+
goto="worker",
|
742 |
+
update={
|
743 |
+
"messages": messages + [HumanMessage(content=feedback_message)],
|
744 |
+
"worker_iterations": worker_iterations + 1,
|
745 |
+
"supervisor_satisfaction": False
|
746 |
+
}
|
747 |
+
)
|
748 |
+
|
749 |
+
# First iteration, provide initial instructions
|
750 |
+
prompt = ChatPromptTemplate.from_messages([
|
751 |
+
("system", """You are a supervisor agent responsible for coordinating a research workflow.
|
752 |
+
|
753 |
+
Your responsibilities:
|
754 |
+
1) Analyze the question to identify required knowledge, tools, and research strategy
|
755 |
+
2) Provide clear, specific instructions to the worker agent
|
756 |
+
3) Specify exactly what information to gather and what analysis to perform
|
757 |
+
|
758 |
+
The worker will prepare a concise research report containing:
|
759 |
+
1) Their research path - the logical sequence of steps taken to reach the answer
|
760 |
+
2) The specific references used with clear citations
|
761 |
+
3) A proposed final answer formatted EXACTLY as requested in the question in separate section
|
762 |
+
|
763 |
+
Worker has access to the following powerful tools:
|
764 |
+
- Web search (using Tavily and Serper)
|
765 |
+
- Web content extraction
|
766 |
+
- Secure code execution (for Python and other languages)
|
767 |
+
- Secure shell command execution
|
768 |
+
- Secure file operations
|
769 |
+
|
770 |
+
You must understand LLM limitations of solving puzzles that can be solved only by code execution,
|
771 |
+
for example math problems, word character flipping, counting and similar tasks that typically plain LLM will fail at.
|
772 |
+
|
773 |
+
In case of such tasks, worker should use the code execution tools to solve the puzzle.
|
774 |
+
|
775 |
+
Tasks given to You are not casual questions by random humans, but tricky contest puzzles that test LLM capabilities.
|
776 |
+
|
777 |
+
Worker should give You full report with all sections for You to evaluate."""
|
778 |
+
),
|
779 |
+
("human", current_question)
|
780 |
+
])
|
781 |
+
|
782 |
+
response = self.supervisor_model.invoke(prompt.format_prompt().to_messages()).content
|
783 |
+
|
784 |
+
# Use Command pattern to update state and move to worker
|
785 |
+
return Command(
|
786 |
+
goto="worker",
|
787 |
+
update={
|
788 |
+
"messages": [HumanMessage(content=current_question), AIMessage(content=response)],
|
789 |
+
"worker_iterations": 1,
|
790 |
+
"supervisor_satisfaction": False
|
791 |
+
}
|
792 |
+
)
|
793 |
+
|
794 |
+
def _worker_agent(self, state: AgentState) -> Command:
|
795 |
+
"""Worker agent that performs the actual work using tools when needed."""
|
796 |
+
messages = state["messages"]
|
797 |
+
|
798 |
+
# Process messages to ensure proper tool call-result pairing
|
799 |
+
processed_messages = self._process_messages_after_tools(messages)
|
800 |
+
|
801 |
+
# Filter out any ToolMessages that don't have a corresponding AIMessage with tool_calls
|
802 |
+
# This helps prevent the "unexpected tool_use_id" error with Anthropic
|
803 |
+
filtered_messages = []
|
804 |
+
tool_call_ids = set()
|
805 |
+
|
806 |
+
# First pass: collect all tool_call_ids from AIMessages
|
807 |
+
for msg in processed_messages:
|
808 |
+
if isinstance(msg, AIMessage) and getattr(msg, "tool_calls", None):
|
809 |
+
for tool_call in msg.tool_calls:
|
810 |
+
if "id" in tool_call:
|
811 |
+
tool_call_ids.add(tool_call["id"])
|
812 |
+
|
813 |
+
# Second pass: only include ToolMessages that have a corresponding tool_call_id
|
814 |
+
for msg in processed_messages:
|
815 |
+
if isinstance(msg, ToolMessage) and getattr(msg, "tool_call_id", None):
|
816 |
+
if msg.tool_call_id in tool_call_ids:
|
817 |
+
filtered_messages.append(msg)
|
818 |
+
else:
|
819 |
+
filtered_messages.append(msg)
|
820 |
+
|
821 |
+
# If messages exist, use them directly with the tool-enabled model
|
822 |
+
response = self.worker_model.invoke(filtered_messages)
|
823 |
+
|
824 |
+
# Update messages - add the response to the original messages
|
825 |
+
# We don't want to lose the original message history
|
826 |
+
updated_messages = messages + [response]
|
827 |
+
|
828 |
+
# Determine next step using Command pattern
|
829 |
+
if response.tool_calls:
|
830 |
+
# If tool calls are present, go to tools
|
831 |
+
return Command(
|
832 |
+
goto="tools",
|
833 |
+
update={"messages": updated_messages}
|
834 |
+
)
|
835 |
+
else:
|
836 |
+
# No tool calls, return to supervisor for evaluation
|
837 |
+
return Command(
|
838 |
+
goto="supervisor",
|
839 |
+
update={"messages": updated_messages}
|
840 |
+
)
|
841 |
+
|
842 |
+
def _validation_agent(self, state: AgentState) -> Command:
|
843 |
+
"""Agent that validates the final answer."""
|
844 |
+
messages = state["messages"]
|
845 |
+
question = state["current_question"]
|
846 |
+
|
847 |
+
# Get the final answer from the last message
|
848 |
+
final_answer = ""
|
849 |
+
for msg in reversed(messages):
|
850 |
+
if isinstance(msg, AIMessage) and not getattr(msg, "tool_calls", None):
|
851 |
+
final_answer = msg.content
|
852 |
+
break
|
853 |
+
|
854 |
+
prompt = ChatPromptTemplate.from_messages([
|
855 |
+
("system", """You are a quality assurance agent responsible for final verification of research reports and precise formatting of final answers.
|
856 |
+
|
857 |
+
Your critical responsibilities:
|
858 |
+
1) Verify the factual accuracy and completeness of the report, ensuring you can extract and format the final answer exactly as requested in the question
|
859 |
+
2) Ensure EXACT compliance with any formatting instructions in the question by producing a properly structured final answer
|
860 |
+
|
861 |
+
Pay extremely close attention to formatting requirements. The user may request:
|
862 |
+
- Only specific parts of information (first/last names, specific data points, numerical values)
|
863 |
+
- Particular ordering (alphabetical, chronological, size-based, relevance-based)
|
864 |
+
- Special formatting (bullet points, numbered lists, specific separators, tables)
|
865 |
+
- Exact text case, spacing, punctuation, or other presentational elements
|
866 |
+
|
867 |
+
Exact formatting compliance is MANDATORY for this challenge evaluation. Your role is to ensure the final answer meets all specified requirements.
|
868 |
+
If numerical values are requested, ensure they are formatted as numbers, not text.
|
869 |
+
|
870 |
+
Remember that the worker had access to:
|
871 |
+
- Web search tools
|
872 |
+
- Web content extraction
|
873 |
+
- Secure code execution
|
874 |
+
- Secure shell commands
|
875 |
+
- Secure file operations
|
876 |
+
|
877 |
+
For computational or precision-based questions, check if code execution was appropriately used and validate the results.
|
878 |
+
|
879 |
+
When evaluating the answer:
|
880 |
+
- Check if all required information is present and accurate
|
881 |
+
- Verify that the answer directly addresses the specific question asked
|
882 |
+
- Ensure any numerical values, dates, names, or technical terms are correct
|
883 |
+
- Confirm that the formatting precisely matches what was requested
|
884 |
+
- Do not add units to the final answer if not explicitly requested
|
885 |
+
- Answers tend to be as short as possible, so do not add extra data unless explicitly requested
|
886 |
+
|
887 |
+
If the answer report is correct, format it exactly as asked in the question, and respond with:
|
888 |
+
"APPROVED: [THE PROPERLY FORMATTED ANSWER]"
|
889 |
+
|
890 |
+
If there are issues with overall answer quality and you cannot format the final answer as requested, respond with:
|
891 |
+
"REJECTED: [DETAILED EXPLANATION OF ISSUES]"
|
892 |
+
|
893 |
+
Be extremely precise in your evaluation - the success of this task depends on your attention to detail.
|
894 |
+
"""
|
895 |
+
),
|
896 |
+
("human", f"Question: {question}\nReport to validate: {final_answer}")
|
897 |
+
])
|
898 |
+
validation_result = self.validator_model.invoke(prompt.format_prompt().to_messages()).content
|
899 |
+
validator_approval = validation_result.startswith("APPROVED")
|
900 |
+
|
901 |
+
if validator_approval:
|
902 |
+
# Approved - end the workflow
|
903 |
+
return Command(
|
904 |
+
goto=END,
|
905 |
+
update={
|
906 |
+
"final_answer": validation_result[10:], # Remove "APPROVED: " prefix
|
907 |
+
"validation_result": validation_result,
|
908 |
+
"validator_approval": True
|
909 |
+
}
|
910 |
+
)
|
911 |
+
else:
|
912 |
+
# Rejected - restart from supervisor with reset state
|
913 |
+
return Command(
|
914 |
+
goto="supervisor",
|
915 |
+
update={
|
916 |
+
"messages": [HumanMessage(content=question)],
|
917 |
+
"validation_result": validation_result,
|
918 |
+
"validator_approval": False,
|
919 |
+
"worker_iterations": 0,
|
920 |
+
"supervisor_satisfaction": False
|
921 |
+
}
|
922 |
+
)
|
923 |
+
|
924 |
+
def _handle_tools(self, state: AgentState) -> Command:
|
925 |
+
"""Custom wrapper around ToolNode to ensure proper message handling."""
|
926 |
+
# Execute the tool using the tool node
|
927 |
+
tool_result = self.tool_node.invoke(state)
|
928 |
+
|
929 |
+
# Process the result to ensure proper message ordering
|
930 |
+
if "messages" in tool_result:
|
931 |
+
# Get original messages
|
932 |
+
original_messages = state["messages"]
|
933 |
+
# Get all existing AIMessages with tool calls and their indices
|
934 |
+
ai_indices = {}
|
935 |
+
for i, msg in enumerate(original_messages):
|
936 |
+
if isinstance(msg, AIMessage) and getattr(msg, "tool_calls", None):
|
937 |
+
for tool_call in msg.tool_calls:
|
938 |
+
if "id" in tool_call:
|
939 |
+
ai_indices[tool_call["id"]] = i
|
940 |
+
|
941 |
+
# Add the new tool messages, ensuring they come right after their corresponding tool call
|
942 |
+
updated_messages = list(original_messages)
|
943 |
+
for msg in tool_result["messages"]:
|
944 |
+
if isinstance(msg, ToolMessage) and hasattr(msg, "tool_call_id"):
|
945 |
+
tool_id = msg.tool_call_id
|
946 |
+
if tool_id in ai_indices:
|
947 |
+
# Insert after the AIMessage with the matching tool call
|
948 |
+
insert_idx = ai_indices[tool_id] + 1
|
949 |
+
# Move past any existing tool messages for this AI message
|
950 |
+
while insert_idx < len(updated_messages) and \
|
951 |
+
isinstance(updated_messages[insert_idx], ToolMessage) and \
|
952 |
+
hasattr(updated_messages[insert_idx], "tool_call_id") and \
|
953 |
+
updated_messages[insert_idx].tool_call_id != tool_id:
|
954 |
+
insert_idx += 1
|
955 |
+
updated_messages.insert(insert_idx, msg)
|
956 |
+
# Update subsequent indices
|
957 |
+
for id in ai_indices:
|
958 |
+
if ai_indices[id] >= insert_idx:
|
959 |
+
ai_indices[id] += 1
|
960 |
+
else:
|
961 |
+
# No matching tool call found, just append
|
962 |
+
updated_messages.append(msg)
|
963 |
+
|
964 |
+
return Command(
|
965 |
+
goto="worker",
|
966 |
+
update={"messages": updated_messages}
|
967 |
+
)
|
968 |
+
|
969 |
+
# If no message updates, just return the state
|
970 |
+
return Command(
|
971 |
+
goto="worker",
|
972 |
+
update=tool_result
|
973 |
+
)
|
974 |
+
|
975 |
+
def __call__(self, question: str) -> str:
|
976 |
+
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
977 |
+
|
978 |
+
# Initialize the state
|
979 |
+
initial_state = {
|
980 |
+
"messages": [],
|
981 |
+
"current_question": question,
|
982 |
+
"final_answer": "",
|
983 |
+
"validation_result": "",
|
984 |
+
"worker_iterations": 0,
|
985 |
+
"supervisor_satisfaction": False,
|
986 |
+
"validator_approval": False
|
987 |
+
}
|
988 |
+
|
989 |
+
try:
|
990 |
+
# Run the workflow
|
991 |
+
final_state = self.app.invoke(initial_state, config={"callbacks": [self.langfuse_handler], "recursion_limit": 50})
|
992 |
+
|
993 |
+
# Return the final answer
|
994 |
+
answer = final_state.get("final_answer", "")
|
995 |
+
if not answer and final_state["messages"]:
|
996 |
+
for msg in reversed(final_state["messages"]):
|
997 |
+
if isinstance(msg, AIMessage) and not getattr(msg, "tool_calls", None):
|
998 |
+
answer = msg.content
|
999 |
+
break
|
1000 |
+
|
1001 |
+
print(f"Agent returning answer: {answer[:50]}...")
|
1002 |
+
return answer
|
1003 |
+
except Exception as e:
|
1004 |
+
print(f"Error in agent processing: {str(e)}")
|
1005 |
+
# Fallback to basic workflow without tool calls if there's an error
|
1006 |
+
return f"I encountered an error while processing your question: {str(e)}. Please try reformulating your question."
|
1007 |
+
finally:
|
1008 |
+
# Clean up resources
|
1009 |
+
cleanup_daytona_sandbox()
|
app.py
CHANGED
@@ -1,23 +1,20 @@
|
|
1 |
import os
|
|
|
|
|
2 |
import gradio as gr
|
3 |
import requests
|
4 |
import inspect
|
5 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
-
# (Keep Constants as is)
|
8 |
# --- Constants ---
|
9 |
-
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
10 |
-
|
11 |
-
# --- Basic Agent Definition ---
|
12 |
-
# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
|
13 |
-
class BasicAgent:
|
14 |
-
def __init__(self):
|
15 |
-
print("BasicAgent initialized.")
|
16 |
-
def __call__(self, question: str) -> str:
|
17 |
-
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
18 |
-
fixed_answer = "This is a default answer."
|
19 |
-
print(f"Agent returning fixed answer: {fixed_answer}")
|
20 |
-
return fixed_answer
|
21 |
|
22 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
23 |
"""
|
@@ -48,44 +45,105 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
48 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
|
49 |
print(agent_code)
|
50 |
|
51 |
-
#
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
print(f"Response text: {response.text[:500]}")
|
67 |
-
return f"Error decoding server response for questions: {e}", None
|
68 |
-
except Exception as e:
|
69 |
-
print(f"An unexpected error occurred fetching questions: {e}")
|
70 |
-
return f"An unexpected error occurred fetching questions: {e}", None
|
71 |
|
72 |
-
#
|
73 |
results_log = []
|
74 |
answers_payload = []
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
task_id = item.get("task_id")
|
78 |
question_text = item.get("question")
|
79 |
if not task_id or question_text is None:
|
80 |
print(f"Skipping item with missing task_id or question: {item}")
|
81 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
try:
|
|
|
83 |
submitted_answer = agent(question_text)
|
84 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
85 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
|
|
|
|
|
|
|
|
86 |
except Exception as e:
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
89 |
|
90 |
if not answers_payload:
|
91 |
print("Agent did not produce any answers to submit.")
|
@@ -99,17 +157,42 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
99 |
# 5. Submit
|
100 |
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
|
101 |
try:
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
final_status = (
|
106 |
-
f"Submission Successful!\n"
|
107 |
f"User: {result_data.get('username')}\n"
|
108 |
f"Overall Score: {result_data.get('score', 'N/A')}% "
|
109 |
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
|
110 |
f"Message: {result_data.get('message', 'No message received.')}"
|
111 |
)
|
112 |
-
print("Submission
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
results_df = pd.DataFrame(results_log)
|
114 |
return final_status, results_df
|
115 |
except requests.exceptions.HTTPError as e:
|
@@ -139,6 +222,24 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
139 |
results_df = pd.DataFrame(results_log)
|
140 |
return status_message, results_df
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
# --- Build Gradio Interface using Blocks ---
|
144 |
with gr.Blocks() as demo:
|
@@ -190,6 +291,18 @@ if __name__ == "__main__":
|
|
190 |
else:
|
191 |
print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
|
192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
print("-"*(60 + len(" App Starting ")) + "\n")
|
194 |
|
195 |
print("Launching Gradio Interface for Basic Agent Evaluation...")
|
|
|
1 |
import os
|
2 |
+
import json
|
3 |
+
from dotenv import load_dotenv
|
4 |
import gradio as gr
|
5 |
import requests
|
6 |
import inspect
|
7 |
import pandas as pd
|
8 |
+
from agent import BasicAgent
|
9 |
+
import time
|
10 |
+
from datetime import datetime
|
11 |
+
|
12 |
+
# Load environment variables from .env file
|
13 |
+
load_dotenv()
|
14 |
|
|
|
15 |
# --- Constants ---
|
16 |
+
DEFAULT_API_URL = os.getenv('DEFAULT_API_URL', "https://agents-course-unit4-scoring.hf.space")
|
17 |
+
CHECKPOINT_FILE = "agent_checkpoint.json"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
20 |
"""
|
|
|
45 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
|
46 |
print(agent_code)
|
47 |
|
48 |
+
# Check for existing checkpoint
|
49 |
+
checkpoint_data = None
|
50 |
+
if os.path.exists(CHECKPOINT_FILE):
|
51 |
+
try:
|
52 |
+
with open(CHECKPOINT_FILE, 'r') as f:
|
53 |
+
checkpoint_data = json.load(f)
|
54 |
+
print(f"Found checkpoint with {len(checkpoint_data.get('questions', []))} questions and {len(checkpoint_data.get('answers', []))} answers")
|
55 |
+
except Exception as e:
|
56 |
+
print(f"Error loading checkpoint: {e}")
|
57 |
+
# If checkpoint is corrupt, remove it
|
58 |
+
try:
|
59 |
+
os.remove(CHECKPOINT_FILE)
|
60 |
+
except:
|
61 |
+
pass
|
62 |
+
checkpoint_data = None
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
+
# Initialize results tracking
|
65 |
results_log = []
|
66 |
answers_payload = []
|
67 |
+
|
68 |
+
if checkpoint_data:
|
69 |
+
# If we have a checkpoint, use it
|
70 |
+
questions_data = checkpoint_data.get('questions', [])
|
71 |
+
# Load any answers we already have
|
72 |
+
existing_answers = checkpoint_data.get('answers', [])
|
73 |
+
existing_answers_dict = {a.get('task_id'): a.get('submitted_answer') for a in existing_answers}
|
74 |
+
print(f"Loaded {len(existing_answers)} existing answers from checkpoint")
|
75 |
+
|
76 |
+
# Load existing results log
|
77 |
+
if 'results_log' in checkpoint_data:
|
78 |
+
results_log = checkpoint_data.get('results_log', [])
|
79 |
+
|
80 |
+
# We'll use the checkpoint data
|
81 |
+
print(f"Resuming from checkpoint with {len(questions_data)} questions")
|
82 |
+
else:
|
83 |
+
# 2. Fetch Questions from server
|
84 |
+
print(f"Fetching questions from: {questions_url}")
|
85 |
+
try:
|
86 |
+
response = requests.get(questions_url, timeout=15)
|
87 |
+
response.raise_for_status()
|
88 |
+
questions_data = response.json()
|
89 |
+
if not questions_data:
|
90 |
+
print("Fetched questions list is empty.")
|
91 |
+
return "Fetched questions list is empty or invalid format.", None
|
92 |
+
print(f"Fetched {len(questions_data)} questions.")
|
93 |
+
|
94 |
+
# Save questions to checkpoint immediately
|
95 |
+
save_checkpoint(questions_data, [], username, [])
|
96 |
+
|
97 |
+
# No existing answers
|
98 |
+
existing_answers_dict = {}
|
99 |
+
|
100 |
+
except requests.exceptions.RequestException as e:
|
101 |
+
print(f"Error fetching questions: {e}")
|
102 |
+
return f"Error fetching questions: {e}", None
|
103 |
+
except requests.exceptions.JSONDecodeError as e:
|
104 |
+
print(f"Error decoding JSON response from questions endpoint: {e}")
|
105 |
+
print(f"Response text: {response.text[:500]}")
|
106 |
+
return f"Error decoding server response for questions: {e}", None
|
107 |
+
except Exception as e:
|
108 |
+
print(f"An unexpected error occurred fetching questions: {e}")
|
109 |
+
return f"An unexpected error occurred fetching questions: {e}", None
|
110 |
+
|
111 |
+
# 3. Run your Agent on questions we haven't answered yet
|
112 |
+
print(f"Running agent on questions...")
|
113 |
+
for idx, item in enumerate(questions_data):
|
114 |
task_id = item.get("task_id")
|
115 |
question_text = item.get("question")
|
116 |
if not task_id or question_text is None:
|
117 |
print(f"Skipping item with missing task_id or question: {item}")
|
118 |
continue
|
119 |
+
|
120 |
+
# Skip if we already have an answer for this question
|
121 |
+
if task_id in existing_answers_dict:
|
122 |
+
submitted_answer = existing_answers_dict[task_id]
|
123 |
+
print(f"Using cached answer for task_id {task_id}")
|
124 |
+
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
125 |
+
|
126 |
+
# Check if we already have this in results_log
|
127 |
+
if not any(r.get("Task ID") == task_id for r in results_log):
|
128 |
+
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
129 |
+
|
130 |
+
continue
|
131 |
+
|
132 |
try:
|
133 |
+
print(f"Processing question {idx+1}/{len(questions_data)}: {task_id}")
|
134 |
submitted_answer = agent(question_text)
|
135 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
136 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
137 |
+
|
138 |
+
# Save checkpoint after each answer
|
139 |
+
save_checkpoint(questions_data, answers_payload, username, results_log)
|
140 |
+
|
141 |
except Exception as e:
|
142 |
+
print(f"Error running agent on task {task_id}: {e}")
|
143 |
+
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
|
144 |
+
|
145 |
+
# Save checkpoint even if there was an error
|
146 |
+
save_checkpoint(questions_data, answers_payload, username, results_log)
|
147 |
|
148 |
if not answers_payload:
|
149 |
print("Agent did not produce any answers to submit.")
|
|
|
157 |
# 5. Submit
|
158 |
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
|
159 |
try:
|
160 |
+
# Check if we're in production mode
|
161 |
+
is_production = os.getenv('PRODUCTION_RUN', 'FALSE').upper() == 'TRUE'
|
162 |
+
|
163 |
+
if is_production:
|
164 |
+
print("Running in PRODUCTION mode - making actual submission")
|
165 |
+
response = requests.post(submit_url, json=submission_data, timeout=60)
|
166 |
+
response.raise_for_status()
|
167 |
+
result_data = response.json()
|
168 |
+
else:
|
169 |
+
print("Running in SIMULATION mode - generating mock response")
|
170 |
+
# Simulate a successful response
|
171 |
+
result_data = {
|
172 |
+
"username": username,
|
173 |
+
"score": 85,
|
174 |
+
"correct_count": len(answers_payload) - 2, # Simulate some incorrect answers
|
175 |
+
"total_attempted": len(answers_payload),
|
176 |
+
"message": "Simulation mode: This is a mock response"
|
177 |
+
}
|
178 |
+
|
179 |
final_status = (
|
180 |
+
f"Submission {'Successful' if is_production else 'Simulated'}!\n"
|
181 |
f"User: {result_data.get('username')}\n"
|
182 |
f"Overall Score: {result_data.get('score', 'N/A')}% "
|
183 |
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
|
184 |
f"Message: {result_data.get('message', 'No message received.')}"
|
185 |
)
|
186 |
+
print(f"Submission {'completed' if is_production else 'simulated'} successfully.")
|
187 |
+
|
188 |
+
# Delete checkpoint file after successful submission
|
189 |
+
if os.path.exists(CHECKPOINT_FILE):
|
190 |
+
try:
|
191 |
+
os.remove(CHECKPOINT_FILE)
|
192 |
+
print(f"Checkpoint file removed after successful submission")
|
193 |
+
except Exception as e:
|
194 |
+
print(f"Warning: Could not remove checkpoint file: {e}")
|
195 |
+
|
196 |
results_df = pd.DataFrame(results_log)
|
197 |
return final_status, results_df
|
198 |
except requests.exceptions.HTTPError as e:
|
|
|
222 |
results_df = pd.DataFrame(results_log)
|
223 |
return status_message, results_df
|
224 |
|
225 |
+
def save_checkpoint(questions_data, answers_payload, username, results_log):
|
226 |
+
"""Save checkpoint data to a local file."""
|
227 |
+
try:
|
228 |
+
checkpoint_data = {
|
229 |
+
'questions': questions_data,
|
230 |
+
'answers': answers_payload,
|
231 |
+
'username': username,
|
232 |
+
'timestamp': time.time(),
|
233 |
+
'results_log': results_log
|
234 |
+
}
|
235 |
+
|
236 |
+
with open(CHECKPOINT_FILE, 'w') as f:
|
237 |
+
json.dump(checkpoint_data, f)
|
238 |
+
|
239 |
+
print(f"Checkpoint saved with {len(questions_data)} questions and {len(answers_payload)} answers")
|
240 |
+
except Exception as e:
|
241 |
+
print(f"Error saving checkpoint: {e}")
|
242 |
+
|
243 |
|
244 |
# --- Build Gradio Interface using Blocks ---
|
245 |
with gr.Blocks() as demo:
|
|
|
291 |
else:
|
292 |
print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
|
293 |
|
294 |
+
# Check for existing checkpoint
|
295 |
+
if os.path.exists(CHECKPOINT_FILE):
|
296 |
+
try:
|
297 |
+
with open(CHECKPOINT_FILE, 'r') as f:
|
298 |
+
checkpoint_data = json.load(f)
|
299 |
+
print(f"✅ Checkpoint found with {len(checkpoint_data.get('questions', []))} questions and {len(checkpoint_data.get('answers', []))} answers")
|
300 |
+
print(f" Created at: {datetime.fromtimestamp(checkpoint_data.get('timestamp', 0)).strftime('%Y-%m-%d %H:%M:%S')}")
|
301 |
+
except Exception as e:
|
302 |
+
print(f"⚠️ Checkpoint file exists but could not be read: {e}")
|
303 |
+
else:
|
304 |
+
print("ℹ️ No checkpoint file found. Will start fresh.")
|
305 |
+
|
306 |
print("-"*(60 + len(" App Starting ")) + "\n")
|
307 |
|
308 |
print("Launching Gradio Interface for Basic Agent Evaluation...")
|
requirements.txt
CHANGED
@@ -1,2 +1,10 @@
|
|
1 |
gradio
|
2 |
-
requests
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
gradio
|
2 |
+
requests
|
3 |
+
python-dotenv
|
4 |
+
langgraph
|
5 |
+
langchain-core
|
6 |
+
langchain-anthropic
|
7 |
+
anthropic
|
8 |
+
python-Levenshtein
|
9 |
+
daytona_sdk
|
10 |
+
|