Yago Bolivar
commited on
Commit
·
9bdf620
1
Parent(s):
556b9b5
feat: add tests for chess position analysis and prompt formatting scenarios
Browse files- tests/test_chess_formatting.py +202 -0
- tests/test_formatting.sh +16 -0
- tests/test_prompt_formatting.py +108 -0
tests/test_chess_formatting.py
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import yaml
|
2 |
+
import os
|
3 |
+
import pytest
|
4 |
+
from unittest.mock import MagicMock
|
5 |
+
|
6 |
+
# Create mock classes for testing
|
7 |
+
class MockModel:
|
8 |
+
def __init__(self):
|
9 |
+
pass
|
10 |
+
|
11 |
+
def __call__(self, prompt, **kwargs):
|
12 |
+
return self.generate_text(prompt, **kwargs)
|
13 |
+
|
14 |
+
def generate_text(self, prompt, **kwargs):
|
15 |
+
# This method will be implemented in child classes
|
16 |
+
pass
|
17 |
+
|
18 |
+
class CodeAgent:
|
19 |
+
def __init__(self, model=None, tools=None, max_steps=None, verbosity_level=None,
|
20 |
+
name=None, description=None, prompt_templates=None):
|
21 |
+
self.model = model
|
22 |
+
self.tools = tools
|
23 |
+
self.max_steps = max_steps
|
24 |
+
self.verbosity_level = verbosity_level
|
25 |
+
self.name = name
|
26 |
+
self.description = description
|
27 |
+
self.prompt_templates = prompt_templates
|
28 |
+
self.step_counter = 0
|
29 |
+
|
30 |
+
def run(self, query):
|
31 |
+
"""Simulate running the agent for testing purposes."""
|
32 |
+
response = None
|
33 |
+
for step in range(self.max_steps):
|
34 |
+
response = self.model.generate_text("", step=step)
|
35 |
+
if isinstance(response, dict) and "choices" in response:
|
36 |
+
response = response["choices"][0]["message"]["content"]
|
37 |
+
if "final_answer" in response:
|
38 |
+
break
|
39 |
+
return response
|
40 |
+
|
41 |
+
def __call__(self, query):
|
42 |
+
return self.run(query)
|
43 |
+
|
44 |
+
# Load your updated prompts.yaml
|
45 |
+
# Get the correct path relative to this script
|
46 |
+
try:
|
47 |
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
48 |
+
project_root = os.path.dirname(script_dir) # Go up one level from tests/ to project root
|
49 |
+
prompts_path = os.path.join(project_root, "prompts.yaml")
|
50 |
+
|
51 |
+
print(f"Looking for prompts.yaml at: {prompts_path}")
|
52 |
+
|
53 |
+
with open(prompts_path, 'r') as stream:
|
54 |
+
prompt_templates = yaml.safe_load(stream)
|
55 |
+
print("Successfully loaded prompts.yaml")
|
56 |
+
except FileNotFoundError:
|
57 |
+
print(f"Warning: prompts.yaml not found at {prompts_path}. Tests may fail.")
|
58 |
+
prompt_templates = {}
|
59 |
+
except yaml.YAMLError as e:
|
60 |
+
print(f"Error parsing prompts.yaml: {e}")
|
61 |
+
prompt_templates = {}
|
62 |
+
|
63 |
+
# Create a specialized model for testing chess position scenario
|
64 |
+
class ChessPositionTestModel(MockModel):
|
65 |
+
def __init__(self):
|
66 |
+
super().__init__()
|
67 |
+
|
68 |
+
def generate_text(self, prompt, **kwargs):
|
69 |
+
# For testing purposes, we'll simulate a series of responses
|
70 |
+
# to see how the agent handles multi-step chess analysis
|
71 |
+
step = kwargs.get("step", 0)
|
72 |
+
|
73 |
+
responses = [
|
74 |
+
# Step 0: Initial response asking for the chess image
|
75 |
+
"""Thought: I need to see the chess image to analyze the position.
|
76 |
+
```py
|
77 |
+
print("I need the chess image to analyze the position. Please provide the image.")
|
78 |
+
```<end_code>""",
|
79 |
+
|
80 |
+
# Step 1: After receiving the image
|
81 |
+
"""Thought: Now I can see the chess position. I'll analyze it.
|
82 |
+
```py
|
83 |
+
from src.image_processing_tool import ImageProcessor
|
84 |
+
|
85 |
+
image_processor = ImageProcessor()
|
86 |
+
analysis = image_processor.analyze_chess_position(image_path="chess_image.png")
|
87 |
+
print(f"Chess position analysis: {analysis}")
|
88 |
+
```<end_code>""",
|
89 |
+
|
90 |
+
# Step 2: Error handling when image analysis fails
|
91 |
+
"""Thought: There was an error analyzing the chess position. I'll try a different approach.
|
92 |
+
```py
|
93 |
+
print("The image analysis failed. Let me try a different method.")
|
94 |
+
# Alternative approach
|
95 |
+
```<end_code>""",
|
96 |
+
|
97 |
+
# Step 3: Final answer
|
98 |
+
"""Thought: I've analyzed the chess position and determined the best move.
|
99 |
+
```py
|
100 |
+
final_answer("e4 to e5")
|
101 |
+
```<end_code>"""
|
102 |
+
]
|
103 |
+
|
104 |
+
# Return the appropriate response for this step
|
105 |
+
if step < len(responses):
|
106 |
+
return {"choices": [{"message": {"content": responses[step]}}]}
|
107 |
+
else:
|
108 |
+
return {"choices": [{"message": {"content": "Test complete"}}]}
|
109 |
+
|
110 |
+
# Simulating a chess position analysis
|
111 |
+
def test_chess_position_scenario():
|
112 |
+
print("\nTesting chess position analysis scenario\n")
|
113 |
+
|
114 |
+
# Create a minimal version of your tools for testing
|
115 |
+
class DummyImageProcessorTool:
|
116 |
+
def __init__(self):
|
117 |
+
self.name = "image_processor"
|
118 |
+
self.description = "Analyze images including chess positions"
|
119 |
+
self.inputs = {"image_path": "string"}
|
120 |
+
self.output_type = "string"
|
121 |
+
|
122 |
+
def analyze_chess_position(self, image_path):
|
123 |
+
return "Position analyzed: white king on e1, black king on e8"
|
124 |
+
|
125 |
+
class DummyFinalAnswerTool:
|
126 |
+
def __init__(self):
|
127 |
+
self.name = "final_answer"
|
128 |
+
self.description = "Use this to provide the final answer"
|
129 |
+
self.inputs = {"answer": "string"}
|
130 |
+
self.output_type = "string"
|
131 |
+
|
132 |
+
def __call__(self, answer):
|
133 |
+
return f"Final answer submitted: {answer}"
|
134 |
+
|
135 |
+
# Create the test model
|
136 |
+
model = ChessPositionTestModel()
|
137 |
+
|
138 |
+
# Create agent with your updated prompts
|
139 |
+
tools = [DummyImageProcessorTool(), DummyFinalAnswerTool()]
|
140 |
+
try:
|
141 |
+
agent = CodeAgent(
|
142 |
+
model=model,
|
143 |
+
tools=tools,
|
144 |
+
max_steps=4, # Allow for 4 steps to see all responses
|
145 |
+
verbosity_level=2, # Increased verbosity to see more details
|
146 |
+
name="ChessTestAgent",
|
147 |
+
description="Testing chess position analysis formatting",
|
148 |
+
prompt_templates=prompt_templates
|
149 |
+
)
|
150 |
+
except Exception as e:
|
151 |
+
print(f"Error creating agent: {e}")
|
152 |
+
return
|
153 |
+
|
154 |
+
# Test with a chess position analysis task
|
155 |
+
print("Starting chess position analysis test...")
|
156 |
+
result = agent("Analyze this chess position and determine the best move for white.")
|
157 |
+
|
158 |
+
print(f"Final result: {result}")
|
159 |
+
print("-"*50)
|
160 |
+
return result
|
161 |
+
|
162 |
+
def test_prompt_structure():
|
163 |
+
"""Test that the prompt structure includes proper formatting instructions."""
|
164 |
+
print("\nTesting prompt structure for formatting instructions\n")
|
165 |
+
|
166 |
+
# Check if prompts.yaml was loaded successfully
|
167 |
+
if not prompt_templates:
|
168 |
+
pytest.skip("No prompt templates available to test")
|
169 |
+
|
170 |
+
# Get the system prompt from the templates
|
171 |
+
system_prompt = prompt_templates.get("system_prompt", {}).get("main", "")
|
172 |
+
|
173 |
+
# Check that the system prompt contains the necessary elements
|
174 |
+
formatting_elements = [
|
175 |
+
"IMPORTANT FORMATTING RULES for ALL responses:", # Section header
|
176 |
+
"EVERY response MUST follow the format", # Format requirement
|
177 |
+
"```py", # Code block start
|
178 |
+
"```<end_code>", # Code block end
|
179 |
+
"MUST include a code block", # Code block requirement
|
180 |
+
"Example of correct formatting:" # Example section
|
181 |
+
]
|
182 |
+
|
183 |
+
for element in formatting_elements:
|
184 |
+
assert element in system_prompt, f"Missing required formatting element: {element}"
|
185 |
+
|
186 |
+
# Check that the example shows proper formatting
|
187 |
+
example_start = system_prompt.find("Example of correct formatting:")
|
188 |
+
if example_start != -1:
|
189 |
+
example_content = system_prompt[example_start:system_prompt.find("\n\n", example_start)]
|
190 |
+
|
191 |
+
assert "Thought:" in example_content, "Example missing Thought: section"
|
192 |
+
assert "```py" in example_content, "Example missing code block start"
|
193 |
+
assert "```<end_code>" in example_content, "Example missing code block end"
|
194 |
+
else:
|
195 |
+
pytest.fail("No formatting example found in system prompt")
|
196 |
+
|
197 |
+
print("✓ Prompt structure contains all required formatting elements")
|
198 |
+
|
199 |
+
# Run the tests if executed directly
|
200 |
+
if __name__ == "__main__":
|
201 |
+
test_prompt_structure()
|
202 |
+
test_chess_position_scenario()
|
tests/test_formatting.sh
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Script to test the formatting in prompts.yaml
|
3 |
+
|
4 |
+
echo "Starting prompt formatting tests..."
|
5 |
+
|
6 |
+
# Set working directory to project root
|
7 |
+
cd "$(dirname "$0")/.." # Go up one level to project root
|
8 |
+
|
9 |
+
echo "Current directory: $(pwd)"
|
10 |
+
echo "Checking if prompts.yaml exists: $([ -f 'prompts.yaml' ] && echo 'Yes' || echo 'No')"
|
11 |
+
|
12 |
+
# Run the chess formatting test
|
13 |
+
echo "Running chess formatting tests..."
|
14 |
+
python3 -m tests.test_chess_formatting
|
15 |
+
|
16 |
+
echo -e "\nTests completed."
|
tests/test_prompt_formatting.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import yaml
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
from smolagents import CodeAgent, DummyModel
|
5 |
+
|
6 |
+
# Load your updated prompts.yaml
|
7 |
+
with open("prompts.yaml", 'r') as stream:
|
8 |
+
prompt_templates = yaml.safe_load(stream)
|
9 |
+
|
10 |
+
# Create a simple dummy model that will help us test the formatting
|
11 |
+
class TestFormattingModel(DummyModel):
|
12 |
+
def __init__(self):
|
13 |
+
super().__init__()
|
14 |
+
|
15 |
+
def __call__(self, prompt, **kwargs):
|
16 |
+
# Print the prompt for inspection
|
17 |
+
print("="*50)
|
18 |
+
print("PROMPT:")
|
19 |
+
print("="*50)
|
20 |
+
print(prompt)
|
21 |
+
print("="*50)
|
22 |
+
|
23 |
+
# Return a response that simulates different scenarios
|
24 |
+
scenario = kwargs.get("scenario", "normal")
|
25 |
+
|
26 |
+
if scenario == "normal":
|
27 |
+
return {
|
28 |
+
"choices": [{
|
29 |
+
"message": {
|
30 |
+
"content": """Thought: I'll solve this task step by step.
|
31 |
+
```py
|
32 |
+
print("Starting to solve the task")
|
33 |
+
result = 2 + 2
|
34 |
+
print(f"The result is {result}")
|
35 |
+
```<end_code>"""
|
36 |
+
}
|
37 |
+
}]
|
38 |
+
}
|
39 |
+
elif scenario == "error":
|
40 |
+
return {
|
41 |
+
"choices": [{
|
42 |
+
"message": {
|
43 |
+
"content": """Thought: I encountered an error.
|
44 |
+
```py
|
45 |
+
print("An error occurred: file not found")
|
46 |
+
```<end_code>"""
|
47 |
+
}
|
48 |
+
}]
|
49 |
+
}
|
50 |
+
elif scenario == "chess":
|
51 |
+
return {
|
52 |
+
"choices": [{
|
53 |
+
"message": {
|
54 |
+
"content": """Thought: I need more information about the chess position.
|
55 |
+
```py
|
56 |
+
print("I need to see the chess image to analyze the position. Please provide the image.")
|
57 |
+
```<end_code>"""
|
58 |
+
}
|
59 |
+
}]
|
60 |
+
}
|
61 |
+
|
62 |
+
return {"choices": [{"message": {"content": "Test failed"}}]}
|
63 |
+
|
64 |
+
# Create a minimal agent to test your prompts
|
65 |
+
def test_scenario(scenario_name):
|
66 |
+
print(f"\nTesting scenario: {scenario_name}")
|
67 |
+
model = TestFormattingModel()
|
68 |
+
|
69 |
+
# Create a minimal version of your tools for testing
|
70 |
+
class DummyFinalAnswerTool:
|
71 |
+
def __init__(self):
|
72 |
+
self.name = "final_answer"
|
73 |
+
self.description = "Use this to provide the final answer"
|
74 |
+
self.inputs = {"answer": "string"}
|
75 |
+
self.output_type = "string"
|
76 |
+
|
77 |
+
def __call__(self, answer):
|
78 |
+
return f"Final answer submitted: {answer}"
|
79 |
+
|
80 |
+
# Create agent with your updated prompts
|
81 |
+
agent = CodeAgent(
|
82 |
+
model=model,
|
83 |
+
tools=[DummyFinalAnswerTool()],
|
84 |
+
max_steps=2,
|
85 |
+
verbosity_level=1,
|
86 |
+
name="TestAgent",
|
87 |
+
description="Testing prompt formatting",
|
88 |
+
prompt_templates=prompt_templates
|
89 |
+
)
|
90 |
+
|
91 |
+
# Test with a simple task
|
92 |
+
result = agent(f"This is a test task for the {scenario_name} scenario.", scenario=scenario_name)
|
93 |
+
|
94 |
+
print(f"Result: {result}")
|
95 |
+
print("-"*50)
|
96 |
+
return result
|
97 |
+
|
98 |
+
if __name__ == "__main__":
|
99 |
+
print("Testing prompt formatting with different scenarios\n")
|
100 |
+
|
101 |
+
# Test normal scenario
|
102 |
+
test_scenario("normal")
|
103 |
+
|
104 |
+
# Test error scenario
|
105 |
+
test_scenario("error")
|
106 |
+
|
107 |
+
# Test chess scenario
|
108 |
+
test_scenario("chess")
|