Spaces:
Running
Running
import pytest | |
import json | |
import logging | |
from starfish.common.exceptions import JsonParserError, SchemaValidationError | |
from starfish.llm.parser.json_parser import JSONParser | |
from tests.llm.parser.fixtures.json_problem_cases import problem_data_list | |
logger = logging.getLogger(__name__) | |
class TestJSONParser: | |
"""Test cases for the JSONParser class.""" | |
# --------------------------------------------------------------------------- | |
# Tests for schema conversion and format instructions | |
# --------------------------------------------------------------------------- | |
def test_convert_to_schema_basic(self): | |
"""Test converting basic field definitions to JSON schema.""" | |
fields = [ | |
{"name": "name", "type": "str", "description": "Person's name"}, | |
{"name": "age", "type": "int", "description": "Person's age"}, | |
{"name": "is_active", "type": "bool", "description": "Activity status", "required": False}, | |
] | |
schema = JSONParser.convert_to_schema(fields) | |
assert schema["type"] == "object" | |
assert "name" in schema["properties"] | |
assert "age" in schema["properties"] | |
assert "is_active" in schema["properties"] | |
assert schema["properties"]["name"]["type"] == "string" | |
assert schema["properties"]["age"]["type"] == "integer" | |
assert schema["properties"]["is_active"]["type"] == "boolean" | |
assert "name" in schema["required"] | |
assert "age" in schema["required"] | |
assert "is_active" not in schema["required"] | |
def test_convert_to_schema_nested_object(self): | |
"""Test converting nested object field definitions to JSON schema.""" | |
fields = [ | |
{"name": "name", "type": "str", "description": "Person's name"}, | |
{ | |
"name": "address", | |
"type": "dict", | |
"description": "Person's address", | |
"properties": { | |
"street": {"type": "string", "description": "Street name"}, | |
"city": {"type": "string", "description": "City name"}, | |
"zip": {"type": "string", "description": "Zip code"}, | |
}, | |
"required": ["street", "city"], | |
}, | |
] | |
schema = JSONParser.convert_to_schema(fields) | |
assert "address" in schema["properties"] | |
assert schema["properties"]["address"]["type"] == "object" | |
assert "properties" in schema["properties"]["address"] | |
assert "street" in schema["properties"]["address"]["properties"] | |
assert "city" in schema["properties"]["address"]["properties"] | |
assert "zip" in schema["properties"]["address"]["properties"] | |
assert schema["properties"]["address"]["required"] == ["street", "city"] | |
def test_convert_to_schema_nested_array(self): | |
"""Test converting array field with nested objects to JSON schema.""" | |
fields = [ | |
{"name": "name", "type": "str", "description": "Person's name"}, | |
{ | |
"name": "contacts", | |
"type": "list", | |
"description": "Person's contacts", | |
"items": { | |
"type": "object", | |
"properties": { | |
"name": {"type": "string", "description": "Contact name"}, | |
"phone": {"type": "string", "description": "Phone number"}, | |
"relationship": {"type": "string", "description": "Relationship type"}, | |
}, | |
"required": ["name", "phone"], | |
}, | |
}, | |
] | |
schema = JSONParser.convert_to_schema(fields) | |
assert "contacts" in schema["properties"] | |
assert schema["properties"]["contacts"]["type"] == "array" | |
assert "items" in schema["properties"]["contacts"] | |
assert schema["properties"]["contacts"]["items"]["type"] == "object" | |
assert "name" in schema["properties"]["contacts"]["items"]["properties"] | |
assert "phone" in schema["properties"]["contacts"]["items"]["properties"] | |
assert schema["properties"]["contacts"]["items"]["required"] == ["name", "phone"] | |
def test_format_instructions_basic(self): | |
"""Test generating format instructions for a basic schema.""" | |
fields = [{"name": "name", "type": "str", "description": "Person's name"}, {"name": "age", "type": "int", "description": "Person's age"}] | |
schema = JSONParser.convert_to_schema(fields) | |
instructions = JSONParser.get_format_instructions(schema) | |
# Check for expected output elements | |
assert "[" in instructions # Output should be wrapped in an array | |
assert '"name": ""' in instructions | |
assert '"age": number' in instructions | |
assert "Person's name (required)" in instructions | |
assert "Person's age (required)" in instructions | |
def test_format_instructions_nested_object(self): | |
"""Test generating format instructions for schema with nested objects.""" | |
fields = [ | |
{"name": "name", "type": "str", "description": "Person's name"}, | |
{ | |
"name": "address", | |
"type": "dict", | |
"description": "Person's address", | |
"properties": {"street": {"type": "string", "description": "Street name"}, "city": {"type": "string", "description": "City name"}}, | |
"required": ["street"], | |
}, | |
] | |
schema = JSONParser.convert_to_schema(fields) | |
instructions = JSONParser.get_format_instructions(schema) | |
# Check for nested object formatting | |
assert '"address": {' in instructions | |
assert '"street": ""' in instructions | |
assert '"city": ""' in instructions | |
assert "Street name (required)" in instructions | |
assert "City name (optional)" in instructions | |
def test_format_instructions_nested_array(self): | |
"""Test generating format instructions for schema with arrays of objects.""" | |
fields = [ | |
{"name": "name", "type": "str", "description": "Person's name"}, | |
{ | |
"name": "contacts", | |
"type": "list", | |
"description": "Person's contacts", | |
"items": { | |
"type": "object", | |
"properties": {"name": {"type": "string", "description": "Contact name"}, "phone": {"type": "string", "description": "Phone number"}}, | |
"required": ["name"], | |
}, | |
}, | |
] | |
schema = JSONParser.convert_to_schema(fields) | |
instructions = JSONParser.get_format_instructions(schema) | |
# Check for array with nested object formatting | |
assert '"contacts": [' in instructions | |
assert '"name": ""' in instructions # Both root name and contact name | |
assert '"phone": ""' in instructions | |
assert "Contact name (required)" in instructions | |
assert "Phone number (optional)" in instructions | |
assert "// ... more items ..." in instructions | |
def test_format_instructions_deeply_nested(self): | |
"""Test generating format instructions for deeply nested structures.""" | |
fields = [ | |
{"name": "name", "type": "str", "description": "Person's name"}, | |
{ | |
"name": "family", | |
"type": "dict", | |
"description": "Family information", | |
"properties": { | |
"spouse": { | |
"type": "object", | |
"description": "Spouse information", | |
"properties": { | |
"name": {"type": "string", "description": "Spouse name"}, | |
"occupation": {"type": "string", "description": "Spouse occupation"}, | |
}, | |
"required": ["name"], | |
}, | |
"children": { | |
"type": "array", | |
"description": "Children information", | |
"items": { | |
"type": "object", | |
"properties": { | |
"name": {"type": "string", "description": "Child name"}, | |
"age": {"type": "integer", "description": "Child age"}, | |
"hobbies": {"type": "array", "description": "Child hobbies", "items": {"type": "string"}}, | |
}, | |
"required": ["name", "age"], | |
}, | |
}, | |
}, | |
}, | |
] | |
schema = JSONParser.convert_to_schema(fields) | |
instructions = JSONParser.get_format_instructions(schema) | |
# Check for deeply nested structure elements | |
assert '"family": {' in instructions | |
assert '"spouse": {' in instructions | |
assert '"children": [' in instructions | |
assert '"name": ""' in instructions # Multiple occurrences | |
assert '"age": number' in instructions | |
assert '"hobbies": [' in instructions | |
assert "Spouse name (required)" in instructions | |
assert "Child name (required)" in instructions | |
assert "Child age (required)" in instructions | |
# --------------------------------------------------------------------------- | |
# Tests for parsing LLM output text | |
# --------------------------------------------------------------------------- | |
def test_extract_json_from_text_simple(self): | |
"""Test extracting JSON from text without markdown.""" | |
text = '{"name": "John", "age": 30}' | |
json_text = JSONParser._extract_json_from_text(text) | |
assert json_text == '{"name": "John", "age": 30}' | |
# Test with surrounding text | |
text = 'Here is the data: {"name": "John", "age": 30} as requested.' | |
json_text = JSONParser._extract_json_from_text(text) | |
assert json_text == '{"name": "John", "age": 30}' | |
def test_extract_json_from_text_markdown(self): | |
"""Test extracting JSON from markdown code blocks.""" | |
# With json tag | |
text = 'Here is the data:\n```json\n{"name": "John", "age": 30}\n```\nAs requested.' | |
json_text = JSONParser._extract_json_from_text(text) | |
assert json_text == '{"name": "John", "age": 30}' | |
# Without json tag | |
text = 'Here is the data:\n```\n{"name": "John", "age": 30}\n```\nAs requested.' | |
json_text = JSONParser._extract_json_from_text(text) | |
assert json_text == '{"name": "John", "age": 30}' | |
def test_extract_json_from_text_array(self): | |
"""Test extracting JSON array from text.""" | |
text = '[{"name": "John", "age": 30}, {"name": "Jane", "age": 25}]' | |
json_text = JSONParser._extract_json_from_text(text) | |
assert json_text == '[{"name": "John", "age": 30}, {"name": "Jane", "age": 25}]' | |
def test_extract_json_from_text_error(self): | |
"""Test error handling when no JSON is found.""" | |
text = "This text does not contain any JSON." | |
with pytest.raises(JsonParserError): | |
JSONParser._extract_json_from_text(text) | |
def test_unwrap_json_data_single(self): | |
"""Test unwrapping single object JSON data.""" | |
data = {"name": "John", "age": 30} | |
result = JSONParser._unwrap_json_data(data) | |
assert result == [{"name": "John", "age": 30}] | |
def test_unwrap_json_data_list(self): | |
"""Test unwrapping list of objects JSON data.""" | |
data = [{"name": "John", "age": 30}, {"name": "Jane", "age": 25}] | |
result = JSONParser._unwrap_json_data(data) | |
assert result == [{"name": "John", "age": 30}, {"name": "Jane", "age": 25}] | |
def test_unwrap_json_data_with_wrapper(self): | |
"""Test unwrapping data with a wrapper key.""" | |
data = {"results": [{"name": "John", "age": 30}, {"name": "Jane", "age": 25}]} | |
result = JSONParser._unwrap_json_data(data, json_wrapper_key="results") | |
assert result == [{"name": "John", "age": 30}, {"name": "Jane", "age": 25}] | |
def test_unwrap_json_data_wrapper_error(self): | |
"""Test error when wrapper key is missing.""" | |
data = {"data": [{"name": "John", "age": 30}]} | |
with pytest.raises(KeyError): | |
JSONParser._unwrap_json_data(data, json_wrapper_key="results") | |
def test_parse_llm_output_complete(self): | |
"""Test complete parsing flow with schema validation.""" | |
# Define a schema | |
fields = [{"name": "name", "type": "str", "description": "Person's name"}, {"name": "age", "type": "int", "description": "Person's age"}] | |
schema = JSONParser.convert_to_schema(fields) | |
# Test with valid data | |
text = '{"name": "John", "age": 30}' | |
result = JSONParser.parse_llm_output(text, schema=schema) | |
assert result == [{"name": "John", "age": 30}] | |
# Test with invalid data (missing required field) | |
text = '{"name": "Jane"}' | |
with pytest.raises(SchemaValidationError): | |
JSONParser.parse_llm_output(text, schema=schema, strict=True) | |
# In non-strict mode, should return None | |
result = JSONParser.parse_llm_output(text, schema=schema, strict=False) | |
assert result is None | |
def test_parse_llm_output_nested(self): | |
"""Test parsing nested structures with validation.""" | |
# Define a schema with nested objects | |
fields = [ | |
{"name": "name", "type": "str", "description": "Person's name"}, | |
{ | |
"name": "address", | |
"type": "dict", | |
"description": "Person's address", | |
"properties": {"street": {"type": "string", "description": "Street name"}, "city": {"type": "string", "description": "City name"}}, | |
"required": ["street", "city"], | |
}, | |
] | |
schema = JSONParser.convert_to_schema(fields) | |
# Test with valid nested data | |
text = '{"name": "John", "address": {"street": "123 Main St", "city": "Anytown"}}' | |
result = JSONParser.parse_llm_output(text, schema=schema) | |
assert result[0]["name"] == "John" | |
assert result[0]["address"]["street"] == "123 Main St" | |
assert result[0]["address"]["city"] == "Anytown" | |
# Test with invalid nested data (missing city) | |
text = '{"name": "Jane", "address": {"street": "456 Oak Ave"}}' | |
# We need to use type_check=True to properly validate nested object fields | |
with pytest.raises(SchemaValidationError): | |
JSONParser.parse_llm_output(text, schema=schema, strict=True, type_check=True) | |
def test_preprocess_latex_json(self): | |
"""Test preprocessing JSON text with LaTeX notation.""" | |
# Normal JSON - should be returned as-is | |
json_text = '{"name": "John", "age": 30}' | |
result = JSONParser._try_parse_json(json_text) | |
assert result == {"name": "John", "age": 30} | |
# JSON with basic LaTeX notation | |
latex_json = '{"formula": "\\\\(x^2 + y^2 = z^2\\\\)"}' | |
result = JSONParser._try_parse_json(latex_json) | |
# The backslashes should be properly parsed | |
assert result["formula"] == "\\(x^2 + y^2 = z^2\\)" | |
def test_parse_llm_output_with_latex(self): | |
"""Test parsing LLM output containing LaTeX notation.""" | |
# JSON with LaTeX notation that would normally fail to parse | |
latex_input = """[ | |
{ | |
"problem": "Find positive integer solutions to the equation", | |
"answer": "5" | |
} | |
]""" | |
# Define a simple schema for validation | |
fields = [{"name": "problem", "type": "str", "description": "Math problem"}, {"name": "answer", "type": "str", "description": "Answer to the problem"}] | |
schema = JSONParser.convert_to_schema(fields) | |
# This should parse successfully with our preprocessing | |
result = JSONParser.parse_llm_output(latex_input, schema=schema) | |
assert result is not None | |
assert len(result) == 1 | |
assert result[0]["answer"] == "5" | |
assert "Find positive integer solutions" in result[0]["problem"] | |
def test_parse_complex_latex_math(self): | |
"""Test parsing complex mathematical LaTeX notation in JSON.""" | |
# The example with complex LaTeX split into parts for readability | |
latex_part1 = '[\n {\n "cot": "We are asked to find the number of ' | |
latex_part2 = "positive integer solutions \\\\((x,y)\\\\) to the equation " | |
latex_part3 = "\\\\(7x + 11y = 2024\\\\) such that \\\\(x \\\\equiv y \\\\pmod{5}\\\\)." | |
# Define the remaining JSON parts | |
latex_ending = """", | |
"problem": "Find the number of positive integer solutions", | |
"answer": "5", | |
"reasoning": "First, express x in terms of y from the equation" | |
} | |
]""" | |
# Concatenate all the parts to form the complete test data | |
complex_latex_json = latex_part1 + latex_part2 + latex_part3 + latex_ending | |
# This should parse successfully with our preprocessing | |
result = JSONParser.parse_llm_output(complex_latex_json) | |
# Check that parsing worked and content is preserved | |
assert result is not None | |
assert len(result) == 1 | |
assert "cot" in result[0] | |
assert "problem" in result[0] | |
assert "answer" in result[0] | |
assert "reasoning" in result[0] | |
assert result[0]["answer"] == "5" | |
# Check that a LaTeX expression is present in the content | |
assert "Find the number of positive integer solutions" in result[0]["problem"] | |
assert "7x + 11y = 2024" in result[0]["cot"] | |
def test_parse_problem_cases_with_latex(self): | |
"""Test parsing real problematic cases containing LaTeX and other issues.""" | |
# Import the problem data | |
# Define a simple schema that matches the general structure | |
problem_schema_fields = [ | |
{"name": "problem", "type": "str", "description": "Problem description"}, | |
{"name": "topic", "type": "str", "description": "Problem topic", "required": False}, | |
{"name": "answer", "type": "str", "description": "Problem answer"}, | |
{"name": "reasoning", "type": "str", "description": "Problem reasoning"}, | |
] | |
problem_schema = JSONParser.convert_to_schema(problem_schema_fields) | |
for i, text in enumerate(problem_data_list): | |
try: | |
# Use non-strict mode which better matches real-world usage | |
result = JSONParser.parse_llm_output(text, schema=problem_schema, strict=False, type_check=False) | |
assert result is not None, f"Case {i+1}: Parsing returned None" | |
assert isinstance(result, list), f"Case {i+1}: Result is not a list" | |
assert len(result) > 0, f"Case {i+1}: Result list is empty" | |
assert isinstance(result[0], dict), f"Case {i+1}: First item in result is not a dict" | |
except (JsonParserError, SchemaValidationError, json.JSONDecodeError) as e: | |
pytest.fail(f"Case {i+1}: Failed to parse problematic JSON. Error: {e}\\nInput text:\\n{text[:500]}...") # Show first 500 chars | |