In [1]:
# Import required packages
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from anthropic import Anthropic
from IPython.display import Markdown, display


In [None]:
# Load environment variables
load_dotenv(override=True)


In [3]:
# Initialize API clients
openai = OpenAI()
claude = Anthropic()


In [None]:
# Original question generation
request = "Please come up with a challenging, nuanced question that I can ask a number of LLMs to evaluate their intelligence. "
request += "Answer only with the question, no explanation."
messages = [{"role": "user", "content": request}]

response = openai.chat.completions.create(
 model="gpt-4o-mini",
 messages=messages,
)
question = response.choices[0].message.content
print(question)


In [None]:
# Get responses from multiple models
competitors = []
answers = []
messages = [{"role": "user", "content": question}]

# OpenAI
response = openai.chat.completions.create(model="gpt-4o-mini", messages=messages)
answer = response.choices[0].message.content
competitors.append("gpt-4o-mini")
answers.append(answer)
display(Markdown(answer))

# Claude
response = claude.messages.create(model="claude-3-7-sonnet-latest", messages=messages, max_tokens=1000)
answer = response.content[0].text
competitors.append("claude-3-7-sonnet-latest")
answers.append(answer)
display(Markdown(answer))


In [6]:
# NEW: Chain of Thought Evaluation
# First, let's create a detailed evaluation prompt that encourages step-by-step reasoning

evaluation_prompt = f"""You are an expert evaluator of AI responses. Your task is to analyze and rank the following responses to this question:

{question}

Please follow these steps in your evaluation:

1. For each response:
 - Identify the main arguments presented
 - Evaluate the clarity and coherence of the reasoning
 - Assess the depth and breadth of the analysis
 - Note any unique insights or perspectives

2. Compare the responses:
 - How do they differ in their approach?
 - Which response demonstrates the most sophisticated understanding?
 - Which response provides the most practical and actionable insights?

3. Provide your final ranking with detailed justification for each position.

Here are the responses:

{'\\n\\n'.join([f'Response {i+1} ({competitors[i]}):\\n{answer}' for i, answer in enumerate(answers)])}

Please provide your evaluation in JSON format with the following structure:
{{
 "detailed_analysis": [
 {{"competitor": "name", "strengths": [], "weaknesses": [], "unique_aspects": []}},
 ...
 ],
 "comparative_analysis": "detailed comparison of responses",
 "final_ranking": ["ranked competitor numbers"],
 "justification": "detailed explanation of the ranking"
}}"""


In [None]:
# Get the detailed evaluation
evaluation_messages = [{"role": "user", "content": evaluation_prompt}]

response = openai.chat.completions.create(
 model="gpt-4o-mini",
 messages=evaluation_messages,
)
detailed_evaluation = response.choices[0].message.content
print(detailed_evaluation)


In [None]:
# Parse and display the results in a more readable format

# Clean up the JSON string by removing markdown code block markers
json_str = detailed_evaluation.replace("```json", "").replace("```", "").strip()

evaluation_dict = json.loads(json_str)

print("Detailed Analysis:")
for analysis in evaluation_dict["detailed_analysis"]:
 print(f"\nCompetitor: {analysis['competitor']}")
 print("Strengths:")
 for strength in analysis['strengths']:
 print(f"- {strength}")
 print("\nWeaknesses:")
 for weakness in analysis['weaknesses']:
 print(f"- {weakness}")
 print("\nUnique Aspects:")
 for aspect in analysis['unique_aspects']:
 print(f"- {aspect}")

print("\nComparative Analysis:")
print(evaluation_dict["comparative_analysis"])

print("\nFinal Ranking:")
for i, rank in enumerate(evaluation_dict["final_ranking"]):
 print(f"{i+1}. {competitors[int(rank)-1]}")

print("\nJustification:")
print(evaluation_dict["justification"])
