{ "cells": [ { "cell_type": "raw", "metadata": { "vscode": { "languageId": "raw" } }, "source": [ "# Lab 2 Exercise - Extending the Patterns\n", "\n", "This notebook extends the original lab by adding the Chain of Thought pattern to enhance the evaluation process.\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Import required packages\n", "import os\n", "import json\n", "from dotenv import load_dotenv\n", "from openai import OpenAI\n", "from anthropic import Anthropic\n", "from IPython.display import Markdown, display\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load environment variables\n", "load_dotenv(override=True)\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Initialize API clients\n", "openai = OpenAI()\n", "claude = Anthropic()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Original question generation\n", "request = \"Please come up with a challenging, nuanced question that I can ask a number of LLMs to evaluate their intelligence. \"\n", "request += \"Answer only with the question, no explanation.\"\n", "messages = [{\"role\": \"user\", \"content\": request}]\n", "\n", "response = openai.chat.completions.create(\n", " model=\"gpt-4o-mini\",\n", " messages=messages,\n", ")\n", "question = response.choices[0].message.content\n", "print(question)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get responses from multiple models\n", "competitors = []\n", "answers = []\n", "messages = [{\"role\": \"user\", \"content\": question}]\n", "\n", "# OpenAI\n", "response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=messages)\n", "answer = response.choices[0].message.content\n", "competitors.append(\"gpt-4o-mini\")\n", "answers.append(answer)\n", "display(Markdown(answer))\n", "\n", "# Claude\n", "response = claude.messages.create(model=\"claude-3-7-sonnet-latest\", messages=messages, max_tokens=1000)\n", "answer = response.content[0].text\n", "competitors.append(\"claude-3-7-sonnet-latest\")\n", "answers.append(answer)\n", "display(Markdown(answer))\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# NEW: Chain of Thought Evaluation\n", "# First, let's create a detailed evaluation prompt that encourages step-by-step reasoning\n", "\n", "evaluation_prompt = f\"\"\"You are an expert evaluator of AI responses. Your task is to analyze and rank the following responses to this question:\n", "\n", "{question}\n", "\n", "Please follow these steps in your evaluation:\n", "\n", "1. For each response:\n", " - Identify the main arguments presented\n", " - Evaluate the clarity and coherence of the reasoning\n", " - Assess the depth and breadth of the analysis\n", " - Note any unique insights or perspectives\n", "\n", "2. Compare the responses:\n", " - How do they differ in their approach?\n", " - Which response demonstrates the most sophisticated understanding?\n", " - Which response provides the most practical and actionable insights?\n", "\n", "3. Provide your final ranking with detailed justification for each position.\n", "\n", "Here are the responses:\n", "\n", "{'\\\\n\\\\n'.join([f'Response {i+1} ({competitors[i]}):\\\\n{answer}' for i, answer in enumerate(answers)])}\n", "\n", "Please provide your evaluation in JSON format with the following structure:\n", "{{\n", " \"detailed_analysis\": [\n", " {{\"competitor\": \"name\", \"strengths\": [], \"weaknesses\": [], \"unique_aspects\": []}},\n", " ...\n", " ],\n", " \"comparative_analysis\": \"detailed comparison of responses\",\n", " \"final_ranking\": [\"ranked competitor numbers\"],\n", " \"justification\": \"detailed explanation of the ranking\"\n", "}}\"\"\"\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get the detailed evaluation\n", "evaluation_messages = [{\"role\": \"user\", \"content\": evaluation_prompt}]\n", "\n", "response = openai.chat.completions.create(\n", " model=\"gpt-4o-mini\",\n", " messages=evaluation_messages,\n", ")\n", "detailed_evaluation = response.choices[0].message.content\n", "print(detailed_evaluation)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Parse and display the results in a more readable format\n", "\n", "# Clean up the JSON string by removing markdown code block markers\n", "json_str = detailed_evaluation.replace(\"```json\", \"\").replace(\"```\", \"\").strip()\n", "\n", "evaluation_dict = json.loads(json_str)\n", "\n", "print(\"Detailed Analysis:\")\n", "for analysis in evaluation_dict[\"detailed_analysis\"]:\n", " print(f\"\\nCompetitor: {analysis['competitor']}\")\n", " print(\"Strengths:\")\n", " for strength in analysis['strengths']:\n", " print(f\"- {strength}\")\n", " print(\"\\nWeaknesses:\")\n", " for weakness in analysis['weaknesses']:\n", " print(f\"- {weakness}\")\n", " print(\"\\nUnique Aspects:\")\n", " for aspect in analysis['unique_aspects']:\n", " print(f\"- {aspect}\")\n", "\n", "print(\"\\nComparative Analysis:\")\n", "print(evaluation_dict[\"comparative_analysis\"])\n", "\n", "print(\"\\nFinal Ranking:\")\n", "for i, rank in enumerate(evaluation_dict[\"final_ranking\"]):\n", " print(f\"{i+1}. {competitors[int(rank)-1]}\")\n", "\n", "print(\"\\nJustification:\")\n", "print(evaluation_dict[\"justification\"])\n" ] }, { "cell_type": "raw", "metadata": { "vscode": { "languageId": "raw" } }, "source": [ "## Pattern Analysis\n", "\n", "This enhanced version uses several agentic design patterns:\n", "\n", "1. **Multi-agent Collaboration**: Sending the same question to multiple LLMs\n", "2. **Evaluation/Judgment Pattern**: Using one LLM to evaluate responses from others\n", "3. **Parallel Processing**: Running multiple models simultaneously\n", "4. **Chain of Thought**: Added a structured, step-by-step evaluation process that breaks down the analysis into clear stages\n", "\n", "The Chain of Thought pattern is particularly valuable here because it:\n", "- Forces the evaluator to consider multiple aspects of each response\n", "- Provides more detailed and structured feedback\n", "- Makes the evaluation process more transparent and explainable\n", "- Helps identify specific strengths and weaknesses in each response\n" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 2 }