Spaces:

Observer04
/

career_convo

Sleeping

File size: 7,769 Bytes

6a9c9f9

{
 "cells": [
  {
   "cell_type": "raw",
   "metadata": {
    "vscode": {
     "languageId": "raw"
    }
   },
   "source": [
    "# Lab 2 Exercise - Extending the Patterns\n",
    "\n",
    "This notebook extends the original lab by adding the Chain of Thought pattern to enhance the evaluation process.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import required packages\n",
    "import os\n",
    "import json\n",
    "from dotenv import load_dotenv\n",
    "from openai import OpenAI\n",
    "from anthropic import Anthropic\n",
    "from IPython.display import Markdown, display\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load environment variables\n",
    "load_dotenv(override=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize API clients\n",
    "openai = OpenAI()\n",
    "claude = Anthropic()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Original question generation\n",
    "request = \"Please come up with a challenging, nuanced question that I can ask a number of LLMs to evaluate their intelligence. \"\n",
    "request += \"Answer only with the question, no explanation.\"\n",
    "messages = [{\"role\": \"user\", \"content\": request}]\n",
    "\n",
    "response = openai.chat.completions.create(\n",
    "    model=\"gpt-4o-mini\",\n",
    "    messages=messages,\n",
    ")\n",
    "question = response.choices[0].message.content\n",
    "print(question)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get responses from multiple models\n",
    "competitors = []\n",
    "answers = []\n",
    "messages = [{\"role\": \"user\", \"content\": question}]\n",
    "\n",
    "# OpenAI\n",
    "response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=messages)\n",
    "answer = response.choices[0].message.content\n",
    "competitors.append(\"gpt-4o-mini\")\n",
    "answers.append(answer)\n",
    "display(Markdown(answer))\n",
    "\n",
    "# Claude\n",
    "response = claude.messages.create(model=\"claude-3-7-sonnet-latest\", messages=messages, max_tokens=1000)\n",
    "answer = response.content[0].text\n",
    "competitors.append(\"claude-3-7-sonnet-latest\")\n",
    "answers.append(answer)\n",
    "display(Markdown(answer))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# NEW: Chain of Thought Evaluation\n",
    "# First, let's create a detailed evaluation prompt that encourages step-by-step reasoning\n",
    "\n",
    "evaluation_prompt = f\"\"\"You are an expert evaluator of AI responses. Your task is to analyze and rank the following responses to this question:\n",
    "\n",
    "{question}\n",
    "\n",
    "Please follow these steps in your evaluation:\n",
    "\n",
    "1. For each response:\n",
    "   - Identify the main arguments presented\n",
    "   - Evaluate the clarity and coherence of the reasoning\n",
    "   - Assess the depth and breadth of the analysis\n",
    "   - Note any unique insights or perspectives\n",
    "\n",
    "2. Compare the responses:\n",
    "   - How do they differ in their approach?\n",
    "   - Which response demonstrates the most sophisticated understanding?\n",
    "   - Which response provides the most practical and actionable insights?\n",
    "\n",
    "3. Provide your final ranking with detailed justification for each position.\n",
    "\n",
    "Here are the responses:\n",
    "\n",
    "{'\\\\n\\\\n'.join([f'Response {i+1} ({competitors[i]}):\\\\n{answer}' for i, answer in enumerate(answers)])}\n",
    "\n",
    "Please provide your evaluation in JSON format with the following structure:\n",
    "{{\n",
    "    \"detailed_analysis\": [\n",
    "        {{\"competitor\": \"name\", \"strengths\": [], \"weaknesses\": [], \"unique_aspects\": []}},\n",
    "        ...\n",
    "    ],\n",
    "    \"comparative_analysis\": \"detailed comparison of responses\",\n",
    "    \"final_ranking\": [\"ranked competitor numbers\"],\n",
    "    \"justification\": \"detailed explanation of the ranking\"\n",
    "}}\"\"\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the detailed evaluation\n",
    "evaluation_messages = [{\"role\": \"user\", \"content\": evaluation_prompt}]\n",
    "\n",
    "response = openai.chat.completions.create(\n",
    "    model=\"gpt-4o-mini\",\n",
    "    messages=evaluation_messages,\n",
    ")\n",
    "detailed_evaluation = response.choices[0].message.content\n",
    "print(detailed_evaluation)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Parse and display the results in a more readable format\n",
    "\n",
    "# Clean up the JSON string by removing markdown code block markers\n",
    "json_str = detailed_evaluation.replace(\"```json\", \"\").replace(\"```\", \"\").strip()\n",
    "\n",
    "evaluation_dict = json.loads(json_str)\n",
    "\n",
    "print(\"Detailed Analysis:\")\n",
    "for analysis in evaluation_dict[\"detailed_analysis\"]:\n",
    "    print(f\"\\nCompetitor: {analysis['competitor']}\")\n",
    "    print(\"Strengths:\")\n",
    "    for strength in analysis['strengths']:\n",
    "        print(f\"- {strength}\")\n",
    "    print(\"\\nWeaknesses:\")\n",
    "    for weakness in analysis['weaknesses']:\n",
    "        print(f\"- {weakness}\")\n",
    "    print(\"\\nUnique Aspects:\")\n",
    "    for aspect in analysis['unique_aspects']:\n",
    "        print(f\"- {aspect}\")\n",
    "\n",
    "print(\"\\nComparative Analysis:\")\n",
    "print(evaluation_dict[\"comparative_analysis\"])\n",
    "\n",
    "print(\"\\nFinal Ranking:\")\n",
    "for i, rank in enumerate(evaluation_dict[\"final_ranking\"]):\n",
    "    print(f\"{i+1}. {competitors[int(rank)-1]}\")\n",
    "\n",
    "print(\"\\nJustification:\")\n",
    "print(evaluation_dict[\"justification\"])\n"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {
    "vscode": {
     "languageId": "raw"
    }
   },
   "source": [
    "## Pattern Analysis\n",
    "\n",
    "This enhanced version uses several agentic design patterns:\n",
    "\n",
    "1. **Multi-agent Collaboration**: Sending the same question to multiple LLMs\n",
    "2. **Evaluation/Judgment Pattern**: Using one LLM to evaluate responses from others\n",
    "3. **Parallel Processing**: Running multiple models simultaneously\n",
    "4. **Chain of Thought**: Added a structured, step-by-step evaluation process that breaks down the analysis into clear stages\n",
    "\n",
    "The Chain of Thought pattern is particularly valuable here because it:\n",
    "- Forces the evaluator to consider multiple aspects of each response\n",
    "- Provides more detailed and structured feedback\n",
    "- Makes the evaluation process more transparent and explainable\n",
    "- Helps identify specific strengths and weaknesses in each response\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}