Spaces:

brickfrog
/

ankigen

Running

File size: 27,290 Bytes

# Specialized judge agents for card quality assessment

import json
import asyncio
from typing import List, Dict, Any, Tuple, Optional
from datetime import datetime
from dataclasses import dataclass

from openai import AsyncOpenAI

from ankigen_core.logging import logger
from ankigen_core.models import Card
from .base import BaseAgentWrapper, AgentConfig
from .config import get_config_manager
from .schemas import JudgeDecisionSchema


@dataclass
class JudgeDecision:
    """Decision from a judge agent"""

    approved: bool
    score: float
    feedback: str
    judge_name: str
    improvements: Optional[List[str]] = None
    metadata: Optional[Dict[str, Any]] = None

    def __post_init__(self):
        if self.metadata is None:
            self.metadata = {}
        if self.improvements is None:
            self.improvements = []


class ContentAccuracyJudge(BaseAgentWrapper):
    """Judge for factual accuracy and content correctness"""

    def __init__(self, openai_client: AsyncOpenAI):
        config_manager = get_config_manager()
        base_config = config_manager.get_agent_config("content_accuracy_judge")

        if not base_config:
            raise ValueError(
                "content_accuracy_judge configuration not found - agent system not properly initialized"
            )

        # Enable structured output for judge decisions
        base_config.response_format = JudgeDecisionSchema

        super().__init__(base_config, openai_client)

    async def judge_card(
        self, card: Card, context: Optional[Dict[str, Any]] = None
    ) -> JudgeDecision:
        """Judge a card for content accuracy"""
        try:
            user_input = f"""Evaluate this flashcard for factual accuracy:

Front: {card.front.question}
Back: {card.back.answer}

Assess:
1. Factual correctness
2. Completeness of information
3. Clarity and precision
4. Potential misconceptions

Provide a score (0-1) and detailed feedback."""

            response, usage = await self.execute(user_input)

            # Log usage information
            if usage and usage.get("total_tokens", 0) > 0:
                logger.info(
                    f"💰 Token Usage: {usage['total_tokens']} tokens (Input: {usage['input_tokens']}, Output: {usage['output_tokens']})"
                )

            return self._parse_judge_response(response, "ContentAccuracyJudge")

        except Exception as e:
            logger.error(f"Content accuracy judgment failed: {e}")
            raise

    def _parse_judge_response(
        self, response: Dict[str, Any], judge_name: str
    ) -> JudgeDecision:
        """Parse the judge response into a JudgeDecision"""
        decision_data = json.loads(response) if isinstance(response, str) else response
        decision = self._parse_decision(decision_data)

        # Enhanced logging for judge decisions
        logger.info(f"🎯 {judge_name.upper()} DECISION:")
        logger.info("   Card: [Card content]")
        logger.info(f"   ✅ Approved: {decision.approved}")
        logger.info(f"   📊 Score: {decision.score:.2f}")
        logger.info(f"   💭 Feedback: {decision.feedback}")

        if decision.metadata.get("factual_errors"):
            logger.info(f"   ❌ Factual Errors: {decision.metadata['factual_errors']}")
        if decision.metadata.get("terminology_issues"):
            logger.info(
                f"   ⚠️ Terminology Issues: {decision.metadata['terminology_issues']}"
            )
        if decision.improvements:
            logger.info(f"   🔧 Suggested Improvements: {decision.improvements}")

        logger.info(
            f"   🎯 Judge Confidence: {decision.metadata.get('confidence', 'N/A')}"
        )

        return decision

    def _parse_decision(self, decision_data: Dict[str, Any]) -> JudgeDecision:
        """Parse the judge response into a JudgeDecision"""
        return JudgeDecision(
            approved=decision_data.get("approved", True),
            score=decision_data.get("accuracy_score", 0.5),
            feedback=decision_data.get("detailed_feedback", "No feedback provided"),
            improvements=decision_data.get("suggestions", []),
            judge_name=self.config.name,
            metadata={
                "factual_errors": decision_data.get("factual_errors", []),
                "terminology_issues": decision_data.get("terminology_issues", []),
                "misconceptions": decision_data.get("misconceptions", []),
                "confidence": decision_data.get("confidence", 0.5),
            },
        )


class PedagogicalJudge(BaseAgentWrapper):
    """Judge for educational effectiveness and pedagogical principles"""

    def __init__(self, openai_client: AsyncOpenAI):
        config_manager = get_config_manager()
        base_config = config_manager.get_agent_config("pedagogical_judge")

        if not base_config:
            base_config = AgentConfig(
                name="pedagogical_judge",
                instructions="""You are an educational assessment specialist.
Evaluate flashcards for pedagogical effectiveness, learning objectives,
cognitive levels, and educational best practices.""",
                model="gpt-4.1",
                temperature=0.4,
            )

        super().__init__(base_config, openai_client)

    async def judge_card(self, card: Card) -> JudgeDecision:
        """Judge a single card for pedagogical effectiveness"""
        datetime.now()

        try:
            user_input = self._build_judgment_prompt(card)
            response, usage = await self.execute(user_input)

            decision_data = (
                json.loads(response) if isinstance(response, str) else response
            )
            decision = self._parse_decision(decision_data)

            # Enhanced logging for pedagogical judge decisions
            logger.info(f"🎓 {self.config.name.upper()} DECISION:")
            logger.info(f"   Card: {card.front.question[:80]}...")
            logger.info(f"   ✅ Approved: {decision.approved}")
            logger.info(f"   📊 Score: {decision.score:.2f}")
            logger.info(f"   💭 Feedback: {decision.feedback}")

            if decision.metadata and decision.metadata.get("cognitive_level"):
                logger.info(
                    f"   🧠 Cognitive Level: {decision.metadata['cognitive_level']}"
                )
            if decision.metadata and decision.metadata.get("pedagogical_issues"):
                logger.info(
                    f"   ⚠️ Pedagogical Issues: {decision.metadata['pedagogical_issues']}"
                )
            if decision.improvements:
                logger.info(f"   🔧 Suggested Improvements: {decision.improvements}")

            return decision

        except Exception as e:
            logger.error(f"PedagogicalJudge failed: {e}")
            return JudgeDecision(
                approved=True,
                score=0.5,
                feedback=f"Judgment failed: {str(e)}",
                judge_name=self.config.name,
            )

    def _build_judgment_prompt(self, card: Card) -> str:
        """Build the judgment prompt for pedagogical effectiveness"""
        return f"""Evaluate this flashcard for pedagogical effectiveness:

Card:
Question: {card.front.question}
Answer: {card.back.answer}
Explanation: {card.back.explanation}
Example: {card.back.example}
Difficulty: {card.metadata.get('difficulty', 'Unknown')}

Evaluate based on:
1. Learning Objectives: Clear, measurable learning goals?
2. Bloom's Taxonomy: Appropriate cognitive level?
3. Cognitive Load: Manageable information load?
4. Motivation: Engaging and relevant content?
5. Assessment: Valid testing of understanding vs memorization?

Return your assessment as JSON:
{{
    "approved": true/false,
    "pedagogical_score": 0.0-1.0,
    "cognitive_level": "remember|understand|apply|analyze|evaluate|create",
    "cognitive_load": "low|medium|high",
    "learning_objectives": ["objective1", "objective2"],
    "engagement_factors": ["factor1", "factor2"],
    "pedagogical_issues": ["issue1", "issue2"],
    "improvement_suggestions": ["suggestion1", "suggestion2"],
    "detailed_feedback": "Comprehensive pedagogical assessment"
}}"""

    def _parse_decision(self, decision_data: Dict[str, Any]) -> JudgeDecision:
        """Parse the judge response into a JudgeDecision"""
        return JudgeDecision(
            approved=decision_data.get("approved", True),
            score=decision_data.get("pedagogical_score", 0.5),
            feedback=decision_data.get("detailed_feedback", "No feedback provided"),
            improvements=decision_data.get("improvement_suggestions", []),
            judge_name=self.config.name,
            metadata={
                "cognitive_level": decision_data.get("cognitive_level", "unknown"),
                "cognitive_load": decision_data.get("cognitive_load", "medium"),
                "learning_objectives": decision_data.get("learning_objectives", []),
                "engagement_factors": decision_data.get("engagement_factors", []),
                "pedagogical_issues": decision_data.get("pedagogical_issues", []),
            },
        )


class ClarityJudge(BaseAgentWrapper):
    """Judge for clarity, readability, and communication effectiveness"""

    def __init__(self, openai_client: AsyncOpenAI):
        config_manager = get_config_manager()
        base_config = config_manager.get_agent_config("clarity_judge")

        if not base_config:
            base_config = AgentConfig(
                name="clarity_judge",
                instructions="""You are a communication and clarity specialist.
Ensure flashcards are clear, unambiguous, well-written, and accessible
to the target audience.""",
                model="gpt-4.1-mini",
                temperature=0.3,
            )

        super().__init__(base_config, openai_client)

    async def judge_card(self, card: Card) -> JudgeDecision:
        """Judge a single card for clarity and communication"""
        datetime.now()

        try:
            user_input = self._build_judgment_prompt(card)
            response, usage = await self.execute(user_input)

            decision_data = (
                json.loads(response) if isinstance(response, str) else response
            )
            decision = self._parse_decision(decision_data)

            # Enhanced logging for clarity judge decisions
            logger.info(f"✨ {self.config.name.upper()} DECISION:")
            logger.info(f"   Card: {card.front.question[:80]}...")
            logger.info(f"   ✅ Approved: {decision.approved}")
            logger.info(f"   📊 Score: {decision.score:.2f}")
            logger.info(f"   💭 Feedback: {decision.feedback}")

            if decision.metadata and decision.metadata.get("readability_level"):
                logger.info(
                    f"   📚 Readability: {decision.metadata['readability_level']}"
                )
            if decision.metadata and decision.metadata.get("ambiguities"):
                logger.info(f"   ❓ Ambiguities: {decision.metadata['ambiguities']}")
            if decision.improvements:
                logger.info(f"   🔧 Suggested Improvements: {decision.improvements}")

            return decision

        except Exception as e:
            logger.error(f"ClarityJudge failed: {e}")
            return JudgeDecision(
                approved=True,
                score=0.5,
                feedback=f"Judgment failed: {str(e)}",
                judge_name=self.config.name,
            )

    def _build_judgment_prompt(self, card: Card) -> str:
        """Build the judgment prompt for clarity assessment"""
        return f"""Evaluate this flashcard for clarity and communication effectiveness:

Card:
Question: {card.front.question}
Answer: {card.back.answer}
Explanation: {card.back.explanation}
Example: {card.back.example}

Evaluate for:
1. Question Clarity: Is the question clear and unambiguous?
2. Answer Completeness: Is the answer complete and coherent?
3. Language Level: Appropriate for target audience?
4. Readability: Easy to read and understand?
5. Structure: Well-organized and logical flow?

Return your assessment as JSON:
{{
    "approved": true/false,
    "clarity_score": 0.0-1.0,
    "question_clarity": 0.0-1.0,
    "answer_completeness": 0.0-1.0,
    "readability_level": "elementary|middle|high|college",
    "ambiguities": ["ambiguity1", "ambiguity2"],
    "clarity_issues": ["issue1", "issue2"],
    "improvement_suggestions": ["suggestion1", "suggestion2"],
    "detailed_feedback": "Comprehensive clarity assessment"
}}"""

    def _parse_decision(self, decision_data: Dict[str, Any]) -> JudgeDecision:
        """Parse the judge response into a JudgeDecision"""
        return JudgeDecision(
            approved=decision_data.get("approved", True),
            score=decision_data.get("clarity_score", 0.5),
            feedback=decision_data.get("detailed_feedback", "No feedback provided"),
            improvements=decision_data.get("improvement_suggestions", []),
            judge_name=self.config.name,
            metadata={
                "question_clarity": decision_data.get("question_clarity", 0.5),
                "answer_completeness": decision_data.get("answer_completeness", 0.5),
                "readability_level": decision_data.get("readability_level", "unknown"),
                "ambiguities": decision_data.get("ambiguities", []),
                "clarity_issues": decision_data.get("clarity_issues", []),
            },
        )


class TechnicalJudge(BaseAgentWrapper):
    """Judge for technical accuracy in programming and technical content"""

    def __init__(self, openai_client: AsyncOpenAI):
        config_manager = get_config_manager()
        base_config = config_manager.get_agent_config("technical_judge")

        if not base_config:
            base_config = AgentConfig(
                name="technical_judge",
                instructions="""You are a technical accuracy specialist for programming and technical content.
Verify code syntax, best practices, security considerations, and technical correctness.""",
                model="gpt-4.1",
                temperature=0.2,
            )

        super().__init__(base_config, openai_client)

    async def judge_card(self, card: Card) -> JudgeDecision:
        """Judge a single card for technical accuracy"""
        datetime.now()

        try:
            # Only judge technical content
            if not self._is_technical_content(card):
                return JudgeDecision(
                    approved=True,
                    score=1.0,
                    feedback="Non-technical content - no technical review needed",
                    judge_name=self.config.name,
                )

            user_input = self._build_judgment_prompt(card)
            response, usage = await self.execute(user_input)

            decision_data = (
                json.loads(response) if isinstance(response, str) else response
            )
            decision = self._parse_decision(decision_data)

            return decision

        except Exception as e:
            logger.error(f"TechnicalJudge failed: {e}")
            return JudgeDecision(
                approved=True,
                score=0.5,
                feedback=f"Technical judgment failed: {str(e)}",
                judge_name=self.config.name,
            )

    def _is_technical_content(self, card: Card) -> bool:
        """Determine if card contains technical content requiring technical review"""
        technical_keywords = [
            "code",
            "programming",
            "algorithm",
            "function",
            "class",
            "method",
            "syntax",
            "API",
            "database",
            "SQL",
            "python",
            "javascript",
            "java",
            "framework",
            "library",
            "development",
            "software",
            "technical",
        ]

        content = (
            f"{card.front.question} {card.back.answer} {card.back.explanation}".lower()
        )
        subject = card.metadata.get("subject", "").lower()

        return any(
            keyword in content or keyword in subject for keyword in technical_keywords
        )

    def _build_judgment_prompt(self, card: Card) -> str:
        """Build the judgment prompt for technical accuracy"""
        return f"""Evaluate this technical flashcard for accuracy and best practices:

Card:
Question: {card.front.question}
Answer: {card.back.answer}
Explanation: {card.back.explanation}
Example: {card.back.example}
Subject: {card.metadata.get('subject', 'Unknown')}

Evaluate for:
1. Code Syntax: Is any code syntactically correct?
2. Best Practices: Does it follow established best practices?
3. Security: Are there security considerations addressed?
4. Performance: Are performance implications mentioned where relevant?
5. Tool Accuracy: Are tool/framework references accurate?

Return your assessment as JSON:
{{
    "approved": true/false,
    "technical_score": 0.0-1.0,
    "syntax_errors": ["error1", "error2"],
    "best_practice_violations": ["violation1", "violation2"],
    "security_issues": ["issue1", "issue2"],
    "performance_concerns": ["concern1", "concern2"],
    "tool_inaccuracies": ["inaccuracy1", "inaccuracy2"],
    "improvement_suggestions": ["suggestion1", "suggestion2"],
    "detailed_feedback": "Comprehensive technical assessment"
}}"""

    def _parse_decision(self, decision_data: Dict[str, Any]) -> JudgeDecision:
        """Parse the judge response into a JudgeDecision"""
        return JudgeDecision(
            approved=decision_data.get("approved", True),
            score=decision_data.get("technical_score", 0.5),
            feedback=decision_data.get("detailed_feedback", "No feedback provided"),
            improvements=decision_data.get("improvement_suggestions", []),
            judge_name=self.config.name,
            metadata={
                "syntax_errors": decision_data.get("syntax_errors", []),
                "best_practice_violations": decision_data.get(
                    "best_practice_violations", []
                ),
                "security_issues": decision_data.get("security_issues", []),
                "performance_concerns": decision_data.get("performance_concerns", []),
                "tool_inaccuracies": decision_data.get("tool_inaccuracies", []),
            },
        )


class CompletenessJudge(BaseAgentWrapper):
    """Judge for completeness and quality standards"""

    def __init__(self, openai_client: AsyncOpenAI):
        config_manager = get_config_manager()
        base_config = config_manager.get_agent_config("completeness_judge")

        if not base_config:
            base_config = AgentConfig(
                name="completeness_judge",
                instructions="""You are a completeness and quality assurance specialist.
Ensure flashcards meet all requirements, have complete information,
and maintain consistent quality standards.""",
                model="gpt-4.1-mini",
                temperature=0.3,
            )

        super().__init__(base_config, openai_client)

    async def judge_card(self, card: Card) -> JudgeDecision:
        """Judge a single card for completeness"""
        datetime.now()

        try:
            user_input = self._build_judgment_prompt(card)
            response, usage = await self.execute(user_input)

            decision_data = (
                json.loads(response) if isinstance(response, str) else response
            )
            decision = self._parse_decision(decision_data)

            return decision

        except Exception as e:
            logger.error(f"CompletenessJudge failed: {e}")
            return JudgeDecision(
                approved=True,
                score=0.5,
                feedback=f"Completeness judgment failed: {str(e)}",
                judge_name=self.config.name,
            )

    def _build_judgment_prompt(self, card: Card) -> str:
        """Build the judgment prompt for completeness assessment"""
        return f"""Evaluate this flashcard for completeness and quality standards:

Card:
Question: {card.front.question}
Answer: {card.back.answer}
Explanation: {card.back.explanation}
Example: {card.back.example}
Type: {card.card_type}
Metadata: {json.dumps(card.metadata, indent=2)}

Check for:
1. Required Fields: All necessary fields present and filled?
2. Metadata Completeness: Appropriate tags, categorization, difficulty?
3. Content Completeness: Answer, explanation, example present and sufficient?
4. Quality Standards: Consistent formatting and professional quality?
5. Example Relevance: Examples relevant and helpful?

Return your assessment as JSON:
{{
    "approved": true/false,
    "completeness_score": 0.0-1.0,
    "missing_fields": ["field1", "field2"],
    "incomplete_sections": ["section1", "section2"],
    "metadata_issues": ["issue1", "issue2"],
    "quality_concerns": ["concern1", "concern2"],
    "improvement_suggestions": ["suggestion1", "suggestion2"],
    "detailed_feedback": "Comprehensive completeness assessment"
}}"""

    def _parse_decision(self, decision_data: Dict[str, Any]) -> JudgeDecision:
        """Parse the judge response into a JudgeDecision"""
        return JudgeDecision(
            approved=decision_data.get("approved", True),
            score=decision_data.get("completeness_score", 0.5),
            feedback=decision_data.get("detailed_feedback", "No feedback provided"),
            improvements=decision_data.get("improvement_suggestions", []),
            judge_name=self.config.name,
            metadata={
                "missing_fields": decision_data.get("missing_fields", []),
                "incomplete_sections": decision_data.get("incomplete_sections", []),
                "metadata_issues": decision_data.get("metadata_issues", []),
                "quality_concerns": decision_data.get("quality_concerns", []),
            },
        )


class JudgeCoordinator(BaseAgentWrapper):
    """Coordinates multiple judges and synthesizes their decisions"""

    def __init__(self, openai_client: AsyncOpenAI):
        config_manager = get_config_manager()
        base_config = config_manager.get_agent_config("judge_coordinator")

        if not base_config:
            base_config = AgentConfig(
                name="judge_coordinator",
                instructions="""You are the quality assurance coordinator.
Orchestrate the judging process and synthesize feedback from specialist judges.
Balance speed with thoroughness in quality assessment.""",
                model="gpt-4.1-mini",
                temperature=0.3,
            )

        super().__init__(base_config, openai_client)

        # Initialize specialist judges
        self.content_accuracy = ContentAccuracyJudge(openai_client)
        self.pedagogical = PedagogicalJudge(openai_client)
        self.clarity = ClarityJudge(openai_client)
        self.technical = TechnicalJudge(openai_client)
        self.completeness = CompletenessJudge(openai_client)

    async def coordinate_judgment(
        self,
        cards: List[Card],
        enable_parallel: bool = True,
        min_consensus: float = 0.6,
    ) -> List[Tuple[Card, List[JudgeDecision], bool]]:
        """Coordinate judgment of multiple cards"""
        datetime.now()

        try:
            results = []

            if enable_parallel:
                # Process all cards in parallel
                tasks = [self._judge_single_card(card, min_consensus) for card in cards]
                card_results = await asyncio.gather(*tasks, return_exceptions=True)

                for card, result in zip(cards, card_results):
                    if isinstance(result, Exception):
                        logger.error(f"Parallel judgment failed for card: {result}")
                        results.append((card, [], False))
                    else:
                        results.append(result)
            else:
                # Process cards sequentially
                for card in cards:
                    try:
                        result = await self._judge_single_card(card, min_consensus)
                        results.append(result)
                    except Exception as e:
                        logger.error(f"Sequential judgment failed for card: {e}")
                        results.append((card, [], False))

            # Calculate summary statistics
            total_cards = len(cards)
            approved_cards = len([result for _, _, approved in results if approved])

            logger.info(
                f"Judge coordination complete: {approved_cards}/{total_cards} cards approved"
            )
            return results

        except Exception as e:
            logger.error(f"Judge coordination failed: {e}")
            raise

    async def _judge_single_card(
        self, card: Card, min_consensus: float
    ) -> Tuple[Card, List[JudgeDecision], bool]:
        """Judge a single card with all relevant judges"""

        # Determine which judges to use based on card content
        judges = [
            self.content_accuracy,
            self.pedagogical,
            self.clarity,
            self.completeness,
        ]

        # Add technical judge only for technical content
        if self.technical._is_technical_content(card):
            judges.append(self.technical)

        # Execute all judges in parallel
        judge_tasks = [judge.judge_card(card) for judge in judges]
        decisions = await asyncio.gather(*judge_tasks, return_exceptions=True)

        # Filter out failed decisions
        valid_decisions = []
        for decision in decisions:
            if isinstance(decision, JudgeDecision):
                valid_decisions.append(decision)
            else:
                logger.warning(f"Judge decision failed: {decision}")

        # Calculate consensus
        if not valid_decisions:
            return (card, [], False)

        approval_votes = len([d for d in valid_decisions if d.approved])
        consensus_score = approval_votes / len(valid_decisions)

        # Determine final approval based on consensus
        final_approval = consensus_score >= min_consensus

        # Enhanced logging for judge coordination
        logger.info("🏛️ JUDGE COORDINATION RESULT:")
        logger.info(f"   Card: {card.front.question[:80]}...")
        logger.info(f"   👥 Judges Consulted: {len(valid_decisions)}")
        logger.info(f"   ✅ Approval Votes: {approval_votes}/{len(valid_decisions)}")
        logger.info(
            f"   📊 Consensus Score: {consensus_score:.2f} (min: {min_consensus:.2f})"
        )
        logger.info(
            f"   🏆 Final Decision: {'APPROVED' if final_approval else 'REJECTED'}"
        )

        if not final_approval:
            logger.info("   📝 Rejection Reasons:")
            for decision in valid_decisions:
                if not decision.approved:
                    logger.info(
                        f"     • {decision.judge_name}: {decision.feedback[:100]}..."
                    )

        return (card, valid_decisions, final_approval)