Spaces:

zwt963
/

paperindex

Running

App Files Files Community

DVampire commited on 1 day ago

Commit

d3ff7fa

1 Parent(s): 26884bd

update database

Browse files

Files changed (6) hide show

app.py +12 -12
frontend/paper.html +1 -1
frontend/paper.js +1 -1
src/agents/evaluator.py +35 -35
src/agents/prompt.py +270 -133
workdir/paper_agent/papers_cache.db +3 -0

app.py CHANGED Viewed

@@ -428,19 +428,19 @@ async def get_paper_score(paper_id: str) -> Dict[str, Any]:
         evaluation_content = paper.get('evaluation_content')
         if evaluation_content:
             evaluation_json = json.loads(evaluation_content)
-            if 'scorecard' in evaluation_json:
-                scorecard = evaluation_json['scorecard']
                 values = [
-                    scorecard.get('task_formalization', 0),
-                    scorecard.get('data_resource_availability', 0),
-                    scorecard.get('input_output_complexity', 0),
-                    scorecard.get('real_world_interaction', 0),
-                    scorecard.get('existing_ai_coverage', 0),
-                    scorecard.get('human_originality', 0),
-                    scorecard.get('safety_ethics', 0),
-                    scorecard.get('technical_maturity_needed', 0),
-                    scorecard.get('three_year_feasibility_pct', 0) / 25,  # Convert percentage to 0-4 scale
-                    scorecard.get('overall_automatability', 0)
                 ]
                 valid_scores = [v for v in values if v > 0]
                 overall_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0

         evaluation_content = paper.get('evaluation_content')
         if evaluation_content:
             evaluation_json = json.loads(evaluation_content)
+            if 'scores' in evaluation_json:
+                scores = evaluation_json['scores']
                 values = [
+                    scores.get('task_formalization', 0),
+                    scores.get('data_resource_availability', 0),
+                    scores.get('input_output_complexity', 0),
+                    scores.get('real_world_interaction', 0),
+                    scores.get('existing_ai_coverage', 0),
+                    scores.get('human_originality', 0),
+                    scores.get('safety_ethics', 0),
+                    scores.get('technical_maturity_needed', 0),
+                    scores.get('three_year_feasibility_pct', 0) / 25,  # Convert percentage to 0-4 scale
+                    scores.get('overall_automatability', 0)
                 ]
                 valid_scores = [v for v in values if v > 0]
                 overall_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0

frontend/paper.html CHANGED Viewed

@@ -64,7 +64,7 @@
           <aside class="sidebar">
             <div class="scorecard-panel">
               <div class="panel-header">
-                <h2><i class="fas fa-radar"></i> Scorecard</h2>
                 <div class="overall-score" id="overallScore">
                   <span class="score-number">-</span>
                   <span class="score-label">Overall</span>

           <aside class="sidebar">
             <div class="scorecard-panel">
               <div class="panel-header">
+                <h2><i class="fas fa-radar"></i> Scores</h2>
                 <div class="overall-score" id="overallScore">
                   <span class="score-number">-</span>
                   <span class="score-label">Overall</span>

frontend/paper.js CHANGED Viewed

@@ -292,7 +292,7 @@ class PaperEvaluationRenderer {
     if (!radarEl) return;
     try {
-      const score = json.scorecard || {};
       const d = parseMaybeJSON(json.dimensions) || {};
       const labels = [

     if (!radarEl) return;
     try {
+      const score = json.scores || {};
       const d = parseMaybeJSON(json.dimensions) || {};
       const labels = [

src/agents/evaluator.py CHANGED Viewed

@@ -94,7 +94,7 @@ class Evaluator:
             # Call Anthropic API with tools (async)
             response = await self.client.messages.create(
                 model=config.model_id,
-                max_tokens=4000,
                 system=self.system_prompt,
                 messages=messages,
                 tools=TOOLS,
@@ -158,26 +158,26 @@ async def save_node(state: ConversationState) -> ConversationState:
         # Try to extract score and tags from tool_result if available
         if state.tool_result:
             try:
-                # Extract overall automatability score from scorecard
-                if 'scorecard' in state.tool_result and 'overall_automatability' in state.tool_result['scorecard']:
-                    evaluation_score = state.tool_result['scorecard']['overall_automatability']
-                # Extract overall score from scorecard
-                if 'scorecard' in state.tool_result and 'overall_automatability' in state.tool_result['scorecard']:
-                    overall_score = state.tool_result['scorecard']['overall_automatability']
-                # Create tags from key dimensions in scorecard
                 tags = []
-                if 'scorecard' in state.tool_result:
-                    scorecard = state.tool_result['scorecard']
-                    if 'three_year_feasibility_pct' in scorecard:
-                        tags.append(f"3yr_feasibility:{scorecard['three_year_feasibility_pct']}%")
-                    if 'task_formalization' in scorecard:
-                        tags.append(f"task_formalization:{scorecard['task_formalization']}/4")
-                    if 'data_resource_availability' in scorecard:
-                        tags.append(f"data_availability:{scorecard['data_resource_availability']}/4")
-                evaluation_tags = ",".join(tags) if tags else None
             except Exception as e:
                 logger.warning(f"Warning: Could not extract structured data from tool_result: {e}")
@@ -185,26 +185,26 @@ async def save_node(state: ConversationState) -> ConversationState:
             # Try to parse evaluation_content as JSON to extract structured data
             try:
                 evaluation_json = json.loads(evaluation_content)
-                # Extract overall automatability score from scorecard
-                if 'scorecard' in evaluation_json and 'overall_automatability' in evaluation_json['scorecard']:
-                    evaluation_score = evaluation_json['scorecard']['overall_automatability']
-                # Extract overall score from scorecard
-                if 'scorecard' in evaluation_json and 'overall_automatability' in evaluation_json['scorecard']:
-                    overall_score = evaluation_json['scorecard']['overall_automatability']
-                # Create tags from key dimensions in scorecard
                 tags = []
-                if 'scorecard' in evaluation_json:
-                    scorecard = evaluation_json['scorecard']
-                    if 'three_year_feasibility_pct' in scorecard:
-                        tags.append(f"3yr_feasibility:{scorecard['three_year_feasibility_pct']}%")
-                    if 'task_formalization' in scorecard:
-                        tags.append(f"task_formalization:{scorecard['task_formalization']}/4")
-                    if 'data_resource_availability' in scorecard:
-                        tags.append(f"data_availability:{scorecard['data_resource_availability']}/4")
-                evaluation_tags = ",".join(tags) if tags else None
             except Exception as e:
                 logger.warning(f"Warning: Could not parse evaluation_content as JSON: {e}")

             # Call Anthropic API with tools (async)
             response = await self.client.messages.create(
                 model=config.model_id,
+                max_tokens=10000,
                 system=self.system_prompt,
                 messages=messages,
                 tools=TOOLS,
         # Try to extract score and tags from tool_result if available
         if state.tool_result:
             try:
+                # Extract overall automatability score from scores
+                if 'scores' in state.tool_result and 'overall_automatability' in state.tool_result['scores']:
+                    evaluation_score = state.tool_result['scores']['overall_automatability']
+                # Extract overall score from scores
+                if 'scores' in state.tool_result and 'overall_automatability' in state.tool_result['scores']:
+                    overall_score = state.tool_result['scores']['overall_automatability']
+                # Create tags from key dimensions in scores
                 tags = []
+                if 'scores' in state.tool_result:
+                    scores = state.tool_result['scores']
+                    if 'three_year_feasibility_pct' in scores:
+                        tags.append(f"3yr_feasibility:{scores['three_year_feasibility_pct']}%")
+                    if 'task_formalization' in scores:
+                        tags.append(f"task_formalization:{scores['task_formalization']}/4")
+                    if 'data_resource_availability' in scores:
+                        tags.append(f"data_availability:{scores['data_resource_availability']}/4")
+                    evaluation_tags = ",".join(tags) if tags else None
             except Exception as e:
                 logger.warning(f"Warning: Could not extract structured data from tool_result: {e}")
             # Try to parse evaluation_content as JSON to extract structured data
             try:
                 evaluation_json = json.loads(evaluation_content)
+                # Extract overall automatability score from scores
+                if 'scores' in evaluation_json and 'overall_automatability' in evaluation_json['scores']:
+                    evaluation_score = evaluation_json['scores']['overall_automatability']
+                # Extract overall score from scores
+                if 'scores' in evaluation_json and 'overall_automatability' in evaluation_json['scores']:
+                    overall_score = evaluation_json['scores']['overall_automatability']
+                # Create tags from key dimensions in scores
                 tags = []
+                if 'scores' in evaluation_json:
+                    scores = evaluation_json['scores']
+                    if 'three_year_feasibility_pct' in scores:
+                        tags.append(f"3yr_feasibility:{scores['three_year_feasibility_pct']}%")
+                    if 'task_formalization' in scores:
+                        tags.append(f"task_formalization:{scores['task_formalization']}/4")
+                    if 'data_resource_availability' in scores:
+                        tags.append(f"data_availability:{scores['data_resource_availability']}/4")
+                    evaluation_tags = ",".join(tags) if tags else None
             except Exception as e:
                 logger.warning(f"Warning: Could not parse evaluation_content as JSON: {e}")

src/agents/prompt.py CHANGED Viewed

@@ -11,9 +11,15 @@ Maintain critical thinking and provide detailed justifications for each score. Y
 EVALUATION_PROMPT_TEMPLATE = """
 # Systematic AI Automation Assessment Framework
-Please conduct a comprehensive evaluation of the provided academic work using the following 12-dimensional framework. For each dimension, provide detailed analysis and justification for your scoring.
-## 12-Dimensional Evaluation Framework
 ### 1. **Task Formalization** (Score: 0-4)
 **What to Evaluate**: Whether the task has clear rules/mathematical objectives
@@ -141,91 +147,19 @@ Please conduct a comprehensive evaluation of the provided academic work using th
 **Analysis Required**: Synthesize all dimensions into overall assessment.
-## Output Format Requirements
-Please structure your response as follows:
-# AI Automation Assessment Report
-## Executive Summary
-[Provide a concise 150-word summary of key findings and overall assessment]
-## Detailed Dimensional Analysis
-### 1. Task Formalization
-**Score: X/4**
-[Detailed analysis and justification]
-### 2. Data & Resource Availability
-**Score: X/4**
-[Detailed analysis and justification]
-### 3. Input-Output Complexity
-**Score: X/4**
-[Detailed analysis and justification]
-### 4. Real-World Interaction
-**Score: X/4**
-[Detailed analysis and justification]
-### 5. Existing AI Coverage
-**Score: X/4**
-[Detailed analysis with specific tools/models and coverage percentage]
-### 6. Automation Barriers
-[Comprehensive list and explanation of key barriers]
-### 7. Human Originality/Irreplaceability
-**Score: X/4**
-[Detailed analysis and justification]
-### 8. Safety & Ethical Criticality
-**Score: X/4**
-[Detailed risk analysis and justification]
-### 9. Societal/Economic Impact
-[Comprehensive impact analysis]
-### 10. Technical Maturity Needed
-**Score: X/4**
-[Detailed analysis of required advances]
-### 11. 3-Year Feasibility
-**Probability: X%**
-[Detailed probability assessment with reasoning]
-### 12. Overall Automatability
-**Score: X/4**
-[Synthesis of all dimensions with final assessment]
-## Summary Scorecard
-| Dimension | Score | Key Insight |
-|-----------|-------|-------------|
-| Task Formalization | X/4 | [Brief insight] |
-| Data & Resource Availability | X/4 | [Brief insight] |
-| Input-Output Complexity | X/4 | [Brief insight] |
-| Real-World Interaction | X/4 | [Brief insight] |
-| Existing AI Coverage | X/4 | [Brief insight] |
-| Human Originality | X/4 | [Brief insight] |
-| Safety & Ethics | X/4 | [Brief insight] |
-| Technical Maturity | X/4 | [Brief insight] |
-| 3-Year Feasibility | X% | [Brief insight] |
-| **Overall Automatability** | **X/4** | **[Key conclusion]** |
-## Strategic Recommendations
 ### For Researchers
-[Specific recommendations for researchers in this field]
 ### For Institutions
-[Recommendations for research institutions and funding bodies]
 ### For AI Development
-[Recommendations for AI researchers and developers]
 ## Assessment Limitations and Uncertainties
-[List key limitations, assumptions, and areas of uncertainty in the assessment]
 ---
@@ -235,6 +169,8 @@ Please structure your response as follows:
 - Consider both current capabilities and realistic near-term developments
 - Justify all numerical scores with detailed reasoning
 - For qualitative dimensions, provide comprehensive analysis
 Now please begin the systematic evaluation of the provided academic work.
 """
@@ -244,79 +180,218 @@ Now please begin the systematic evaluation of the provided academic work.
 TOOLS = [
     {
         "name": "return_assessment",
-        "description": "Return the complete 12D AI automation assessment as a single JSON object.",
         "input_schema": {
             "type": "object",
             "properties": {
-                "executive_summary": {"type": "string"},
                 "dimensions": {
                     "type": "object",
                     "properties": {
                         "task_formalization": {
                             "type": "object",
-                            "properties": {"score": {"type": "number"}, "analysis": {"type": "string"}},
-                            "required": ["score", "analysis"],
                         },
                         "data_resource_availability": {
                             "type": "object",
-                            "properties": {"score": {"type": "number"}, "analysis": {"type": "string"}},
-                            "required": ["score", "analysis"],
                         },
                         "input_output_complexity": {
                             "type": "object",
-                            "properties": {"score": {"type": "number"}, "analysis": {"type": "string"}},
-                            "required": ["score", "analysis"],
                         },
                         "real_world_interaction": {
                             "type": "object",
-                            "properties": {"score": {"type": "number"}, "analysis": {"type": "string"}},
-                            "required": ["score", "analysis"],
                         },
                         "existing_ai_coverage": {
                             "type": "object",
                             "properties": {
-                                "score": {"type": "number"},
-                                "analysis": {"type": "string"},
-                                "tools_models": {"type": "array", "items": {"type": "string"}},
-                                "coverage_pct_estimate": {"type": "number"},
                             },
-                            "required": ["score", "analysis"],
                         },
                         "automation_barriers": {
                             "type": "object",
-                            "properties": {"analysis": {"type": "string"}},
-                            "required": ["analysis"],
                         },
                         "human_originality": {
                             "type": "object",
-                            "properties": {"score": {"type": "number"}, "analysis": {"type": "string"}},
-                            "required": ["score", "analysis"],
                         },
                         "safety_ethics": {
                             "type": "object",
-                            "properties": {"score": {"type": "number"}, "analysis": {"type": "string"}},
-                            "required": ["score", "analysis"],
                         },
                         "societal_economic_impact": {
                             "type": "object",
-                            "properties": {"analysis": {"type": "string"}},
-                            "required": ["analysis"],
                         },
                         "technical_maturity_needed": {
                             "type": "object",
-                            "properties": {"score": {"type": "number"}, "analysis": {"type": "string"}},
-                            "required": ["score", "analysis"],
                         },
                         "three_year_feasibility": {
                             "type": "object",
-                            "properties": {"probability_pct": {"type": "number"}, "analysis": {"type": "string"}},
-                            "required": ["probability_pct", "analysis"],
                         },
                         "overall_automatability": {
                             "type": "object",
-                            "properties": {"score": {"type": "number"}, "analysis": {"type": "string"}},
-                            "required": ["score", "analysis"],
-                        },
                     },
                     "required": [
                         "task_formalization",
@@ -330,22 +405,52 @@ TOOLS = [
                         "societal_economic_impact",
                         "technical_maturity_needed",
                         "three_year_feasibility",
-                        "overall_automatability",
-                    ],
                 },
-                "scorecard": {
                     "type": "object",
                     "properties": {
-                        "task_formalization": {"type": "number"},
-                        "data_resource_availability": {"type": "number"},
-                        "input_output_complexity": {"type": "number"},
-                        "real_world_interaction": {"type": "number"},
-                        "existing_ai_coverage": {"type": "number"},
-                        "human_originality": {"type": "number"},
-                        "safety_ethics": {"type": "number"},
-                        "technical_maturity_needed": {"type": "number"},
-                        "three_year_feasibility_pct": {"type": "number"},
-                        "overall_automatability": {"type": "number"},
                     },
                     "required": [
                         "task_formalization",
@@ -357,30 +462,62 @@ TOOLS = [
                         "safety_ethics",
                         "technical_maturity_needed",
                         "three_year_feasibility_pct",
-                        "overall_automatability",
-                    ],
                 },
                 "recommendations": {
                     "type": "object",
                     "properties": {
-                        "for_researchers": {"type": "array", "items": {"type": "string"}},
-                        "for_institutions": {"type": "array", "items": {"type": "string"}},
-                        "for_ai_development": {"type": "array", "items": {"type": "string"}},
                     },
-                    "required": ["for_researchers", "for_institutions", "for_ai_development"],
                 },
-                "limitations_uncertainties": {"type": "array", "items": {"type": "string"}},
             },
             "required": [
                 "executive_summary",
                 "dimensions",
-                "scorecard",
                 "recommendations",
-                "limitations_uncertainties",
             ],
             "additionalProperties": False,
-        },
     }
 ]
-TOOL_CHOICE = {"type": "tool", "name": "return_assessment"}

 EVALUATION_PROMPT_TEMPLATE = """
 # Systematic AI Automation Assessment Framework
+Please conduct a comprehensive evaluation of the provided academic work using the following 12-dimensional framework. Your output should be organized into four sections: executive_summary, dimensions, scores, recommendations, and limitations_uncertainties.
+IMPORTANT: Follow the exact JSON schema structure provided. The 'dimensions' section should contain detailed analysis objects with 'score' and 'analysis' fields. The 'scores' section should contain only the numerical scores as a flat object. Do not include dimension scores as top-level fields.
+## Executive Summary
+Please provide a concise 150-word summary of key findings and overall assessment.
+## 12-Dimensional Evaluation
 ### 1. **Task Formalization** (Score: 0-4)
 **What to Evaluate**: Whether the task has clear rules/mathematical objectives
 **Analysis Required**: Synthesize all dimensions into overall assessment.
+## Recommendations
 ### For Researchers
+Please provide specific recommendations for researchers in this field.
 ### For Institutions
+Please provide recommendations for research institutions and funding bodies.
 ### For AI Development
+Please provide recommendations for AI researchers and developers.
 ## Assessment Limitations and Uncertainties
+Please list any limitations or uncertainties in your assessment.
 ---
 - Consider both current capabilities and realistic near-term developments
 - Justify all numerical scores with detailed reasoning
 - For qualitative dimensions, provide comprehensive analysis
+- Please use `return_assessment` tool to return the complete AI automation assessment as a single JSON object.
+- Do not mention the tool in your response in order to avoid model hallucination.
 Now please begin the systematic evaluation of the provided academic work.
 """
 TOOLS = [
     {
         "name": "return_assessment",
+        "description": "Return the complete AI automation assessment as a single JSON object.",
         "input_schema": {
             "type": "object",
             "properties": {
+                "executive_summary": {
+                    "type": "string",
+                    "description": "A concise 150-word summary of key findings and overall assessment."
+                },
                 "dimensions": {
                     "type": "object",
+                    "description": "Detailed analysis of each dimension with scores and justifications.",
                     "properties": {
                         "task_formalization": {
                             "type": "object",
+                            "properties": {
+                                "score": {
+                                    "type": "number",
+                                    "description": "The score for the task formalization dimension, on a scale of 0-4."
+                                },
+                                "analysis": {
+                                    "type": "string",
+                                    "description": "A detailed analysis of the task formalization dimension, including the score and the justification for the score."
+                                }
+                            },
+                            "required": [
+                                "score",
+                                "analysis"
+                            ]
                         },
                         "data_resource_availability": {
                             "type": "object",
+                            "properties": {
+                                "score": {
+                                    "type": "number",
+                                    "description": "The score for the data resource availability dimension, on a scale of 0-4."
+                                },
+                                "analysis": {
+                                    "type": "string",
+                                    "description": "A detailed analysis of the data resource availability dimension, including the score and the justification for the score."
+                                }
+                            },
+                            "required": [
+                                "score",
+                                "analysis"
+                            ]
                         },
                         "input_output_complexity": {
                             "type": "object",
+                            "properties": {
+                                "score": {
+                                    "type": "number",
+                                    "description": "The score for the input output complexity dimension, on a scale of 0-4."
+                                },
+                                "analysis": {
+                                    "type": "string",
+                                    "description": "A detailed analysis of the input output complexity dimension, including the score and the justification for the score."
+                                }
+                            },
+                            "required": [
+                                "score",
+                                "analysis"
+                            ]
                         },
                         "real_world_interaction": {
                             "type": "object",
+                            "properties": {
+                                "score": {
+                                    "type": "number",
+                                    "description": "The score for the real world interaction dimension, on a scale of 0-4."
+                                },
+                                "analysis": {
+                                    "type": "string",
+                                    "description": "A detailed analysis of the real world interaction dimension, including the score and the justification for the score."
+                                }
+                            },
+                            "required": [
+                                "score",
+                                "analysis"
+                            ]
                         },
                         "existing_ai_coverage": {
                             "type": "object",
                             "properties": {
+                                "score": {
+                                    "type": "number",
+                                    "description": "The score for the existing AI coverage dimension, on a scale of 0-4."
+                                },
+                                "analysis": {
+                                    "type": "string",
+                                    "description": "A detailed analysis of the existing AI coverage dimension, including the score and the justification for the score."
+                                },
+                                "tools_models": {
+                                    "type": "array",
+                                    "items": {
+                                        "type": "string"
+                                    }
+                                },
+                                "coverage_pct_estimate": {
+                                    "type": "number"
+                                }
                             },
+                            "required": [
+                                "score",
+                                "analysis"
+                            ]
                         },
                         "automation_barriers": {
                             "type": "object",
+                            "properties": {
+                                "analysis": {
+                                    "type": "string",
+                                    "description": "A detailed analysis of the automation barriers dimension, including the score and the justification for the score."
+                                }
+                            },
+                            "required": [
+                                "analysis"
+                            ]
                         },
                         "human_originality": {
                             "type": "object",
+                            "properties": {
+                                "score": {
+                                    "type": "number",
+                                    "description": "The score for the human originality dimension, on a scale of 0-4."
+                                },
+                                "analysis": {
+                                    "type": "string",
+                                    "description": "A detailed analysis of the human originality dimension, including the score and the justification for the score."
+                                }
+                            },
+                            "required": [
+                                "score",
+                                "analysis"
+                            ]
                         },
                         "safety_ethics": {
                             "type": "object",
+                            "properties": {
+                                "score": {
+                                    "type": "number",
+                                    "description": "The score for the safety and ethics dimension, on a scale of 0-4."
+                                },
+                                "analysis": {
+                                    "type": "string",
+                                    "description": "A detailed analysis of the safety and ethics dimension, including the score and the justification for the score."
+                                }
+                            },
+                            "required": [
+                                "score",
+                                "analysis"
+                            ]
                         },
                         "societal_economic_impact": {
                             "type": "object",
+                            "properties": {
+                                "analysis": {
+                                    "type": "string"
+                                }
+                            },
+                            "required": [
+                                "analysis"
+                            ]
                         },
                         "technical_maturity_needed": {
                             "type": "object",
+                            "properties": {
+                                "score": {
+                                    "type": "number"
+                                },
+                                "analysis": {
+                                    "type": "string"
+                                }
+                            },
+                            "required": [
+                                "score",
+                                "analysis"
+                            ]
                         },
                         "three_year_feasibility": {
                             "type": "object",
+                            "properties": {
+                                "probability_pct": {
+                                    "type": "number",
+                                    "description": "The probability of AI reaching expert level within 3 years, on a scale of 0-100%."
+                                },
+                                "analysis": {
+                                    "type": "string",
+                                    "description": "A detailed analysis of the three year feasibility dimension, including the probability and the justification for the probability."
+                                }
+                            },
+                            "required": [
+                                "probability_pct",
+                                "analysis"
+                            ]
                         },
                         "overall_automatability": {
                             "type": "object",
+                            "properties": {
+                                "score": {
+                                    "type": "number",
+                                    "description": "The score for the overall automatability dimension, on a scale of 0-4."
+                                },
+                                "analysis": {
+                                    "type": "string",
+                                    "description": "A detailed analysis of the overall automatability dimension, including the score and the justification for the score."
+                                }
+                            },
+                            "required": [
+                                "score",
+                                "analysis"
+                            ]
+                        }
                     },
                     "required": [
                         "task_formalization",
                         "societal_economic_impact",
                         "technical_maturity_needed",
                         "three_year_feasibility",
+                        "overall_automatability"
+                    ]
                 },
+                "scores": {
                     "type": "object",
                     "properties": {
+                        "task_formalization": {
+                            "type": "number",
+                            "description": "The score for the task formalization dimension, on a scale of 0-4."
+                        },
+                        "data_resource_availability": {
+                            "type": "number",
+                            "description": "The score for the data resource availability dimension, on a scale of 0-4."
+                        },
+                        "input_output_complexity": {
+                            "type": "number",
+                            "description": "The score for the input output complexity dimension, on a scale of 0-4."
+                        },
+                        "real_world_interaction": {
+                            "type": "number",
+                            "description": "The score for the real world interaction dimension, on a scale of 0-4."
+                        },
+                        "existing_ai_coverage": {
+                            "type": "number",
+                            "description": "The score for the existing AI coverage dimension, on a scale of 0-4."
+                        },
+                        "human_originality": {
+                            "type": "number",
+                            "description": "The score for the human originality dimension, on a scale of 0-4."
+                        },
+                        "safety_ethics": {
+                            "type": "number",
+                            "description": "The score for the safety and ethics dimension, on a scale of 0-4."
+                        },
+                        "technical_maturity_needed": {
+                            "type": "number",
+                            "description": "The score for the technical maturity needed dimension, on a scale of 0-4."
+                        },
+                        "three_year_feasibility_pct": {
+                            "type": "number",
+                            "description": "The probability of AI reaching expert level within 3 years, on a scale of 0-100%."
+                        },
+                        "overall_automatability": {
+                            "type": "number",
+                            "description": "The score for the overall automatability dimension, on a scale of 0-4."
+                        }
                     },
                     "required": [
                         "task_formalization",
                         "safety_ethics",
                         "technical_maturity_needed",
                         "three_year_feasibility_pct",
+                        "overall_automatability"
+                    ]
                 },
                 "recommendations": {
                     "type": "object",
                     "properties": {
+                        "for_researchers": {
+                            "type": "array",
+                            "items": {
+                                "type": "string",
+                                "description": "A specific recommendation for researchers in this field."
+                            }
+                        },
+                        "for_institutions": {
+                            "type": "array",
+                            "items": {
+                                "type": "string",
+                                "description": "A recommendation for research institutions and funding bodies."
+                            }
+                        },
+                        "for_ai_development": {
+                            "type": "array",
+                            "items": {
+                                "type": "string",
+                                "description": "A recommendation for AI researchers and developers."
+                            }
+                        }
                     },
+                    "required": [
+                        "for_researchers",
+                        "for_institutions",
+                        "for_ai_development"
+                    ]
                 },
+                "limitations_uncertainties": {
+                    "type": "array",
+                    "items": {
+                        "type": "string",
+                        "description": "A limitation or uncertainty in the assessment."
+                    }
+                }
             },
             "required": [
                 "executive_summary",
                 "dimensions",
+                "scores",
                 "recommendations",
+                "limitations_uncertainties"
             ],
             "additionalProperties": False,
+            "description": "Complete evaluation output with executive summary, detailed dimensions analysis, numerical scores, recommendations, and limitations."
+        }
     }
 ]
+TOOL_CHOICE = {
+    "type": "tool",
+    "name": "return_assessment"
+}

workdir/paper_agent/papers_cache.db ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63089ac0211f69a8086daf1e54751e1a9cb67ca01a5a81b6765beb8dae6fe818
+size 282624