DVampire commited on
Commit
d3ff7fa
·
1 Parent(s): 26884bd

update database

Browse files
app.py CHANGED
@@ -428,19 +428,19 @@ async def get_paper_score(paper_id: str) -> Dict[str, Any]:
428
  evaluation_content = paper.get('evaluation_content')
429
  if evaluation_content:
430
  evaluation_json = json.loads(evaluation_content)
431
- if 'scorecard' in evaluation_json:
432
- scorecard = evaluation_json['scorecard']
433
  values = [
434
- scorecard.get('task_formalization', 0),
435
- scorecard.get('data_resource_availability', 0),
436
- scorecard.get('input_output_complexity', 0),
437
- scorecard.get('real_world_interaction', 0),
438
- scorecard.get('existing_ai_coverage', 0),
439
- scorecard.get('human_originality', 0),
440
- scorecard.get('safety_ethics', 0),
441
- scorecard.get('technical_maturity_needed', 0),
442
- scorecard.get('three_year_feasibility_pct', 0) / 25, # Convert percentage to 0-4 scale
443
- scorecard.get('overall_automatability', 0)
444
  ]
445
  valid_scores = [v for v in values if v > 0]
446
  overall_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0
 
428
  evaluation_content = paper.get('evaluation_content')
429
  if evaluation_content:
430
  evaluation_json = json.loads(evaluation_content)
431
+ if 'scores' in evaluation_json:
432
+ scores = evaluation_json['scores']
433
  values = [
434
+ scores.get('task_formalization', 0),
435
+ scores.get('data_resource_availability', 0),
436
+ scores.get('input_output_complexity', 0),
437
+ scores.get('real_world_interaction', 0),
438
+ scores.get('existing_ai_coverage', 0),
439
+ scores.get('human_originality', 0),
440
+ scores.get('safety_ethics', 0),
441
+ scores.get('technical_maturity_needed', 0),
442
+ scores.get('three_year_feasibility_pct', 0) / 25, # Convert percentage to 0-4 scale
443
+ scores.get('overall_automatability', 0)
444
  ]
445
  valid_scores = [v for v in values if v > 0]
446
  overall_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0
frontend/paper.html CHANGED
@@ -64,7 +64,7 @@
64
  <aside class="sidebar">
65
  <div class="scorecard-panel">
66
  <div class="panel-header">
67
- <h2><i class="fas fa-radar"></i> Scorecard</h2>
68
  <div class="overall-score" id="overallScore">
69
  <span class="score-number">-</span>
70
  <span class="score-label">Overall</span>
 
64
  <aside class="sidebar">
65
  <div class="scorecard-panel">
66
  <div class="panel-header">
67
+ <h2><i class="fas fa-radar"></i> Scores</h2>
68
  <div class="overall-score" id="overallScore">
69
  <span class="score-number">-</span>
70
  <span class="score-label">Overall</span>
frontend/paper.js CHANGED
@@ -292,7 +292,7 @@ class PaperEvaluationRenderer {
292
  if (!radarEl) return;
293
 
294
  try {
295
- const score = json.scorecard || {};
296
  const d = parseMaybeJSON(json.dimensions) || {};
297
 
298
  const labels = [
 
292
  if (!radarEl) return;
293
 
294
  try {
295
+ const score = json.scores || {};
296
  const d = parseMaybeJSON(json.dimensions) || {};
297
 
298
  const labels = [
src/agents/evaluator.py CHANGED
@@ -94,7 +94,7 @@ class Evaluator:
94
  # Call Anthropic API with tools (async)
95
  response = await self.client.messages.create(
96
  model=config.model_id,
97
- max_tokens=4000,
98
  system=self.system_prompt,
99
  messages=messages,
100
  tools=TOOLS,
@@ -158,26 +158,26 @@ async def save_node(state: ConversationState) -> ConversationState:
158
  # Try to extract score and tags from tool_result if available
159
  if state.tool_result:
160
  try:
161
- # Extract overall automatability score from scorecard
162
- if 'scorecard' in state.tool_result and 'overall_automatability' in state.tool_result['scorecard']:
163
- evaluation_score = state.tool_result['scorecard']['overall_automatability']
164
 
165
- # Extract overall score from scorecard
166
- if 'scorecard' in state.tool_result and 'overall_automatability' in state.tool_result['scorecard']:
167
- overall_score = state.tool_result['scorecard']['overall_automatability']
168
 
169
- # Create tags from key dimensions in scorecard
170
  tags = []
171
- if 'scorecard' in state.tool_result:
172
- scorecard = state.tool_result['scorecard']
173
- if 'three_year_feasibility_pct' in scorecard:
174
- tags.append(f"3yr_feasibility:{scorecard['three_year_feasibility_pct']}%")
175
- if 'task_formalization' in scorecard:
176
- tags.append(f"task_formalization:{scorecard['task_formalization']}/4")
177
- if 'data_resource_availability' in scorecard:
178
- tags.append(f"data_availability:{scorecard['data_resource_availability']}/4")
179
-
180
- evaluation_tags = ",".join(tags) if tags else None
181
 
182
  except Exception as e:
183
  logger.warning(f"Warning: Could not extract structured data from tool_result: {e}")
@@ -185,26 +185,26 @@ async def save_node(state: ConversationState) -> ConversationState:
185
  # Try to parse evaluation_content as JSON to extract structured data
186
  try:
187
  evaluation_json = json.loads(evaluation_content)
188
- # Extract overall automatability score from scorecard
189
- if 'scorecard' in evaluation_json and 'overall_automatability' in evaluation_json['scorecard']:
190
- evaluation_score = evaluation_json['scorecard']['overall_automatability']
191
 
192
- # Extract overall score from scorecard
193
- if 'scorecard' in evaluation_json and 'overall_automatability' in evaluation_json['scorecard']:
194
- overall_score = evaluation_json['scorecard']['overall_automatability']
195
 
196
- # Create tags from key dimensions in scorecard
197
  tags = []
198
- if 'scorecard' in evaluation_json:
199
- scorecard = evaluation_json['scorecard']
200
- if 'three_year_feasibility_pct' in scorecard:
201
- tags.append(f"3yr_feasibility:{scorecard['three_year_feasibility_pct']}%")
202
- if 'task_formalization' in scorecard:
203
- tags.append(f"task_formalization:{scorecard['task_formalization']}/4")
204
- if 'data_resource_availability' in scorecard:
205
- tags.append(f"data_availability:{scorecard['data_resource_availability']}/4")
206
-
207
- evaluation_tags = ",".join(tags) if tags else None
208
 
209
  except Exception as e:
210
  logger.warning(f"Warning: Could not parse evaluation_content as JSON: {e}")
 
94
  # Call Anthropic API with tools (async)
95
  response = await self.client.messages.create(
96
  model=config.model_id,
97
+ max_tokens=10000,
98
  system=self.system_prompt,
99
  messages=messages,
100
  tools=TOOLS,
 
158
  # Try to extract score and tags from tool_result if available
159
  if state.tool_result:
160
  try:
161
+ # Extract overall automatability score from scores
162
+ if 'scores' in state.tool_result and 'overall_automatability' in state.tool_result['scores']:
163
+ evaluation_score = state.tool_result['scores']['overall_automatability']
164
 
165
+ # Extract overall score from scores
166
+ if 'scores' in state.tool_result and 'overall_automatability' in state.tool_result['scores']:
167
+ overall_score = state.tool_result['scores']['overall_automatability']
168
 
169
+ # Create tags from key dimensions in scores
170
  tags = []
171
+ if 'scores' in state.tool_result:
172
+ scores = state.tool_result['scores']
173
+ if 'three_year_feasibility_pct' in scores:
174
+ tags.append(f"3yr_feasibility:{scores['three_year_feasibility_pct']}%")
175
+ if 'task_formalization' in scores:
176
+ tags.append(f"task_formalization:{scores['task_formalization']}/4")
177
+ if 'data_resource_availability' in scores:
178
+ tags.append(f"data_availability:{scores['data_resource_availability']}/4")
179
+
180
+ evaluation_tags = ",".join(tags) if tags else None
181
 
182
  except Exception as e:
183
  logger.warning(f"Warning: Could not extract structured data from tool_result: {e}")
 
185
  # Try to parse evaluation_content as JSON to extract structured data
186
  try:
187
  evaluation_json = json.loads(evaluation_content)
188
+ # Extract overall automatability score from scores
189
+ if 'scores' in evaluation_json and 'overall_automatability' in evaluation_json['scores']:
190
+ evaluation_score = evaluation_json['scores']['overall_automatability']
191
 
192
+ # Extract overall score from scores
193
+ if 'scores' in evaluation_json and 'overall_automatability' in evaluation_json['scores']:
194
+ overall_score = evaluation_json['scores']['overall_automatability']
195
 
196
+ # Create tags from key dimensions in scores
197
  tags = []
198
+ if 'scores' in evaluation_json:
199
+ scores = evaluation_json['scores']
200
+ if 'three_year_feasibility_pct' in scores:
201
+ tags.append(f"3yr_feasibility:{scores['three_year_feasibility_pct']}%")
202
+ if 'task_formalization' in scores:
203
+ tags.append(f"task_formalization:{scores['task_formalization']}/4")
204
+ if 'data_resource_availability' in scores:
205
+ tags.append(f"data_availability:{scores['data_resource_availability']}/4")
206
+
207
+ evaluation_tags = ",".join(tags) if tags else None
208
 
209
  except Exception as e:
210
  logger.warning(f"Warning: Could not parse evaluation_content as JSON: {e}")
src/agents/prompt.py CHANGED
@@ -11,9 +11,15 @@ Maintain critical thinking and provide detailed justifications for each score. Y
11
  EVALUATION_PROMPT_TEMPLATE = """
12
  # Systematic AI Automation Assessment Framework
13
 
14
- Please conduct a comprehensive evaluation of the provided academic work using the following 12-dimensional framework. For each dimension, provide detailed analysis and justification for your scoring.
15
 
16
- ## 12-Dimensional Evaluation Framework
 
 
 
 
 
 
17
 
18
  ### 1. **Task Formalization** (Score: 0-4)
19
  **What to Evaluate**: Whether the task has clear rules/mathematical objectives
@@ -141,91 +147,19 @@ Please conduct a comprehensive evaluation of the provided academic work using th
141
 
142
  **Analysis Required**: Synthesize all dimensions into overall assessment.
143
 
144
- ## Output Format Requirements
145
-
146
- Please structure your response as follows:
147
-
148
- # AI Automation Assessment Report
149
-
150
- ## Executive Summary
151
- [Provide a concise 150-word summary of key findings and overall assessment]
152
-
153
- ## Detailed Dimensional Analysis
154
-
155
- ### 1. Task Formalization
156
- **Score: X/4**
157
- [Detailed analysis and justification]
158
-
159
- ### 2. Data & Resource Availability
160
- **Score: X/4**
161
- [Detailed analysis and justification]
162
-
163
- ### 3. Input-Output Complexity
164
- **Score: X/4**
165
- [Detailed analysis and justification]
166
-
167
- ### 4. Real-World Interaction
168
- **Score: X/4**
169
- [Detailed analysis and justification]
170
-
171
- ### 5. Existing AI Coverage
172
- **Score: X/4**
173
- [Detailed analysis with specific tools/models and coverage percentage]
174
-
175
- ### 6. Automation Barriers
176
- [Comprehensive list and explanation of key barriers]
177
-
178
- ### 7. Human Originality/Irreplaceability
179
- **Score: X/4**
180
- [Detailed analysis and justification]
181
-
182
- ### 8. Safety & Ethical Criticality
183
- **Score: X/4**
184
- [Detailed risk analysis and justification]
185
-
186
- ### 9. Societal/Economic Impact
187
- [Comprehensive impact analysis]
188
-
189
- ### 10. Technical Maturity Needed
190
- **Score: X/4**
191
- [Detailed analysis of required advances]
192
-
193
- ### 11. 3-Year Feasibility
194
- **Probability: X%**
195
- [Detailed probability assessment with reasoning]
196
-
197
- ### 12. Overall Automatability
198
- **Score: X/4**
199
- [Synthesis of all dimensions with final assessment]
200
-
201
- ## Summary Scorecard
202
-
203
- | Dimension | Score | Key Insight |
204
- |-----------|-------|-------------|
205
- | Task Formalization | X/4 | [Brief insight] |
206
- | Data & Resource Availability | X/4 | [Brief insight] |
207
- | Input-Output Complexity | X/4 | [Brief insight] |
208
- | Real-World Interaction | X/4 | [Brief insight] |
209
- | Existing AI Coverage | X/4 | [Brief insight] |
210
- | Human Originality | X/4 | [Brief insight] |
211
- | Safety & Ethics | X/4 | [Brief insight] |
212
- | Technical Maturity | X/4 | [Brief insight] |
213
- | 3-Year Feasibility | X% | [Brief insight] |
214
- | **Overall Automatability** | **X/4** | **[Key conclusion]** |
215
-
216
- ## Strategic Recommendations
217
 
218
  ### For Researchers
219
- [Specific recommendations for researchers in this field]
220
 
221
  ### For Institutions
222
- [Recommendations for research institutions and funding bodies]
223
 
224
  ### For AI Development
225
- [Recommendations for AI researchers and developers]
226
 
227
  ## Assessment Limitations and Uncertainties
228
- [List key limitations, assumptions, and areas of uncertainty in the assessment]
229
 
230
  ---
231
 
@@ -235,6 +169,8 @@ Please structure your response as follows:
235
  - Consider both current capabilities and realistic near-term developments
236
  - Justify all numerical scores with detailed reasoning
237
  - For qualitative dimensions, provide comprehensive analysis
 
 
238
 
239
  Now please begin the systematic evaluation of the provided academic work.
240
  """
@@ -244,79 +180,218 @@ Now please begin the systematic evaluation of the provided academic work.
244
  TOOLS = [
245
  {
246
  "name": "return_assessment",
247
- "description": "Return the complete 12D AI automation assessment as a single JSON object.",
248
  "input_schema": {
249
  "type": "object",
250
  "properties": {
251
- "executive_summary": {"type": "string"},
 
 
 
252
  "dimensions": {
253
  "type": "object",
 
254
  "properties": {
255
  "task_formalization": {
256
  "type": "object",
257
- "properties": {"score": {"type": "number"}, "analysis": {"type": "string"}},
258
- "required": ["score", "analysis"],
 
 
 
 
 
 
 
 
 
 
 
 
259
  },
260
  "data_resource_availability": {
261
  "type": "object",
262
- "properties": {"score": {"type": "number"}, "analysis": {"type": "string"}},
263
- "required": ["score", "analysis"],
 
 
 
 
 
 
 
 
 
 
 
 
264
  },
265
  "input_output_complexity": {
266
  "type": "object",
267
- "properties": {"score": {"type": "number"}, "analysis": {"type": "string"}},
268
- "required": ["score", "analysis"],
 
 
 
 
 
 
 
 
 
 
 
 
269
  },
270
  "real_world_interaction": {
271
  "type": "object",
272
- "properties": {"score": {"type": "number"}, "analysis": {"type": "string"}},
273
- "required": ["score", "analysis"],
 
 
 
 
 
 
 
 
 
 
 
 
274
  },
275
  "existing_ai_coverage": {
276
  "type": "object",
277
  "properties": {
278
- "score": {"type": "number"},
279
- "analysis": {"type": "string"},
280
- "tools_models": {"type": "array", "items": {"type": "string"}},
281
- "coverage_pct_estimate": {"type": "number"},
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  },
283
- "required": ["score", "analysis"],
 
 
 
284
  },
285
  "automation_barriers": {
286
  "type": "object",
287
- "properties": {"analysis": {"type": "string"}},
288
- "required": ["analysis"],
 
 
 
 
 
 
 
289
  },
290
  "human_originality": {
291
  "type": "object",
292
- "properties": {"score": {"type": "number"}, "analysis": {"type": "string"}},
293
- "required": ["score", "analysis"],
 
 
 
 
 
 
 
 
 
 
 
 
294
  },
295
  "safety_ethics": {
296
  "type": "object",
297
- "properties": {"score": {"type": "number"}, "analysis": {"type": "string"}},
298
- "required": ["score", "analysis"],
 
 
 
 
 
 
 
 
 
 
 
 
299
  },
300
  "societal_economic_impact": {
301
  "type": "object",
302
- "properties": {"analysis": {"type": "string"}},
303
- "required": ["analysis"],
 
 
 
 
 
 
304
  },
305
  "technical_maturity_needed": {
306
  "type": "object",
307
- "properties": {"score": {"type": "number"}, "analysis": {"type": "string"}},
308
- "required": ["score", "analysis"],
 
 
 
 
 
 
 
 
 
 
309
  },
310
  "three_year_feasibility": {
311
  "type": "object",
312
- "properties": {"probability_pct": {"type": "number"}, "analysis": {"type": "string"}},
313
- "required": ["probability_pct", "analysis"],
 
 
 
 
 
 
 
 
 
 
 
 
314
  },
315
  "overall_automatability": {
316
  "type": "object",
317
- "properties": {"score": {"type": "number"}, "analysis": {"type": "string"}},
318
- "required": ["score", "analysis"],
319
- },
 
 
 
 
 
 
 
 
 
 
 
 
320
  },
321
  "required": [
322
  "task_formalization",
@@ -330,22 +405,52 @@ TOOLS = [
330
  "societal_economic_impact",
331
  "technical_maturity_needed",
332
  "three_year_feasibility",
333
- "overall_automatability",
334
- ],
335
  },
336
- "scorecard": {
337
  "type": "object",
338
  "properties": {
339
- "task_formalization": {"type": "number"},
340
- "data_resource_availability": {"type": "number"},
341
- "input_output_complexity": {"type": "number"},
342
- "real_world_interaction": {"type": "number"},
343
- "existing_ai_coverage": {"type": "number"},
344
- "human_originality": {"type": "number"},
345
- "safety_ethics": {"type": "number"},
346
- "technical_maturity_needed": {"type": "number"},
347
- "three_year_feasibility_pct": {"type": "number"},
348
- "overall_automatability": {"type": "number"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  },
350
  "required": [
351
  "task_formalization",
@@ -357,30 +462,62 @@ TOOLS = [
357
  "safety_ethics",
358
  "technical_maturity_needed",
359
  "three_year_feasibility_pct",
360
- "overall_automatability",
361
- ],
362
  },
363
  "recommendations": {
364
  "type": "object",
365
  "properties": {
366
- "for_researchers": {"type": "array", "items": {"type": "string"}},
367
- "for_institutions": {"type": "array", "items": {"type": "string"}},
368
- "for_ai_development": {"type": "array", "items": {"type": "string"}},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  },
370
- "required": ["for_researchers", "for_institutions", "for_ai_development"],
 
 
 
 
371
  },
372
- "limitations_uncertainties": {"type": "array", "items": {"type": "string"}},
 
 
 
 
 
 
373
  },
374
  "required": [
375
  "executive_summary",
376
  "dimensions",
377
- "scorecard",
378
  "recommendations",
379
- "limitations_uncertainties",
380
  ],
381
  "additionalProperties": False,
382
- },
 
383
  }
384
  ]
385
 
386
- TOOL_CHOICE = {"type": "tool", "name": "return_assessment"}
 
 
 
 
11
  EVALUATION_PROMPT_TEMPLATE = """
12
  # Systematic AI Automation Assessment Framework
13
 
14
+ Please conduct a comprehensive evaluation of the provided academic work using the following 12-dimensional framework. Your output should be organized into four sections: executive_summary, dimensions, scores, recommendations, and limitations_uncertainties.
15
 
16
+ IMPORTANT: Follow the exact JSON schema structure provided. The 'dimensions' section should contain detailed analysis objects with 'score' and 'analysis' fields. The 'scores' section should contain only the numerical scores as a flat object. Do not include dimension scores as top-level fields.
17
+
18
+ ## Executive Summary
19
+
20
+ Please provide a concise 150-word summary of key findings and overall assessment.
21
+
22
+ ## 12-Dimensional Evaluation
23
 
24
  ### 1. **Task Formalization** (Score: 0-4)
25
  **What to Evaluate**: Whether the task has clear rules/mathematical objectives
 
147
 
148
  **Analysis Required**: Synthesize all dimensions into overall assessment.
149
 
150
+ ## Recommendations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  ### For Researchers
153
+ Please provide specific recommendations for researchers in this field.
154
 
155
  ### For Institutions
156
+ Please provide recommendations for research institutions and funding bodies.
157
 
158
  ### For AI Development
159
+ Please provide recommendations for AI researchers and developers.
160
 
161
  ## Assessment Limitations and Uncertainties
162
+ Please list any limitations or uncertainties in your assessment.
163
 
164
  ---
165
 
 
169
  - Consider both current capabilities and realistic near-term developments
170
  - Justify all numerical scores with detailed reasoning
171
  - For qualitative dimensions, provide comprehensive analysis
172
+ - Please use `return_assessment` tool to return the complete AI automation assessment as a single JSON object.
173
+ - Do not mention the tool in your response in order to avoid model hallucination.
174
 
175
  Now please begin the systematic evaluation of the provided academic work.
176
  """
 
180
  TOOLS = [
181
  {
182
  "name": "return_assessment",
183
+ "description": "Return the complete AI automation assessment as a single JSON object.",
184
  "input_schema": {
185
  "type": "object",
186
  "properties": {
187
+ "executive_summary": {
188
+ "type": "string",
189
+ "description": "A concise 150-word summary of key findings and overall assessment."
190
+ },
191
  "dimensions": {
192
  "type": "object",
193
+ "description": "Detailed analysis of each dimension with scores and justifications.",
194
  "properties": {
195
  "task_formalization": {
196
  "type": "object",
197
+ "properties": {
198
+ "score": {
199
+ "type": "number",
200
+ "description": "The score for the task formalization dimension, on a scale of 0-4."
201
+ },
202
+ "analysis": {
203
+ "type": "string",
204
+ "description": "A detailed analysis of the task formalization dimension, including the score and the justification for the score."
205
+ }
206
+ },
207
+ "required": [
208
+ "score",
209
+ "analysis"
210
+ ]
211
  },
212
  "data_resource_availability": {
213
  "type": "object",
214
+ "properties": {
215
+ "score": {
216
+ "type": "number",
217
+ "description": "The score for the data resource availability dimension, on a scale of 0-4."
218
+ },
219
+ "analysis": {
220
+ "type": "string",
221
+ "description": "A detailed analysis of the data resource availability dimension, including the score and the justification for the score."
222
+ }
223
+ },
224
+ "required": [
225
+ "score",
226
+ "analysis"
227
+ ]
228
  },
229
  "input_output_complexity": {
230
  "type": "object",
231
+ "properties": {
232
+ "score": {
233
+ "type": "number",
234
+ "description": "The score for the input output complexity dimension, on a scale of 0-4."
235
+ },
236
+ "analysis": {
237
+ "type": "string",
238
+ "description": "A detailed analysis of the input output complexity dimension, including the score and the justification for the score."
239
+ }
240
+ },
241
+ "required": [
242
+ "score",
243
+ "analysis"
244
+ ]
245
  },
246
  "real_world_interaction": {
247
  "type": "object",
248
+ "properties": {
249
+ "score": {
250
+ "type": "number",
251
+ "description": "The score for the real world interaction dimension, on a scale of 0-4."
252
+ },
253
+ "analysis": {
254
+ "type": "string",
255
+ "description": "A detailed analysis of the real world interaction dimension, including the score and the justification for the score."
256
+ }
257
+ },
258
+ "required": [
259
+ "score",
260
+ "analysis"
261
+ ]
262
  },
263
  "existing_ai_coverage": {
264
  "type": "object",
265
  "properties": {
266
+ "score": {
267
+ "type": "number",
268
+ "description": "The score for the existing AI coverage dimension, on a scale of 0-4."
269
+ },
270
+ "analysis": {
271
+ "type": "string",
272
+ "description": "A detailed analysis of the existing AI coverage dimension, including the score and the justification for the score."
273
+ },
274
+ "tools_models": {
275
+ "type": "array",
276
+ "items": {
277
+ "type": "string"
278
+ }
279
+ },
280
+ "coverage_pct_estimate": {
281
+ "type": "number"
282
+ }
283
  },
284
+ "required": [
285
+ "score",
286
+ "analysis"
287
+ ]
288
  },
289
  "automation_barriers": {
290
  "type": "object",
291
+ "properties": {
292
+ "analysis": {
293
+ "type": "string",
294
+ "description": "A detailed analysis of the automation barriers dimension, including the score and the justification for the score."
295
+ }
296
+ },
297
+ "required": [
298
+ "analysis"
299
+ ]
300
  },
301
  "human_originality": {
302
  "type": "object",
303
+ "properties": {
304
+ "score": {
305
+ "type": "number",
306
+ "description": "The score for the human originality dimension, on a scale of 0-4."
307
+ },
308
+ "analysis": {
309
+ "type": "string",
310
+ "description": "A detailed analysis of the human originality dimension, including the score and the justification for the score."
311
+ }
312
+ },
313
+ "required": [
314
+ "score",
315
+ "analysis"
316
+ ]
317
  },
318
  "safety_ethics": {
319
  "type": "object",
320
+ "properties": {
321
+ "score": {
322
+ "type": "number",
323
+ "description": "The score for the safety and ethics dimension, on a scale of 0-4."
324
+ },
325
+ "analysis": {
326
+ "type": "string",
327
+ "description": "A detailed analysis of the safety and ethics dimension, including the score and the justification for the score."
328
+ }
329
+ },
330
+ "required": [
331
+ "score",
332
+ "analysis"
333
+ ]
334
  },
335
  "societal_economic_impact": {
336
  "type": "object",
337
+ "properties": {
338
+ "analysis": {
339
+ "type": "string"
340
+ }
341
+ },
342
+ "required": [
343
+ "analysis"
344
+ ]
345
  },
346
  "technical_maturity_needed": {
347
  "type": "object",
348
+ "properties": {
349
+ "score": {
350
+ "type": "number"
351
+ },
352
+ "analysis": {
353
+ "type": "string"
354
+ }
355
+ },
356
+ "required": [
357
+ "score",
358
+ "analysis"
359
+ ]
360
  },
361
  "three_year_feasibility": {
362
  "type": "object",
363
+ "properties": {
364
+ "probability_pct": {
365
+ "type": "number",
366
+ "description": "The probability of AI reaching expert level within 3 years, on a scale of 0-100%."
367
+ },
368
+ "analysis": {
369
+ "type": "string",
370
+ "description": "A detailed analysis of the three year feasibility dimension, including the probability and the justification for the probability."
371
+ }
372
+ },
373
+ "required": [
374
+ "probability_pct",
375
+ "analysis"
376
+ ]
377
  },
378
  "overall_automatability": {
379
  "type": "object",
380
+ "properties": {
381
+ "score": {
382
+ "type": "number",
383
+ "description": "The score for the overall automatability dimension, on a scale of 0-4."
384
+ },
385
+ "analysis": {
386
+ "type": "string",
387
+ "description": "A detailed analysis of the overall automatability dimension, including the score and the justification for the score."
388
+ }
389
+ },
390
+ "required": [
391
+ "score",
392
+ "analysis"
393
+ ]
394
+ }
395
  },
396
  "required": [
397
  "task_formalization",
 
405
  "societal_economic_impact",
406
  "technical_maturity_needed",
407
  "three_year_feasibility",
408
+ "overall_automatability"
409
+ ]
410
  },
411
+ "scores": {
412
  "type": "object",
413
  "properties": {
414
+ "task_formalization": {
415
+ "type": "number",
416
+ "description": "The score for the task formalization dimension, on a scale of 0-4."
417
+ },
418
+ "data_resource_availability": {
419
+ "type": "number",
420
+ "description": "The score for the data resource availability dimension, on a scale of 0-4."
421
+ },
422
+ "input_output_complexity": {
423
+ "type": "number",
424
+ "description": "The score for the input output complexity dimension, on a scale of 0-4."
425
+ },
426
+ "real_world_interaction": {
427
+ "type": "number",
428
+ "description": "The score for the real world interaction dimension, on a scale of 0-4."
429
+ },
430
+ "existing_ai_coverage": {
431
+ "type": "number",
432
+ "description": "The score for the existing AI coverage dimension, on a scale of 0-4."
433
+ },
434
+ "human_originality": {
435
+ "type": "number",
436
+ "description": "The score for the human originality dimension, on a scale of 0-4."
437
+ },
438
+ "safety_ethics": {
439
+ "type": "number",
440
+ "description": "The score for the safety and ethics dimension, on a scale of 0-4."
441
+ },
442
+ "technical_maturity_needed": {
443
+ "type": "number",
444
+ "description": "The score for the technical maturity needed dimension, on a scale of 0-4."
445
+ },
446
+ "three_year_feasibility_pct": {
447
+ "type": "number",
448
+ "description": "The probability of AI reaching expert level within 3 years, on a scale of 0-100%."
449
+ },
450
+ "overall_automatability": {
451
+ "type": "number",
452
+ "description": "The score for the overall automatability dimension, on a scale of 0-4."
453
+ }
454
  },
455
  "required": [
456
  "task_formalization",
 
462
  "safety_ethics",
463
  "technical_maturity_needed",
464
  "three_year_feasibility_pct",
465
+ "overall_automatability"
466
+ ]
467
  },
468
  "recommendations": {
469
  "type": "object",
470
  "properties": {
471
+ "for_researchers": {
472
+ "type": "array",
473
+ "items": {
474
+ "type": "string",
475
+ "description": "A specific recommendation for researchers in this field."
476
+ }
477
+ },
478
+ "for_institutions": {
479
+ "type": "array",
480
+ "items": {
481
+ "type": "string",
482
+ "description": "A recommendation for research institutions and funding bodies."
483
+ }
484
+ },
485
+ "for_ai_development": {
486
+ "type": "array",
487
+ "items": {
488
+ "type": "string",
489
+ "description": "A recommendation for AI researchers and developers."
490
+ }
491
+ }
492
  },
493
+ "required": [
494
+ "for_researchers",
495
+ "for_institutions",
496
+ "for_ai_development"
497
+ ]
498
  },
499
+ "limitations_uncertainties": {
500
+ "type": "array",
501
+ "items": {
502
+ "type": "string",
503
+ "description": "A limitation or uncertainty in the assessment."
504
+ }
505
+ }
506
  },
507
  "required": [
508
  "executive_summary",
509
  "dimensions",
510
+ "scores",
511
  "recommendations",
512
+ "limitations_uncertainties"
513
  ],
514
  "additionalProperties": False,
515
+ "description": "Complete evaluation output with executive summary, detailed dimensions analysis, numerical scores, recommendations, and limitations."
516
+ }
517
  }
518
  ]
519
 
520
+ TOOL_CHOICE = {
521
+ "type": "tool",
522
+ "name": "return_assessment"
523
+ }
workdir/paper_agent/papers_cache.db ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63089ac0211f69a8086daf1e54751e1a9cb67ca01a5a81b6765beb8dae6fe818
3
+ size 282624