Spaces:
Sleeping
Sleeping
| import {z} from 'zod'; | |
| import {GenerateObjectResult} from 'ai'; | |
| import {TokenTracker} from "../utils/token-tracker"; | |
| import {AnswerAction, EvaluationCriteria, EvaluationResponse, EvaluationType} from '../types'; | |
| import {readUrl, removeAllLineBreaks} from "./read"; | |
| import {ObjectGeneratorSafe} from "../utils/safe-generator"; | |
| const baseSchema = { | |
| pass: z.boolean().describe('Whether the answer passes the evaluation criteria defined by the evaluator'), | |
| think: z.string().describe('Explanation the thought process why the answer does not pass the evaluation criteria') | |
| }; | |
| const definitiveSchema = z.object({ | |
| ...baseSchema, | |
| type: z.literal('definitive') | |
| }); | |
| const freshnessSchema = z.object({ | |
| ...baseSchema, | |
| type: z.literal('freshness'), | |
| freshness_analysis: z.object({ | |
| likely_outdated: z.boolean().describe('Whether the answer content is likely outdated based on dates and current time'), | |
| dates_mentioned: z.array(z.string()).describe('All dates mentioned in the answer'), | |
| current_time: z.string().describe('Current system time when evaluation was performed'), | |
| max_age_days: z.number().optional().describe('Maximum allowed age in days before content is considered outdated') | |
| }) | |
| }); | |
| const pluralitySchema = z.object({ | |
| ...baseSchema, | |
| type: z.literal('plurality'), | |
| plurality_analysis: z.object({ | |
| expects_multiple: z.boolean().describe('Whether the question asks for multiple items'), | |
| provides_multiple: z.boolean().describe('Whether the answer provides multiple items'), | |
| count_expected: z.number().optional().describe('Number of items expected if specified in question'), | |
| count_provided: z.number().describe('Number of items provided in answer') | |
| }) | |
| }); | |
| const attributionSchema = z.object({ | |
| ...baseSchema, | |
| type: z.literal('attribution'), | |
| attribution_analysis: z.object({ | |
| sources_provided: z.boolean().describe('Whether the answer provides source references'), | |
| sources_verified: z.boolean().describe('Whether the provided sources contain the claimed information'), | |
| quotes_accurate: z.boolean().describe('Whether the quotes accurately represent the source content') | |
| }) | |
| }); | |
| function getAttributionPrompt(question: string, answer: string, sourceContent: string): string { | |
| return `You are an evaluator that verifies if answer content is properly attributed to and supported by the provided sources. | |
| <rules> | |
| 1. Source Verification: | |
| - Check if answer claims are supported by the provided source content | |
| - Verify that quotes are accurate and in proper context | |
| - Ensure numerical data and statistics match the source | |
| - Flag any claims that go beyond what the sources support | |
| 2. Attribution Analysis: | |
| - Check if answer properly references its sources | |
| - Verify that important claims have clear source attribution | |
| - Ensure quotes are properly marked and cited | |
| - Check for any unsupported generalizations | |
| 3. Accuracy Requirements: | |
| - Direct quotes must match source exactly | |
| - Paraphrasing must maintain original meaning | |
| - Statistics and numbers must be precise | |
| - Context must be preserved | |
| </rules> | |
| <examples> | |
| Question: "What are Jina AI's main products?" | |
| Answer: "According to Jina AI's website, their main products are DocArray and Jina Framework." | |
| Source Content: "Jina AI's flagship products include DocArray, Jina Framework, and JCloud, offering a complete ecosystem for neural search applications." | |
| Evaluation: { | |
| "pass": false, | |
| "think": "The answer omits JCloud which is mentioned as a main product in the source. The information provided is incomplete and potentially misleading as it fails to mention a significant product from the company's ecosystem.", | |
| "attribution_analysis": { | |
| "sources_provided": true, | |
| "sources_verified": false, | |
| "quotes_accurate": false | |
| } | |
| } | |
| Question: "When was Python first released?" | |
| Answer: "Python was first released in 1991 by Guido van Rossum." | |
| Source Content: "Python was first released in 1991 by Guido van Rossum while working at CWI." | |
| Evaluation: { | |
| "pass": true, | |
| "think": "The answer accurately reflects the core information from the source about Python's release date and creator, though it omits the additional context about CWI which isn't essential to the question.", | |
| "attribution_analysis": { | |
| "sources_provided": true, | |
| "sources_verified": true, | |
| "quotes_accurate": true | |
| } | |
| } | |
| </examples> | |
| Now evaluate this pair: | |
| Question: ${JSON.stringify(question)} | |
| Answer: ${JSON.stringify(answer)} | |
| Source Content: ${JSON.stringify(sourceContent)}`; | |
| } | |
| function getDefinitivePrompt(question: string, answer: string): string { | |
| return `You are an evaluator of answer definitiveness. Analyze if the given answer provides a definitive response or not. | |
| <rules> | |
| First, if the answer is not a direct response to the question, it must return false. | |
| Definitiveness is the king! The following types of responses are NOT definitive and must return false: | |
| 1. Expressions of uncertainty: "I don't know", "not sure", "might be", "probably" | |
| 2. Lack of information statements: "doesn't exist", "lack of information", "could not find" | |
| 3. Inability statements: "I cannot provide", "I am unable to", "we cannot" | |
| 4. Negative statements that redirect: "However, you can...", "Instead, try..." | |
| 5. Non-answers that suggest alternatives | |
| </rules> | |
| <examples> | |
| Question: "What are the system requirements for running Python 3.9?" | |
| Answer: "I'm not entirely sure, but I think you need a computer with some RAM." | |
| Evaluation: { | |
| "pass": false, | |
| "think": "The answer contains uncertainty markers like 'not entirely sure' and 'I think', making it non-definitive." | |
| } | |
| Question: "What are the system requirements for running Python 3.9?" | |
| Answer: "Python 3.9 requires Windows 7 or later, macOS 10.11 or later, or Linux." | |
| Evaluation: { | |
| "pass": true, | |
| "think": "The answer makes clear, definitive statements without uncertainty markers or ambiguity." | |
| } | |
| Question: "Who will be the president of the United States in 2032?" | |
| Answer: "I cannot predict the future, it depends on the election results." | |
| Evaluation: { | |
| "pass": false, | |
| "think": "The answer contains a statement of inability to predict the future, making it non-definitive." | |
| } | |
| Question: "Who is the sales director at Company X?" | |
| Answer: "I cannot provide the name of the sales director, but you can contact their sales team at [email protected]" | |
| Evaluation: { | |
| "pass": false, | |
| "think": "The answer starts with 'I cannot provide' and redirects to an alternative contact method instead of answering the original question." | |
| } | |
| Question: "what is the twitter account of jina ai's founder?" | |
| Answer: "The provided text does not contain the Twitter account of Jina AI's founder." | |
| Evaluation: { | |
| "pass": false, | |
| "think": "The answer indicates a lack of information rather than providing a definitive response." | |
| } | |
| </examples> | |
| Now evaluate this pair: | |
| Question: ${JSON.stringify(question)} | |
| Answer: ${JSON.stringify(answer)}`; | |
| } | |
| function getFreshnessPrompt(question: string, answer: string, currentTime: string): string { | |
| return `You are an evaluator that analyzes if answer content is likely outdated based on mentioned dates and current time. | |
| <rules> | |
| 1. Date Analysis: | |
| - Extract all dates mentioned in the answer | |
| - Compare against current system time: ${currentTime} | |
| - Consider content outdated if: | |
| * It refers to a "latest" or "current" state from more than 30 days ago | |
| * It mentions specific dates/events that have been superseded | |
| * It contains time-sensitive information (e.g., "current CEO", "latest version") from more than 60 days ago | |
| - For product versions, releases, or announcements, max age is 30 days | |
| - For company positions, leadership, or general facts, max age is 60 days | |
| 2. Context Hints: | |
| - Words indicating recency: "latest", "current", "newest", "just released", "recently" | |
| - Time-sensitive terms: "CEO", "price", "version", "release" | |
| - Future dates should be ignored in outdated calculation | |
| </rules> | |
| <examples> | |
| Question: "What is Jina AI's latest embedding model?" | |
| Answer: "The latest embedding model from Jina AI is jina-embeddings-v2, released on March 15, 2024." | |
| Current Time: "2024-10-06T00:00:00Z" | |
| Evaluation: { | |
| "pass": false, | |
| "think": "The answer refers to a 'latest' model release from over 6 months ago, which is likely outdated for product version information", | |
| "freshness_analysis": { | |
| "likely_outdated": true, | |
| "dates_mentioned": ["2024-03-15"], | |
| "current_time": "2024-10-06T00:00:00Z", | |
| "max_age_days": 30 | |
| } | |
| } | |
| Question: "Who is OpenAI's CEO?" | |
| Answer: "Sam Altman is the CEO of OpenAI as of December 2023." | |
| Current Time: "2024-02-06T00:00:00Z" | |
| Evaluation: { | |
| "pass": true, | |
| "think": "The answer is about company leadership and is within the 60-day threshold for such information", | |
| "freshness_analysis": { | |
| "likely_outdated": false, | |
| "dates_mentioned": ["2023-12"], | |
| "current_time": "2024-02-06T00:00:00Z", | |
| "max_age_days": 60 | |
| } | |
| } | |
| </examples> | |
| Now evaluate this pair: | |
| Question: ${JSON.stringify(question)} | |
| Answer: ${JSON.stringify(answer)}`; | |
| } | |
| function getPluralityPrompt(question: string, answer: string): string { | |
| return `You are an evaluator that analyzes if answers provide the appropriate number of items requested in the question. | |
| <rules> | |
| 1. Question Analysis: | |
| - Check if question asks for multiple items using indicators like: | |
| * Plural nouns: "companies", "people", "names" | |
| * Quantifiers: "all", "many", "several", "various", "multiple" | |
| * List requests: "list", "enumerate", "name all", "give me all" | |
| * Numbers: "5 examples", "top 10" | |
| - Otherwise skip the analysis and return pass to true | |
| 2. Answer Analysis: | |
| - Count distinct items provided in the answer | |
| - Check if answer uses limiting words like "only", "just", "single" | |
| - Identify if answer acknowledges there are more items but only provides some | |
| 3. Definitiveness Rules: | |
| - If question asks for multiple items but answer provides only one → NOT definitive | |
| - If question asks for specific number (e.g., "top 5") but answer provides fewer → NOT definitive | |
| - If answer clearly states it's providing a partial list → NOT definitive | |
| - If question asks for "all" or "every" but answer seems incomplete → NOT definitive | |
| </rules> | |
| <examples> | |
| Question: "Who works in Jina AI's sales team?" | |
| Answer: "John Smith is a sales representative at Jina AI." | |
| Evaluation: { | |
| "pass": true, | |
| "think": "The question doesn't specifically ask for multiple team members, so a single name can be considered a definitive answer.", | |
| "plurality_analysis": { | |
| "expects_multiple": false, | |
| "provides_multiple": false, | |
| "count_provided": 1 | |
| } | |
| } | |
| Question: "List all the salespeople who work at Jina AI" | |
| Answer: "John Smith is a sales representative at Jina AI." | |
| Evaluation: { | |
| "pass": false, | |
| "think": "The question asks for 'all salespeople' but the answer only provides one name without indicating if this is the complete list.", | |
| "plurality_analysis": { | |
| "expects_multiple": true, | |
| "provides_multiple": false, | |
| "count_provided": 1 | |
| } | |
| } | |
| Question: "Name the top 3 products sold by Jina AI" | |
| Answer: "Jina AI's product lineup includes DocArray and Jina." | |
| Evaluation: { | |
| "pass": false, | |
| "think": "The question asks for top 3 products but only 2 are provided.", | |
| "plurality_analysis": { | |
| "expects_multiple": true, | |
| "provides_multiple": true, | |
| "count_expected": 3, | |
| "count_provided": 2 | |
| } | |
| } | |
| Question: "List as many AI companies in Berlin as you can find" | |
| Answer: "Here are several AI companies in Berlin: Ada Health, Merantix, DeepL, Understand.ai, and Zeitgold. There are many more AI companies in Berlin, but these are some notable examples." | |
| Evaluation: { | |
| "pass": false, | |
| "think": "While the answer provides multiple companies, it explicitly states it's an incomplete list when the question asks to list as many as possible.", | |
| "plurality_analysis": { | |
| "expects_multiple": true, | |
| "provides_multiple": true, | |
| "count_provided": 5 | |
| } | |
| } | |
| </examples> | |
| Now evaluate this pair: | |
| Question: ${JSON.stringify(question)} | |
| Answer: ${JSON.stringify(answer)}`; | |
| } | |
| const questionEvaluationSchema = z.object({ | |
| needsFreshness: z.boolean().describe('Whether the question requires freshness check'), | |
| needsPlurality: z.boolean().describe('Whether the question requires plurality check'), | |
| reasoning: z.string().describe('Explanation of why these checks are needed or not needed'), | |
| languageStyle: z.string().describe('The language being used and the overall vibe/mood of the question'), | |
| }); | |
| function getQuestionEvaluationPrompt(question: string): string { | |
| return `You are an evaluator that determines if a question requires freshness and/or plurality checks in addition to the required definitiveness check. | |
| <evaluation_types> | |
| 1. freshness - Checks if the question is time-sensitive or requires very recent information | |
| 2. plurality - Checks if the question asks for multiple items or a specific count or enumeration | |
| 3. language style - Identifies both the language used and the overall vibe of the question | |
| </evaluation_types> | |
| <rules> | |
| If question is a simple greeting, chit-chat, or general knowledge, provide the answer directly. | |
| 1. Freshness Evaluation: | |
| - Required for questions about current state, recent events, or time-sensitive information | |
| - Required for: prices, versions, leadership positions, status updates | |
| - Look for terms: "current", "latest", "recent", "now", "today", "new" | |
| - Consider company positions, product versions, market data time-sensitive | |
| 2. Plurality Evaluation: | |
| - Required when question asks for multiple items or specific counts | |
| - Check for: numbers ("5 examples"), plural nouns, list requests | |
| - Look for: "all", "list", "enumerate", "examples", plural forms | |
| - Required when question implies completeness ("all the reasons", "every factor") | |
| 3. Language Style Analysis: | |
| Combine both language and emotional vibe in a descriptive phrase, considering: | |
| - Language: The primary language or mix of languages used | |
| - Emotional tone: panic, excitement, frustration, curiosity, etc. | |
| - Formality level: academic, casual, professional, etc. | |
| - Domain context: technical, academic, social, etc. | |
| </rules> | |
| <examples> | |
| Question: "fam PLEASE help me calculate the eigenvalues of this 4x4 matrix ASAP!! [matrix details] got an exam tmrw 😭" | |
| Evaluation: { | |
| "needsFreshness": false, | |
| "needsPlurality": true, | |
| "reasoning": "Multiple eigenvalues needed but no time-sensitive information required", | |
| "languageStyle": "panicked student English with math jargon" | |
| } | |
| Question: "Can someone explain how tf did Ferrari mess up their pit stop strategy AGAIN?! 🤦♂️ #MonacoGP" | |
| Evaluation: { | |
| "needsFreshness": true, | |
| "needsPlurality": true, | |
| "reasoning": "Refers to recent race event and requires analysis of multiple strategic decisions", | |
| "languageStyle": "frustrated fan English with F1 terminology" | |
| } | |
| Question: "肖老师您好,请您介绍一下最近量子计算领域的三个重大突破,特别是它们在密码学领域的应用价值吗?🤔" | |
| Evaluation: { | |
| "needsFreshness": true, | |
| "needsPlurality": true, | |
| "reasoning": "Asks for recent breakthroughs (freshness) and specifically requests three examples (plurality)", | |
| "languageStyle": "formal technical Chinese with academic undertones" | |
| } | |
| Question: "Bruder krass, kannst du mir erklären warum meine neural network training loss komplett durchdreht? Hab schon alles probiert 😤" | |
| Evaluation: { | |
| "needsFreshness": false, | |
| "needsPlurality": true, | |
| "reasoning": "Requires comprehensive debugging analysis of multiple potential issues", | |
| "languageStyle": "frustrated German-English tech slang" | |
| } | |
| Question: "Does anyone have insights into the sociopolitical implications of GPT-4's emergence in the Global South, particularly regarding indigenous knowledge systems and linguistic diversity? Looking for a nuanced analysis." | |
| Evaluation: { | |
| "needsFreshness": true, | |
| "needsPlurality": true, | |
| "reasoning": "Requires analysis of current impacts (freshness) across multiple dimensions: sociopolitical, cultural, and linguistic (plurality)", | |
| "languageStyle": "formal academic English with sociological terminology" | |
| } | |
| </examples> | |
| Now evaluate this question: | |
| Question: ${JSON.stringify(question)}`; | |
| } | |
| const TOOL_NAME = 'evaluator'; | |
| export async function evaluateQuestion( | |
| question: string, | |
| tracker?: TokenTracker | |
| ): Promise<EvaluationCriteria> { | |
| try { | |
| const generator = new ObjectGeneratorSafe(tracker); | |
| const result = await generator.generateObject({ | |
| model: TOOL_NAME, | |
| schema: questionEvaluationSchema, | |
| prompt: getQuestionEvaluationPrompt(question), | |
| }); | |
| console.log('Question Evaluation:', result.object); | |
| // Always include definitive in types | |
| const types: EvaluationType[] = ['definitive']; | |
| if (result.object.needsFreshness) types.push('freshness'); | |
| if (result.object.needsPlurality) types.push('plurality'); | |
| console.log('Question Metrics:', types); | |
| // Always evaluate definitive first, then freshness (if needed), then plurality (if needed) | |
| return {types, languageStyle: result.object.languageStyle}; | |
| } catch (error) { | |
| console.error('Error in question evaluation:', error); | |
| // Default to all evaluation types in case of error | |
| return {types: ['definitive', 'freshness', 'plurality'], languageStyle: 'plain English'}; | |
| } | |
| } | |
| async function performEvaluation<T>( | |
| evaluationType: EvaluationType, | |
| params: { | |
| schema: z.ZodType<T>; | |
| prompt: string; | |
| }, | |
| tracker?: TokenTracker | |
| ): Promise<GenerateObjectResult<T>> { | |
| const generator = new ObjectGeneratorSafe(tracker); | |
| const result = await generator.generateObject({ | |
| model: TOOL_NAME, | |
| schema: params.schema, | |
| prompt: params.prompt, | |
| }); | |
| console.log(`${evaluationType} ${TOOL_NAME}`, result.object); | |
| return result as GenerateObjectResult<any>; | |
| } | |
| // Main evaluation function | |
| export async function evaluateAnswer( | |
| question: string, | |
| action: AnswerAction, | |
| evaluationCri: EvaluationCriteria, | |
| tracker?: TokenTracker | |
| ): Promise<{ response: EvaluationResponse }> { | |
| let result; | |
| // Only add attribution if we have valid references | |
| if (action.references && action.references.length > 0) { | |
| evaluationCri.types = ['attribution', ...evaluationCri.types]; | |
| } | |
| for (const evaluationType of evaluationCri.types) { | |
| switch (evaluationType) { | |
| case 'attribution': { | |
| // Safely handle references and ensure we have content | |
| const urls = action.references?.map(ref => ref.url) ?? []; | |
| const uniqueURLs = [...new Set(urls)]; | |
| const allKnowledge = await fetchSourceContent(uniqueURLs, tracker); | |
| if (!allKnowledge.trim()) { | |
| return { | |
| response: { | |
| pass: false, | |
| think: "The answer does not provide any valid attribution references that could be verified. No accessible source content was found to validate the claims made in the answer.", | |
| type: 'attribution', | |
| } | |
| }; | |
| } | |
| result = await performEvaluation( | |
| 'attribution', | |
| { | |
| schema: attributionSchema, | |
| prompt: getAttributionPrompt(question, action.answer, allKnowledge), | |
| }, | |
| tracker | |
| ); | |
| break; | |
| } | |
| case 'definitive': | |
| result = await performEvaluation( | |
| 'definitive', | |
| { | |
| schema: definitiveSchema, | |
| prompt: getDefinitivePrompt(question, action.answer), | |
| }, | |
| tracker | |
| ); | |
| break; | |
| case 'freshness': | |
| result = await performEvaluation( | |
| 'freshness', | |
| { | |
| schema: freshnessSchema, | |
| prompt: getFreshnessPrompt(question, action.answer, new Date().toISOString()), | |
| }, | |
| tracker | |
| ); | |
| break; | |
| case 'plurality': | |
| result = await performEvaluation( | |
| 'plurality', | |
| { | |
| schema: pluralitySchema, | |
| prompt: getPluralityPrompt(question, action.answer), | |
| }, | |
| tracker | |
| ); | |
| break; | |
| } | |
| if (!result?.object.pass) { | |
| return {response: result.object}; | |
| } | |
| } | |
| return {response: result!.object}; | |
| } | |
| // Helper function to fetch and combine source content | |
| async function fetchSourceContent(urls: string[], tracker?: TokenTracker): Promise<string> { | |
| if (!urls.length) return ''; | |
| try { | |
| const results = await Promise.all( | |
| urls.map(async (url) => { | |
| try { | |
| const {response} = await readUrl(url, tracker); | |
| const content = response?.data?.content || ''; | |
| return removeAllLineBreaks(content); | |
| } catch (error) { | |
| console.error('Error reading URL:', error); | |
| return ''; | |
| } | |
| }) | |
| ); | |
| // Filter out empty results and join with proper separation | |
| return results | |
| .filter(content => content.trim()) | |
| .join('\n\n'); | |
| } catch (error) { | |
| console.error('Error fetching source content:', error); | |
| return ''; | |
| } | |
| } |