Spaces:
Sleeping
Sleeping
| import {z} from 'zod'; | |
| import {TokenTracker} from "../utils/token-tracker"; | |
| import {ObjectGeneratorSafe} from "../utils/safe-generator"; | |
| const responseSchema = z.object({ | |
| think: z.string().describe('Strategic reasoning about the overall deduplication approach'), | |
| unique_queries: z.array(z.string().describe('Unique query that passed the deduplication process, must be less than 30 characters')) | |
| .describe('Array of semantically unique queries').max(3) | |
| }); | |
| function getPrompt(newQueries: string[], existingQueries: string[]): string { | |
| return `You are an expert in semantic similarity analysis. Given a set of queries (setA) and a set of queries (setB) | |
| <rules> | |
| Function FilterSetA(setA, setB, threshold): | |
| filteredA = empty set | |
| for each candidateQuery in setA: | |
| isValid = true | |
| // Check similarity with already accepted queries in filteredA | |
| for each acceptedQuery in filteredA: | |
| similarity = calculateSimilarity(candidateQuery, acceptedQuery) | |
| if similarity >= threshold: | |
| isValid = false | |
| break | |
| // If passed first check, compare with set B | |
| if isValid: | |
| for each queryB in setB: | |
| similarity = calculateSimilarity(candidateQuery, queryB) | |
| if similarity >= threshold: | |
| isValid = false | |
| break | |
| // If passed all checks, add to filtered set | |
| if isValid: | |
| add candidateQuery to filteredA | |
| return filteredA | |
| </rules> | |
| <similarity-definition> | |
| 1. Consider semantic meaning and query intent, not just lexical similarity | |
| 2. Account for different phrasings of the same information need | |
| 3. Queries with same base keywords but different operators are NOT duplicates | |
| 4. Different aspects or perspectives of the same topic are not duplicates | |
| 5. Consider query specificity - a more specific query is not a duplicate of a general one | |
| 6. Search operators that make queries behave differently: | |
| - Different site: filters (e.g., site:youtube.com vs site:github.com) | |
| - Different file types (e.g., filetype:pdf vs filetype:doc) | |
| - Different language/location filters (e.g., lang:en vs lang:es) | |
| - Different exact match phrases (e.g., "exact phrase" vs no quotes) | |
| - Different inclusion/exclusion (+/- operators) | |
| - Different title/body filters (intitle: vs inbody:) | |
| </similarity-definition> | |
| Now with threshold set to 0.2; run FilterSetA on the following: | |
| SetA: ${JSON.stringify(newQueries)} | |
| SetB: ${JSON.stringify(existingQueries)}`; | |
| } | |
| const TOOL_NAME = 'dedup'; | |
| export async function dedupQueries( | |
| newQueries: string[], | |
| existingQueries: string[], | |
| tracker?: TokenTracker | |
| ): Promise<{ unique_queries: string[] }> { | |
| try { | |
| const generator = new ObjectGeneratorSafe(tracker); | |
| const prompt = getPrompt(newQueries, existingQueries); | |
| const result = await generator.generateObject({ | |
| model: TOOL_NAME, | |
| schema: responseSchema, | |
| prompt, | |
| }); | |
| console.log(TOOL_NAME, result.object.unique_queries); | |
| return {unique_queries: result.object.unique_queries}; | |
| } catch (error) { | |
| console.error(`Error in ${TOOL_NAME}`, error); | |
| throw error; | |
| } | |
| } |