Spaces:
Sleeping
Sleeping
"""Evaluation script for the SHL Assessment Recommendation System.""" | |
import os | |
import json | |
import numpy as np | |
from typing import List, Dict, Any | |
import pandas as pd | |
import importlib | |
from app import RecommendationSystem | |
# Path to the data file | |
DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), | |
"data", "processed") | |
ASSESSMENTS_PATH = r"C:\Users\nikhi\OneDrive\Documents\GitHub\SHL_Assignment\data\processed\shl_test_solutions.csv" | |
# Test queries with ground truth relevant assessments | |
# In a real scenario, you would have a proper evaluation dataset with human-labeled relevance | |
TEST_QUERIES = [ | |
{ | |
"query": "I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes.", | |
"relevant_assessments": ["Java", "Core Java", "Java Spring Boot", "Java Programming", "Collaboration Skills"], | |
"time_constraint": 40 | |
}, | |
{ | |
"query": "Looking to hire mid-level professionals who are proficient in Python, SQL and Java Script. Need an assessment package that can test all skills with max duration of 60 minutes.", | |
"relevant_assessments": ["Python", "SQL", "JavaScript", "Full Stack Developer", "Web Development"], | |
"time_constraint": 60 | |
}, | |
{ | |
"query": "I am hiring for an analyst and wants applications to screen using Cognitive and personality tests, what options are available within 45 mins.", | |
"relevant_assessments": ["Analytical Thinking", "Cognitive Ability", "Personality", "Decision Making", "Data Analysis"], | |
"time_constraint": 45 | |
} | |
] | |
def is_relevant(assessment: Dict[str, Any], relevant_keywords: List[str]) -> bool: | |
""" | |
Check if an assessment is relevant based on keywords in its name or description | |
Args: | |
assessment: Assessment dictionary with 'name' and optionally 'description' | |
relevant_keywords: List of keywords to match against | |
Returns: | |
Boolean indicating relevance | |
""" | |
assessment_name = assessment["name"].lower() | |
assessment_desc = assessment.get("description", "").lower() if isinstance(assessment.get("description", ""), str) else "" | |
# Special case for cognitive/personality assessments | |
if any(kw.lower() in ["cognitive ability", "personality", "analytical thinking"] for kw in relevant_keywords): | |
cognitive_keywords = ["reasoning", "cognitive", "numerical", "verbal", "inductive", "deductive", "verify"] | |
personality_keywords = ["personality", "trait", "behavior", "opq"] | |
analytical_keywords = ["analytical", "analysis", "problem solving", "critical thinking"] | |
# Check if assessment name contains any cognitive/personality keywords | |
if any(kw in assessment_name for kw in cognitive_keywords + personality_keywords + analytical_keywords): | |
return True | |
if assessment_desc and any(kw in assessment_desc for kw in cognitive_keywords + personality_keywords + analytical_keywords): | |
return True | |
# General keyword matching | |
for keyword in relevant_keywords: | |
keyword_lower = keyword.lower() | |
# Direct match in name or description | |
if keyword_lower in assessment_name or (assessment_desc and keyword_lower in assessment_desc): | |
return True | |
# Word boundary matching to avoid partial word matches | |
name_words = assessment_name.split() | |
for word in name_words: | |
# Allow stemming-like matching (e.g. 'Python' matches 'Python-based') | |
if (keyword_lower in word or word in keyword_lower) and len(word) >= 4 and len(keyword_lower) >= 4: | |
return True | |
# Try matching in description | |
if assessment_desc: | |
desc_words = assessment_desc.split() | |
for word in desc_words: | |
if (keyword_lower in word or word in keyword_lower) and len(word) >= 4 and len(keyword_lower) >= 4: | |
return True | |
return False | |
def precision_at_k(recommended: List[Dict[str, Any]], relevant_keywords: List[str], k: int) -> float: | |
"""Calculate precision@k""" | |
if k == 0 or not recommended: | |
return 0.0 | |
hits = sum(1 for i, item in enumerate(recommended[:k]) | |
if is_relevant(item, relevant_keywords)) | |
return hits / k | |
def recall_at_k(recommended: List[Dict[str, Any]], relevant_keywords: List[str], k: int) -> float: | |
"""Calculate recall@k""" | |
if not relevant_keywords or not recommended: | |
return 0.0 | |
hits = sum(1 for i, item in enumerate(recommended[:k]) | |
if is_relevant(item, relevant_keywords)) | |
return hits / len(relevant_keywords) | |
def average_precision(recommended: List[Dict[str, Any]], relevant_keywords: List[str], k: int) -> float: | |
"""Calculate average precision@k""" | |
if not recommended or not relevant_keywords: | |
return 0.0 | |
precisions = [] | |
num_relevant_found = 0 | |
for i in range(min(k, len(recommended))): | |
if is_relevant(recommended[i], relevant_keywords): | |
num_relevant_found += 1 | |
precisions.append(num_relevant_found / (i + 1)) | |
if not precisions: | |
return 0.0 | |
return sum(precisions) / min(len(relevant_keywords), k) | |
def evaluate_system(): | |
"""Evaluate the recommendation system using test queries""" | |
# Load data before creating recommender to avoid double initialization | |
assessments_df = pd.read_csv(ASSESSMENTS_PATH) | |
# Now create recommendation system with the pre-loaded data | |
print("Initializing recommendation system...") | |
recommender = RecommendationSystem(ASSESSMENTS_PATH) | |
recalls = [] | |
avg_precisions = [] | |
print("\n=== Evaluation Results ===") | |
for i, test_case in enumerate(TEST_QUERIES): | |
query = test_case["query"] | |
relevant_keywords = test_case["relevant_assessments"] | |
print(f"\nQuery {i+1}: {query}") | |
print(f"Relevant assessment keywords: {relevant_keywords}") | |
# Get recommendations | |
recommendations = recommender.recommend(query, max_results=10) | |
# Display enhanced query if available in the recommendation response | |
if hasattr(recommender, 'enhanced_query') and recommender.enhanced_query: | |
print(f"Original query: {query}") | |
print(f"Enhanced query: {recommender.enhanced_query}") | |
# Fix the "minutes minutes" issue by checking and cleaning the duration format | |
print("\nTop 3 Recommendations:") | |
for j, rec in enumerate(recommendations[:3]): | |
# Fix duration formatting - remove the word "minutes" if it's already in the rec['duration'] | |
duration_str = str(rec['duration']) | |
if "minute" not in duration_str.lower(): | |
duration_display = f"{duration_str} minutes" | |
else: | |
duration_display = duration_str | |
relevance_marker = "✓" if is_relevant(rec, relevant_keywords) else " " | |
print(f"{j+1}. {rec['name']} (Duration: {duration_display}, Score: {rec['similarity_score']:.2f}) {relevance_marker}") | |
# Calculate metrics at k=3 | |
k = 3 | |
recall = recall_at_k(recommendations, relevant_keywords, k) | |
ap = average_precision(recommendations, relevant_keywords, k) | |
recalls.append(recall) | |
avg_precisions.append(ap) | |
print(f"\nMetrics at k={k}:") | |
print(f"Recall@{k}: {recall:.2f}") | |
print(f"AP@{k}: {ap:.2f}") | |
# Debug information about relevance matching | |
print("\nRelevance details:") | |
for j, rec in enumerate(recommendations[:k]): | |
is_rel = is_relevant(rec, relevant_keywords) | |
print(f"- {rec['name']}: {'Relevant' if is_rel else 'Not relevant'}") | |
# Calculate mean metrics | |
mean_recall = np.mean(recalls) | |
mean_ap = np.mean(avg_precisions) | |
print("\n=== Overall Performance ===") | |
print(f"Mean Recall@3: {mean_recall:.4f}") | |
print(f"MAP@3: {mean_ap:.4f}") | |
# Prevent RecommendationSystem from being imported twice | |
if __name__ == "__main__": | |
evaluate_system() | |