Spaces:
Running
on
Zero
Running
on
Zero
#!/usr/bin/env python3 | |
""" | |
Score Utilizer - Extract and utilize highest-scoring pages from retrieval logs | |
This module provides utilities to parse log outputs and retrieve the best pages based on scores. | |
""" | |
import re | |
import json | |
from typing import List, Dict, Tuple, Optional | |
class ScoreUtilizer: | |
""" | |
Utility class to extract and utilize highest-scoring pages from retrieval logs | |
""" | |
def __init__(self): | |
self.score_patterns = { | |
'page_score': r'Page\s+(\d+)\s+\(doc_id:\s*(\d+)\)\s*\|\s*Score:\s*([\d.]+)', | |
'highest_scoring': r'(\d+)\.\s*Page\s+(\d+)\s+-\s*Score:\s*([\d.]+)', | |
'relevance_level': r'([π’π‘π π΅π£π΄])\s+([A-Z\s]+)\s+-\s+(.+)' | |
} | |
def parse_log_output(self, log_text: str) -> Dict: | |
""" | |
Parse log output to extract page scores and relevance information | |
Args: | |
log_text: Raw log output from the retrieval system | |
Returns: | |
Dictionary containing parsed page scores and metadata | |
""" | |
print("π PARSING LOG OUTPUT FOR HIGHEST-SCORING PAGES") | |
print("=" * 60) | |
# Extract page scores | |
page_scores = self._extract_page_scores(log_text) | |
# Extract highest scoring pages | |
top_pages = self._extract_top_pages(log_text) | |
# Extract relevance distribution | |
relevance_dist = self._extract_relevance_distribution(log_text) | |
# Extract statistics | |
stats = self._extract_statistics(log_text) | |
result = { | |
'page_scores': page_scores, | |
'top_pages': top_pages, | |
'relevance_distribution': relevance_dist, | |
'statistics': stats, | |
'parsed_at': self._get_timestamp() | |
} | |
print(f"β Successfully parsed {len(page_scores)} page scores") | |
print(f"π Found {len(top_pages)} top-scoring pages") | |
print("=" * 60) | |
return result | |
def _extract_page_scores(self, log_text: str) -> List[Dict]: | |
"""Extract individual page scores from log text""" | |
page_scores = [] | |
# Pattern: "Page 1 (doc_id: 0) | Score: 0.9234 | π’ EXCELLENT - Highly relevant" | |
pattern = self.score_patterns['page_score'] | |
matches = re.findall(pattern, log_text) | |
for match in matches: | |
page_num, doc_id, score = match | |
page_scores.append({ | |
'page_number': int(page_num), | |
'doc_id': int(doc_id), | |
'score': float(score), | |
'relevance_level': self._get_relevance_level(float(score)) | |
}) | |
# Sort by score (highest first) | |
page_scores.sort(key=lambda x: x['score'], reverse=True) | |
return page_scores | |
def _extract_top_pages(self, log_text: str) -> List[Dict]: | |
"""Extract top-scoring pages from log text""" | |
top_pages = [] | |
# Pattern: "1. Page 1 - Score: 0.9234" | |
pattern = self.score_patterns['highest_scoring'] | |
matches = re.findall(pattern, log_text) | |
for match in matches: | |
rank, page_num, score = match | |
top_pages.append({ | |
'rank': int(rank), | |
'page_number': int(page_num), | |
'score': float(score), | |
'relevance_level': self._get_relevance_level(float(score)) | |
}) | |
return top_pages | |
def _extract_relevance_distribution(self, log_text: str) -> Dict: | |
"""Extract relevance distribution from log text""" | |
distribution = { | |
'excellent': 0, | |
'very_good': 0, | |
'good': 0, | |
'moderate': 0, | |
'basic': 0, | |
'poor': 0 | |
} | |
# Look for distribution lines like "π’ Excellent (β₯0.90): 2 pages" | |
patterns = { | |
'excellent': r'π’\s+Excellent.*?(\d+)\s+pages?', | |
'very_good': r'π‘\s+Very Good.*?(\d+)\s+pages?', | |
'good': r'π \s+Good.*?(\d+)\s+pages?', | |
'moderate': r'π΅\s+Moderate.*?(\d+)\s+pages?', | |
'basic': r'π£\s+Basic.*?(\d+)\s+pages?', | |
'poor': r'π΄\s+Poor.*?(\d+)\s+pages?' | |
} | |
for level, pattern in patterns.items(): | |
match = re.search(pattern, log_text) | |
if match: | |
distribution[level] = int(match.group(1)) | |
return distribution | |
def _extract_statistics(self, log_text: str) -> Dict: | |
"""Extract statistical information from log text""" | |
stats = {} | |
# Extract average score | |
avg_match = re.search(r'Average.*?Score:\s*([\d.]+)', log_text) | |
if avg_match: | |
stats['average_score'] = float(avg_match.group(1)) | |
# Extract highest score | |
high_match = re.search(r'Highest.*?Score:\s*([\d.]+)', log_text) | |
if high_match: | |
stats['highest_score'] = float(high_match.group(1)) | |
# Extract lowest score | |
low_match = re.search(r'Lowest.*?Score:\s*([\d.]+)', log_text) | |
if low_match: | |
stats['lowest_score'] = float(low_match.group(1)) | |
# Extract total pages | |
total_match = re.search(r'Total.*?(\d+).*?results?', log_text) | |
if total_match: | |
stats['total_pages'] = int(total_match.group(1)) | |
return stats | |
def get_highest_scoring_pages(self, parsed_data: Dict, count: int = 5) -> List[Dict]: | |
""" | |
Get the highest-scoring pages from parsed data | |
Args: | |
parsed_data: Parsed log data from parse_log_output() | |
count: Number of top pages to return | |
Returns: | |
List of highest-scoring pages | |
""" | |
if 'page_scores' not in parsed_data: | |
return [] | |
return parsed_data['page_scores'][:count] | |
def get_pages_by_threshold(self, parsed_data: Dict, threshold: float = 0.80) -> List[Dict]: | |
""" | |
Get pages that meet or exceed a score threshold | |
Args: | |
parsed_data: Parsed log data from parse_log_output() | |
threshold: Minimum score threshold | |
Returns: | |
List of pages meeting the threshold | |
""" | |
if 'page_scores' not in parsed_data: | |
return [] | |
return [page for page in parsed_data['page_scores'] if page['score'] >= threshold] | |
def get_pages_by_relevance_level(self, parsed_data: Dict, level: str = 'excellent') -> List[Dict]: | |
""" | |
Get pages by specific relevance level | |
Args: | |
parsed_data: Parsed log data from parse_log_output() | |
level: Relevance level ('excellent', 'very_good', 'good', 'moderate', 'basic', 'poor') | |
Returns: | |
List of pages with the specified relevance level | |
""" | |
if 'page_scores' not in parsed_data: | |
return [] | |
level_mapping = { | |
'excellent': 'π’ EXCELLENT', | |
'very_good': 'π‘ VERY GOOD', | |
'good': 'π GOOD', | |
'moderate': 'π΅ MODERATE', | |
'basic': 'π£ BASIC', | |
'poor': 'π΄ POOR' | |
} | |
target_level = level_mapping.get(level, 'π’ EXCELLENT') | |
return [page for page in parsed_data['page_scores'] if target_level in page['relevance_level']] | |
def generate_utilization_report(self, parsed_data: Dict) -> str: | |
""" | |
Generate a comprehensive report on how to utilize the highest-scoring pages | |
Args: | |
parsed_data: Parsed log data from parse_log_output() | |
Returns: | |
Formatted report string | |
""" | |
report = [] | |
report.append("π HIGHEST-SCORING PAGES UTILIZATION REPORT") | |
report.append("=" * 60) | |
# Top pages summary | |
top_pages = self.get_highest_scoring_pages(parsed_data, 5) | |
report.append(f"\nπ TOP 5 HIGHEST-SCORING PAGES:") | |
for i, page in enumerate(top_pages, 1): | |
report.append(f" {i}. Page {page['page_number']} - Score: {page['score']:.4f} ({page['relevance_level']})") | |
# Threshold-based recommendations | |
excellent_pages = self.get_pages_by_threshold(parsed_data, 0.90) | |
very_good_pages = self.get_pages_by_threshold(parsed_data, 0.80) | |
report.append(f"\nπ― UTILIZATION RECOMMENDATIONS:") | |
report.append(f" π’ Excellent pages (β₯0.90): {len(excellent_pages)} pages - Use for primary context") | |
report.append(f" π‘ Very Good pages (β₯0.80): {len(very_good_pages)} pages - Use for comprehensive coverage") | |
# Statistics | |
if 'statistics' in parsed_data and parsed_data['statistics']: | |
stats = parsed_data['statistics'] | |
report.append(f"\nπ QUALITY METRICS:") | |
if 'average_score' in stats: | |
report.append(f" Average Score: {stats['average_score']:.4f}") | |
if 'highest_score' in stats: | |
report.append(f" Highest Score: {stats['highest_score']:.4f}") | |
if 'total_pages' in stats: | |
report.append(f" Total Pages Analyzed: {stats['total_pages']}") | |
# Usage suggestions | |
report.append(f"\nπ‘ USAGE SUGGESTIONS:") | |
report.append(f" 1. Feed top 3 pages to language model for focused responses") | |
report.append(f" 2. Use excellent pages for critical information extraction") | |
report.append(f" 3. Include very good pages for comprehensive analysis") | |
report.append(f" 4. Consider page diversity for balanced coverage") | |
report.append("=" * 60) | |
return "\n".join(report) | |
def _get_relevance_level(self, score: float) -> str: | |
"""Get relevance level based on score""" | |
if score >= 0.90: | |
return "π’ EXCELLENT - Highly relevant" | |
elif score >= 0.80: | |
return "π‘ VERY GOOD - Very relevant" | |
elif score >= 0.70: | |
return "π GOOD - Relevant" | |
elif score >= 0.60: | |
return "π΅ MODERATE - Somewhat relevant" | |
elif score >= 0.50: | |
return "π£ BASIC - Minimally relevant" | |
else: | |
return "π΄ POOR - Not relevant" | |
def _get_timestamp(self) -> str: | |
"""Get current timestamp""" | |
from datetime import datetime | |
return datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
# Example usage function | |
def demonstrate_score_utilization(): | |
""" | |
Demonstrate how to use the ScoreUtilizer to extract and utilize highest-scoring pages | |
""" | |
print("π§ͺ DEMONSTRATING SCORE UTILIZATION") | |
print("=" * 60) | |
# Example log output (this would come from your actual retrieval system) | |
example_log = """ | |
================================================================================ | |
π RETRIEVAL SCORES - PAGE NUMBERS WITH HIGHEST SCORES | |
================================================================================ | |
π Collection: documents_20250101_120000 | |
π Total documents found: 15 | |
π― Requested top-k: 5 | |
-------------------------------------------------------------------------------- | |
π Page 1 (doc_id: 0) | Score: 0.9234 | π’ EXCELLENT - Highly relevant | |
π Page 3 (doc_id: 2) | Score: 0.8756 | π‘ VERY GOOD - Very relevant | |
π Page 7 (doc_id: 6) | Score: 0.8123 | π‘ VERY GOOD - Very relevant | |
π Page 2 (doc_id: 1) | Score: 0.7890 | π GOOD - Relevant | |
π Page 5 (doc_id: 4) | Score: 0.7456 | π GOOD - Relevant | |
-------------------------------------------------------------------------------- | |
π HIGHEST SCORING PAGES: | |
1. Page 1 - Score: 0.9234 | |
2. Page 3 - Score: 0.8756 | |
3. Page 7 - Score: 0.8123 | |
================================================================================ | |
""" | |
# Initialize utilizer | |
utilizer = ScoreUtilizer() | |
# Parse the log output | |
parsed_data = utilizer.parse_log_output(example_log) | |
# Get highest-scoring pages | |
top_pages = utilizer.get_highest_scoring_pages(parsed_data, 3) | |
print(f"\nπ TOP 3 HIGHEST-SCORING PAGES:") | |
for page in top_pages: | |
print(f" Page {page['page_number']} - Score: {page['score']:.4f}") | |
# Get pages by threshold | |
excellent_pages = utilizer.get_pages_by_threshold(parsed_data, 0.90) | |
print(f"\nπ’ EXCELLENT PAGES (β₯0.90): {len(excellent_pages)} pages") | |
# Generate utilization report | |
report = utilizer.generate_utilization_report(parsed_data) | |
print(f"\n{report}") | |
print("\nβ Score utilization demonstration completed!") | |
if __name__ == "__main__": | |
demonstrate_score_utilization() | |