Spaces:
Running
Running
from collections import Counter | |
from typing import Dict, List | |
import numpy as np | |
from transformers import AutoTokenizer | |
class DatasetAnalyzer: | |
def __init__(self, model_name: str = "facebook/opt-350m"): | |
self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
def analyze_sample(self, sample: Dict) -> Dict: | |
tokens = self.tokenizer.encode(str(sample)) | |
return { | |
"token_count": len(tokens), | |
"word_count": len(str(sample).split()), | |
"has_abstract": bool(sample.get("abstract")), | |
"has_content": bool(sample.get("full_text") or sample.get("excerpt")), | |
"has_section": bool(sample.get("section_type")), | |
"domain": sample.get("domain_tag", "unknown") | |
} | |
def get_dataset_stats(self, samples: List[Dict]) -> Dict: | |
stats = [] | |
domains = Counter() | |
sections = Counter() | |
for sample in samples: | |
sample_stats = self.analyze_sample(sample) | |
stats.append(sample_stats) | |
domains[sample_stats["domain"]] += 1 | |
sections[sample.get("section_type", "unknown")] += 1 | |
return { | |
"total_samples": len(samples), | |
"avg_tokens": np.mean([s["token_count"] for s in stats]), | |
"avg_words": np.mean([s["word_count"] for s in stats]), | |
"domain_distribution": dict(domains), | |
"section_distribution": dict(sections) | |
} | |