Allanatrix's picture
Upload 50 files
ef4c8c3 verified
from collections import Counter
from typing import Dict, List
import numpy as np
from transformers import AutoTokenizer
class DatasetAnalyzer:
def __init__(self, model_name: str = "facebook/opt-350m"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
def analyze_sample(self, sample: Dict) -> Dict:
tokens = self.tokenizer.encode(str(sample))
return {
"token_count": len(tokens),
"word_count": len(str(sample).split()),
"has_abstract": bool(sample.get("abstract")),
"has_content": bool(sample.get("full_text") or sample.get("excerpt")),
"has_section": bool(sample.get("section_type")),
"domain": sample.get("domain_tag", "unknown")
}
def get_dataset_stats(self, samples: List[Dict]) -> Dict:
stats = []
domains = Counter()
sections = Counter()
for sample in samples:
sample_stats = self.analyze_sample(sample)
stats.append(sample_stats)
domains[sample_stats["domain"]] += 1
sections[sample.get("section_type", "unknown")] += 1
return {
"total_samples": len(samples),
"avg_tokens": np.mean([s["token_count"] for s in stats]),
"avg_words": np.mean([s["word_count"] for s in stats]),
"domain_distribution": dict(domains),
"section_distribution": dict(sections)
}