# Tokenization/label_tokens.py # Domain tags DOMAIN_TAGS = { "physics": "[PHYS]", "biology": "[BIO]", "materials": "[MAT]", "education": "[GEN]", } # Task tags TASK_TAGS = { "hypothesis": "[HYP]", "method": "[MTH]", "experiment": "[EXP]", } # Section tags (for further granularity, e.g., for long-context or future models) SECTION_TAGS = { "abstract": "[ABSTRACT]", "introduction": "[INTRO]", "results": "[RESULTS]", "discussion": "[DISCUSSION]", "conclusion": "[CONCLUSION]", "method": "[MTH]", "experiment": "[EXP]", } # Routing tags ROUTING_TAGS = { "general": "[GEN]", "specific": "[SPEC]", } # Token/word limits for validation and filtering MIN_WORDS = 8 MAX_TOKENS = 1024 MAX_TOTAL_TOKENS = 327680000 # Example: 325M tokens # Token targets for different corpus types TOKEN_TARGETS = { "warm_start": 100_000_000, "scientific": 225_000_000, "instruction": 30_000_000, "default": 325_000_000, } def build_tag_string( domain: str, task: str = None, section: str = None, routing: str = "general", subdomain: str = None ) -> str: """ Build a tag string for a sample, e.g. [PHYS][HYP][GEN] or [BIO][MTH][SPEC: Genomics] """ tags = [] if domain in DOMAIN_TAGS: tags.append(DOMAIN_TAGS[domain]) if task in TASK_TAGS: tags.append(TASK_TAGS[task]) if section in SECTION_TAGS: tags.append(SECTION_TAGS[section]) if routing == "general": tags.append(ROUTING_TAGS["general"]) elif routing == "specific" and subdomain: tags.append(f"[SPEC:{subdomain}]") return "".join(tags)