Spaces:

Allanatrix
/

Nexa_Data_Studio

Running

File size: 1,723 Bytes

ef4c8c3

# Tokenization/label_tokens.py

# Domain tags
DOMAIN_TAGS = {
    "physics": "[PHYS]",
    "biology": "[BIO]",
    "materials": "[MAT]",
    "education": "[GEN]",
}

# Task tags
TASK_TAGS = {
    "hypothesis": "[HYP]",
    "method": "[MTH]",
    "experiment": "[EXP]",
}

# Section tags (for further granularity, e.g., for long-context or future models)
SECTION_TAGS = {
    "abstract": "[ABSTRACT]",
    "introduction": "[INTRO]",
    "results": "[RESULTS]",
    "discussion": "[DISCUSSION]",
    "conclusion": "[CONCLUSION]",
    "method": "[MTH]",
    "experiment": "[EXP]",
}

# Routing tags
ROUTING_TAGS = {
    "general": "[GEN]",
    "specific": "[SPEC]",
}

# Token/word limits for validation and filtering
MIN_WORDS = 8
MAX_TOKENS = 1024
MAX_TOTAL_TOKENS = 327680000  # Example: 325M tokens

# Token targets for different corpus types
TOKEN_TARGETS = {
    "warm_start": 100_000_000,
    "scientific": 225_000_000,
    "instruction": 30_000_000,
    "default": 325_000_000,
}

def build_tag_string(

    domain: str,

    task: str = None,

    section: str = None,

    routing: str = "general",

    subdomain: str = None

) -> str:
    """

    Build a tag string for a sample, e.g. [PHYS][HYP][GEN] or [BIO][MTH][SPEC: Genomics]

    """
    tags = []
    if domain in DOMAIN_TAGS:
        tags.append(DOMAIN_TAGS[domain])
    if task in TASK_TAGS:
        tags.append(TASK_TAGS[task])
    if section in SECTION_TAGS:
        tags.append(SECTION_TAGS[section])
    if routing == "general":
        tags.append(ROUTING_TAGS["general"])
    elif routing == "specific" and subdomain:
        tags.append(f"[SPEC:{subdomain}]")
    return "".join(tags)