File size: 1,723 Bytes
ef4c8c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# Tokenization/label_tokens.py

# Domain tags
DOMAIN_TAGS = {
    "physics": "[PHYS]",
    "biology": "[BIO]",
    "materials": "[MAT]",
    "education": "[GEN]",
}

# Task tags
TASK_TAGS = {
    "hypothesis": "[HYP]",
    "method": "[MTH]",
    "experiment": "[EXP]",
}

# Section tags (for further granularity, e.g., for long-context or future models)
SECTION_TAGS = {
    "abstract": "[ABSTRACT]",
    "introduction": "[INTRO]",
    "results": "[RESULTS]",
    "discussion": "[DISCUSSION]",
    "conclusion": "[CONCLUSION]",
    "method": "[MTH]",
    "experiment": "[EXP]",
}

# Routing tags
ROUTING_TAGS = {
    "general": "[GEN]",
    "specific": "[SPEC]",
}

# Token/word limits for validation and filtering
MIN_WORDS = 8
MAX_TOKENS = 1024
MAX_TOTAL_TOKENS = 327680000  # Example: 325M tokens

# Token targets for different corpus types
TOKEN_TARGETS = {
    "warm_start": 100_000_000,
    "scientific": 225_000_000,
    "instruction": 30_000_000,
    "default": 325_000_000,
}

def build_tag_string(

    domain: str,

    task: str = None,

    section: str = None,

    routing: str = "general",

    subdomain: str = None

) -> str:
    """

    Build a tag string for a sample, e.g. [PHYS][HYP][GEN] or [BIO][MTH][SPEC: Genomics]

    """
    tags = []
    if domain in DOMAIN_TAGS:
        tags.append(DOMAIN_TAGS[domain])
    if task in TASK_TAGS:
        tags.append(TASK_TAGS[task])
    if section in SECTION_TAGS:
        tags.append(SECTION_TAGS[section])
    if routing == "general":
        tags.append(ROUTING_TAGS["general"])
    elif routing == "specific" and subdomain:
        tags.append(f"[SPEC:{subdomain}]")
    return "".join(tags)