Spaces:
Running
Running
Upload 50 files
Browse files- Tokenization/Build_tokenizer.py +89 -0
- Tokenization/Cleanser.py +102 -0
- Tokenization/Entropy_ranker.py +59 -0
- Tokenization/Label_tokens.py +69 -0
- Tokenization/Logs/corpus_builder.log +0 -0
- Tokenization/Logs/debug_upload.log +4 -0
- Tokenization/Main_2.py +922 -0
- Tokenization/__init__.py +21 -0
- Tokenization/__pycache__/Build_tokenizer.cpython-310.pyc +0 -0
- Tokenization/__pycache__/Entropy_ranker.cpython-310.pyc +0 -0
- Tokenization/__pycache__/Label_tokens.cpython-310.pyc +0 -0
- Tokenization/__pycache__/Main_2.cpython-310.pyc +0 -0
- Tokenization/__pycache__/__init__.cpython-310.pyc +0 -0
- Tokenization/__pycache__/generate_dataset.cpython-310.pyc +0 -0
- Tokenization/__pycache__/hf_upload.cpython-310.pyc +0 -0
- Tokenization/app.py +147 -0
- Tokenization/app/Api.py +75 -0
- Tokenization/app/Config.py +25 -0
- Tokenization/app/Core.py +155 -0
- Tokenization/app/Payment.py +27 -0
- Tokenization/app/Progress.py +37 -0
- Tokenization/app/__init__.py +15 -0
- Tokenization/app/__pycache__/Api.cpython-310.pyc +0 -0
- Tokenization/app/__pycache__/Config.cpython-310.pyc +0 -0
- Tokenization/app/__pycache__/Core.cpython-310.pyc +0 -0
- Tokenization/app/__pycache__/Payment.cpython-310.pyc +0 -0
- Tokenization/app/__pycache__/Progress.cpython-310.pyc +0 -0
- Tokenization/app/__pycache__/__init__.cpython-310.pyc +0 -0
- Tokenization/combined_scientific_papers.json +0 -0
- Tokenization/combined_scientific_papers.jsonl +0 -0
- Tokenization/corpus_builder.log +0 -0
- Tokenization/debug_upload.log +198 -0
- Tokenization/generate_dataset.py +77 -0
- Tokenization/hf_upload.py +163 -0
- Tokenization/preprocessing/Clean_text.py +16 -0
- Tokenization/preprocessing/Preprocess_sample.py +31 -0
- Tokenization/preprocessing/Segment_paragraphs.py +19 -0
- Tokenization/preprocessing/__init__.py +9 -0
- Tokenization/preprocessing/__pycache__/Clean_text.cpython-310.pyc +0 -0
- Tokenization/preprocessing/__pycache__/Preprocess_sample.cpython-310.pyc +0 -0
- Tokenization/preprocessing/__pycache__/Segment_paragraphs.cpython-310.pyc +0 -0
- Tokenization/preprocessing/__pycache__/__init__.cpython-310.pyc +0 -0
- Tokenization/pretraining/Dataset_stats.py +40 -0
- Tokenization/pretraining/Instruction_formatter.py +18 -0
- Tokenization/pretraining/__init__.py +3 -0
- Tokenization/pretraining/__pycache__/Dataset_stats.cpython-310.pyc +0 -0
- Tokenization/pretraining/__pycache__/Instruction_formatter.cpython-310.pyc +0 -0
- Tokenization/pretraining/__pycache__/__init__.cpython-310.pyc +0 -0
- Tokenization/requirements.txt +11 -0
- Tokenization/run_backend.py +12 -0
Tokenization/Build_tokenizer.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from pathlib import Path
|
3 |
+
from typing import Dict
|
4 |
+
|
5 |
+
from transformers import AutoTokenizer
|
6 |
+
|
7 |
+
from Tokenization.Entropy_ranker import EntropyRanker
|
8 |
+
from Tokenization.Label_tokens import MIN_WORDS, MAX_TOKENS, MAX_TOTAL_TOKENS, TOKEN_TARGETS
|
9 |
+
from Tokenization.pretraining.Dataset_stats import DatasetAnalyzer
|
10 |
+
from Tokenization.pretraining.Instruction_formatter import InstructionFormatter
|
11 |
+
|
12 |
+
|
13 |
+
class QLoRAPreprocessor:
|
14 |
+
def __init__(self, model_name: str = "facebook/opt-350m", corpus_type: str = "warm_start"):
|
15 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
16 |
+
self.analyzer = DatasetAnalyzer(model_name)
|
17 |
+
self.formatter = InstructionFormatter()
|
18 |
+
self.ranker = EntropyRanker()
|
19 |
+
self.token_target = TOKEN_TARGETS[corpus_type]
|
20 |
+
self.current_tokens = 0
|
21 |
+
|
22 |
+
def track_tokens(self, text: str) -> bool:
|
23 |
+
tokens = self.tokenizer.encode(text)
|
24 |
+
self.current_tokens += len(tokens)
|
25 |
+
return self.current_tokens <= self.token_target
|
26 |
+
|
27 |
+
def validate_sample(self, sample: Dict) -> bool:
|
28 |
+
if not all(k in sample for k in ["instruction", "input", "output"]):
|
29 |
+
return False
|
30 |
+
total_text = f"{sample['instruction']} {sample['input']} {sample['output']}"
|
31 |
+
tokens = self.tokenizer.encode(total_text)
|
32 |
+
words = total_text.split()
|
33 |
+
return (len(words) >= MIN_WORDS and
|
34 |
+
len(tokens) <= MAX_TOKENS and
|
35 |
+
len(tokens) <= MAX_TOTAL_TOKENS)
|
36 |
+
|
37 |
+
def process_dataset(self, input_path: str, output_path: str):
|
38 |
+
# Load data, skipping blank lines and malformed JSON
|
39 |
+
data = []
|
40 |
+
with open(input_path, 'r', encoding='utf-8') as f:
|
41 |
+
for i, line in enumerate(f, 1):
|
42 |
+
line = line.strip()
|
43 |
+
if not line:
|
44 |
+
continue
|
45 |
+
try:
|
46 |
+
data.append(json.loads(line))
|
47 |
+
except json.JSONDecodeError as e:
|
48 |
+
print(f"Skipping line {i}: {e}")
|
49 |
+
|
50 |
+
# Analyze dataset
|
51 |
+
stats = self.analyzer.get_dataset_stats(data)
|
52 |
+
print(f"Dataset stats: {stats}")
|
53 |
+
|
54 |
+
# Format samples
|
55 |
+
formatted_samples = [
|
56 |
+
self.formatter.format_sample(sample)
|
57 |
+
for sample in data
|
58 |
+
]
|
59 |
+
|
60 |
+
# Rank and filter samples
|
61 |
+
ranked_samples = self.ranker.rank_samples(formatted_samples)
|
62 |
+
|
63 |
+
# Track token count while processing
|
64 |
+
valid_samples = []
|
65 |
+
for sample in ranked_samples:
|
66 |
+
if not self.validate_sample(sample):
|
67 |
+
continue
|
68 |
+
|
69 |
+
sample_text = f"{sample['instruction']} {sample['input']} {sample['output']}"
|
70 |
+
if not self.track_tokens(sample_text):
|
71 |
+
break
|
72 |
+
|
73 |
+
valid_samples.append(sample)
|
74 |
+
|
75 |
+
# Save to JSONL
|
76 |
+
output_file = Path(output_path)
|
77 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
78 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
79 |
+
for sample in valid_samples:
|
80 |
+
f.write(json.dumps(sample) + '\n')
|
81 |
+
|
82 |
+
print(f"Processed {len(valid_samples)} samples saved to {output_path}")
|
83 |
+
|
84 |
+
if __name__ == "__main__":
|
85 |
+
preprocessor = QLoRAPreprocessor()
|
86 |
+
preprocessor.process_dataset(
|
87 |
+
"C:/Users/kunya/PycharmProjects/DataVolt/Tokenizers/combined_scientific_papers.json",
|
88 |
+
"nexa_scientific_instruction_300k.jsonl"
|
89 |
+
)
|
Tokenization/Cleanser.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from pathlib import Path
|
4 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
5 |
+
from datasets import Dataset
|
6 |
+
|
7 |
+
# Tag dictionaries
|
8 |
+
DOMAIN_TAGS = {
|
9 |
+
"physics": "[PHYS]",
|
10 |
+
"biology": "[BIO]",
|
11 |
+
"materials": "[MAT]",
|
12 |
+
"education": "[GEN]",
|
13 |
+
}
|
14 |
+
|
15 |
+
TASK_TAGS = {
|
16 |
+
"hypothesis": "[HYP]",
|
17 |
+
"method": "[MTH]",
|
18 |
+
"experiment": "[EXP]",
|
19 |
+
}
|
20 |
+
|
21 |
+
SECTION_TAGS = {
|
22 |
+
"abstract": "[ABSTRACT]",
|
23 |
+
"introduction": "[INTRO]",
|
24 |
+
"results": "[RESULTS]",
|
25 |
+
"discussion": "[DISCUSSION]",
|
26 |
+
"conclusion": "[CONCLUSION]",
|
27 |
+
"method": "[MTH]",
|
28 |
+
"experiment": "[EXP]",
|
29 |
+
}
|
30 |
+
|
31 |
+
SRC_PATH = Path(r"C:\Users\kunya\PycharmProjects\DataVolt\Tokenization\scientific_corpus_325M.jsonl")
|
32 |
+
CLEANED_JSONL_PATH = Path("scientific_corpus_325M.cleaned.jsonl")
|
33 |
+
CLEANED_ARROW_PATH = Path("scientific_corpus_325M.cleaned.arrow")
|
34 |
+
CHUNK_SIZE = 10000
|
35 |
+
MAX_WORKERS = os.cpu_count() or 4
|
36 |
+
|
37 |
+
def tag_record(record):
|
38 |
+
# Tagging logic: add tags to text fields if domain/task/section present
|
39 |
+
# You may need to adjust keys based on your schema
|
40 |
+
domain = record.get("domain", "").lower()
|
41 |
+
task = record.get("task", "").lower()
|
42 |
+
section = record.get("section", "").lower()
|
43 |
+
text = record.get("full_text", "")
|
44 |
+
|
45 |
+
tags = []
|
46 |
+
if domain in DOMAIN_TAGS:
|
47 |
+
tags.append(DOMAIN_TAGS[domain])
|
48 |
+
if task in TASK_TAGS:
|
49 |
+
tags.append(TASK_TAGS[task])
|
50 |
+
if section in SECTION_TAGS:
|
51 |
+
tags.append(SECTION_TAGS[section])
|
52 |
+
|
53 |
+
# Prepend tags to text
|
54 |
+
record["tagged_text"] = " ".join(tags) + " " + text if tags else text
|
55 |
+
return record
|
56 |
+
|
57 |
+
def process_chunk(lines):
|
58 |
+
cleaned = []
|
59 |
+
for line in lines:
|
60 |
+
try:
|
61 |
+
record = json.loads(line)
|
62 |
+
cleaned.append(tag_record(record))
|
63 |
+
except Exception:
|
64 |
+
continue # skip malformed lines
|
65 |
+
return cleaned
|
66 |
+
|
67 |
+
def chunked_file_reader(path, chunk_size):
|
68 |
+
with open(path, "r", encoding="utf-8") as f:
|
69 |
+
chunk = []
|
70 |
+
for line in f:
|
71 |
+
chunk.append(line)
|
72 |
+
if len(chunk) == chunk_size:
|
73 |
+
yield chunk
|
74 |
+
chunk = []
|
75 |
+
if chunk:
|
76 |
+
yield chunk
|
77 |
+
|
78 |
+
def main():
|
79 |
+
print("Starting cleaning process...")
|
80 |
+
# Write cleaned records to a new JSONL file in chunks
|
81 |
+
with open(CLEANED_JSONL_PATH, "w", encoding="utf-8") as out_f:
|
82 |
+
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
83 |
+
futures = []
|
84 |
+
for chunk in chunked_file_reader(SRC_PATH, CHUNK_SIZE):
|
85 |
+
futures.append(executor.submit(process_chunk, chunk))
|
86 |
+
for fut in as_completed(futures):
|
87 |
+
for record in fut.result():
|
88 |
+
out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
89 |
+
print(f"Cleaned JSONL written to {CLEANED_JSONL_PATH}")
|
90 |
+
|
91 |
+
# Convert cleaned JSONL to Arrow using datasets (handles chunking internally)
|
92 |
+
print("Saving cleaned dataset to Arrow format...")
|
93 |
+
ds = Dataset.from_json(str(CLEANED_JSONL_PATH))
|
94 |
+
ds.save_to_disk(str(CLEANED_ARROW_PATH))
|
95 |
+
print(f"Saved cleaned Arrow dataset at: {CLEANED_ARROW_PATH}")
|
96 |
+
|
97 |
+
# Optionally, call hf_upload.py asynchronously
|
98 |
+
print("Uploading to HuggingFace using hf_upload.py ...")
|
99 |
+
os.system(f"python hf_upload.py")
|
100 |
+
|
101 |
+
if __name__ == "__main__":
|
102 |
+
main()
|
Tokenization/Entropy_ranker.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
from typing import List, Dict, Optional, Callable
|
3 |
+
|
4 |
+
class EntropyRanker:
|
5 |
+
"""
|
6 |
+
Scores and filters text samples by Shannon entropy of their token distribution.
|
7 |
+
Used to remove low-information or repetitive samples from scientific corpora.
|
8 |
+
"""
|
9 |
+
|
10 |
+
def __init__(self, entropy_threshold: float = 3.5, tokenizer: Optional[Callable[[str], List[str]]] = None):
|
11 |
+
"""
|
12 |
+
Args:
|
13 |
+
entropy_threshold: Minimum entropy required to keep a sample.
|
14 |
+
tokenizer: Function to tokenize text. Defaults to whitespace split.
|
15 |
+
"""
|
16 |
+
self.entropy_threshold = entropy_threshold
|
17 |
+
self.tokenizer = tokenizer or (lambda x: x.split())
|
18 |
+
|
19 |
+
@staticmethod
|
20 |
+
def shannon_entropy(tokens: List[str]) -> float:
|
21 |
+
"""Compute Shannon entropy for a list of tokens."""
|
22 |
+
if not tokens:
|
23 |
+
return 0.0
|
24 |
+
freq = {}
|
25 |
+
for t in tokens:
|
26 |
+
freq[t] = freq.get(t, 0) + 1
|
27 |
+
total = len(tokens)
|
28 |
+
entropy = 0.0
|
29 |
+
for count in freq.values():
|
30 |
+
p = count / total
|
31 |
+
entropy -= p * math.log(p, 2)
|
32 |
+
return entropy
|
33 |
+
|
34 |
+
def score_sample(self, text: str) -> float:
|
35 |
+
"""Tokenize and score a text sample by entropy."""
|
36 |
+
tokens = self.tokenizer(text)
|
37 |
+
return self.shannon_entropy(tokens)
|
38 |
+
|
39 |
+
def is_explanatory(self, text: str) -> bool:
|
40 |
+
"""Return True if sample passes an entropy threshold."""
|
41 |
+
return self.score_sample(text) >= self.entropy_threshold
|
42 |
+
|
43 |
+
def filter_samples(self, samples: List[Dict], text_key: str = "text") -> List[Dict]:
|
44 |
+
"""Filter a list of dict samples, keeping only those above a threshold."""
|
45 |
+
return [s for s in samples if self.is_explanatory(s.get(text_key, ""))]
|
46 |
+
|
47 |
+
def rank_samples(self, samples: List[Dict], text_key: str = "text", top_k: Optional[int] = None) -> List[Dict]:
|
48 |
+
"""
|
49 |
+
Rank samples by entropy, descending. Optionally return only top_k.
|
50 |
+
"""
|
51 |
+
scored = [
|
52 |
+
(self.score_sample(s.get(text_key, "")), s)
|
53 |
+
for s in samples
|
54 |
+
]
|
55 |
+
scored.sort(reverse=True, key=lambda x: x[0])
|
56 |
+
ranked = [s for _, s in scored if _ >= self.entropy_threshold]
|
57 |
+
if top_k is not None:
|
58 |
+
ranked = ranked[:top_k]
|
59 |
+
return ranked
|
Tokenization/Label_tokens.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Tokenization/label_tokens.py
|
2 |
+
|
3 |
+
# Domain tags
|
4 |
+
DOMAIN_TAGS = {
|
5 |
+
"physics": "[PHYS]",
|
6 |
+
"biology": "[BIO]",
|
7 |
+
"materials": "[MAT]",
|
8 |
+
"education": "[GEN]",
|
9 |
+
}
|
10 |
+
|
11 |
+
# Task tags
|
12 |
+
TASK_TAGS = {
|
13 |
+
"hypothesis": "[HYP]",
|
14 |
+
"method": "[MTH]",
|
15 |
+
"experiment": "[EXP]",
|
16 |
+
}
|
17 |
+
|
18 |
+
# Section tags (for further granularity, e.g., for long-context or future models)
|
19 |
+
SECTION_TAGS = {
|
20 |
+
"abstract": "[ABSTRACT]",
|
21 |
+
"introduction": "[INTRO]",
|
22 |
+
"results": "[RESULTS]",
|
23 |
+
"discussion": "[DISCUSSION]",
|
24 |
+
"conclusion": "[CONCLUSION]",
|
25 |
+
"method": "[MTH]",
|
26 |
+
"experiment": "[EXP]",
|
27 |
+
}
|
28 |
+
|
29 |
+
# Routing tags
|
30 |
+
ROUTING_TAGS = {
|
31 |
+
"general": "[GEN]",
|
32 |
+
"specific": "[SPEC]",
|
33 |
+
}
|
34 |
+
|
35 |
+
# Token/word limits for validation and filtering
|
36 |
+
MIN_WORDS = 8
|
37 |
+
MAX_TOKENS = 1024
|
38 |
+
MAX_TOTAL_TOKENS = 327680000 # Example: 325M tokens
|
39 |
+
|
40 |
+
# Token targets for different corpus types
|
41 |
+
TOKEN_TARGETS = {
|
42 |
+
"warm_start": 100_000_000,
|
43 |
+
"scientific": 225_000_000,
|
44 |
+
"instruction": 30_000_000,
|
45 |
+
"default": 325_000_000,
|
46 |
+
}
|
47 |
+
|
48 |
+
def build_tag_string(
|
49 |
+
domain: str,
|
50 |
+
task: str = None,
|
51 |
+
section: str = None,
|
52 |
+
routing: str = "general",
|
53 |
+
subdomain: str = None
|
54 |
+
) -> str:
|
55 |
+
"""
|
56 |
+
Build a tag string for a sample, e.g. [PHYS][HYP][GEN] or [BIO][MTH][SPEC: Genomics]
|
57 |
+
"""
|
58 |
+
tags = []
|
59 |
+
if domain in DOMAIN_TAGS:
|
60 |
+
tags.append(DOMAIN_TAGS[domain])
|
61 |
+
if task in TASK_TAGS:
|
62 |
+
tags.append(TASK_TAGS[task])
|
63 |
+
if section in SECTION_TAGS:
|
64 |
+
tags.append(SECTION_TAGS[section])
|
65 |
+
if routing == "general":
|
66 |
+
tags.append(ROUTING_TAGS["general"])
|
67 |
+
elif routing == "specific" and subdomain:
|
68 |
+
tags.append(f"[SPEC:{subdomain}]")
|
69 |
+
return "".join(tags)
|
Tokenization/Logs/corpus_builder.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Tokenization/Logs/debug_upload.log
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-06-07 20:23:13,293 - INFO - Converting C:\Users\kunya\PycharmProjects\DataVolt\Tokenization\scientific_corpus_325M.jsonl to Arrow format at scientific_corpus_325M.arrow ...
|
2 |
+
2025-06-07 20:23:36,951 - ERROR - An error occurred while generating the dataset: An error occurred while generating the dataset
|
3 |
+
2025-06-07 20:23:36,951 - ERROR - Process failed: An error occurred while generating the dataset
|
4 |
+
2025-06-07 20:23:36,952 - INFO - Cleaned up local files.
|
Tokenization/Main_2.py
ADDED
@@ -0,0 +1,922 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# python
|
2 |
+
"""
|
3 |
+
The Main pipeline for building a scientific corpus from multiple sources.
|
4 |
+
|
5 |
+
Responsibilities:
|
6 |
+
- Orchestrates collection, processing, ranking, and deduplication of papers from arXiv, PubMed, and FineWeb-Edu.
|
7 |
+
- Handles error logging, checkpointing, and metrics for observability.
|
8 |
+
- Modular design for extensibility and maintainability.
|
9 |
+
|
10 |
+
Usage:
|
11 |
+
python Main_2.py
|
12 |
+
|
13 |
+
Classes:
|
14 |
+
- SourceMetrics: Tracks per-source metrics.
|
15 |
+
- CorpusConfig: Configuration for corpus building.
|
16 |
+
- ScientificCorpusBuilder: Main pipeline class.
|
17 |
+
|
18 |
+
Functions:
|
19 |
+
- main: Entry point for running the pipeline.
|
20 |
+
|
21 |
+
Environment:
|
22 |
+
- Requires ENTREZ_EMAIL for PubMed API.
|
23 |
+
- Outputs logs and intermediate checkpoints to ./scientific_corpus_data.
|
24 |
+
|
25 |
+
"""
|
26 |
+
|
27 |
+
import concurrent.futures
|
28 |
+
import json
|
29 |
+
import logging
|
30 |
+
import os
|
31 |
+
import signal
|
32 |
+
import time
|
33 |
+
from dataclasses import dataclass
|
34 |
+
from pathlib import Path
|
35 |
+
from types import FrameType
|
36 |
+
from typing import List, Dict, Set, Optional, Callable, Any
|
37 |
+
from urllib.error import URLError, HTTPError
|
38 |
+
from xml.parsers.expat import ExpatError
|
39 |
+
|
40 |
+
import arxiv
|
41 |
+
from Bio import Entrez
|
42 |
+
from datasets import load_dataset
|
43 |
+
from tqdm import tqdm
|
44 |
+
|
45 |
+
from Tokenization.Build_tokenizer import QLoRAPreprocessor
|
46 |
+
from Tokenization.Entropy_ranker import EntropyRanker
|
47 |
+
from Tokenization.hf_upload import upload_to_huggingface
|
48 |
+
from Tokenization.Label_tokens import TASK_TAGS, ROUTING_TAGS
|
49 |
+
from Tokenization.preprocessing import clean_text, segment_paragraphs
|
50 |
+
from Tokenization.pretraining.Dataset_stats import DatasetAnalyzer
|
51 |
+
from Tokenization.app.Config import PLAN_LIMITS
|
52 |
+
|
53 |
+
# Configure logging
|
54 |
+
logging.basicConfig(
|
55 |
+
level=logging.INFO,
|
56 |
+
format="%(asctime)s - %(levelname)s - %(message)s",
|
57 |
+
handlers=[
|
58 |
+
logging.FileHandler("corpus_builder.log"),
|
59 |
+
logging.StreamHandler()
|
60 |
+
]
|
61 |
+
)
|
62 |
+
logger = logging.getLogger(__name__)
|
63 |
+
|
64 |
+
|
65 |
+
is_shutdown = False
|
66 |
+
"""Global flag indicating whether a shutdown signal has been received.
|
67 |
+
|
68 |
+
This flag is set to True by the signal handler to allow for graceful shutdown
|
69 |
+
of long-running operations throughout the pipeline.
|
70 |
+
"""
|
71 |
+
|
72 |
+
def signal_handler(sig: int, frame: FrameType) -> None:
|
73 |
+
"""Handle shutdown signals gracefully and set shutdown flag."""
|
74 |
+
global is_shutdown
|
75 |
+
logger.info(f"Received signal {sig}, shutting down gracefully. Frame: {frame}")
|
76 |
+
is_shutdown = True
|
77 |
+
|
78 |
+
|
79 |
+
# Register signal handlers for graceful shutdown
|
80 |
+
signal.signal(signal.SIGINT, signal_handler)
|
81 |
+
signal.signal(signal.SIGTERM, signal_handler)
|
82 |
+
|
83 |
+
|
84 |
+
def retry(max_retries: int = 3, backoff_factor: float = 1.0,
|
85 |
+
exceptions: tuple = (Exception,)) -> Callable:
|
86 |
+
"""
|
87 |
+
Decorator for retrying a function with exponential backoff.
|
88 |
+
|
89 |
+
Args:
|
90 |
+
max_retries: Maximum number of retries.
|
91 |
+
backoff_factor: Multiplier for exponential backoff.
|
92 |
+
exceptions: Exception types to catch and retry.
|
93 |
+
|
94 |
+
Returns:
|
95 |
+
Decorated function with retry logic.
|
96 |
+
"""
|
97 |
+
def decorator(func: Callable) -> Callable:
|
98 |
+
def wrapper(*args, **kwargs) -> Any:
|
99 |
+
retries = 0
|
100 |
+
while retries < max_retries:
|
101 |
+
if is_shutdown:
|
102 |
+
logger.info("Shutdown in progress, aborting retries.")
|
103 |
+
raise KeyboardInterrupt("Shutdown requested")
|
104 |
+
try:
|
105 |
+
return func(*args, **kwargs)
|
106 |
+
except exceptions as e:
|
107 |
+
wait = backoff_factor * (2 ** retries)
|
108 |
+
logger.warning(f"Error in {func.__name__}: {e}. Retrying in {wait:.1f}s...")
|
109 |
+
time.sleep(wait)
|
110 |
+
retries += 1
|
111 |
+
logger.error(f"Function {func.__name__} failed after {max_retries} attempts.")
|
112 |
+
raise RuntimeError(f"{func.__name__} failed after {max_retries} attempts")
|
113 |
+
return wrapper
|
114 |
+
return decorator
|
115 |
+
|
116 |
+
|
117 |
+
@dataclass
|
118 |
+
class SourceMetrics:
|
119 |
+
"""Metrics for tracking source performance."""
|
120 |
+
papers: int = 0
|
121 |
+
tokens: int = 0
|
122 |
+
time: float = 0.0
|
123 |
+
errors: int = 0
|
124 |
+
|
125 |
+
|
126 |
+
@dataclass
|
127 |
+
class CorpusConfig:
|
128 |
+
"""
|
129 |
+
Configuration for corpus building parameters.
|
130 |
+
|
131 |
+
Attributes:
|
132 |
+
max_arxiv_papers: Maximum number of arXiv papers to fetch.
|
133 |
+
max_pubmed_papers: Maximum number of PubMed papers to fetch.
|
134 |
+
max_fineweb_samples: Maximum number of FineWeb-Edu samples to fetch.
|
135 |
+
max_workers: Number of workers for parallel processing.
|
136 |
+
timeout: Timeout for API requests.
|
137 |
+
chunk_size: Chunk size for batch processing.
|
138 |
+
"""
|
139 |
+
max_arxiv_papers: int = 9000
|
140 |
+
max_pubmed_papers: int = 3000
|
141 |
+
max_fineweb_samples: int = 30000
|
142 |
+
max_workers: int = 8
|
143 |
+
timeout: int = 30
|
144 |
+
chunk_size: int = 1000
|
145 |
+
|
146 |
+
|
147 |
+
class ScientificCorpusBuilder:
|
148 |
+
"""
|
149 |
+
Main class for building a scientific corpus from multiple sources.
|
150 |
+
|
151 |
+
Methods:
|
152 |
+
fetch_arxiv_papers: Collects papers from arXiv.
|
153 |
+
fetch_pubmed_papers: Collects papers from PubMed.
|
154 |
+
fetch_fineweb_edu: Collects educational content from FineWeb-Edu.
|
155 |
+
preprocess_sample: Cleans and segments a paper into samples.
|
156 |
+
process_papers: Tags, filters, and preprocesses papers.
|
157 |
+
build_corpus: Orchestrates the full pipeline and builds the corpus.
|
158 |
+
print_report: Prints a summary report of the build process.
|
159 |
+
"""
|
160 |
+
|
161 |
+
def __init__(self, config: Optional[CorpusConfig] = None):
|
162 |
+
"""
|
163 |
+
Initialize the corpus builder with configuration and dependencies.
|
164 |
+
|
165 |
+
Args:
|
166 |
+
config: Optional CorpusConfig object.
|
167 |
+
"""
|
168 |
+
self.config = config or CorpusConfig()
|
169 |
+
self.preprocessor = QLoRAPreprocessor(corpus_type="scientific")
|
170 |
+
self.analyzer = DatasetAnalyzer()
|
171 |
+
self.ranker = EntropyRanker()
|
172 |
+
self.data_dir = Path("scientific_corpus_data")
|
173 |
+
self.data_dir.mkdir(exist_ok=True)
|
174 |
+
self._setup_apis()
|
175 |
+
self.seen_titles: Set[str] = set()
|
176 |
+
self.metrics = {
|
177 |
+
"arxiv": SourceMetrics(),
|
178 |
+
"pubmed": SourceMetrics(),
|
179 |
+
"fineweb_edu": SourceMetrics(),
|
180 |
+
"total_tokens": 0,
|
181 |
+
"total_time": 0.0
|
182 |
+
}
|
183 |
+
|
184 |
+
@staticmethod
|
185 |
+
def _setup_apis() -> None:
|
186 |
+
"""
|
187 |
+
Setup API configurations for external data sources.
|
188 |
+
"""
|
189 |
+
Entrez.email = os.getenv("ENTREZ_EMAIL", "[email protected]")
|
190 |
+
if Entrez.email == "[email protected]":
|
191 |
+
logger.warning("Using default email for Entrez. Set ENTREZ_EMAIL environment variable.")
|
192 |
+
|
193 |
+
@retry(max_retries=3, backoff_factor=2,
|
194 |
+
exceptions=(arxiv.ArxivError, HTTPError, URLError, ConnectionError))
|
195 |
+
def _fetch_arxiv_search(self, query: str, max_results: int) -> List[Any]:
|
196 |
+
"""
|
197 |
+
Fetch arXiv search results with error handling and exponential backoff.
|
198 |
+
|
199 |
+
Args:
|
200 |
+
query: arXiv API query string.
|
201 |
+
max_results: Maximum number of results to fetch.
|
202 |
+
|
203 |
+
Returns:
|
204 |
+
List of arXiv result objects.
|
205 |
+
"""
|
206 |
+
try:
|
207 |
+
search = arxiv.Search(
|
208 |
+
query=query,
|
209 |
+
max_results=max_results,
|
210 |
+
sort_by=arxiv.SortCriterion.SubmittedDate,
|
211 |
+
)
|
212 |
+
client = arxiv.Client()
|
213 |
+
results = list(client.results(search))
|
214 |
+
if not results:
|
215 |
+
logger.warning(f"Empty page returned for query '{query}'")
|
216 |
+
return results
|
217 |
+
except (arxiv.UnexpectedEmptyPageError, arxiv.HTTPError) as e:
|
218 |
+
logger.warning(f"Empty page returned for query '{query}': {e}")
|
219 |
+
return []
|
220 |
+
except Exception as e:
|
221 |
+
logger.error(f"Error in _fetch_arxiv_search for query '{query}': {e}")
|
222 |
+
raise
|
223 |
+
|
224 |
+
def fetch_arxiv_papers(self) -> List[Dict]:
|
225 |
+
"""
|
226 |
+
Fetch papers from arXiv across multiple domains with verification and checkpoint saving.
|
227 |
+
|
228 |
+
Returns:
|
229 |
+
List of arXiv paper dictionaries.
|
230 |
+
"""
|
231 |
+
logger.info("Starting arXiv paper collection...")
|
232 |
+
start_time = time.time()
|
233 |
+
papers = []
|
234 |
+
queries = [
|
235 |
+
("physics", "cat:physics* OR cat:astro-ph* OR cat:cond-mat* OR cat:hep-th OR cat:quant-ph OR cat:math-ph"),
|
236 |
+
("biology", "cat:q-bio*"),
|
237 |
+
("materials", "cat:cond-mat.mtrl-sci OR cat:materials*")
|
238 |
+
]
|
239 |
+
for domain, query in queries:
|
240 |
+
if is_shutdown:
|
241 |
+
break
|
242 |
+
try:
|
243 |
+
results = self._fetch_arxiv_search(query, self.config.max_arxiv_papers // 3)
|
244 |
+
for result in tqdm(results, desc=f"arXiv {domain}"):
|
245 |
+
if is_shutdown:
|
246 |
+
break
|
247 |
+
try:
|
248 |
+
paper = {
|
249 |
+
"title": result.title.strip() if result.title else "",
|
250 |
+
"abstract": result.summary.strip() if result.summary else "",
|
251 |
+
"full_text": "",
|
252 |
+
"domain": domain,
|
253 |
+
"section": "abstract",
|
254 |
+
"source": "arxiv",
|
255 |
+
"authors": [str(a) for a in result.authors] if result.authors else [],
|
256 |
+
"published": result.published.isoformat() if result.published else None,
|
257 |
+
"provenance": {"arxiv_id": result.get_short_id()},
|
258 |
+
"categories": [c for c in getattr(result, "categories", [])] if hasattr(result, "categories") else [],
|
259 |
+
"text": result.summary.strip() if result.summary else ""
|
260 |
+
}
|
261 |
+
if paper["title"] and paper["title"] not in self.seen_titles:
|
262 |
+
papers.append(paper)
|
263 |
+
self.seen_titles.add(paper["title"])
|
264 |
+
except Exception as e:
|
265 |
+
logger.warning(f"Error processing arXiv result: {e}")
|
266 |
+
self.metrics["arxiv"].errors += 1
|
267 |
+
continue
|
268 |
+
except Exception as e:
|
269 |
+
logger.error(f"arXiv {domain} search failed: {e}")
|
270 |
+
self.metrics["arxiv"].errors += 1
|
271 |
+
self._save_intermediate(papers, "arxiv_papers.jsonl")
|
272 |
+
elapsed = time.time() - start_time
|
273 |
+
self.metrics["arxiv"].papers = len(papers)
|
274 |
+
self.metrics["arxiv"].time = elapsed
|
275 |
+
logger.info(f"Collected {len(papers)} arXiv papers in {elapsed:.2f}s")
|
276 |
+
return papers
|
277 |
+
|
278 |
+
@retry(max_retries=3, backoff_factor=2,
|
279 |
+
exceptions=(HTTPError, URLError, ConnectionError, ExpatError))
|
280 |
+
def _fetch_pubmed_batch(self, chunk_pmids: List[str]) -> Dict:
|
281 |
+
"""
|
282 |
+
Fetch a batch of PubMed records with error handling.
|
283 |
+
|
284 |
+
Args:
|
285 |
+
chunk_pmids: List of PubMed IDs.
|
286 |
+
|
287 |
+
Returns:
|
288 |
+
Dictionary of PubMed records.
|
289 |
+
"""
|
290 |
+
try:
|
291 |
+
fetch_handle = Entrez.efetch (
|
292 |
+
db="pubmed",
|
293 |
+
id=",".join (chunk_pmids),
|
294 |
+
rettype="medline",
|
295 |
+
retmode="xml"
|
296 |
+
)
|
297 |
+
records = Entrez.read (fetch_handle)
|
298 |
+
fetch_handle.close ()
|
299 |
+
return records
|
300 |
+
except ExpatError as e:
|
301 |
+
logger.error (f"XML parsing error in PubMed batch: {e}")
|
302 |
+
raise
|
303 |
+
except (HTTPError, URLError) as e:
|
304 |
+
logger.error (f"Network error fetching PubMed batch: {e}")
|
305 |
+
raise
|
306 |
+
|
307 |
+
def fetch_pubmed_papers(self) -> List[Dict]:
|
308 |
+
"""
|
309 |
+
Fetch papers from PubMed with biology focus.
|
310 |
+
|
311 |
+
Returns:
|
312 |
+
List of PubMed paper dictionaries.
|
313 |
+
"""
|
314 |
+
logger.info ("Starting PubMed paper collection...")
|
315 |
+
start_time = time.time ()
|
316 |
+
papers = []
|
317 |
+
|
318 |
+
search_terms = [
|
319 |
+
"(methods[Title/Abstract]) AND (biology[MeSH Terms])",
|
320 |
+
"(computational biology[MeSH Terms]) AND (methods[Title/Abstract])",
|
321 |
+
"(bioinformatics[MeSH Terms]) AND (algorithm[Title/Abstract])",
|
322 |
+
"(molecular biology[MeSH Terms]) AND (technique[Title/Abstract])"
|
323 |
+
]
|
324 |
+
|
325 |
+
for search_term in search_terms:
|
326 |
+
if is_shutdown:
|
327 |
+
break
|
328 |
+
|
329 |
+
try:
|
330 |
+
handle = Entrez.esearch (
|
331 |
+
db="pubmed",
|
332 |
+
term=search_term,
|
333 |
+
retmax=self.config.max_pubmed_papers // len (search_terms),
|
334 |
+
sort="relevance"
|
335 |
+
)
|
336 |
+
record = Entrez.read (handle)
|
337 |
+
handle.close ()
|
338 |
+
pmids = record.get ("IdList", [])
|
339 |
+
|
340 |
+
for i in tqdm (range (0, len (pmids), self.config.chunk_size), desc="PubMed batch"):
|
341 |
+
if is_shutdown:
|
342 |
+
break
|
343 |
+
|
344 |
+
chunk_pmids = pmids [i:i + self.config.chunk_size]
|
345 |
+
try:
|
346 |
+
records = self._fetch_pubmed_batch (chunk_pmids)
|
347 |
+
|
348 |
+
for rec in records.get ("PubmedArticle", []):
|
349 |
+
try:
|
350 |
+
medline_citation = rec.get ("MedlineCitation", {})
|
351 |
+
article = medline_citation.get ("Article", {})
|
352 |
+
|
353 |
+
title = article.get ("ArticleTitle", "")
|
354 |
+
abstract_list = article.get ("Abstract", {}).get ("AbstractText", [""])
|
355 |
+
abstract = abstract_list [0] if abstract_list else ""
|
356 |
+
|
357 |
+
if title and isinstance (title, str) and title not in self.seen_titles:
|
358 |
+
paper = {
|
359 |
+
"title": title.strip (),
|
360 |
+
"abstract": abstract.strip () if isinstance (abstract, str) else "",
|
361 |
+
"full_text": "",
|
362 |
+
"domain": "biology",
|
363 |
+
"section": "abstract",
|
364 |
+
"source": "pubmed",
|
365 |
+
"authors": [],
|
366 |
+
"published": None,
|
367 |
+
"provenance": {"pubmed_id": str (medline_citation.get ("PMID", ""))},
|
368 |
+
"categories": ["biology"],
|
369 |
+
"text": abstract.strip () if isinstance (abstract, str) else ""
|
370 |
+
}
|
371 |
+
papers.append (paper)
|
372 |
+
self.seen_titles.add (title)
|
373 |
+
|
374 |
+
except (KeyError, TypeError, AttributeError) as e:
|
375 |
+
logger.warning (f"Error processing PubMed record: {e}")
|
376 |
+
self.metrics ["pubmed"].errors += 1
|
377 |
+
continue
|
378 |
+
|
379 |
+
except (HTTPError, URLError, ConnectionError, ExpatError) as e:
|
380 |
+
self.metrics ["pubmed"].errors += 1
|
381 |
+
logger.warning (f"Failed to fetch PubMed batch: {e}")
|
382 |
+
continue
|
383 |
+
|
384 |
+
except (HTTPError, URLError, ConnectionError, ExpatError) as e:
|
385 |
+
self.metrics ["pubmed"].errors += 1
|
386 |
+
logger.error (f"PubMed search failed for {search_term}: {e}")
|
387 |
+
except KeyboardInterrupt:
|
388 |
+
logger.info ("PubMed collection interrupted by user")
|
389 |
+
break
|
390 |
+
|
391 |
+
self._save_intermediate (papers, "pubmed_papers.jsonl")
|
392 |
+
elapsed = time.time () - start_time
|
393 |
+
self.metrics ["pubmed"].papers = len (papers)
|
394 |
+
self.metrics ["pubmed"].time = elapsed
|
395 |
+
logger.info (f"Collected {len (papers)} PubMed papers in {elapsed:.2f}s")
|
396 |
+
return papers
|
397 |
+
|
398 |
+
@retry (max_retries=3, backoff_factor=2,
|
399 |
+
exceptions=(ConnectionError, HTTPError, URLError, OSError))
|
400 |
+
def fetch_fineweb_edu(self) -> List [Dict]:
|
401 |
+
"""
|
402 |
+
Fetch educational content from FineWeb-Edu dataset.
|
403 |
+
|
404 |
+
Returns:
|
405 |
+
List of FineWeb-Edu paper dictionaries.
|
406 |
+
"""
|
407 |
+
logger.info ("Starting FineWeb-Edu collection...")
|
408 |
+
start_time = time.time ()
|
409 |
+
papers = []
|
410 |
+
|
411 |
+
try:
|
412 |
+
ds = load_dataset ("HuggingFaceFW/fineweb-edu", "sample-10BT",
|
413 |
+
split="train", streaming=True)
|
414 |
+
samples = []
|
415 |
+
|
416 |
+
for i, sample in enumerate (ds):
|
417 |
+
if is_shutdown:
|
418 |
+
break
|
419 |
+
if i >= self.config.max_fineweb_samples:
|
420 |
+
break
|
421 |
+
|
422 |
+
if not isinstance (sample, dict) or "text" not in sample:
|
423 |
+
logger.warning (f"Invalid sample structure at index {i}")
|
424 |
+
continue
|
425 |
+
|
426 |
+
samples.append (sample)
|
427 |
+
if (i + 1) % 10000 == 0:
|
428 |
+
logger.info (f"Collected {i + 1} FineWeb samples")
|
429 |
+
|
430 |
+
logger.info (f"Processing {len (samples)} FineWeb samples")
|
431 |
+
|
432 |
+
def is_educational_content(sample: Dict) -> bool:
|
433 |
+
"""Check if content is educational and suitable."""
|
434 |
+
try:
|
435 |
+
text = sample.get ("text", "")
|
436 |
+
if not isinstance (text, str) or len (text) < 500:
|
437 |
+
return False
|
438 |
+
return self.ranker.is_explanatory (text)
|
439 |
+
except (AttributeError, TypeError, ValueError) as e:
|
440 |
+
logger.debug (f"Error evaluating educational content: {e}")
|
441 |
+
return False
|
442 |
+
|
443 |
+
with concurrent.futures.ThreadPoolExecutor (max_workers=self.config.max_workers) as executor:
|
444 |
+
filtered_results = list (tqdm (
|
445 |
+
executor.map (is_educational_content, samples),
|
446 |
+
total=len (samples),
|
447 |
+
desc="Filtering FineWeb content"
|
448 |
+
))
|
449 |
+
|
450 |
+
for sample, is_good in zip (samples, filtered_results):
|
451 |
+
if is_shutdown:
|
452 |
+
break
|
453 |
+
if is_good:
|
454 |
+
try:
|
455 |
+
url = sample.get ("url", "")
|
456 |
+
meta = sample.get ("meta", {})
|
457 |
+
title = meta.get ("title", "") if isinstance (meta, dict) else ""
|
458 |
+
title = title or url or f"Document_{len (papers)}"
|
459 |
+
|
460 |
+
if title not in self.seen_titles:
|
461 |
+
paper = {
|
462 |
+
"title": title,
|
463 |
+
"abstract": "",
|
464 |
+
"full_text": sample.get ("text", ""),
|
465 |
+
"domain": "education",
|
466 |
+
"section": "full_text",
|
467 |
+
"source": "fineweb_edu",
|
468 |
+
"authors": [],
|
469 |
+
"published": None,
|
470 |
+
"provenance": {"url": url},
|
471 |
+
"categories": ["education"],
|
472 |
+
"text": sample.get("text", "")
|
473 |
+
}
|
474 |
+
papers.append (paper)
|
475 |
+
self.seen_titles.add (title)
|
476 |
+
except (KeyError, TypeError, AttributeError) as e:
|
477 |
+
logger.warning (f"Error processing FineWeb sample: {e}")
|
478 |
+
self.metrics ["fineweb_edu"].errors += 1
|
479 |
+
continue
|
480 |
+
|
481 |
+
except (ConnectionError, HTTPError, URLError, OSError) as e:
|
482 |
+
logger.error (f"FineWeb-Edu fetch failed: {e}")
|
483 |
+
self.metrics ["fineweb_edu"].errors += 1
|
484 |
+
except KeyboardInterrupt:
|
485 |
+
logger.info ("FineWeb-Edu collection interrupted by user")
|
486 |
+
except ImportError as e:
|
487 |
+
logger.error (f"Failed to import required dataset library: {e}")
|
488 |
+
self.metrics ["fineweb_edu"].errors += 1
|
489 |
+
|
490 |
+
self._save_intermediate (papers, "fineweb_edu.jsonl")
|
491 |
+
elapsed = time.time () - start_time
|
492 |
+
self.metrics ["fineweb_edu"].papers = len (papers)
|
493 |
+
self.metrics ["fineweb_edu"].time = elapsed
|
494 |
+
logger.info (f"Collected {len (papers)} FineWeb-Edu papers in {elapsed:.2f}s")
|
495 |
+
return papers
|
496 |
+
|
497 |
+
@staticmethod
|
498 |
+
def preprocess_sample(paper: Dict) -> List [Dict]:
|
499 |
+
"""
|
500 |
+
Preprocess a paper sample into multiple training samples.
|
501 |
+
|
502 |
+
Args:
|
503 |
+
paper: Dictionary representing a paper.
|
504 |
+
|
505 |
+
Returns:
|
506 |
+
List of processed sample dictionaries.
|
507 |
+
"""
|
508 |
+
try:
|
509 |
+
title = clean_text (paper.get ("title", "")) if paper.get ("title") else ""
|
510 |
+
abstract = clean_text (paper.get ("abstract", "")) if paper.get ("abstract") else ""
|
511 |
+
full_text = clean_text (paper.get ("full_text", "")) if paper.get ("full_text") else ""
|
512 |
+
|
513 |
+
paragraphs = segment_paragraphs (full_text) if full_text else []
|
514 |
+
samples = []
|
515 |
+
|
516 |
+
if title or abstract:
|
517 |
+
sample = dict (paper)
|
518 |
+
sample ["title"] = title
|
519 |
+
sample ["abstract"] = abstract
|
520 |
+
sample ["full_text"] = ""
|
521 |
+
sample ["section"] = "abstract"
|
522 |
+
samples.append (sample)
|
523 |
+
|
524 |
+
for para in paragraphs:
|
525 |
+
if para.strip ():
|
526 |
+
sample = dict (paper)
|
527 |
+
sample ["title"] = title
|
528 |
+
sample ["abstract"] = ""
|
529 |
+
sample ["full_text"] = para
|
530 |
+
sample ["section"] = "paragraph"
|
531 |
+
samples.append (sample)
|
532 |
+
|
533 |
+
return samples
|
534 |
+
|
535 |
+
except (AttributeError, TypeError, ValueError) as e:
|
536 |
+
logger.warning (f"Error preprocessing sample: {e}")
|
537 |
+
return []
|
538 |
+
|
539 |
+
def process_papers(self, papers: List[Dict], domain: str) -> List[Dict]:
|
540 |
+
"""
|
541 |
+
Process papers with domain-specific tagging and filtering.
|
542 |
+
|
543 |
+
Args:
|
544 |
+
papers: List of paper dictionaries.
|
545 |
+
domain: Domain string for tagging.
|
546 |
+
|
547 |
+
Returns:
|
548 |
+
List of processed and filtered sample dictionaries.
|
549 |
+
"""
|
550 |
+
logger.info(f"Processing {len(papers)} {domain} papers...")
|
551 |
+
processed = []
|
552 |
+
unknown_domains = 0
|
553 |
+
unknown_sections = 0
|
554 |
+
|
555 |
+
def label_domain(paper):
|
556 |
+
cats = paper.get('categories', [])
|
557 |
+
if not cats:
|
558 |
+
return 'unknown'
|
559 |
+
cats_str = " ".join(cats).lower()
|
560 |
+
if 'bio' in cats_str:
|
561 |
+
return '[BIO]'
|
562 |
+
if 'gen' in cats_str:
|
563 |
+
return '[GEN]'
|
564 |
+
if 'phys' in cats_str:
|
565 |
+
return '[PHY]'
|
566 |
+
if 'math' in cats_str:
|
567 |
+
return '[MATH]'
|
568 |
+
if 'mat' in cats_str or 'materials' in cats_str:
|
569 |
+
return '[MAT]'
|
570 |
+
if 'astro' in cats_str:
|
571 |
+
return '[ASTRO]'
|
572 |
+
if 'cs' in cats_str:
|
573 |
+
return '[CS]'
|
574 |
+
return 'unknown'
|
575 |
+
|
576 |
+
def label_section(paper):
|
577 |
+
text = paper.get('text', '') or paper.get('abstract', '') or ''
|
578 |
+
text_lower = text.lower()
|
579 |
+
if not text_lower:
|
580 |
+
return 'unknown'
|
581 |
+
if 'abstract' in text_lower:
|
582 |
+
return '[ABSTRACT]'
|
583 |
+
if 'introduction' in text_lower:
|
584 |
+
return '[INTRO]'
|
585 |
+
if 'methods' in text_lower:
|
586 |
+
return '[METHODS]'
|
587 |
+
if 'results' in text_lower:
|
588 |
+
return '[RESULTS]'
|
589 |
+
if 'discussion' in text_lower:
|
590 |
+
return '[DISCUSSION]'
|
591 |
+
if 'conclusion' in text_lower:
|
592 |
+
return '[CONCLUSION]'
|
593 |
+
return 'unknown'
|
594 |
+
|
595 |
+
for paper in tqdm(papers, desc=f"Processing {domain} papers"):
|
596 |
+
try:
|
597 |
+
domain_tag = label_domain(paper)
|
598 |
+
section_tag = label_section(paper)
|
599 |
+
paper["domain_tag"] = domain_tag
|
600 |
+
paper["section_tag"] = section_tag
|
601 |
+
if domain_tag == 'unknown':
|
602 |
+
unknown_domains += 1
|
603 |
+
if section_tag == 'unknown':
|
604 |
+
unknown_sections += 1
|
605 |
+
|
606 |
+
task = paper.get("task", None)
|
607 |
+
if task and task in TASK_TAGS:
|
608 |
+
paper["task_tag"] = TASK_TAGS[task]
|
609 |
+
|
610 |
+
routing = paper.get("routing", "general")
|
611 |
+
paper["routing_tag"] = ROUTING_TAGS.get(routing, ROUTING_TAGS["general"])
|
612 |
+
|
613 |
+
samples = self.preprocess_sample(paper)
|
614 |
+
|
615 |
+
for sample in samples:
|
616 |
+
try:
|
617 |
+
content_parts = []
|
618 |
+
if sample.get("title"):
|
619 |
+
content_parts.append(str(sample["title"]))
|
620 |
+
if sample.get("abstract"):
|
621 |
+
content_parts.append(str(sample["abstract"]))
|
622 |
+
if sample.get("full_text"):
|
623 |
+
content_parts.append(str(sample["full_text"])[:1000])
|
624 |
+
content = " ".join(content_parts)
|
625 |
+
if content.strip() and self.ranker.is_explanatory(content):
|
626 |
+
sample["domain_tag"] = paper["domain_tag"]
|
627 |
+
sample["section_tag"] = paper["section_tag"]
|
628 |
+
sample["routing_tag"] = paper["routing_tag"]
|
629 |
+
if "task_tag" in paper:
|
630 |
+
sample["task_tag"] = paper["task_tag"]
|
631 |
+
processed.append(sample)
|
632 |
+
except Exception as e:
|
633 |
+
logger.debug(f"Error evaluating sample content: {e}")
|
634 |
+
continue
|
635 |
+
|
636 |
+
except Exception as e:
|
637 |
+
logger.warning(f"Paper processing error: {e}")
|
638 |
+
continue
|
639 |
+
|
640 |
+
logger.info(f"Processed {len(processed)}/{len(papers)} {domain} papers")
|
641 |
+
logger.info(f"Unknown domains: {unknown_domains}, Unknown sections: {unknown_sections}")
|
642 |
+
return processed
|
643 |
+
|
644 |
+
def _save_intermediate(self, papers: List[Dict], filename: str) -> None:
|
645 |
+
"""
|
646 |
+
Save intermediate results to disk as JSONL.
|
647 |
+
|
648 |
+
Args:
|
649 |
+
papers: List of paper/sample dictionaries.
|
650 |
+
filename: Output filename.
|
651 |
+
"""
|
652 |
+
path = self.data_dir / filename
|
653 |
+
try:
|
654 |
+
with open (path, "w", encoding="utf-8") as f:
|
655 |
+
for paper in papers:
|
656 |
+
f.write (json.dumps (paper, ensure_ascii=False) + "\n")
|
657 |
+
logger.info (f"Saved checkpoint to {path}")
|
658 |
+
except (OSError, IOError, PermissionError) as e:
|
659 |
+
logger.error (f"Failed to save intermediate file {filename}: {e}")
|
660 |
+
except (TypeError, ValueError) as e:
|
661 |
+
logger.error (f"JSON serialization error for {filename}: {e}")
|
662 |
+
|
663 |
+
def build_corpus(self, output_path: str, verify_only: bool = False) -> None:
|
664 |
+
"""
|
665 |
+
Build the complete scientific corpus with checkpoint verification.
|
666 |
+
|
667 |
+
Args:
|
668 |
+
output_path: Path to save the final corpus.
|
669 |
+
verify_only: If True, only verify checkpoints and skip merging.
|
670 |
+
"""
|
671 |
+
logger.info("Starting scientific corpus build...")
|
672 |
+
total_start = time.time()
|
673 |
+
all_papers = []
|
674 |
+
|
675 |
+
sources = [
|
676 |
+
("arXiv", self.fetch_arxiv_papers, None),
|
677 |
+
("PubMed", self.fetch_pubmed_papers, "biology"),
|
678 |
+
("FineWeb-Edu", self.fetch_fineweb_edu, "education")
|
679 |
+
]
|
680 |
+
for source_name, fetch_func, domain in sources:
|
681 |
+
if is_shutdown:
|
682 |
+
break
|
683 |
+
logger.info(f"Fetching {source_name} papers...")
|
684 |
+
try:
|
685 |
+
papers = fetch_func()
|
686 |
+
if domain:
|
687 |
+
processed = []
|
688 |
+
for i in range(0, len(papers), self.config.chunk_size):
|
689 |
+
chunk = papers[i:i + self.config.chunk_size]
|
690 |
+
processed.extend(self.process_papers(chunk, domain))
|
691 |
+
papers = processed
|
692 |
+
chkpt_filename = f"{source_name.lower()}_papers.jsonl"
|
693 |
+
self._save_intermediate(papers, chkpt_filename)
|
694 |
+
if not papers:
|
695 |
+
logger.error(f"{source_name} checkpoint {chkpt_filename} is empty!")
|
696 |
+
all_papers.extend(papers)
|
697 |
+
logger.info(f"Added {len(papers)} papers from {source_name}")
|
698 |
+
except Exception as e:
|
699 |
+
logger.error(f"Critical error fetching from {source_name}: {e}")
|
700 |
+
continue
|
701 |
+
|
702 |
+
logger.info(f"Total papers collected: {len(all_papers)}")
|
703 |
+
if verify_only:
|
704 |
+
logger.info("Verification flag enabled; skipping merge and build.")
|
705 |
+
self.print_report({})
|
706 |
+
return
|
707 |
+
|
708 |
+
if not all_papers:
|
709 |
+
logger.error("No papers collected. Cannot build corpus.")
|
710 |
+
self.print_report({})
|
711 |
+
return
|
712 |
+
|
713 |
+
logger.info("Ranking and deduplicating papers...")
|
714 |
+
try:
|
715 |
+
ranked_papers = self.ranker.rank_samples(all_papers)
|
716 |
+
if not ranked_papers:
|
717 |
+
logger.error("Final corpus is empty after ranking. Using unranked papers as fallback.")
|
718 |
+
ranked_papers = all_papers
|
719 |
+
logger.info(f"Final corpus size: {len(ranked_papers)} papers")
|
720 |
+
except Exception as e:
|
721 |
+
logger.error(f"Error ranking papers: {e}")
|
722 |
+
ranked_papers = all_papers
|
723 |
+
|
724 |
+
if not ranked_papers:
|
725 |
+
logger.error("Final corpus is empty. No data to process or save.")
|
726 |
+
self.print_report({})
|
727 |
+
return
|
728 |
+
|
729 |
+
self._save_intermediate(ranked_papers, "ranked_papers.jsonl")
|
730 |
+
try:
|
731 |
+
stats = self.analyzer.get_dataset_stats(ranked_papers)
|
732 |
+
self.metrics["total_tokens"] = int(stats.get("avg_tokens", 0) * stats.get("total_samples", 0))
|
733 |
+
except Exception as e:
|
734 |
+
logger.error(f"Error generating dataset statistics: {e}")
|
735 |
+
stats = {}
|
736 |
+
|
737 |
+
self.metrics["total_time"] = time.time() - total_start
|
738 |
+
logger.info("Processing final dataset in batches...")
|
739 |
+
try:
|
740 |
+
with open(output_path, "w", encoding="utf-8") as out_f:
|
741 |
+
for i in range(0, len(ranked_papers), self.config.chunk_size):
|
742 |
+
chunk = ranked_papers[i:i + self.config.chunk_size]
|
743 |
+
for paper in chunk:
|
744 |
+
out_f.write(json.dumps(paper, ensure_ascii=False) + "\n")
|
745 |
+
except Exception as e:
|
746 |
+
logger.error(f"Error processing final dataset: {e}")
|
747 |
+
|
748 |
+
# HuggingFace upload: warn if a file is too large
|
749 |
+
if os.path.exists(output_path) and os.path.getsize(output_path) > 10 * 1024 * 1024:
|
750 |
+
logger.warning(
|
751 |
+
f"{output_path} is larger than 10 MiB. HuggingFace will reject files >10 MiB unless you use Git LFS. "
|
752 |
+
"See https://hf.co/docs/hub/repositories-getting-started#terminal"
|
753 |
+
)
|
754 |
+
logger.warning(
|
755 |
+
"To fix: install git-lfs and run 'git lfs track \"*.jsonl\"' before pushing, or split your file."
|
756 |
+
)
|
757 |
+
|
758 |
+
self.print_report(stats)
|
759 |
+
logger.info(f"Scientific corpus successfully built: {output_path}")
|
760 |
+
|
761 |
+
def build_corpus_scoped(self, plan: str, token_budget: int) -> (list, dict):
|
762 |
+
"""
|
763 |
+
Build a scientific corpus, limiting the total number of tokens to the plan's budget.
|
764 |
+
Returns the corpus and stats.
|
765 |
+
"""
|
766 |
+
logger.info(f"Building corpus for plan '{plan}' with token budget {token_budget}")
|
767 |
+
all_papers = []
|
768 |
+
all_papers.extend(self.process_papers(self.fetch_arxiv_papers(), "arxiv"))
|
769 |
+
all_papers.extend(self.process_papers(self.fetch_pubmed_papers(), "biology"))
|
770 |
+
all_papers.extend(self.process_papers(self.fetch_fineweb_edu(), "education"))
|
771 |
+
|
772 |
+
# Rank and deduplicate
|
773 |
+
ranked_papers = self.ranker.rank_samples(all_papers)
|
774 |
+
corpus = []
|
775 |
+
total_tokens = 0
|
776 |
+
for paper in ranked_papers:
|
777 |
+
tokens = paper.get("text", "").split()
|
778 |
+
if total_tokens + len(tokens) > token_budget:
|
779 |
+
break
|
780 |
+
corpus.append(paper)
|
781 |
+
total_tokens += len(tokens)
|
782 |
+
stats = self.analyzer.get_dataset_stats(corpus)
|
783 |
+
stats["total_tokens"] = total_tokens
|
784 |
+
logger.info(f"Corpus built: {len(corpus)} samples, {total_tokens} tokens")
|
785 |
+
return corpus, stats
|
786 |
+
|
787 |
+
def print_report(self, stats: Dict) -> None:
|
788 |
+
"""
|
789 |
+
Print a comprehensive build report.
|
790 |
+
|
791 |
+
Args:
|
792 |
+
stats: Dictionary of dataset statistics.
|
793 |
+
"""
|
794 |
+
print("\n" + "=" * 67)
|
795 |
+
print(" SCIENTIFIC CORPUS BUILD REPORT")
|
796 |
+
print("=" * 67)
|
797 |
+
print("\nSOURCE METRICS:")
|
798 |
+
print("-" * 40)
|
799 |
+
for source_name, label in zip(["arxiv", "pubmed", "fineweb_edu"],
|
800 |
+
["ARXIV", "PUBMED", "FINEWEB_EDU"]):
|
801 |
+
metrics = self.metrics[source_name]
|
802 |
+
print(f"{label:15}: {metrics.papers:6d} papers | {metrics.errors:3d} errors | {metrics.time:9.2f}s")
|
803 |
+
print("\nOVERALL METRICS:")
|
804 |
+
print("-" * 40)
|
805 |
+
total_papers = sum(self.metrics[src].papers for src in ["arxiv", "pubmed", "fineweb_edu"])
|
806 |
+
total_errors = sum(self.metrics[src].errors for src in ["arxiv", "pubmed", "fineweb_edu"])
|
807 |
+
print(f"Total Papers: {total_papers:,}")
|
808 |
+
print(f"Total Tokens: {self.metrics['total_tokens']:,}")
|
809 |
+
print(f"Total Time: {self.metrics['total_time']:.2f}s")
|
810 |
+
print(f"Total Errors: {total_errors}")
|
811 |
+
success_rate = (1 - total_errors / max(total_papers + total_errors, 1)) * 100
|
812 |
+
print(f"Success Rate: {success_rate:.2f}%")
|
813 |
+
if stats:
|
814 |
+
print("\nDATASET STATISTICS:")
|
815 |
+
print("-" * 40)
|
816 |
+
for key, value in stats.items():
|
817 |
+
print(f"{key:20}: {value}")
|
818 |
+
print("=" * 67)
|
819 |
+
print()
|
820 |
+
|
821 |
+
|
822 |
+
def main() -> None:
|
823 |
+
"""
|
824 |
+
Main entry point for the corpus builder.
|
825 |
+
"""
|
826 |
+
try:
|
827 |
+
config = CorpusConfig()
|
828 |
+
builder = ScientificCorpusBuilder(config)
|
829 |
+
output_path = "scientific_corpus_325M.jsonl"
|
830 |
+
builder.build_corpus(output_path)
|
831 |
+
|
832 |
+
# --- Hugging Face upload with improved error handling ---
|
833 |
+
try:
|
834 |
+
# Split large files if needed
|
835 |
+
file_size = os.path.getsize(output_path)
|
836 |
+
if file_size > 10 * 1024 * 1024: # 10 MB
|
837 |
+
logger.info("Large file detected, splitting into chunks...")
|
838 |
+
chunk_size = 10 * 1024 * 1024 # 10 MB chunks
|
839 |
+
base_path = os.path.splitext(output_path)[0]
|
840 |
+
|
841 |
+
with open(output_path, 'r', encoding='utf-8') as f:
|
842 |
+
chunk_num = 0
|
843 |
+
chunk = []
|
844 |
+
current_size = 0
|
845 |
+
|
846 |
+
for line in f:
|
847 |
+
line_size = len(line.encode('utf-8'))
|
848 |
+
if current_size + line_size > chunk_size and chunk:
|
849 |
+
chunk_path = f"{base_path}_part{chunk_num}.jsonl"
|
850 |
+
with open(chunk_path, 'w', encoding='utf-8') as chunk_file:
|
851 |
+
chunk_file.writelines(chunk)
|
852 |
+
logger.info(f"Created chunk {chunk_num}: {chunk_path}")
|
853 |
+
chunk = []
|
854 |
+
current_size = 0
|
855 |
+
chunk_num += 1
|
856 |
+
|
857 |
+
chunk.append(line)
|
858 |
+
current_size += line_size
|
859 |
+
|
860 |
+
# Write final chunk
|
861 |
+
if chunk:
|
862 |
+
chunk_path = f"{base_path}_part{chunk_num}.jsonl"
|
863 |
+
with open(chunk_path, 'w', encoding='utf-8') as chunk_file:
|
864 |
+
chunk_file.writelines(chunk)
|
865 |
+
logger.info(f"Created final chunk {chunk_num}: {chunk_path}")
|
866 |
+
|
867 |
+
# Upload each chunk
|
868 |
+
for i in range(chunk_num + 1):
|
869 |
+
chunk_path = f"{base_path}_part{i}.jsonl"
|
870 |
+
logger.info(f"Uploading chunk {i}...")
|
871 |
+
upload_to_huggingface(
|
872 |
+
dataset_path=chunk_path,
|
873 |
+
repo_id="Allanatrix/Scientific_Research_Tokenized",
|
874 |
+
auto_generate_readme=(i == 0), # Only generate README for first chunk
|
875 |
+
compress=True,
|
876 |
+
keep_local=True # Keep files until all uploads complete
|
877 |
+
)
|
878 |
+
else:
|
879 |
+
# Upload single file
|
880 |
+
upload_to_huggingface(
|
881 |
+
dataset_path=output_path,
|
882 |
+
repo_id="Allanatrix/Scientific_Research_Tokenized",
|
883 |
+
auto_generate_readme=True,
|
884 |
+
compress=True
|
885 |
+
)
|
886 |
+
|
887 |
+
except ImportError:
|
888 |
+
logger.error("Hugging Face upload module not found. Please ensure hf_upload.py exists.")
|
889 |
+
except Exception as e:
|
890 |
+
logger.error(f"Error during Hugging Face upload: {e}")
|
891 |
+
if "EOF" in str(e) or "timeout" in str(e):
|
892 |
+
logger.warning("Upload interrupted. Try using smaller chunks or increasing timeout.")
|
893 |
+
finally:
|
894 |
+
# Cleanup temporary files
|
895 |
+
if 'chunk_num' in locals():
|
896 |
+
for i in range(chunk_num + 1):
|
897 |
+
try:
|
898 |
+
os.remove(f"{base_path}_part{i}.jsonl")
|
899 |
+
except OSError:
|
900 |
+
pass
|
901 |
+
|
902 |
+
except KeyboardInterrupt:
|
903 |
+
logger.info("Build process interrupted by user")
|
904 |
+
except Exception as e:
|
905 |
+
logger.error(f"Unexpected error in main: {e}")
|
906 |
+
raise
|
907 |
+
|
908 |
+
# Optionally, you can add a CLI entry point for testing:
|
909 |
+
def main_scoped(plan: str = "free"):
|
910 |
+
config = CorpusConfig()
|
911 |
+
builder = ScientificCorpusBuilder(config)
|
912 |
+
token_budget = PLAN_LIMITS.get(plan, 1000)
|
913 |
+
corpus, stats = builder.build_corpus_scoped(plan, token_budget)
|
914 |
+
output_path = f"scientific_corpus_{plan}_{token_budget}.jsonl"
|
915 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
916 |
+
for paper in corpus:
|
917 |
+
f.write(json.dumps(paper, ensure_ascii=False) + "\n")
|
918 |
+
print(f"Saved {len(corpus)} samples ({stats['total_tokens']} tokens) to {output_path}")
|
919 |
+
|
920 |
+
if __name__ == "__main__":
|
921 |
+
# main() # old entry point
|
922 |
+
main_scoped("free") # new entry point for plan-scoped corpus
|
Tokenization/__init__.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Tokenization/__init__.py
|
2 |
+
|
3 |
+
from .Entropy_ranker import EntropyRanker
|
4 |
+
from .Label_tokens import DOMAIN_TAGS, TASK_TAGS, SECTION_TAGS, ROUTING_TAGS, build_tag_string
|
5 |
+
from .preprocessing import clean_text, segment_paragraphs, preprocess_sample
|
6 |
+
|
7 |
+
# Expose the main dataset generation pipeline for external use
|
8 |
+
from .generate_dataset import generate_dataset
|
9 |
+
|
10 |
+
__all__ = [
|
11 |
+
"EntropyRanker",
|
12 |
+
"DOMAIN_TAGS",
|
13 |
+
"TASK_TAGS",
|
14 |
+
"SECTION_TAGS",
|
15 |
+
"ROUTING_TAGS",
|
16 |
+
"build_tag_string",
|
17 |
+
"clean_text",
|
18 |
+
"segment_paragraphs",
|
19 |
+
"preprocess_sample",
|
20 |
+
"generate_dataset",
|
21 |
+
]
|
Tokenization/__pycache__/Build_tokenizer.cpython-310.pyc
ADDED
Binary file (3.54 kB). View file
|
|
Tokenization/__pycache__/Entropy_ranker.cpython-310.pyc
ADDED
Binary file (3.39 kB). View file
|
|
Tokenization/__pycache__/Label_tokens.cpython-310.pyc
ADDED
Binary file (1.35 kB). View file
|
|
Tokenization/__pycache__/Main_2.cpython-310.pyc
ADDED
Binary file (26.8 kB). View file
|
|
Tokenization/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (565 Bytes). View file
|
|
Tokenization/__pycache__/generate_dataset.cpython-310.pyc
ADDED
Binary file (3.14 kB). View file
|
|
Tokenization/__pycache__/hf_upload.cpython-310.pyc
ADDED
Binary file (5.56 kB). View file
|
|
Tokenization/app.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import time
|
3 |
+
|
4 |
+
def calculate_price(payment_mode, tokens, plan, custom_price, file):
|
5 |
+
if payment_mode == "Pay as you go":
|
6 |
+
price = round(tokens * 0.01, 2) # Example: $0.01 per token
|
7 |
+
return f"{tokens:,} tokens\nPrice: ${price:.2f}", price
|
8 |
+
elif payment_mode == "Plan":
|
9 |
+
if plan == "Free":
|
10 |
+
return "0 tokens\nPrice: $0", 0
|
11 |
+
elif plan == "Starter":
|
12 |
+
return "100,000 tokens\nPrice: $15", 15
|
13 |
+
elif plan == "Pro":
|
14 |
+
return "500,000 tokens\nPrice: $30", 30
|
15 |
+
elif plan == "Custom":
|
16 |
+
return f"Custom plan\nPrice: ${custom_price}", float(custom_price or 0)
|
17 |
+
elif file is not None:
|
18 |
+
# Simulate token count from file size
|
19 |
+
tokens = 1000 # Replace it with real calculation
|
20 |
+
price = round(tokens * 0.01, 2)
|
21 |
+
return f"{tokens:,} tokens\nPrice: ${price:.2f}", price
|
22 |
+
return "", 0
|
23 |
+
|
24 |
+
def generate_dataset(*args, **kwargs):
|
25 |
+
for i in range(5):
|
26 |
+
yield f"Generating... ({(i+1)*20}%)", None, (i+1)/5
|
27 |
+
time.sleep(0.3)
|
28 |
+
yield "Ready! Please pay to download.", "dataset.jsonl", 1.0
|
29 |
+
|
30 |
+
with gr.Blocks(
|
31 |
+
title="Nexa Data Studio",
|
32 |
+
css="""
|
33 |
+
body, .gradio-container {
|
34 |
+
min-height: 100vh;
|
35 |
+
background: #111 !important;
|
36 |
+
color: #fff !important;
|
37 |
+
}
|
38 |
+
.gradio-container {
|
39 |
+
max-width: 900px !important;
|
40 |
+
margin: 40px auto !important;
|
41 |
+
box-shadow: 0 2px 16px #0008;
|
42 |
+
border-radius: 16px;
|
43 |
+
padding: 32px 32px 24px 32px !important;
|
44 |
+
background: #111 !important;
|
45 |
+
color: #fff !important;
|
46 |
+
display: flex;
|
47 |
+
flex-direction: column;
|
48 |
+
align-items: center;
|
49 |
+
}
|
50 |
+
.footer {margin-top: 2em; color: #bbb; font-size: 0.9em; text-align: center;}
|
51 |
+
#header {text-align: center;}
|
52 |
+
"""
|
53 |
+
) as demo:
|
54 |
+
gr.Markdown(
|
55 |
+
"""
|
56 |
+
<div style="display:flex;align-items:center;gap:16px;justify-content:center;">
|
57 |
+
<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" height="40"/>
|
58 |
+
<h1 style="margin-bottom:0;">Nexa Data Studio</h1>
|
59 |
+
</div>
|
60 |
+
<p style="text-align:center;">
|
61 |
+
<b>Generate or label scientific datasets for ML research.</b>
|
62 |
+
</p>
|
63 |
+
""",
|
64 |
+
elem_id="header"
|
65 |
+
)
|
66 |
+
|
67 |
+
payment_mode = gr.Radio(
|
68 |
+
["Pay as you go", "Plan"],
|
69 |
+
label="Payment Mode",
|
70 |
+
value="Pay as you go"
|
71 |
+
)
|
72 |
+
|
73 |
+
with gr.Row() as payg_row:
|
74 |
+
tokens = gr.Slider(100, 100000, value=1000, step=100, label="Tokens Requested")
|
75 |
+
with gr.Row(visible=False) as plan_row:
|
76 |
+
plan = gr.Dropdown(
|
77 |
+
["Free", "Starter", "Pro", "Custom"],
|
78 |
+
label="Plan",
|
79 |
+
value="Free"
|
80 |
+
)
|
81 |
+
custom_price = gr.Number(label="Custom Price ($)", visible=False)
|
82 |
+
|
83 |
+
job_type = gr.Radio(
|
84 |
+
["Generate Dataset", "Label Uploaded Data"],
|
85 |
+
label="Job Type",
|
86 |
+
value="Generate Dataset"
|
87 |
+
)
|
88 |
+
|
89 |
+
with gr.Column(visible=False) as label_col:
|
90 |
+
file = gr.File(label="Upload Dataset (.txt or .jsonl)")
|
91 |
+
|
92 |
+
price_info = gr.Textbox(label="Summary", interactive=False)
|
93 |
+
download = gr.File(label="Download")
|
94 |
+
progress = gr.Slider(0, 1, value=0, step=0.01, label="Progress", interactive=False)
|
95 |
+
status = gr.Text(label="Status", interactive=False)
|
96 |
+
|
97 |
+
def update_payment_ui(payment_mode_val, plan_val):
|
98 |
+
return (
|
99 |
+
gr.update(visible=payment_mode_val == "Pay as you go"),
|
100 |
+
gr.update(visible=payment_mode_val == "Plan"),
|
101 |
+
gr.update(visible=payment_mode_val == "Plan" and plan_val == "Custom")
|
102 |
+
)
|
103 |
+
|
104 |
+
payment_mode.change(
|
105 |
+
update_payment_ui,
|
106 |
+
inputs=[payment_mode, plan],
|
107 |
+
outputs=[payg_row, plan_row, custom_price]
|
108 |
+
)
|
109 |
+
plan.change(
|
110 |
+
lambda p: gr.update(visible=p == "Custom"),
|
111 |
+
inputs=plan,
|
112 |
+
outputs=custom_price
|
113 |
+
)
|
114 |
+
|
115 |
+
def update_label_ui(job_type_val):
|
116 |
+
return gr.update(visible=job_type_val == "Label Uploaded Data")
|
117 |
+
job_type.change(update_label_ui, inputs=job_type, outputs=label_col)
|
118 |
+
|
119 |
+
def update_summary(payment_mode, tokens, plan, custom_price, file, job_type):
|
120 |
+
if job_type == "Label Uploaded Data" and file is not None:
|
121 |
+
return calculate_price("Label", tokens, plan, custom_price, file)[0]
|
122 |
+
return calculate_price(payment_mode, tokens, plan, custom_price, file)[0]
|
123 |
+
|
124 |
+
inputs = [payment_mode, tokens, plan, custom_price, file, job_type]
|
125 |
+
gr.Button("Generate", elem_id="generate-btn", variant="primary").click(
|
126 |
+
generate_dataset,
|
127 |
+
inputs=inputs,
|
128 |
+
outputs=[status, download, progress]
|
129 |
+
)
|
130 |
+
gr.Button("Update Summary").click(
|
131 |
+
update_summary,
|
132 |
+
inputs=inputs,
|
133 |
+
outputs=price_info
|
134 |
+
)
|
135 |
+
|
136 |
+
gr.Markdown(
|
137 |
+
f"""
|
138 |
+
<div class="footer">
|
139 |
+
© {time.strftime("%Y")} Nexa Data Studio — Powered by Hugging Face Spaces<br>
|
140 |
+
For support, contact <a href="mailto:[email protected]">[email protected]</a>
|
141 |
+
</div>
|
142 |
+
"""
|
143 |
+
)
|
144 |
+
|
145 |
+
if __name__ == "__main__":
|
146 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
|
147 |
+
print("Nexa Data Studio is running at http://localhost:7860")
|
Tokenization/app/Api.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Api.py: FastAPI endpoints for dataset generation, progress polling, and download.
|
3 |
+
"""
|
4 |
+
from fastapi import FastAPI, Request
|
5 |
+
from fastapi.responses import JSONResponse, StreamingResponse
|
6 |
+
from .Core import job_manager
|
7 |
+
from .Progress import progress_tracker
|
8 |
+
from .Payment import payment_manager
|
9 |
+
import io
|
10 |
+
|
11 |
+
app = FastAPI()
|
12 |
+
|
13 |
+
@app.post("/generate-dataset")
|
14 |
+
async def generate_dataset(request: Request):
|
15 |
+
user_input = await request.json()
|
16 |
+
job_id, error = job_manager.start_job(user_input)
|
17 |
+
if error:
|
18 |
+
return JSONResponse({"error": error}, status_code=400)
|
19 |
+
return {"job_id": job_id}
|
20 |
+
|
21 |
+
@app.get("/progress/{job_id}")
|
22 |
+
def get_progress(job_id: str):
|
23 |
+
progress = progress_tracker.get(job_id)
|
24 |
+
if not progress:
|
25 |
+
return JSONResponse({"error": "Job not found"}, status_code=404)
|
26 |
+
return progress
|
27 |
+
|
28 |
+
@app.get("/download/{job_id}")
|
29 |
+
def download(job_id: str):
|
30 |
+
job = job_manager.get_job_status(job_id)
|
31 |
+
if not job or job.get("status") != "complete":
|
32 |
+
return JSONResponse({"error": "Job not complete"}, status_code=400)
|
33 |
+
# Payment check
|
34 |
+
plan = job.get("plan", "free")
|
35 |
+
tokens = job.get("token_budget", 0)
|
36 |
+
if payment_manager.requires_payment(plan, tokens):
|
37 |
+
return JSONResponse({"error": "Payment required", "checkout_url": payment_manager.create_checkout_session(plan, job_id)}, status_code=402)
|
38 |
+
# In production, use FileResponse to serve the file
|
39 |
+
return {
|
40 |
+
"download_url": job["result_path"],
|
41 |
+
"stats": job.get("stats", {})
|
42 |
+
}
|
43 |
+
|
44 |
+
@app.get("/download-corpus/{job_id}")
|
45 |
+
def download_corpus(job_id: str):
|
46 |
+
job = job_manager.get_job_status(job_id)
|
47 |
+
if not job or job.get("status") != "complete":
|
48 |
+
return JSONResponse({"error": "Job not complete"}, status_code=400)
|
49 |
+
if job.get("job_type") != "corpus":
|
50 |
+
return JSONResponse({"error": "Not a corpus job"}, status_code=400)
|
51 |
+
plan = job.get("plan", "free")
|
52 |
+
tokens = job.get("token_budget", 0)
|
53 |
+
if payment_manager.requires_payment(plan, tokens):
|
54 |
+
return JSONResponse({"error": "Payment required", "checkout_url": payment_manager.create_checkout_session(plan, job_id)}, status_code=402)
|
55 |
+
jsonl_lines = job.get("jsonl_lines", [])
|
56 |
+
stats = job.get("stats", {})
|
57 |
+
# Stream the JSONL as a file
|
58 |
+
file_like = io.StringIO("\n".join(jsonl_lines))
|
59 |
+
headers = {
|
60 |
+
"Content-Disposition": f"attachment; filename=scientific_corpus_{job_id}.jsonl"
|
61 |
+
}
|
62 |
+
return StreamingResponse(file_like, media_type="application/jsonl", headers=headers)
|
63 |
+
|
64 |
+
@app.get("/job-stats/{job_id}")
|
65 |
+
def job_stats(job_id: str):
|
66 |
+
job = job_manager.get_job_status(job_id)
|
67 |
+
if not job:
|
68 |
+
return JSONResponse({"error": "Job not found"}, status_code=404)
|
69 |
+
return {"stats": job.get("stats", {})}
|
70 |
+
|
71 |
+
@app.get("/price/{plan}")
|
72 |
+
def get_price(plan: str):
|
73 |
+
price = payment_manager.get_price(plan)
|
74 |
+
return {"plan": plan, "price": price}
|
75 |
+
|
Tokenization/app/Config.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Config.py: Configuration for plan limits, pricing, and app constants.
|
3 |
+
"""
|
4 |
+
|
5 |
+
# Plan limits (tokens per plan)
|
6 |
+
PLAN_LIMITS = {
|
7 |
+
"free": 1000,
|
8 |
+
"starter": 5000,
|
9 |
+
"pro": 10000,
|
10 |
+
"enterprise": 100000,
|
11 |
+
}
|
12 |
+
|
13 |
+
# Pricing per plan (USD)
|
14 |
+
PLAN_PRICING = {
|
15 |
+
"free": 0,
|
16 |
+
"starter": 15,
|
17 |
+
"pro": 30,
|
18 |
+
"enterprise": "custom",
|
19 |
+
}
|
20 |
+
|
21 |
+
# Other app-wide constants
|
22 |
+
tmp_dir = "./tmp_datasets"
|
23 |
+
|
24 |
+
# Stripe keys, etc. (to be set via environment variables in production)
|
25 |
+
STRIPE_API_KEY = None
|
Tokenization/app/Core.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Core.py: Orchestrates dataset generation jobs, plan enforcement, and background processing.
|
3 |
+
"""
|
4 |
+
import threading
|
5 |
+
import uuid
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
from .Config import PLAN_LIMITS, tmp_dir
|
9 |
+
from .Progress import progress_tracker
|
10 |
+
from .Payment import payment_manager
|
11 |
+
|
12 |
+
# Import your tokenizer module here (example)
|
13 |
+
from Tokenization.generate_dataset import generate_dataset
|
14 |
+
from Tokenization.Main_2 import ScientificCorpusBuilder, CorpusConfig
|
15 |
+
from Tokenization.Build_tokenizer import QLoRAPreprocessor
|
16 |
+
import nltk
|
17 |
+
|
18 |
+
class JobManager:
|
19 |
+
def __init__(self):
|
20 |
+
self.jobs = {}
|
21 |
+
self.lock = threading.Lock()
|
22 |
+
|
23 |
+
def start_job(self, user_input):
|
24 |
+
plan = user_input.get("plan")
|
25 |
+
token_budget = user_input.get("token_budget")
|
26 |
+
job_type = user_input.get("job_type", "tokenize") # "tokenize", "corpus", or "label"
|
27 |
+
# For label jobs, token_budget is determined after upload
|
28 |
+
if job_type != "label" and not payment_manager.check_plan_limit(plan, token_budget):
|
29 |
+
return None, "Plan limit exceeded"
|
30 |
+
job_id = str(uuid.uuid4())
|
31 |
+
with self.lock:
|
32 |
+
self.jobs[job_id] = {
|
33 |
+
"status": "pending",
|
34 |
+
"plan": plan,
|
35 |
+
"token_budget": token_budget,
|
36 |
+
"job_type": job_type,
|
37 |
+
"user_input": user_input
|
38 |
+
}
|
39 |
+
if job_type == "corpus":
|
40 |
+
thread = threading.Thread(target=self._run_corpus_pipeline, args=(job_id,))
|
41 |
+
elif job_type == "label":
|
42 |
+
thread = threading.Thread(target=self._run_label_pipeline, args=(job_id,))
|
43 |
+
else:
|
44 |
+
thread = threading.Thread(target=self._run_job, args=(job_id, user_input))
|
45 |
+
thread.start()
|
46 |
+
return job_id, None
|
47 |
+
|
48 |
+
def _run_job(self, job_id, user_input):
|
49 |
+
try:
|
50 |
+
progress_tracker.start_job(job_id, total_steps=6)
|
51 |
+
# Step 1: Data retrieval
|
52 |
+
progress_tracker.update(job_id, 1, "Retrieving data from sources...")
|
53 |
+
domain = user_input.get("domain")
|
54 |
+
token_budget = user_input.get("token_budget")
|
55 |
+
plan = user_input.get("plan")
|
56 |
+
custom_seed = user_input.get("custom_seed", None)
|
57 |
+
# Step 2: Preprocessing
|
58 |
+
progress_tracker.update(job_id, 2, "Preprocessing and cleaning data...")
|
59 |
+
# Step 3: Tokenization & Labeling
|
60 |
+
progress_tracker.update(job_id, 3, "Tokenizing and labeling samples...")
|
61 |
+
# Step 4: Validation & Stats
|
62 |
+
progress_tracker.update(job_id, 4, "Validating and computing statistics...")
|
63 |
+
# Step 5: Formatting output
|
64 |
+
progress_tracker.update(job_id, 5, "Formatting dataset as JSONL...")
|
65 |
+
# Call tokenizer pipeline (implement in tokenization/tokenizer.py)
|
66 |
+
result = generate_dataset(
|
67 |
+
domain=domain,
|
68 |
+
token_budget=token_budget,
|
69 |
+
plan=plan,
|
70 |
+
custom_seed=custom_seed,
|
71 |
+
progress_callback=lambda step, msg: progress_tracker.update(job_id, step, msg)
|
72 |
+
)
|
73 |
+
# Step 6: Save output
|
74 |
+
os.makedirs(tmp_dir, exist_ok=True)
|
75 |
+
output_path = os.path.join(tmp_dir, f"{domain}_{token_budget}_tokens_{job_id}.jsonl")
|
76 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
77 |
+
for line in result["jsonl_lines"]:
|
78 |
+
f.write(line + "\n")
|
79 |
+
progress_tracker.update(job_id, 6, "Dataset ready for download.")
|
80 |
+
progress_tracker.complete(job_id)
|
81 |
+
with self.lock:
|
82 |
+
self.jobs[job_id]["status"] = "complete"
|
83 |
+
self.jobs[job_id]["result_path"] = output_path
|
84 |
+
self.jobs[job_id]["stats"] = result.get("stats", {})
|
85 |
+
except Exception as e:
|
86 |
+
progress_tracker.update(job_id, 0, f"Job failed: {str(e)}")
|
87 |
+
with self.lock:
|
88 |
+
self.jobs[job_id]["status"] = "failed"
|
89 |
+
self.jobs[job_id]["error"] = str(e)
|
90 |
+
|
91 |
+
def _run_corpus_pipeline(self, job_id):
|
92 |
+
try:
|
93 |
+
with self.lock:
|
94 |
+
user_input = self.jobs[job_id]["user_input"]
|
95 |
+
plan = user_input.get("plan")
|
96 |
+
token_budget = user_input.get("token_budget")
|
97 |
+
progress_tracker.start_job(job_id, total_steps=5)
|
98 |
+
progress_tracker.update(job_id, 1, "Building scientific corpus...")
|
99 |
+
config = CorpusConfig()
|
100 |
+
builder = ScientificCorpusBuilder(config)
|
101 |
+
corpus, stats = builder.build_corpus_scoped(plan, token_budget)
|
102 |
+
progress_tracker.update(job_id, 2, "Formatting dataset as JSONL...")
|
103 |
+
jsonl_lines = [json.dumps(paper, ensure_ascii=False) for paper in corpus]
|
104 |
+
progress_tracker.update(job_id, 3, "Finalizing output...")
|
105 |
+
progress_tracker.update(job_id, 4, "Corpus ready for download.")
|
106 |
+
progress_tracker.complete(job_id)
|
107 |
+
with self.lock:
|
108 |
+
self.jobs[job_id]["status"] = "complete"
|
109 |
+
self.jobs[job_id]["jsonl_lines"] = jsonl_lines
|
110 |
+
self.jobs[job_id]["stats"] = stats
|
111 |
+
self.jobs[job_id]["actual_tokens"] = stats.get("total_tokens", 0)
|
112 |
+
except Exception as e:
|
113 |
+
progress_tracker.update(job_id, 0, f"Job failed: {str(e)}")
|
114 |
+
with self.lock:
|
115 |
+
self.jobs[job_id]["status"] = "failed"
|
116 |
+
self.jobs[job_id]["error"] = str(e)
|
117 |
+
|
118 |
+
def _run_label_pipeline(self, job_id):
|
119 |
+
try:
|
120 |
+
with self.lock:
|
121 |
+
user_input = self.jobs[job_id]["user_input"]
|
122 |
+
plan = self.jobs[job_id]["plan"]
|
123 |
+
progress_tracker.start_job(job_id, total_steps=4)
|
124 |
+
progress_tracker.update(job_id, 1, "Loading and preprocessing dataset...")
|
125 |
+
dataset_text = user_input.get("dataset_text", "")
|
126 |
+
if not dataset_text:
|
127 |
+
raise ValueError("No dataset text provided.")
|
128 |
+
tokens = nltk.word_tokenize(dataset_text)
|
129 |
+
num_tokens = len(tokens)
|
130 |
+
with self.lock:
|
131 |
+
self.jobs[job_id]["actual_tokens"] = num_tokens
|
132 |
+
if not payment_manager.check_plan_limit(plan, num_tokens):
|
133 |
+
raise ValueError("Plan limit exceeded.")
|
134 |
+
progress_tracker.update(job_id, 2, "Tokenizing and labeling dataset...")
|
135 |
+
preprocessor = QLoRAPreprocessor()
|
136 |
+
labeled_data = preprocessor.preprocess_function(dataset_text)
|
137 |
+
jsonl_lines = [json.dumps({"text": item}, ensure_ascii=False) for item in labeled_data]
|
138 |
+
stats = {"token_count": num_tokens, "sample_count": len(labeled_data)}
|
139 |
+
progress_tracker.update(job_id, 3, "Dataset ready for download.")
|
140 |
+
progress_tracker.complete(job_id)
|
141 |
+
with self.lock:
|
142 |
+
self.jobs[job_id]["status"] = "complete"
|
143 |
+
self.jobs[job_id]["jsonl_lines"] = jsonl_lines
|
144 |
+
self.jobs[job_id]["stats"] = stats
|
145 |
+
except Exception as e:
|
146 |
+
progress_tracker.update(job_id, 0, f"Job failed: {str(e)}")
|
147 |
+
with self.lock:
|
148 |
+
self.jobs[job_id]["status"] = "failed"
|
149 |
+
self.jobs[job_id]["error"] = str(e)
|
150 |
+
|
151 |
+
def get_job_status(self, job_id):
|
152 |
+
with self.lock:
|
153 |
+
return self.jobs.get(job_id, None)
|
154 |
+
|
155 |
+
job_manager = JobManager()
|
Tokenization/app/Payment.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Payment.py: Plan enforcement and payment logic (Stripe stub).
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
from .Config import PLAN_LIMITS, PLAN_PRICING
|
6 |
+
|
7 |
+
class PaymentManager:
|
8 |
+
def __init__(self):
|
9 |
+
self.stripe_api_key = os.getenv("STRIPE_API_KEY")
|
10 |
+
|
11 |
+
def check_plan_limit(self, plan, requested_tokens):
|
12 |
+
limit = PLAN_LIMITS.get(plan, 0)
|
13 |
+
return requested_tokens <= limit
|
14 |
+
|
15 |
+
def get_price(self, plan):
|
16 |
+
return PLAN_PRICING.get(plan, 0)
|
17 |
+
|
18 |
+
def requires_payment(self, plan, requested_tokens):
|
19 |
+
if plan == "free":
|
20 |
+
return requested_tokens > PLAN_LIMITS["free"]
|
21 |
+
return plan not in PLAN_LIMITS
|
22 |
+
|
23 |
+
def create_checkout_session(self, plan, job_id):
|
24 |
+
# Stub: Integrate with Stripe API in production
|
25 |
+
return f"https://checkout.stripe.com/pay/{plan}/{job_id}"
|
26 |
+
|
27 |
+
payment_manager = PaymentManager()
|
Tokenization/app/Progress.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Progress.py: Thread-safe progress tracking for dataset generation jobs.
|
3 |
+
"""
|
4 |
+
import threading
|
5 |
+
|
6 |
+
class ProgressTracker:
|
7 |
+
def __init__(self):
|
8 |
+
self._progress = {}
|
9 |
+
self._lock = threading.Lock()
|
10 |
+
|
11 |
+
def start_job(self, job_id, total_steps):
|
12 |
+
with self._lock:
|
13 |
+
self._progress[job_id] = {
|
14 |
+
"current": 0,
|
15 |
+
"total": total_steps,
|
16 |
+
"status": "started",
|
17 |
+
"message": "Job started"
|
18 |
+
}
|
19 |
+
|
20 |
+
def update(self, job_id, current, message=None):
|
21 |
+
with self._lock:
|
22 |
+
if job_id in self._progress:
|
23 |
+
self._progress[job_id]["current"] = current
|
24 |
+
if message:
|
25 |
+
self._progress[job_id]["message"] = message # No emoji, just message
|
26 |
+
|
27 |
+
def complete(self, job_id):
|
28 |
+
with self._lock:
|
29 |
+
if job_id in self._progress:
|
30 |
+
self._progress[job_id]["status"] = "complete"
|
31 |
+
self._progress[job_id]["message"] = "Job complete"
|
32 |
+
|
33 |
+
def get(self, job_id):
|
34 |
+
with self._lock:
|
35 |
+
return self._progress.get(job_id, None)
|
36 |
+
|
37 |
+
progress_tracker = ProgressTracker()
|
Tokenization/app/__init__.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
app/__init__.py: Exposes main backend components for reuse.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from .Api import app as fastapi_app
|
6 |
+
from .Core import job_manager
|
7 |
+
from .Progress import progress_tracker
|
8 |
+
from .Payment import payment_manager
|
9 |
+
|
10 |
+
__all__ = [
|
11 |
+
"fastapi_app",
|
12 |
+
"job_manager",
|
13 |
+
"progress_tracker",
|
14 |
+
"payment_manager",
|
15 |
+
]
|
Tokenization/app/__pycache__/Api.cpython-310.pyc
ADDED
Binary file (2.81 kB). View file
|
|
Tokenization/app/__pycache__/Config.cpython-310.pyc
ADDED
Binary file (444 Bytes). View file
|
|
Tokenization/app/__pycache__/Core.cpython-310.pyc
ADDED
Binary file (4.86 kB). View file
|
|
Tokenization/app/__pycache__/Payment.cpython-310.pyc
ADDED
Binary file (1.45 kB). View file
|
|
Tokenization/app/__pycache__/Progress.cpython-310.pyc
ADDED
Binary file (1.66 kB). View file
|
|
Tokenization/app/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (460 Bytes). View file
|
|
Tokenization/combined_scientific_papers.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Tokenization/combined_scientific_papers.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Tokenization/corpus_builder.log
ADDED
File without changes
|
Tokenization/debug_upload.log
ADDED
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-06-12 18:18:01,037 - WARNING - Using default email for Entrez. Set ENTREZ_EMAIL environment variable.
|
2 |
+
2025-06-12 18:18:01,037 - INFO - Starting arXiv paper collection...
|
3 |
+
2025-06-12 18:18:01,038 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
|
4 |
+
2025-06-12 18:18:03,165 - INFO - Got first page: 100 of 1236760 total results
|
5 |
+
2025-06-12 18:18:03,172 - INFO - Sleeping: 2.828948 seconds
|
6 |
+
2025-06-12 18:18:06,004 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=100&max_results=100
|
7 |
+
2025-06-12 18:18:06,953 - INFO - Sleeping: 2.866122 seconds
|
8 |
+
2025-06-12 18:18:09,824 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=200&max_results=100
|
9 |
+
2025-06-12 18:18:11,783 - INFO - Sleeping: 2.823819 seconds
|
10 |
+
2025-06-12 18:18:14,608 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=300&max_results=100
|
11 |
+
2025-06-12 18:18:16,436 - INFO - Sleeping: 2.857095 seconds
|
12 |
+
2025-06-12 18:18:19,301 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=400&max_results=100
|
13 |
+
2025-06-12 18:18:22,022 - INFO - Sleeping: 2.790207 seconds
|
14 |
+
2025-06-12 18:18:24,820 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
|
15 |
+
2025-06-12 18:18:25,173 - INFO - Sleeping: 2.998001 seconds
|
16 |
+
2025-06-12 18:18:28,181 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
|
17 |
+
2025-06-12 18:18:28,988 - INFO - Sleeping: 2.999010 seconds
|
18 |
+
2025-06-12 18:18:32,000 - INFO - Requesting page (first: False, try: 2): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
|
19 |
+
2025-06-12 18:18:32,507 - INFO - Sleeping: 2.998957 seconds
|
20 |
+
2025-06-12 18:18:35,519 - INFO - Requesting page (first: False, try: 3): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
|
21 |
+
2025-06-12 18:18:36,061 - WARNING - Empty page returned for query 'cat:physics* OR cat:astro-ph* OR cat:cond-mat* OR cat:hep-th OR cat:quant-ph OR cat:math-ph': Page of results was unexpectedly empty (https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100)
|
22 |
+
2025-06-12 18:18:36,065 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
|
23 |
+
2025-06-12 18:18:36,888 - INFO - Got first page: 100 of 50293 total results
|
24 |
+
2025-06-12 18:18:36,896 - INFO - Sleeping: 2.871087 seconds
|
25 |
+
2025-06-12 18:18:39,783 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=100&max_results=100
|
26 |
+
2025-06-12 18:18:40,466 - INFO - Sleeping: 2.870444 seconds
|
27 |
+
2025-06-12 18:18:43,339 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=200&max_results=100
|
28 |
+
2025-06-12 18:18:44,012 - INFO - Sleeping: 2.874603 seconds
|
29 |
+
2025-06-12 18:18:46,893 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=300&max_results=100
|
30 |
+
2025-06-12 18:18:47,688 - INFO - Sleeping: 2.858048 seconds
|
31 |
+
2025-06-12 18:18:50,552 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=400&max_results=100
|
32 |
+
2025-06-12 18:18:51,370 - INFO - Sleeping: 2.870823 seconds
|
33 |
+
2025-06-12 18:18:54,246 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
|
34 |
+
2025-06-12 18:18:54,960 - INFO - Sleeping: 2.886596 seconds
|
35 |
+
2025-06-12 18:18:57,856 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=600&max_results=100
|
36 |
+
2025-06-12 18:18:58,568 - INFO - Sleeping: 2.886486 seconds
|
37 |
+
2025-06-12 18:19:01,466 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=700&max_results=100
|
38 |
+
2025-06-12 18:19:02,219 - INFO - Sleeping: 2.867826 seconds
|
39 |
+
2025-06-12 18:19:05,103 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=800&max_results=100
|
40 |
+
2025-06-12 18:19:06,346 - INFO - Sleeping: 2.766637 seconds
|
41 |
+
2025-06-12 18:19:09,120 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=900&max_results=100
|
42 |
+
2025-06-12 18:19:10,043 - INFO - Sleeping: 2.877552 seconds
|
43 |
+
2025-06-12 18:19:12,929 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1000&max_results=100
|
44 |
+
2025-06-12 18:19:13,641 - INFO - Sleeping: 2.873434 seconds
|
45 |
+
2025-06-12 18:19:16,525 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1100&max_results=100
|
46 |
+
2025-06-12 18:19:17,281 - INFO - Sleeping: 2.871482 seconds
|
47 |
+
2025-06-12 18:19:20,161 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1200&max_results=100
|
48 |
+
2025-06-12 18:19:20,990 - INFO - Sleeping: 2.872492 seconds
|
49 |
+
2025-06-12 18:19:23,876 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1300&max_results=100
|
50 |
+
2025-06-12 18:19:24,633 - INFO - Sleeping: 2.873157 seconds
|
51 |
+
2025-06-12 18:19:27,510 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1400&max_results=100
|
52 |
+
2025-06-12 18:19:28,249 - INFO - Sleeping: 2.872219 seconds
|
53 |
+
2025-06-12 18:19:31,132 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1500&max_results=100
|
54 |
+
2025-06-12 18:19:31,787 - INFO - Sleeping: 2.871294 seconds
|
55 |
+
2025-06-12 18:19:34,660 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1600&max_results=100
|
56 |
+
2025-06-12 18:19:35,423 - INFO - Sleeping: 2.864608 seconds
|
57 |
+
2025-06-12 18:19:38,291 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1700&max_results=100
|
58 |
+
2025-06-12 18:19:38,496 - INFO - Sleeping: 2.998046 seconds
|
59 |
+
2025-06-12 18:19:41,498 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1700&max_results=100
|
60 |
+
2025-06-12 18:19:41,682 - INFO - Sleeping: 2.998049 seconds
|
61 |
+
2025-06-12 18:19:44,693 - INFO - Requesting page (first: False, try: 2): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1700&max_results=100
|
62 |
+
2025-06-12 18:19:45,568 - INFO - Sleeping: 2.874692 seconds
|
63 |
+
2025-06-12 18:19:48,448 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1800&max_results=100
|
64 |
+
2025-06-12 18:19:48,654 - INFO - Sleeping: 2.998000 seconds
|
65 |
+
2025-06-12 18:19:51,668 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1800&max_results=100
|
66 |
+
2025-06-12 18:19:52,436 - INFO - Sleeping: 2.877867 seconds
|
67 |
+
2025-06-12 18:19:55,323 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1900&max_results=100
|
68 |
+
2025-06-12 18:19:56,074 - INFO - Sleeping: 2.878102 seconds
|
69 |
+
2025-06-12 18:19:58,961 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2000&max_results=100
|
70 |
+
2025-06-12 18:19:59,730 - INFO - Sleeping: 2.846435 seconds
|
71 |
+
2025-06-12 18:20:02,587 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2100&max_results=100
|
72 |
+
2025-06-12 18:20:02,802 - INFO - Sleeping: 2.997978 seconds
|
73 |
+
2025-06-12 18:20:05,801 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2100&max_results=100
|
74 |
+
2025-06-12 18:20:06,645 - INFO - Sleeping: 2.882026 seconds
|
75 |
+
2025-06-12 18:20:09,537 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2200&max_results=100
|
76 |
+
2025-06-12 18:20:10,681 - INFO - Sleeping: 2.867912 seconds
|
77 |
+
2025-06-12 18:20:13,558 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2300&max_results=100
|
78 |
+
2025-06-12 18:20:15,163 - INFO - Sleeping: 2.874383 seconds
|
79 |
+
2025-06-12 18:20:18,052 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2400&max_results=100
|
80 |
+
2025-06-12 18:20:19,022 - INFO - Sleeping: 2.885731 seconds
|
81 |
+
2025-06-12 18:20:21,916 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2500&max_results=100
|
82 |
+
2025-06-12 18:20:22,743 - INFO - Sleeping: 2.880111 seconds
|
83 |
+
2025-06-12 18:20:25,633 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2600&max_results=100
|
84 |
+
2025-06-12 18:20:26,848 - INFO - Sleeping: 2.877337 seconds
|
85 |
+
2025-06-12 18:20:29,728 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2700&max_results=100
|
86 |
+
2025-06-12 18:20:29,961 - INFO - Sleeping: 2.999086 seconds
|
87 |
+
2025-06-12 18:20:32,973 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2700&max_results=100
|
88 |
+
2025-06-12 18:20:33,783 - INFO - Sleeping: 2.870358 seconds
|
89 |
+
2025-06-12 18:20:36,664 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2800&max_results=100
|
90 |
+
2025-06-12 18:20:36,929 - INFO - Sleeping: 2.997254 seconds
|
91 |
+
2025-06-12 18:20:39,936 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2800&max_results=100
|
92 |
+
2025-06-12 18:20:40,834 - INFO - Sleeping: 2.876953 seconds
|
93 |
+
2025-06-12 18:20:43,716 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2900&max_results=100
|
94 |
+
2025-06-12 18:20:44,816 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
|
95 |
+
2025-06-12 18:20:46,192 - INFO - Got first page: 100 of 100310 total results
|
96 |
+
2025-06-12 18:20:46,198 - INFO - Sleeping: 2.859482 seconds
|
97 |
+
2025-06-12 18:20:49,073 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=100&max_results=100
|
98 |
+
2025-06-12 18:20:49,789 - INFO - Sleeping: 2.869352 seconds
|
99 |
+
2025-06-12 18:20:52,669 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=200&max_results=100
|
100 |
+
2025-06-12 18:20:53,467 - INFO - Sleeping: 2.862511 seconds
|
101 |
+
2025-06-12 18:20:56,338 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=300&max_results=100
|
102 |
+
2025-06-12 18:20:57,071 - INFO - Sleeping: 2.870255 seconds
|
103 |
+
2025-06-12 18:20:59,951 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=400&max_results=100
|
104 |
+
2025-06-12 18:21:00,728 - INFO - Sleeping: 2.869636 seconds
|
105 |
+
2025-06-12 18:21:03,604 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
|
106 |
+
2025-06-12 18:21:04,393 - INFO - Sleeping: 2.865000 seconds
|
107 |
+
2025-06-12 18:21:07,272 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=600&max_results=100
|
108 |
+
2025-06-12 18:21:08,029 - INFO - Sleeping: 2.858943 seconds
|
109 |
+
2025-06-12 18:21:10,895 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=700&max_results=100
|
110 |
+
2025-06-12 18:21:11,768 - INFO - Sleeping: 2.866744 seconds
|
111 |
+
2025-06-12 18:21:14,640 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=800&max_results=100
|
112 |
+
2025-06-12 18:21:15,488 - INFO - Sleeping: 2.720050 seconds
|
113 |
+
2025-06-12 18:21:18,211 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=900&max_results=100
|
114 |
+
2025-06-12 18:21:19,122 - INFO - Sleeping: 2.844511 seconds
|
115 |
+
2025-06-12 18:21:21,982 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1000&max_results=100
|
116 |
+
2025-06-12 18:21:22,772 - INFO - Sleeping: 2.871176 seconds
|
117 |
+
2025-06-12 18:21:25,647 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1100&max_results=100
|
118 |
+
2025-06-12 18:21:25,925 - INFO - Sleeping: 2.997949 seconds
|
119 |
+
2025-06-12 18:21:28,932 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1100&max_results=100
|
120 |
+
2025-06-12 18:21:29,774 - INFO - Sleeping: 2.864288 seconds
|
121 |
+
2025-06-12 18:21:32,644 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1200&max_results=100
|
122 |
+
2025-06-12 18:21:33,454 - INFO - Sleeping: 2.860076 seconds
|
123 |
+
2025-06-12 18:21:36,317 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1300&max_results=100
|
124 |
+
2025-06-12 18:21:36,605 - INFO - Sleeping: 2.997453 seconds
|
125 |
+
2025-06-12 18:21:39,607 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1300&max_results=100
|
126 |
+
2025-06-12 18:21:40,404 - INFO - Sleeping: 2.856277 seconds
|
127 |
+
2025-06-12 18:21:43,276 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1400&max_results=100
|
128 |
+
2025-06-12 18:21:44,085 - INFO - Sleeping: 2.862912 seconds
|
129 |
+
2025-06-12 18:21:46,964 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1500&max_results=100
|
130 |
+
2025-06-12 18:21:47,858 - INFO - Sleeping: 2.860433 seconds
|
131 |
+
2025-06-12 18:21:50,732 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1600&max_results=100
|
132 |
+
2025-06-12 18:21:51,504 - INFO - Sleeping: 2.874451 seconds
|
133 |
+
2025-06-12 18:21:54,387 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1700&max_results=100
|
134 |
+
2025-06-12 18:21:55,722 - INFO - Sleeping: 2.859315 seconds
|
135 |
+
2025-06-12 18:21:58,585 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1800&max_results=100
|
136 |
+
2025-06-12 18:21:59,503 - INFO - Sleeping: 2.863854 seconds
|
137 |
+
2025-06-12 18:22:02,377 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1900&max_results=100
|
138 |
+
2025-06-12 18:22:02,618 - INFO - Sleeping: 2.997967 seconds
|
139 |
+
2025-06-12 18:22:05,628 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1900&max_results=100
|
140 |
+
2025-06-12 18:22:06,677 - INFO - Sleeping: 2.844775 seconds
|
141 |
+
2025-06-12 18:22:09,533 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2000&max_results=100
|
142 |
+
2025-06-12 18:22:09,792 - INFO - Sleeping: 2.998977 seconds
|
143 |
+
2025-06-12 18:22:12,797 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2000&max_results=100
|
144 |
+
2025-06-12 18:22:13,677 - INFO - Sleeping: 2.860952 seconds
|
145 |
+
2025-06-12 18:22:16,551 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2100&max_results=100
|
146 |
+
2025-06-12 18:22:17,381 - INFO - Sleeping: 2.862895 seconds
|
147 |
+
2025-06-12 18:22:20,259 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2200&max_results=100
|
148 |
+
2025-06-12 18:22:21,092 - INFO - Sleeping: 2.865440 seconds
|
149 |
+
2025-06-12 18:22:23,963 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2300&max_results=100
|
150 |
+
2025-06-12 18:22:24,738 - INFO - Sleeping: 2.854685 seconds
|
151 |
+
2025-06-12 18:22:27,605 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2400&max_results=100
|
152 |
+
2025-06-12 18:22:28,443 - INFO - Sleeping: 2.866245 seconds
|
153 |
+
2025-06-12 18:22:31,321 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2500&max_results=100
|
154 |
+
2025-06-12 18:22:32,401 - INFO - Sleeping: 2.857156 seconds
|
155 |
+
2025-06-12 18:22:35,269 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2600&max_results=100
|
156 |
+
2025-06-12 18:22:35,481 - INFO - Sleeping: 2.997016 seconds
|
157 |
+
2025-06-12 18:22:38,486 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2600&max_results=100
|
158 |
+
2025-06-12 18:22:39,346 - INFO - Sleeping: 2.856990 seconds
|
159 |
+
2025-06-12 18:22:42,208 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2700&max_results=100
|
160 |
+
2025-06-12 18:22:43,031 - INFO - Sleeping: 2.852790 seconds
|
161 |
+
2025-06-12 18:22:45,889 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2800&max_results=100
|
162 |
+
2025-06-12 18:22:46,748 - INFO - Sleeping: 2.858054 seconds
|
163 |
+
2025-06-12 18:22:49,610 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2900&max_results=100
|
164 |
+
2025-06-12 18:22:49,923 - INFO - Sleeping: 2.997999 seconds
|
165 |
+
2025-06-12 18:22:52,927 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2900&max_results=100
|
166 |
+
2025-06-12 18:22:53,180 - INFO - Sleeping: 2.998443 seconds
|
167 |
+
2025-06-12 18:22:56,182 - INFO - Requesting page (first: False, try: 2): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2900&max_results=100
|
168 |
+
2025-06-12 18:22:57,297 - INFO - Saved checkpoint to scientific_corpus_data\arxiv_papers.jsonl
|
169 |
+
2025-06-12 18:22:57,297 - INFO - Collected 5989 arXiv papers in 296.26s
|
170 |
+
2025-06-12 18:22:57,310 - INFO - Starting PubMed paper collection...
|
171 |
+
2025-06-12 18:23:14,143 - INFO - Saved checkpoint to scientific_corpus_data\pubmed_papers.jsonl
|
172 |
+
2025-06-12 18:23:14,143 - INFO - Collected 2671 PubMed papers in 16.83s
|
173 |
+
2025-06-12 18:23:14,143 - INFO - Starting FineWeb-Edu collection...
|
174 |
+
2025-06-12 18:23:34,470 - INFO - Collected 10000 FineWeb samples
|
175 |
+
2025-06-12 18:23:38,652 - INFO - Collected 20000 FineWeb samples
|
176 |
+
2025-06-12 18:23:43,218 - INFO - Collected 30000 FineWeb samples
|
177 |
+
2025-06-12 18:23:43,221 - INFO - Processing 30000 FineWeb samples
|
178 |
+
2025-06-12 18:24:03,830 - INFO - Saved checkpoint to scientific_corpus_data\fineweb_edu.jsonl
|
179 |
+
2025-06-12 18:24:03,831 - INFO - Collected 29616 FineWeb-Edu papers in 49.69s
|
180 |
+
2025-06-12 18:24:03,873 - INFO - Processing 5989 arxiv papers...
|
181 |
+
2025-06-12 18:24:05,244 - INFO - Processed 5989/5989 arxiv papers
|
182 |
+
2025-06-12 18:24:05,244 - INFO - Unknown domains: 0, Unknown sections: 3349
|
183 |
+
2025-06-12 18:24:05,244 - INFO - Processing 2671 biology papers...
|
184 |
+
2025-06-12 18:24:05,765 - INFO - Processed 2605/2671 biology papers
|
185 |
+
2025-06-12 18:24:05,765 - INFO - Unknown domains: 0, Unknown sections: 1015
|
186 |
+
2025-06-12 18:24:05,765 - INFO - Processing 29616 education papers...
|
187 |
+
2025-06-12 18:24:39,231 - INFO - Processed 159402/29616 education papers
|
188 |
+
2025-06-12 18:24:39,231 - INFO - Unknown domains: 29616, Unknown sections: 21161
|
189 |
+
2025-06-12 19:06:41,335 - INFO - Received signal 2, shutting down gracefully. Frame: <frame at 0x0000023E5AF0BBC0, file 'C:\\Users\\kunya\\AppData\\Local\\Programs\\Python\\Python310\\lib\\threading.py', line 320, code wait>
|
190 |
+
2025-06-12 19:06:43,708 - WARNING - Using default email for Entrez. Set ENTREZ_EMAIL environment variable.
|
191 |
+
2025-06-12 19:06:43,710 - INFO - Starting arXiv paper collection...
|
192 |
+
2025-06-12 19:06:43,711 - INFO - Saved checkpoint to scientific_corpus_data\arxiv_papers.jsonl
|
193 |
+
2025-06-12 19:06:43,712 - INFO - Collected 0 arXiv papers in 0.00s
|
194 |
+
2025-06-12 19:06:43,713 - INFO - Starting PubMed paper collection...
|
195 |
+
2025-06-12 19:06:43,715 - INFO - Saved checkpoint to scientific_corpus_data\pubmed_papers.jsonl
|
196 |
+
2025-06-12 19:06:43,715 - INFO - Collected 0 PubMed papers in 0.00s
|
197 |
+
2025-06-12 19:06:43,716 - INFO - Shutdown in progress, aborting retries.
|
198 |
+
2025-06-12 19:16:11,718 - INFO - Received signal 2, shutting down gracefully. Frame: <frame at 0x0000023E7696F880, file 'C:\\Users\\kunya\\AppData\\Local\\Programs\\Python\\Python310\\lib\\selectors.py', line 315, code _select>
|
Tokenization/generate_dataset.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from typing import Optional, Callable, Dict, Any
|
3 |
+
|
4 |
+
from Tokenization.Build_tokenizer import QLoRAPreprocessor
|
5 |
+
from Tokenization.preprocessing.Clean_text import clean_text
|
6 |
+
from Tokenization.Main_2 import ScientificCorpusBuilder, CorpusConfig
|
7 |
+
|
8 |
+
def generate_dataset(
|
9 |
+
domain: str = None,
|
10 |
+
token_budget: int = 1000,
|
11 |
+
plan: str = "free",
|
12 |
+
custom_seed: Optional[str] = None,
|
13 |
+
job_type: str = "tokenize",
|
14 |
+
progress_callback: Optional[Callable[[int, str], None]] = None
|
15 |
+
) -> Dict[str, Any]:
|
16 |
+
"""
|
17 |
+
Unified dataset generation pipeline for both 'tokenize' and 'corpus' jobs.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
domain (str): Domain for dataset.
|
21 |
+
token_budget (int): Token budget.
|
22 |
+
plan (str): Plan type.
|
23 |
+
custom_seed (str): Optional seed data.
|
24 |
+
job_type (str): "tokenize" or "corpus".
|
25 |
+
progress_callback (callable): Progress update callback.
|
26 |
+
|
27 |
+
Returns:
|
28 |
+
dict: {"jsonl_lines": [...], "stats": {...}}
|
29 |
+
"""
|
30 |
+
if job_type == "corpus":
|
31 |
+
# Use Main_2 pipeline
|
32 |
+
if progress_callback:
|
33 |
+
progress_callback(1, "Initializing scientific corpus builder...")
|
34 |
+
config = CorpusConfig()
|
35 |
+
builder = ScientificCorpusBuilder(config)
|
36 |
+
if progress_callback:
|
37 |
+
progress_callback(2, "Fetching arXiv papers...")
|
38 |
+
arxiv_papers = builder.fetch_arxiv_papers()
|
39 |
+
if progress_callback:
|
40 |
+
progress_callback(3, "Fetching PubMed papers...")
|
41 |
+
pubmed_papers = builder.fetch_pubmed_papers()
|
42 |
+
if progress_callback:
|
43 |
+
progress_callback(4, "Fetching FineWeb-Edu samples...")
|
44 |
+
fineweb_papers = builder.fetch_fineweb_edu()
|
45 |
+
if progress_callback:
|
46 |
+
progress_callback(5, "Processing and tagging papers...")
|
47 |
+
all_papers = []
|
48 |
+
all_papers.extend(builder.process_papers(arxiv_papers, "arxiv"))
|
49 |
+
all_papers.extend(builder.process_papers(pubmed_papers, "biology"))
|
50 |
+
all_papers.extend(builder.process_papers(fineweb_papers, "education"))
|
51 |
+
if progress_callback:
|
52 |
+
progress_callback(6, "Ranking and deduplicating...")
|
53 |
+
ranked_papers = builder.ranker.rank_samples(all_papers)
|
54 |
+
if progress_callback:
|
55 |
+
progress_callback(7, "Preparing dataset for download...")
|
56 |
+
jsonl_lines = [json.dumps(paper, ensure_ascii=False) for paper in ranked_papers]
|
57 |
+
stats = builder.analyzer.get_dataset_stats(ranked_papers)
|
58 |
+
if progress_callback:
|
59 |
+
progress_callback(8, "Dataset ready for download.")
|
60 |
+
return {"jsonl_lines": jsonl_lines, "stats": stats}
|
61 |
+
|
62 |
+
# Standard "tokenize" job
|
63 |
+
if progress_callback:
|
64 |
+
progress_callback(1, "Cleaning input text...")
|
65 |
+
cleaned_text = clean_text(custom_seed or "")
|
66 |
+
if progress_callback:
|
67 |
+
progress_callback(2, "Tokenizing input...")
|
68 |
+
preprocessor = QLoRAPreprocessor()
|
69 |
+
# For demonstration, just split cleaned_text into sentences (replace with real logic)
|
70 |
+
tokens = [cleaned_text[i:i+token_budget] for i in range(0, len(cleaned_text), token_budget)]
|
71 |
+
if progress_callback:
|
72 |
+
progress_callback(3, "Formatting samples...")
|
73 |
+
jsonl_lines = [json.dumps({"text": t}) for t in tokens]
|
74 |
+
stats = {"token_count": sum(len(t.split()) for t in tokens), "total_samples": len(tokens)}
|
75 |
+
if progress_callback:
|
76 |
+
progress_callback(4, "Dataset ready for download.")
|
77 |
+
return {"jsonl_lines": jsonl_lines, "stats": stats}
|
Tokenization/hf_upload.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
from datetime import datetime
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
from datasets import Dataset, Features, Value
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
from huggingface_hub import HfApi
|
10 |
+
|
11 |
+
# Load environment variables
|
12 |
+
load_dotenv()
|
13 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
14 |
+
|
15 |
+
# Logging setup
|
16 |
+
logging.basicConfig(
|
17 |
+
level=logging.INFO,
|
18 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
19 |
+
handlers=[
|
20 |
+
logging.StreamHandler(sys.stdout),
|
21 |
+
logging.FileHandler('debug_upload.log', mode='w')
|
22 |
+
]
|
23 |
+
)
|
24 |
+
|
25 |
+
REPO_ID = "Allanatrix/Scientific_Research_Tokenized"
|
26 |
+
JSONL_SRC = Path(r"C:\Users\kunya\PycharmProjects\DataVolt\Tokenization\scientific_corpus_325M.jsonl")
|
27 |
+
ARROW_PATH = Path("scientific_corpus_325M.arrow")
|
28 |
+
README_PATH = Path("README.md")
|
29 |
+
|
30 |
+
def debug_jsonl_head(jsonl_path, n=5):
|
31 |
+
logging.info(f"Printing the first {n} lines of {jsonl_path} for schema inspection:")
|
32 |
+
try:
|
33 |
+
with open(jsonl_path, "r", encoding="utf-8") as f:
|
34 |
+
for i in range(n):
|
35 |
+
line = f.readline()
|
36 |
+
if not line:
|
37 |
+
break
|
38 |
+
logging.info(f"Line {i+1}: {line.strip()}")
|
39 |
+
except Exception as e:
|
40 |
+
logging.error(f"Failed to read JSONL head: {e}")
|
41 |
+
|
42 |
+
def infer_features_from_sample(jsonl_path, n=100):
|
43 |
+
import json
|
44 |
+
from collections import defaultdict
|
45 |
+
types = defaultdict(set)
|
46 |
+
try:
|
47 |
+
with open(jsonl_path, "r", encoding="utf-8") as f:
|
48 |
+
for i, line in enumerate(f):
|
49 |
+
if i >= n:
|
50 |
+
break
|
51 |
+
obj = json.loads(line)
|
52 |
+
for k, v in obj.items():
|
53 |
+
types[k].add(type(v).__name__)
|
54 |
+
logging.info(f"Inferred field types from first {n} lines: {dict(types)}")
|
55 |
+
except Exception as e:
|
56 |
+
logging.error(f"Failed to infer features: {e}")
|
57 |
+
|
58 |
+
def convert_jsonl_to_arrow(jsonl_path, arrow_path):
|
59 |
+
try:
|
60 |
+
logging.info(f"Converting {jsonl_path} to Arrow format at {arrow_path} ...")
|
61 |
+
if not jsonl_path.exists():
|
62 |
+
logging.error(f"JSONL source file does not exist: {jsonl_path}")
|
63 |
+
print(f"\n❌ JSONL source file does not exist: {jsonl_path}")
|
64 |
+
raise FileNotFoundError(f"JSONL source file does not exist: {jsonl_path}")
|
65 |
+
logging.info(f"File size: {jsonl_path.stat().st_size} bytes")
|
66 |
+
debug_jsonl_head(jsonl_path, n=5)
|
67 |
+
infer_features_from_sample(jsonl_path, n=100)
|
68 |
+
# Try loading a small sample first for debugging
|
69 |
+
try:
|
70 |
+
sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]")
|
71 |
+
logging.info(f"Sample loaded: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}")
|
72 |
+
except Exception as sample_e:
|
73 |
+
logging.error(f"Failed to load sample from JSONL: {sample_e}", exc_info=True)
|
74 |
+
print(f"\n❌ Failed to load sample from JSONL. See debug_upload.log for details.")
|
75 |
+
# Try to load with explicit features if possible
|
76 |
+
# Example: features = Features({'url': Value('string'), 'pubmed_id': Value('string')})
|
77 |
+
# Uncomment and adjust the following lines if you know the schema:
|
78 |
+
# features = Features({'url': Value('string'), 'pubmed_id': Value('string')})
|
79 |
+
# try:
|
80 |
+
# sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]", features=features)
|
81 |
+
# logging.info(f"Sample loaded with explicit features: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}")
|
82 |
+
# except Exception as e2:
|
83 |
+
# logging.error(f"Still failed with explicit features: {e2}", exc_info=True)
|
84 |
+
raise
|
85 |
+
# Now load the full dataset
|
86 |
+
dataset = Dataset.from_json(str(jsonl_path))
|
87 |
+
logging.info(f"Full dataset loaded: {len(dataset)} rows, columns: {dataset.column_names}")
|
88 |
+
dataset.to_file(str(arrow_path))
|
89 |
+
logging.info(f"Saved Arrow dataset with {len(dataset):,} rows.")
|
90 |
+
return dataset
|
91 |
+
except Exception as e:
|
92 |
+
logging.error(f"An error occurred while generating the dataset: {e}", exc_info=True)
|
93 |
+
print(f"\n❌ Failed to convert JSONL to Arrow. See debug_upload.log for details.")
|
94 |
+
raise
|
95 |
+
|
96 |
+
def create_readme(dataset):
|
97 |
+
content = f"""# Scientific Research Tokenized Dataset
|
98 |
+
|
99 |
+
- **Examples**: {len(dataset):,}
|
100 |
+
- **Columns**: {dataset.column_names}
|
101 |
+
- **Updated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
102 |
+
|
103 |
+
## Usage
|
104 |
+
```python
|
105 |
+
from datasets import load_dataset
|
106 |
+
ds = load_dataset("{REPO_ID}")
|
107 |
+
```
|
108 |
+
"""
|
109 |
+
with open(README_PATH, "w", encoding="utf-8") as f:
|
110 |
+
f.write(content)
|
111 |
+
logging.info("README.md created.")
|
112 |
+
|
113 |
+
def upload_to_hf():
|
114 |
+
api = HfApi()
|
115 |
+
logging.info("Uploading Arrow file to HuggingFace Hub ...")
|
116 |
+
api.upload_file(
|
117 |
+
path_or_fileobj=str(ARROW_PATH),
|
118 |
+
path_in_repo=ARROW_PATH.name,
|
119 |
+
repo_id=REPO_ID,
|
120 |
+
repo_type="dataset",
|
121 |
+
token=HF_TOKEN,
|
122 |
+
commit_message="Upload Arrow dataset"
|
123 |
+
)
|
124 |
+
logging.info("Uploading README.md to HuggingFace Hub ...")
|
125 |
+
api.upload_file(
|
126 |
+
path_or_fileobj=str(README_PATH),
|
127 |
+
path_in_repo="README.md",
|
128 |
+
repo_id=REPO_ID,
|
129 |
+
repo_type="dataset",
|
130 |
+
token=HF_TOKEN,
|
131 |
+
commit_message="Update README"
|
132 |
+
)
|
133 |
+
logging.info("Upload complete.")
|
134 |
+
|
135 |
+
def upload_to_huggingface(*args, **kwargs):
|
136 |
+
"""Alias for upload_to_hf to match expected import in Main_2.py"""
|
137 |
+
return upload_to_hf(*args, **kwargs)
|
138 |
+
|
139 |
+
def cleanup():
|
140 |
+
if ARROW_PATH.exists():
|
141 |
+
ARROW_PATH.unlink()
|
142 |
+
if README_PATH.exists():
|
143 |
+
README_PATH.unlink()
|
144 |
+
logging.info("Cleaned up local files.")
|
145 |
+
|
146 |
+
def main():
|
147 |
+
try:
|
148 |
+
if not HF_TOKEN:
|
149 |
+
print("❌ HF_TOKEN not found in environment. Please set it in your .env file.")
|
150 |
+
return
|
151 |
+
dataset = convert_jsonl_to_arrow(JSONL_SRC, ARROW_PATH)
|
152 |
+
create_readme(dataset)
|
153 |
+
upload_to_hf()
|
154 |
+
print(f"\n🎉 SUCCESS! View at: https://huggingface.co/datasets/{REPO_ID}")
|
155 |
+
except Exception as e:
|
156 |
+
logging.error(f"Process failed: {e}")
|
157 |
+
print(f"\n❌ Upload failed. See debug_upload.log for details.")
|
158 |
+
sys.exit(1)
|
159 |
+
finally:
|
160 |
+
cleanup()
|
161 |
+
|
162 |
+
if __name__ == "__main__":
|
163 |
+
main()
|
Tokenization/preprocessing/Clean_text.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import unicodedata
|
3 |
+
|
4 |
+
def clean_text(text: str) -> str:
|
5 |
+
"""Clean and normalize text for LLM ingestion."""
|
6 |
+
if not isinstance(text, str):
|
7 |
+
return ""
|
8 |
+
# Normalize unicode
|
9 |
+
text = unicodedata.normalize("NFKC", text)
|
10 |
+
# Remove control characters
|
11 |
+
text = re.sub(r"[\x00-\x1F\x7F]", " ", text)
|
12 |
+
# Replace multiple spaces/newlines with a single space
|
13 |
+
text = re.sub(r"\s+", " ", text)
|
14 |
+
# Strip leading/trailing whitespace
|
15 |
+
text = text.strip()
|
16 |
+
return text
|
Tokenization/preprocessing/Preprocess_sample.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, List
|
2 |
+
from Tokenization.preprocessing.Clean_text import clean_text
|
3 |
+
from Tokenization.preprocessing.Segment_paragraphs import segment_paragraphs
|
4 |
+
|
5 |
+
def preprocess_sample(paper: Dict) -> List[Dict]:
|
6 |
+
"""
|
7 |
+
Clean and segment a paper into samples for LLM ingestion.
|
8 |
+
Returns a list of dicts: one for title+abstract, and one per paragraph.
|
9 |
+
"""
|
10 |
+
title = clean_text(paper.get("title", ""))
|
11 |
+
abstract = clean_text(paper.get("abstract", ""))
|
12 |
+
full_text = clean_text(paper.get("full_text", ""))
|
13 |
+
paragraphs = segment_paragraphs(full_text) if full_text else []
|
14 |
+
samples = []
|
15 |
+
# Title + abstract sample
|
16 |
+
if title or abstract:
|
17 |
+
sample = dict(paper)
|
18 |
+
sample["title"] = title
|
19 |
+
sample["abstract"] = abstract
|
20 |
+
sample["full_text"] = ""
|
21 |
+
sample["section"] = "abstract"
|
22 |
+
samples.append(sample)
|
23 |
+
# Paragraph samples
|
24 |
+
for para in paragraphs:
|
25 |
+
sample = dict(paper)
|
26 |
+
sample["title"] = title
|
27 |
+
sample["abstract"] = ""
|
28 |
+
sample["full_text"] = para
|
29 |
+
sample["section"] = "paragraph"
|
30 |
+
samples.append(sample)
|
31 |
+
return samples
|
Tokenization/preprocessing/Segment_paragraphs.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
def segment_paragraphs(text: str) -> list:
|
4 |
+
"""Segment text into paragraphs using double newlines or similar heuristics."""
|
5 |
+
if not isinstance(text, str):
|
6 |
+
return []
|
7 |
+
# Split on two or more newlines, or at least 200 chars per paragraph
|
8 |
+
paras = re.split(r"\n{2,}", text)
|
9 |
+
# Fallback: split-long paragraphs
|
10 |
+
result = []
|
11 |
+
for para in paras:
|
12 |
+
para = para.strip()
|
13 |
+
if len(para) > 1000:
|
14 |
+
# Split further if too long
|
15 |
+
chunks = [para[i:i+1000] for i in range(0, len(para), 1000)]
|
16 |
+
result.extend(chunks)
|
17 |
+
elif para:
|
18 |
+
result.append(para)
|
19 |
+
return [p for p in result if p]
|
Tokenization/preprocessing/__init__.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .Clean_text import clean_text
|
2 |
+
from .Segment_paragraphs import segment_paragraphs
|
3 |
+
from .Preprocess_sample import preprocess_sample
|
4 |
+
|
5 |
+
__all__ = [
|
6 |
+
"clean_text",
|
7 |
+
"segment_paragraphs",
|
8 |
+
"preprocess_sample",
|
9 |
+
]
|
Tokenization/preprocessing/__pycache__/Clean_text.cpython-310.pyc
ADDED
Binary file (544 Bytes). View file
|
|
Tokenization/preprocessing/__pycache__/Preprocess_sample.cpython-310.pyc
ADDED
Binary file (1.03 kB). View file
|
|
Tokenization/preprocessing/__pycache__/Segment_paragraphs.cpython-310.pyc
ADDED
Binary file (932 Bytes). View file
|
|
Tokenization/preprocessing/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (372 Bytes). View file
|
|
Tokenization/pretraining/Dataset_stats.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import Counter
|
2 |
+
from typing import Dict, List
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
from transformers import AutoTokenizer
|
6 |
+
|
7 |
+
|
8 |
+
class DatasetAnalyzer:
|
9 |
+
def __init__(self, model_name: str = "facebook/opt-350m"):
|
10 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
11 |
+
|
12 |
+
def analyze_sample(self, sample: Dict) -> Dict:
|
13 |
+
tokens = self.tokenizer.encode(str(sample))
|
14 |
+
return {
|
15 |
+
"token_count": len(tokens),
|
16 |
+
"word_count": len(str(sample).split()),
|
17 |
+
"has_abstract": bool(sample.get("abstract")),
|
18 |
+
"has_content": bool(sample.get("full_text") or sample.get("excerpt")),
|
19 |
+
"has_section": bool(sample.get("section_type")),
|
20 |
+
"domain": sample.get("domain_tag", "unknown")
|
21 |
+
}
|
22 |
+
|
23 |
+
def get_dataset_stats(self, samples: List[Dict]) -> Dict:
|
24 |
+
stats = []
|
25 |
+
domains = Counter()
|
26 |
+
sections = Counter()
|
27 |
+
|
28 |
+
for sample in samples:
|
29 |
+
sample_stats = self.analyze_sample(sample)
|
30 |
+
stats.append(sample_stats)
|
31 |
+
domains[sample_stats["domain"]] += 1
|
32 |
+
sections[sample.get("section_type", "unknown")] += 1
|
33 |
+
|
34 |
+
return {
|
35 |
+
"total_samples": len(samples),
|
36 |
+
"avg_tokens": np.mean([s["token_count"] for s in stats]),
|
37 |
+
"avg_words": np.mean([s["word_count"] for s in stats]),
|
38 |
+
"domain_distribution": dict(domains),
|
39 |
+
"section_distribution": dict(sections)
|
40 |
+
}
|
Tokenization/pretraining/Instruction_formatter.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Tokenization/pretraining/instruction_formatter.py
|
2 |
+
|
3 |
+
class InstructionFormatter:
|
4 |
+
@staticmethod
|
5 |
+
def format_sample(sample):
|
6 |
+
"""
|
7 |
+
Formats a sample dict with 'instruction', 'input', and 'output' fields.
|
8 |
+
This is a placeholder; customize as needed for your data.
|
9 |
+
"""
|
10 |
+
# Ensure required fields exist
|
11 |
+
instruction = sample.get("instruction", "")
|
12 |
+
input_ = sample.get("input", "")
|
13 |
+
output = sample.get("output", "")
|
14 |
+
return {
|
15 |
+
"instruction": instruction.strip(),
|
16 |
+
"input": input_.strip(),
|
17 |
+
"output": output.strip(),
|
18 |
+
}
|
Tokenization/pretraining/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .Dataset_stats import DatasetAnalyzer
|
2 |
+
|
3 |
+
__all__ = ["DatasetAnalyzer"]
|
Tokenization/pretraining/__pycache__/Dataset_stats.cpython-310.pyc
ADDED
Binary file (1.97 kB). View file
|
|
Tokenization/pretraining/__pycache__/Instruction_formatter.cpython-310.pyc
ADDED
Binary file (806 Bytes). View file
|
|
Tokenization/pretraining/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (244 Bytes). View file
|
|
Tokenization/requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn
|
3 |
+
gradio
|
4 |
+
requests
|
5 |
+
nltk
|
6 |
+
scikit-learn
|
7 |
+
beautifulsoup4
|
8 |
+
arxiv
|
9 |
+
huggingface_hub
|
10 |
+
python-dotenv
|
11 |
+
stripe
|
Tokenization/run_backend.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import uvicorn
|
2 |
+
import os
|
3 |
+
|
4 |
+
if __name__ == "__main__":
|
5 |
+
os.makedirs("tmp", exist_ok=True)
|
6 |
+
print("Starting FastAPI backend at http://localhost:8000 ...")
|
7 |
+
uvicorn.run(
|
8 |
+
"Tokenization.app:fastapi_app",
|
9 |
+
host="0.0.0.0",
|
10 |
+
port=8000,
|
11 |
+
reload=True
|
12 |
+
)
|