Spaces:
Sleeping
Sleeping
Upload 50 files
Browse files- Tokenization/Build_tokenizer.py +89 -0
- Tokenization/Cleanser.py +102 -0
- Tokenization/Entropy_ranker.py +59 -0
- Tokenization/Label_tokens.py +69 -0
- Tokenization/Logs/corpus_builder.log +0 -0
- Tokenization/Logs/debug_upload.log +4 -0
- Tokenization/Main_2.py +922 -0
- Tokenization/__init__.py +21 -0
- Tokenization/__pycache__/Build_tokenizer.cpython-310.pyc +0 -0
- Tokenization/__pycache__/Entropy_ranker.cpython-310.pyc +0 -0
- Tokenization/__pycache__/Label_tokens.cpython-310.pyc +0 -0
- Tokenization/__pycache__/Main_2.cpython-310.pyc +0 -0
- Tokenization/__pycache__/__init__.cpython-310.pyc +0 -0
- Tokenization/__pycache__/generate_dataset.cpython-310.pyc +0 -0
- Tokenization/__pycache__/hf_upload.cpython-310.pyc +0 -0
- Tokenization/app.py +147 -0
- Tokenization/app/Api.py +75 -0
- Tokenization/app/Config.py +25 -0
- Tokenization/app/Core.py +155 -0
- Tokenization/app/Payment.py +27 -0
- Tokenization/app/Progress.py +37 -0
- Tokenization/app/__init__.py +15 -0
- Tokenization/app/__pycache__/Api.cpython-310.pyc +0 -0
- Tokenization/app/__pycache__/Config.cpython-310.pyc +0 -0
- Tokenization/app/__pycache__/Core.cpython-310.pyc +0 -0
- Tokenization/app/__pycache__/Payment.cpython-310.pyc +0 -0
- Tokenization/app/__pycache__/Progress.cpython-310.pyc +0 -0
- Tokenization/app/__pycache__/__init__.cpython-310.pyc +0 -0
- Tokenization/combined_scientific_papers.json +0 -0
- Tokenization/combined_scientific_papers.jsonl +0 -0
- Tokenization/corpus_builder.log +0 -0
- Tokenization/debug_upload.log +198 -0
- Tokenization/generate_dataset.py +77 -0
- Tokenization/hf_upload.py +163 -0
- Tokenization/preprocessing/Clean_text.py +16 -0
- Tokenization/preprocessing/Preprocess_sample.py +31 -0
- Tokenization/preprocessing/Segment_paragraphs.py +19 -0
- Tokenization/preprocessing/__init__.py +9 -0
- Tokenization/preprocessing/__pycache__/Clean_text.cpython-310.pyc +0 -0
- Tokenization/preprocessing/__pycache__/Preprocess_sample.cpython-310.pyc +0 -0
- Tokenization/preprocessing/__pycache__/Segment_paragraphs.cpython-310.pyc +0 -0
- Tokenization/preprocessing/__pycache__/__init__.cpython-310.pyc +0 -0
- Tokenization/pretraining/Dataset_stats.py +40 -0
- Tokenization/pretraining/Instruction_formatter.py +18 -0
- Tokenization/pretraining/__init__.py +3 -0
- Tokenization/pretraining/__pycache__/Dataset_stats.cpython-310.pyc +0 -0
- Tokenization/pretraining/__pycache__/Instruction_formatter.cpython-310.pyc +0 -0
- Tokenization/pretraining/__pycache__/__init__.cpython-310.pyc +0 -0
- Tokenization/requirements.txt +11 -0
- Tokenization/run_backend.py +12 -0
Tokenization/Build_tokenizer.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Dict
|
| 4 |
+
|
| 5 |
+
from transformers import AutoTokenizer
|
| 6 |
+
|
| 7 |
+
from Tokenization.Entropy_ranker import EntropyRanker
|
| 8 |
+
from Tokenization.Label_tokens import MIN_WORDS, MAX_TOKENS, MAX_TOTAL_TOKENS, TOKEN_TARGETS
|
| 9 |
+
from Tokenization.pretraining.Dataset_stats import DatasetAnalyzer
|
| 10 |
+
from Tokenization.pretraining.Instruction_formatter import InstructionFormatter
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class QLoRAPreprocessor:
|
| 14 |
+
def __init__(self, model_name: str = "facebook/opt-350m", corpus_type: str = "warm_start"):
|
| 15 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 16 |
+
self.analyzer = DatasetAnalyzer(model_name)
|
| 17 |
+
self.formatter = InstructionFormatter()
|
| 18 |
+
self.ranker = EntropyRanker()
|
| 19 |
+
self.token_target = TOKEN_TARGETS[corpus_type]
|
| 20 |
+
self.current_tokens = 0
|
| 21 |
+
|
| 22 |
+
def track_tokens(self, text: str) -> bool:
|
| 23 |
+
tokens = self.tokenizer.encode(text)
|
| 24 |
+
self.current_tokens += len(tokens)
|
| 25 |
+
return self.current_tokens <= self.token_target
|
| 26 |
+
|
| 27 |
+
def validate_sample(self, sample: Dict) -> bool:
|
| 28 |
+
if not all(k in sample for k in ["instruction", "input", "output"]):
|
| 29 |
+
return False
|
| 30 |
+
total_text = f"{sample['instruction']} {sample['input']} {sample['output']}"
|
| 31 |
+
tokens = self.tokenizer.encode(total_text)
|
| 32 |
+
words = total_text.split()
|
| 33 |
+
return (len(words) >= MIN_WORDS and
|
| 34 |
+
len(tokens) <= MAX_TOKENS and
|
| 35 |
+
len(tokens) <= MAX_TOTAL_TOKENS)
|
| 36 |
+
|
| 37 |
+
def process_dataset(self, input_path: str, output_path: str):
|
| 38 |
+
# Load data, skipping blank lines and malformed JSON
|
| 39 |
+
data = []
|
| 40 |
+
with open(input_path, 'r', encoding='utf-8') as f:
|
| 41 |
+
for i, line in enumerate(f, 1):
|
| 42 |
+
line = line.strip()
|
| 43 |
+
if not line:
|
| 44 |
+
continue
|
| 45 |
+
try:
|
| 46 |
+
data.append(json.loads(line))
|
| 47 |
+
except json.JSONDecodeError as e:
|
| 48 |
+
print(f"Skipping line {i}: {e}")
|
| 49 |
+
|
| 50 |
+
# Analyze dataset
|
| 51 |
+
stats = self.analyzer.get_dataset_stats(data)
|
| 52 |
+
print(f"Dataset stats: {stats}")
|
| 53 |
+
|
| 54 |
+
# Format samples
|
| 55 |
+
formatted_samples = [
|
| 56 |
+
self.formatter.format_sample(sample)
|
| 57 |
+
for sample in data
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
# Rank and filter samples
|
| 61 |
+
ranked_samples = self.ranker.rank_samples(formatted_samples)
|
| 62 |
+
|
| 63 |
+
# Track token count while processing
|
| 64 |
+
valid_samples = []
|
| 65 |
+
for sample in ranked_samples:
|
| 66 |
+
if not self.validate_sample(sample):
|
| 67 |
+
continue
|
| 68 |
+
|
| 69 |
+
sample_text = f"{sample['instruction']} {sample['input']} {sample['output']}"
|
| 70 |
+
if not self.track_tokens(sample_text):
|
| 71 |
+
break
|
| 72 |
+
|
| 73 |
+
valid_samples.append(sample)
|
| 74 |
+
|
| 75 |
+
# Save to JSONL
|
| 76 |
+
output_file = Path(output_path)
|
| 77 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 78 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 79 |
+
for sample in valid_samples:
|
| 80 |
+
f.write(json.dumps(sample) + '\n')
|
| 81 |
+
|
| 82 |
+
print(f"Processed {len(valid_samples)} samples saved to {output_path}")
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
preprocessor = QLoRAPreprocessor()
|
| 86 |
+
preprocessor.process_dataset(
|
| 87 |
+
"C:/Users/kunya/PycharmProjects/DataVolt/Tokenizers/combined_scientific_papers.json",
|
| 88 |
+
"nexa_scientific_instruction_300k.jsonl"
|
| 89 |
+
)
|
Tokenization/Cleanser.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 5 |
+
from datasets import Dataset
|
| 6 |
+
|
| 7 |
+
# Tag dictionaries
|
| 8 |
+
DOMAIN_TAGS = {
|
| 9 |
+
"physics": "[PHYS]",
|
| 10 |
+
"biology": "[BIO]",
|
| 11 |
+
"materials": "[MAT]",
|
| 12 |
+
"education": "[GEN]",
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
TASK_TAGS = {
|
| 16 |
+
"hypothesis": "[HYP]",
|
| 17 |
+
"method": "[MTH]",
|
| 18 |
+
"experiment": "[EXP]",
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
SECTION_TAGS = {
|
| 22 |
+
"abstract": "[ABSTRACT]",
|
| 23 |
+
"introduction": "[INTRO]",
|
| 24 |
+
"results": "[RESULTS]",
|
| 25 |
+
"discussion": "[DISCUSSION]",
|
| 26 |
+
"conclusion": "[CONCLUSION]",
|
| 27 |
+
"method": "[MTH]",
|
| 28 |
+
"experiment": "[EXP]",
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
SRC_PATH = Path(r"C:\Users\kunya\PycharmProjects\DataVolt\Tokenization\scientific_corpus_325M.jsonl")
|
| 32 |
+
CLEANED_JSONL_PATH = Path("scientific_corpus_325M.cleaned.jsonl")
|
| 33 |
+
CLEANED_ARROW_PATH = Path("scientific_corpus_325M.cleaned.arrow")
|
| 34 |
+
CHUNK_SIZE = 10000
|
| 35 |
+
MAX_WORKERS = os.cpu_count() or 4
|
| 36 |
+
|
| 37 |
+
def tag_record(record):
|
| 38 |
+
# Tagging logic: add tags to text fields if domain/task/section present
|
| 39 |
+
# You may need to adjust keys based on your schema
|
| 40 |
+
domain = record.get("domain", "").lower()
|
| 41 |
+
task = record.get("task", "").lower()
|
| 42 |
+
section = record.get("section", "").lower()
|
| 43 |
+
text = record.get("full_text", "")
|
| 44 |
+
|
| 45 |
+
tags = []
|
| 46 |
+
if domain in DOMAIN_TAGS:
|
| 47 |
+
tags.append(DOMAIN_TAGS[domain])
|
| 48 |
+
if task in TASK_TAGS:
|
| 49 |
+
tags.append(TASK_TAGS[task])
|
| 50 |
+
if section in SECTION_TAGS:
|
| 51 |
+
tags.append(SECTION_TAGS[section])
|
| 52 |
+
|
| 53 |
+
# Prepend tags to text
|
| 54 |
+
record["tagged_text"] = " ".join(tags) + " " + text if tags else text
|
| 55 |
+
return record
|
| 56 |
+
|
| 57 |
+
def process_chunk(lines):
|
| 58 |
+
cleaned = []
|
| 59 |
+
for line in lines:
|
| 60 |
+
try:
|
| 61 |
+
record = json.loads(line)
|
| 62 |
+
cleaned.append(tag_record(record))
|
| 63 |
+
except Exception:
|
| 64 |
+
continue # skip malformed lines
|
| 65 |
+
return cleaned
|
| 66 |
+
|
| 67 |
+
def chunked_file_reader(path, chunk_size):
|
| 68 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 69 |
+
chunk = []
|
| 70 |
+
for line in f:
|
| 71 |
+
chunk.append(line)
|
| 72 |
+
if len(chunk) == chunk_size:
|
| 73 |
+
yield chunk
|
| 74 |
+
chunk = []
|
| 75 |
+
if chunk:
|
| 76 |
+
yield chunk
|
| 77 |
+
|
| 78 |
+
def main():
|
| 79 |
+
print("Starting cleaning process...")
|
| 80 |
+
# Write cleaned records to a new JSONL file in chunks
|
| 81 |
+
with open(CLEANED_JSONL_PATH, "w", encoding="utf-8") as out_f:
|
| 82 |
+
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
| 83 |
+
futures = []
|
| 84 |
+
for chunk in chunked_file_reader(SRC_PATH, CHUNK_SIZE):
|
| 85 |
+
futures.append(executor.submit(process_chunk, chunk))
|
| 86 |
+
for fut in as_completed(futures):
|
| 87 |
+
for record in fut.result():
|
| 88 |
+
out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
| 89 |
+
print(f"Cleaned JSONL written to {CLEANED_JSONL_PATH}")
|
| 90 |
+
|
| 91 |
+
# Convert cleaned JSONL to Arrow using datasets (handles chunking internally)
|
| 92 |
+
print("Saving cleaned dataset to Arrow format...")
|
| 93 |
+
ds = Dataset.from_json(str(CLEANED_JSONL_PATH))
|
| 94 |
+
ds.save_to_disk(str(CLEANED_ARROW_PATH))
|
| 95 |
+
print(f"Saved cleaned Arrow dataset at: {CLEANED_ARROW_PATH}")
|
| 96 |
+
|
| 97 |
+
# Optionally, call hf_upload.py asynchronously
|
| 98 |
+
print("Uploading to HuggingFace using hf_upload.py ...")
|
| 99 |
+
os.system(f"python hf_upload.py")
|
| 100 |
+
|
| 101 |
+
if __name__ == "__main__":
|
| 102 |
+
main()
|
Tokenization/Entropy_ranker.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
from typing import List, Dict, Optional, Callable
|
| 3 |
+
|
| 4 |
+
class EntropyRanker:
|
| 5 |
+
"""
|
| 6 |
+
Scores and filters text samples by Shannon entropy of their token distribution.
|
| 7 |
+
Used to remove low-information or repetitive samples from scientific corpora.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
def __init__(self, entropy_threshold: float = 3.5, tokenizer: Optional[Callable[[str], List[str]]] = None):
|
| 11 |
+
"""
|
| 12 |
+
Args:
|
| 13 |
+
entropy_threshold: Minimum entropy required to keep a sample.
|
| 14 |
+
tokenizer: Function to tokenize text. Defaults to whitespace split.
|
| 15 |
+
"""
|
| 16 |
+
self.entropy_threshold = entropy_threshold
|
| 17 |
+
self.tokenizer = tokenizer or (lambda x: x.split())
|
| 18 |
+
|
| 19 |
+
@staticmethod
|
| 20 |
+
def shannon_entropy(tokens: List[str]) -> float:
|
| 21 |
+
"""Compute Shannon entropy for a list of tokens."""
|
| 22 |
+
if not tokens:
|
| 23 |
+
return 0.0
|
| 24 |
+
freq = {}
|
| 25 |
+
for t in tokens:
|
| 26 |
+
freq[t] = freq.get(t, 0) + 1
|
| 27 |
+
total = len(tokens)
|
| 28 |
+
entropy = 0.0
|
| 29 |
+
for count in freq.values():
|
| 30 |
+
p = count / total
|
| 31 |
+
entropy -= p * math.log(p, 2)
|
| 32 |
+
return entropy
|
| 33 |
+
|
| 34 |
+
def score_sample(self, text: str) -> float:
|
| 35 |
+
"""Tokenize and score a text sample by entropy."""
|
| 36 |
+
tokens = self.tokenizer(text)
|
| 37 |
+
return self.shannon_entropy(tokens)
|
| 38 |
+
|
| 39 |
+
def is_explanatory(self, text: str) -> bool:
|
| 40 |
+
"""Return True if sample passes an entropy threshold."""
|
| 41 |
+
return self.score_sample(text) >= self.entropy_threshold
|
| 42 |
+
|
| 43 |
+
def filter_samples(self, samples: List[Dict], text_key: str = "text") -> List[Dict]:
|
| 44 |
+
"""Filter a list of dict samples, keeping only those above a threshold."""
|
| 45 |
+
return [s for s in samples if self.is_explanatory(s.get(text_key, ""))]
|
| 46 |
+
|
| 47 |
+
def rank_samples(self, samples: List[Dict], text_key: str = "text", top_k: Optional[int] = None) -> List[Dict]:
|
| 48 |
+
"""
|
| 49 |
+
Rank samples by entropy, descending. Optionally return only top_k.
|
| 50 |
+
"""
|
| 51 |
+
scored = [
|
| 52 |
+
(self.score_sample(s.get(text_key, "")), s)
|
| 53 |
+
for s in samples
|
| 54 |
+
]
|
| 55 |
+
scored.sort(reverse=True, key=lambda x: x[0])
|
| 56 |
+
ranked = [s for _, s in scored if _ >= self.entropy_threshold]
|
| 57 |
+
if top_k is not None:
|
| 58 |
+
ranked = ranked[:top_k]
|
| 59 |
+
return ranked
|
Tokenization/Label_tokens.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Tokenization/label_tokens.py
|
| 2 |
+
|
| 3 |
+
# Domain tags
|
| 4 |
+
DOMAIN_TAGS = {
|
| 5 |
+
"physics": "[PHYS]",
|
| 6 |
+
"biology": "[BIO]",
|
| 7 |
+
"materials": "[MAT]",
|
| 8 |
+
"education": "[GEN]",
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
# Task tags
|
| 12 |
+
TASK_TAGS = {
|
| 13 |
+
"hypothesis": "[HYP]",
|
| 14 |
+
"method": "[MTH]",
|
| 15 |
+
"experiment": "[EXP]",
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
# Section tags (for further granularity, e.g., for long-context or future models)
|
| 19 |
+
SECTION_TAGS = {
|
| 20 |
+
"abstract": "[ABSTRACT]",
|
| 21 |
+
"introduction": "[INTRO]",
|
| 22 |
+
"results": "[RESULTS]",
|
| 23 |
+
"discussion": "[DISCUSSION]",
|
| 24 |
+
"conclusion": "[CONCLUSION]",
|
| 25 |
+
"method": "[MTH]",
|
| 26 |
+
"experiment": "[EXP]",
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
# Routing tags
|
| 30 |
+
ROUTING_TAGS = {
|
| 31 |
+
"general": "[GEN]",
|
| 32 |
+
"specific": "[SPEC]",
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
# Token/word limits for validation and filtering
|
| 36 |
+
MIN_WORDS = 8
|
| 37 |
+
MAX_TOKENS = 1024
|
| 38 |
+
MAX_TOTAL_TOKENS = 327680000 # Example: 325M tokens
|
| 39 |
+
|
| 40 |
+
# Token targets for different corpus types
|
| 41 |
+
TOKEN_TARGETS = {
|
| 42 |
+
"warm_start": 100_000_000,
|
| 43 |
+
"scientific": 225_000_000,
|
| 44 |
+
"instruction": 30_000_000,
|
| 45 |
+
"default": 325_000_000,
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
def build_tag_string(
|
| 49 |
+
domain: str,
|
| 50 |
+
task: str = None,
|
| 51 |
+
section: str = None,
|
| 52 |
+
routing: str = "general",
|
| 53 |
+
subdomain: str = None
|
| 54 |
+
) -> str:
|
| 55 |
+
"""
|
| 56 |
+
Build a tag string for a sample, e.g. [PHYS][HYP][GEN] or [BIO][MTH][SPEC: Genomics]
|
| 57 |
+
"""
|
| 58 |
+
tags = []
|
| 59 |
+
if domain in DOMAIN_TAGS:
|
| 60 |
+
tags.append(DOMAIN_TAGS[domain])
|
| 61 |
+
if task in TASK_TAGS:
|
| 62 |
+
tags.append(TASK_TAGS[task])
|
| 63 |
+
if section in SECTION_TAGS:
|
| 64 |
+
tags.append(SECTION_TAGS[section])
|
| 65 |
+
if routing == "general":
|
| 66 |
+
tags.append(ROUTING_TAGS["general"])
|
| 67 |
+
elif routing == "specific" and subdomain:
|
| 68 |
+
tags.append(f"[SPEC:{subdomain}]")
|
| 69 |
+
return "".join(tags)
|
Tokenization/Logs/corpus_builder.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Tokenization/Logs/debug_upload.log
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-07 20:23:13,293 - INFO - Converting C:\Users\kunya\PycharmProjects\DataVolt\Tokenization\scientific_corpus_325M.jsonl to Arrow format at scientific_corpus_325M.arrow ...
|
| 2 |
+
2025-06-07 20:23:36,951 - ERROR - An error occurred while generating the dataset: An error occurred while generating the dataset
|
| 3 |
+
2025-06-07 20:23:36,951 - ERROR - Process failed: An error occurred while generating the dataset
|
| 4 |
+
2025-06-07 20:23:36,952 - INFO - Cleaned up local files.
|
Tokenization/Main_2.py
ADDED
|
@@ -0,0 +1,922 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# python
|
| 2 |
+
"""
|
| 3 |
+
The Main pipeline for building a scientific corpus from multiple sources.
|
| 4 |
+
|
| 5 |
+
Responsibilities:
|
| 6 |
+
- Orchestrates collection, processing, ranking, and deduplication of papers from arXiv, PubMed, and FineWeb-Edu.
|
| 7 |
+
- Handles error logging, checkpointing, and metrics for observability.
|
| 8 |
+
- Modular design for extensibility and maintainability.
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
python Main_2.py
|
| 12 |
+
|
| 13 |
+
Classes:
|
| 14 |
+
- SourceMetrics: Tracks per-source metrics.
|
| 15 |
+
- CorpusConfig: Configuration for corpus building.
|
| 16 |
+
- ScientificCorpusBuilder: Main pipeline class.
|
| 17 |
+
|
| 18 |
+
Functions:
|
| 19 |
+
- main: Entry point for running the pipeline.
|
| 20 |
+
|
| 21 |
+
Environment:
|
| 22 |
+
- Requires ENTREZ_EMAIL for PubMed API.
|
| 23 |
+
- Outputs logs and intermediate checkpoints to ./scientific_corpus_data.
|
| 24 |
+
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
import concurrent.futures
|
| 28 |
+
import json
|
| 29 |
+
import logging
|
| 30 |
+
import os
|
| 31 |
+
import signal
|
| 32 |
+
import time
|
| 33 |
+
from dataclasses import dataclass
|
| 34 |
+
from pathlib import Path
|
| 35 |
+
from types import FrameType
|
| 36 |
+
from typing import List, Dict, Set, Optional, Callable, Any
|
| 37 |
+
from urllib.error import URLError, HTTPError
|
| 38 |
+
from xml.parsers.expat import ExpatError
|
| 39 |
+
|
| 40 |
+
import arxiv
|
| 41 |
+
from Bio import Entrez
|
| 42 |
+
from datasets import load_dataset
|
| 43 |
+
from tqdm import tqdm
|
| 44 |
+
|
| 45 |
+
from Tokenization.Build_tokenizer import QLoRAPreprocessor
|
| 46 |
+
from Tokenization.Entropy_ranker import EntropyRanker
|
| 47 |
+
from Tokenization.hf_upload import upload_to_huggingface
|
| 48 |
+
from Tokenization.Label_tokens import TASK_TAGS, ROUTING_TAGS
|
| 49 |
+
from Tokenization.preprocessing import clean_text, segment_paragraphs
|
| 50 |
+
from Tokenization.pretraining.Dataset_stats import DatasetAnalyzer
|
| 51 |
+
from Tokenization.app.Config import PLAN_LIMITS
|
| 52 |
+
|
| 53 |
+
# Configure logging
|
| 54 |
+
logging.basicConfig(
|
| 55 |
+
level=logging.INFO,
|
| 56 |
+
format="%(asctime)s - %(levelname)s - %(message)s",
|
| 57 |
+
handlers=[
|
| 58 |
+
logging.FileHandler("corpus_builder.log"),
|
| 59 |
+
logging.StreamHandler()
|
| 60 |
+
]
|
| 61 |
+
)
|
| 62 |
+
logger = logging.getLogger(__name__)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
is_shutdown = False
|
| 66 |
+
"""Global flag indicating whether a shutdown signal has been received.
|
| 67 |
+
|
| 68 |
+
This flag is set to True by the signal handler to allow for graceful shutdown
|
| 69 |
+
of long-running operations throughout the pipeline.
|
| 70 |
+
"""
|
| 71 |
+
|
| 72 |
+
def signal_handler(sig: int, frame: FrameType) -> None:
|
| 73 |
+
"""Handle shutdown signals gracefully and set shutdown flag."""
|
| 74 |
+
global is_shutdown
|
| 75 |
+
logger.info(f"Received signal {sig}, shutting down gracefully. Frame: {frame}")
|
| 76 |
+
is_shutdown = True
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
# Register signal handlers for graceful shutdown
|
| 80 |
+
signal.signal(signal.SIGINT, signal_handler)
|
| 81 |
+
signal.signal(signal.SIGTERM, signal_handler)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def retry(max_retries: int = 3, backoff_factor: float = 1.0,
|
| 85 |
+
exceptions: tuple = (Exception,)) -> Callable:
|
| 86 |
+
"""
|
| 87 |
+
Decorator for retrying a function with exponential backoff.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
max_retries: Maximum number of retries.
|
| 91 |
+
backoff_factor: Multiplier for exponential backoff.
|
| 92 |
+
exceptions: Exception types to catch and retry.
|
| 93 |
+
|
| 94 |
+
Returns:
|
| 95 |
+
Decorated function with retry logic.
|
| 96 |
+
"""
|
| 97 |
+
def decorator(func: Callable) -> Callable:
|
| 98 |
+
def wrapper(*args, **kwargs) -> Any:
|
| 99 |
+
retries = 0
|
| 100 |
+
while retries < max_retries:
|
| 101 |
+
if is_shutdown:
|
| 102 |
+
logger.info("Shutdown in progress, aborting retries.")
|
| 103 |
+
raise KeyboardInterrupt("Shutdown requested")
|
| 104 |
+
try:
|
| 105 |
+
return func(*args, **kwargs)
|
| 106 |
+
except exceptions as e:
|
| 107 |
+
wait = backoff_factor * (2 ** retries)
|
| 108 |
+
logger.warning(f"Error in {func.__name__}: {e}. Retrying in {wait:.1f}s...")
|
| 109 |
+
time.sleep(wait)
|
| 110 |
+
retries += 1
|
| 111 |
+
logger.error(f"Function {func.__name__} failed after {max_retries} attempts.")
|
| 112 |
+
raise RuntimeError(f"{func.__name__} failed after {max_retries} attempts")
|
| 113 |
+
return wrapper
|
| 114 |
+
return decorator
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
@dataclass
|
| 118 |
+
class SourceMetrics:
|
| 119 |
+
"""Metrics for tracking source performance."""
|
| 120 |
+
papers: int = 0
|
| 121 |
+
tokens: int = 0
|
| 122 |
+
time: float = 0.0
|
| 123 |
+
errors: int = 0
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
@dataclass
|
| 127 |
+
class CorpusConfig:
|
| 128 |
+
"""
|
| 129 |
+
Configuration for corpus building parameters.
|
| 130 |
+
|
| 131 |
+
Attributes:
|
| 132 |
+
max_arxiv_papers: Maximum number of arXiv papers to fetch.
|
| 133 |
+
max_pubmed_papers: Maximum number of PubMed papers to fetch.
|
| 134 |
+
max_fineweb_samples: Maximum number of FineWeb-Edu samples to fetch.
|
| 135 |
+
max_workers: Number of workers for parallel processing.
|
| 136 |
+
timeout: Timeout for API requests.
|
| 137 |
+
chunk_size: Chunk size for batch processing.
|
| 138 |
+
"""
|
| 139 |
+
max_arxiv_papers: int = 9000
|
| 140 |
+
max_pubmed_papers: int = 3000
|
| 141 |
+
max_fineweb_samples: int = 30000
|
| 142 |
+
max_workers: int = 8
|
| 143 |
+
timeout: int = 30
|
| 144 |
+
chunk_size: int = 1000
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
class ScientificCorpusBuilder:
|
| 148 |
+
"""
|
| 149 |
+
Main class for building a scientific corpus from multiple sources.
|
| 150 |
+
|
| 151 |
+
Methods:
|
| 152 |
+
fetch_arxiv_papers: Collects papers from arXiv.
|
| 153 |
+
fetch_pubmed_papers: Collects papers from PubMed.
|
| 154 |
+
fetch_fineweb_edu: Collects educational content from FineWeb-Edu.
|
| 155 |
+
preprocess_sample: Cleans and segments a paper into samples.
|
| 156 |
+
process_papers: Tags, filters, and preprocesses papers.
|
| 157 |
+
build_corpus: Orchestrates the full pipeline and builds the corpus.
|
| 158 |
+
print_report: Prints a summary report of the build process.
|
| 159 |
+
"""
|
| 160 |
+
|
| 161 |
+
def __init__(self, config: Optional[CorpusConfig] = None):
|
| 162 |
+
"""
|
| 163 |
+
Initialize the corpus builder with configuration and dependencies.
|
| 164 |
+
|
| 165 |
+
Args:
|
| 166 |
+
config: Optional CorpusConfig object.
|
| 167 |
+
"""
|
| 168 |
+
self.config = config or CorpusConfig()
|
| 169 |
+
self.preprocessor = QLoRAPreprocessor(corpus_type="scientific")
|
| 170 |
+
self.analyzer = DatasetAnalyzer()
|
| 171 |
+
self.ranker = EntropyRanker()
|
| 172 |
+
self.data_dir = Path("scientific_corpus_data")
|
| 173 |
+
self.data_dir.mkdir(exist_ok=True)
|
| 174 |
+
self._setup_apis()
|
| 175 |
+
self.seen_titles: Set[str] = set()
|
| 176 |
+
self.metrics = {
|
| 177 |
+
"arxiv": SourceMetrics(),
|
| 178 |
+
"pubmed": SourceMetrics(),
|
| 179 |
+
"fineweb_edu": SourceMetrics(),
|
| 180 |
+
"total_tokens": 0,
|
| 181 |
+
"total_time": 0.0
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
@staticmethod
|
| 185 |
+
def _setup_apis() -> None:
|
| 186 |
+
"""
|
| 187 |
+
Setup API configurations for external data sources.
|
| 188 |
+
"""
|
| 189 |
+
Entrez.email = os.getenv("ENTREZ_EMAIL", "[email protected]")
|
| 190 |
+
if Entrez.email == "[email protected]":
|
| 191 |
+
logger.warning("Using default email for Entrez. Set ENTREZ_EMAIL environment variable.")
|
| 192 |
+
|
| 193 |
+
@retry(max_retries=3, backoff_factor=2,
|
| 194 |
+
exceptions=(arxiv.ArxivError, HTTPError, URLError, ConnectionError))
|
| 195 |
+
def _fetch_arxiv_search(self, query: str, max_results: int) -> List[Any]:
|
| 196 |
+
"""
|
| 197 |
+
Fetch arXiv search results with error handling and exponential backoff.
|
| 198 |
+
|
| 199 |
+
Args:
|
| 200 |
+
query: arXiv API query string.
|
| 201 |
+
max_results: Maximum number of results to fetch.
|
| 202 |
+
|
| 203 |
+
Returns:
|
| 204 |
+
List of arXiv result objects.
|
| 205 |
+
"""
|
| 206 |
+
try:
|
| 207 |
+
search = arxiv.Search(
|
| 208 |
+
query=query,
|
| 209 |
+
max_results=max_results,
|
| 210 |
+
sort_by=arxiv.SortCriterion.SubmittedDate,
|
| 211 |
+
)
|
| 212 |
+
client = arxiv.Client()
|
| 213 |
+
results = list(client.results(search))
|
| 214 |
+
if not results:
|
| 215 |
+
logger.warning(f"Empty page returned for query '{query}'")
|
| 216 |
+
return results
|
| 217 |
+
except (arxiv.UnexpectedEmptyPageError, arxiv.HTTPError) as e:
|
| 218 |
+
logger.warning(f"Empty page returned for query '{query}': {e}")
|
| 219 |
+
return []
|
| 220 |
+
except Exception as e:
|
| 221 |
+
logger.error(f"Error in _fetch_arxiv_search for query '{query}': {e}")
|
| 222 |
+
raise
|
| 223 |
+
|
| 224 |
+
def fetch_arxiv_papers(self) -> List[Dict]:
|
| 225 |
+
"""
|
| 226 |
+
Fetch papers from arXiv across multiple domains with verification and checkpoint saving.
|
| 227 |
+
|
| 228 |
+
Returns:
|
| 229 |
+
List of arXiv paper dictionaries.
|
| 230 |
+
"""
|
| 231 |
+
logger.info("Starting arXiv paper collection...")
|
| 232 |
+
start_time = time.time()
|
| 233 |
+
papers = []
|
| 234 |
+
queries = [
|
| 235 |
+
("physics", "cat:physics* OR cat:astro-ph* OR cat:cond-mat* OR cat:hep-th OR cat:quant-ph OR cat:math-ph"),
|
| 236 |
+
("biology", "cat:q-bio*"),
|
| 237 |
+
("materials", "cat:cond-mat.mtrl-sci OR cat:materials*")
|
| 238 |
+
]
|
| 239 |
+
for domain, query in queries:
|
| 240 |
+
if is_shutdown:
|
| 241 |
+
break
|
| 242 |
+
try:
|
| 243 |
+
results = self._fetch_arxiv_search(query, self.config.max_arxiv_papers // 3)
|
| 244 |
+
for result in tqdm(results, desc=f"arXiv {domain}"):
|
| 245 |
+
if is_shutdown:
|
| 246 |
+
break
|
| 247 |
+
try:
|
| 248 |
+
paper = {
|
| 249 |
+
"title": result.title.strip() if result.title else "",
|
| 250 |
+
"abstract": result.summary.strip() if result.summary else "",
|
| 251 |
+
"full_text": "",
|
| 252 |
+
"domain": domain,
|
| 253 |
+
"section": "abstract",
|
| 254 |
+
"source": "arxiv",
|
| 255 |
+
"authors": [str(a) for a in result.authors] if result.authors else [],
|
| 256 |
+
"published": result.published.isoformat() if result.published else None,
|
| 257 |
+
"provenance": {"arxiv_id": result.get_short_id()},
|
| 258 |
+
"categories": [c for c in getattr(result, "categories", [])] if hasattr(result, "categories") else [],
|
| 259 |
+
"text": result.summary.strip() if result.summary else ""
|
| 260 |
+
}
|
| 261 |
+
if paper["title"] and paper["title"] not in self.seen_titles:
|
| 262 |
+
papers.append(paper)
|
| 263 |
+
self.seen_titles.add(paper["title"])
|
| 264 |
+
except Exception as e:
|
| 265 |
+
logger.warning(f"Error processing arXiv result: {e}")
|
| 266 |
+
self.metrics["arxiv"].errors += 1
|
| 267 |
+
continue
|
| 268 |
+
except Exception as e:
|
| 269 |
+
logger.error(f"arXiv {domain} search failed: {e}")
|
| 270 |
+
self.metrics["arxiv"].errors += 1
|
| 271 |
+
self._save_intermediate(papers, "arxiv_papers.jsonl")
|
| 272 |
+
elapsed = time.time() - start_time
|
| 273 |
+
self.metrics["arxiv"].papers = len(papers)
|
| 274 |
+
self.metrics["arxiv"].time = elapsed
|
| 275 |
+
logger.info(f"Collected {len(papers)} arXiv papers in {elapsed:.2f}s")
|
| 276 |
+
return papers
|
| 277 |
+
|
| 278 |
+
@retry(max_retries=3, backoff_factor=2,
|
| 279 |
+
exceptions=(HTTPError, URLError, ConnectionError, ExpatError))
|
| 280 |
+
def _fetch_pubmed_batch(self, chunk_pmids: List[str]) -> Dict:
|
| 281 |
+
"""
|
| 282 |
+
Fetch a batch of PubMed records with error handling.
|
| 283 |
+
|
| 284 |
+
Args:
|
| 285 |
+
chunk_pmids: List of PubMed IDs.
|
| 286 |
+
|
| 287 |
+
Returns:
|
| 288 |
+
Dictionary of PubMed records.
|
| 289 |
+
"""
|
| 290 |
+
try:
|
| 291 |
+
fetch_handle = Entrez.efetch (
|
| 292 |
+
db="pubmed",
|
| 293 |
+
id=",".join (chunk_pmids),
|
| 294 |
+
rettype="medline",
|
| 295 |
+
retmode="xml"
|
| 296 |
+
)
|
| 297 |
+
records = Entrez.read (fetch_handle)
|
| 298 |
+
fetch_handle.close ()
|
| 299 |
+
return records
|
| 300 |
+
except ExpatError as e:
|
| 301 |
+
logger.error (f"XML parsing error in PubMed batch: {e}")
|
| 302 |
+
raise
|
| 303 |
+
except (HTTPError, URLError) as e:
|
| 304 |
+
logger.error (f"Network error fetching PubMed batch: {e}")
|
| 305 |
+
raise
|
| 306 |
+
|
| 307 |
+
def fetch_pubmed_papers(self) -> List[Dict]:
|
| 308 |
+
"""
|
| 309 |
+
Fetch papers from PubMed with biology focus.
|
| 310 |
+
|
| 311 |
+
Returns:
|
| 312 |
+
List of PubMed paper dictionaries.
|
| 313 |
+
"""
|
| 314 |
+
logger.info ("Starting PubMed paper collection...")
|
| 315 |
+
start_time = time.time ()
|
| 316 |
+
papers = []
|
| 317 |
+
|
| 318 |
+
search_terms = [
|
| 319 |
+
"(methods[Title/Abstract]) AND (biology[MeSH Terms])",
|
| 320 |
+
"(computational biology[MeSH Terms]) AND (methods[Title/Abstract])",
|
| 321 |
+
"(bioinformatics[MeSH Terms]) AND (algorithm[Title/Abstract])",
|
| 322 |
+
"(molecular biology[MeSH Terms]) AND (technique[Title/Abstract])"
|
| 323 |
+
]
|
| 324 |
+
|
| 325 |
+
for search_term in search_terms:
|
| 326 |
+
if is_shutdown:
|
| 327 |
+
break
|
| 328 |
+
|
| 329 |
+
try:
|
| 330 |
+
handle = Entrez.esearch (
|
| 331 |
+
db="pubmed",
|
| 332 |
+
term=search_term,
|
| 333 |
+
retmax=self.config.max_pubmed_papers // len (search_terms),
|
| 334 |
+
sort="relevance"
|
| 335 |
+
)
|
| 336 |
+
record = Entrez.read (handle)
|
| 337 |
+
handle.close ()
|
| 338 |
+
pmids = record.get ("IdList", [])
|
| 339 |
+
|
| 340 |
+
for i in tqdm (range (0, len (pmids), self.config.chunk_size), desc="PubMed batch"):
|
| 341 |
+
if is_shutdown:
|
| 342 |
+
break
|
| 343 |
+
|
| 344 |
+
chunk_pmids = pmids [i:i + self.config.chunk_size]
|
| 345 |
+
try:
|
| 346 |
+
records = self._fetch_pubmed_batch (chunk_pmids)
|
| 347 |
+
|
| 348 |
+
for rec in records.get ("PubmedArticle", []):
|
| 349 |
+
try:
|
| 350 |
+
medline_citation = rec.get ("MedlineCitation", {})
|
| 351 |
+
article = medline_citation.get ("Article", {})
|
| 352 |
+
|
| 353 |
+
title = article.get ("ArticleTitle", "")
|
| 354 |
+
abstract_list = article.get ("Abstract", {}).get ("AbstractText", [""])
|
| 355 |
+
abstract = abstract_list [0] if abstract_list else ""
|
| 356 |
+
|
| 357 |
+
if title and isinstance (title, str) and title not in self.seen_titles:
|
| 358 |
+
paper = {
|
| 359 |
+
"title": title.strip (),
|
| 360 |
+
"abstract": abstract.strip () if isinstance (abstract, str) else "",
|
| 361 |
+
"full_text": "",
|
| 362 |
+
"domain": "biology",
|
| 363 |
+
"section": "abstract",
|
| 364 |
+
"source": "pubmed",
|
| 365 |
+
"authors": [],
|
| 366 |
+
"published": None,
|
| 367 |
+
"provenance": {"pubmed_id": str (medline_citation.get ("PMID", ""))},
|
| 368 |
+
"categories": ["biology"],
|
| 369 |
+
"text": abstract.strip () if isinstance (abstract, str) else ""
|
| 370 |
+
}
|
| 371 |
+
papers.append (paper)
|
| 372 |
+
self.seen_titles.add (title)
|
| 373 |
+
|
| 374 |
+
except (KeyError, TypeError, AttributeError) as e:
|
| 375 |
+
logger.warning (f"Error processing PubMed record: {e}")
|
| 376 |
+
self.metrics ["pubmed"].errors += 1
|
| 377 |
+
continue
|
| 378 |
+
|
| 379 |
+
except (HTTPError, URLError, ConnectionError, ExpatError) as e:
|
| 380 |
+
self.metrics ["pubmed"].errors += 1
|
| 381 |
+
logger.warning (f"Failed to fetch PubMed batch: {e}")
|
| 382 |
+
continue
|
| 383 |
+
|
| 384 |
+
except (HTTPError, URLError, ConnectionError, ExpatError) as e:
|
| 385 |
+
self.metrics ["pubmed"].errors += 1
|
| 386 |
+
logger.error (f"PubMed search failed for {search_term}: {e}")
|
| 387 |
+
except KeyboardInterrupt:
|
| 388 |
+
logger.info ("PubMed collection interrupted by user")
|
| 389 |
+
break
|
| 390 |
+
|
| 391 |
+
self._save_intermediate (papers, "pubmed_papers.jsonl")
|
| 392 |
+
elapsed = time.time () - start_time
|
| 393 |
+
self.metrics ["pubmed"].papers = len (papers)
|
| 394 |
+
self.metrics ["pubmed"].time = elapsed
|
| 395 |
+
logger.info (f"Collected {len (papers)} PubMed papers in {elapsed:.2f}s")
|
| 396 |
+
return papers
|
| 397 |
+
|
| 398 |
+
@retry (max_retries=3, backoff_factor=2,
|
| 399 |
+
exceptions=(ConnectionError, HTTPError, URLError, OSError))
|
| 400 |
+
def fetch_fineweb_edu(self) -> List [Dict]:
|
| 401 |
+
"""
|
| 402 |
+
Fetch educational content from FineWeb-Edu dataset.
|
| 403 |
+
|
| 404 |
+
Returns:
|
| 405 |
+
List of FineWeb-Edu paper dictionaries.
|
| 406 |
+
"""
|
| 407 |
+
logger.info ("Starting FineWeb-Edu collection...")
|
| 408 |
+
start_time = time.time ()
|
| 409 |
+
papers = []
|
| 410 |
+
|
| 411 |
+
try:
|
| 412 |
+
ds = load_dataset ("HuggingFaceFW/fineweb-edu", "sample-10BT",
|
| 413 |
+
split="train", streaming=True)
|
| 414 |
+
samples = []
|
| 415 |
+
|
| 416 |
+
for i, sample in enumerate (ds):
|
| 417 |
+
if is_shutdown:
|
| 418 |
+
break
|
| 419 |
+
if i >= self.config.max_fineweb_samples:
|
| 420 |
+
break
|
| 421 |
+
|
| 422 |
+
if not isinstance (sample, dict) or "text" not in sample:
|
| 423 |
+
logger.warning (f"Invalid sample structure at index {i}")
|
| 424 |
+
continue
|
| 425 |
+
|
| 426 |
+
samples.append (sample)
|
| 427 |
+
if (i + 1) % 10000 == 0:
|
| 428 |
+
logger.info (f"Collected {i + 1} FineWeb samples")
|
| 429 |
+
|
| 430 |
+
logger.info (f"Processing {len (samples)} FineWeb samples")
|
| 431 |
+
|
| 432 |
+
def is_educational_content(sample: Dict) -> bool:
|
| 433 |
+
"""Check if content is educational and suitable."""
|
| 434 |
+
try:
|
| 435 |
+
text = sample.get ("text", "")
|
| 436 |
+
if not isinstance (text, str) or len (text) < 500:
|
| 437 |
+
return False
|
| 438 |
+
return self.ranker.is_explanatory (text)
|
| 439 |
+
except (AttributeError, TypeError, ValueError) as e:
|
| 440 |
+
logger.debug (f"Error evaluating educational content: {e}")
|
| 441 |
+
return False
|
| 442 |
+
|
| 443 |
+
with concurrent.futures.ThreadPoolExecutor (max_workers=self.config.max_workers) as executor:
|
| 444 |
+
filtered_results = list (tqdm (
|
| 445 |
+
executor.map (is_educational_content, samples),
|
| 446 |
+
total=len (samples),
|
| 447 |
+
desc="Filtering FineWeb content"
|
| 448 |
+
))
|
| 449 |
+
|
| 450 |
+
for sample, is_good in zip (samples, filtered_results):
|
| 451 |
+
if is_shutdown:
|
| 452 |
+
break
|
| 453 |
+
if is_good:
|
| 454 |
+
try:
|
| 455 |
+
url = sample.get ("url", "")
|
| 456 |
+
meta = sample.get ("meta", {})
|
| 457 |
+
title = meta.get ("title", "") if isinstance (meta, dict) else ""
|
| 458 |
+
title = title or url or f"Document_{len (papers)}"
|
| 459 |
+
|
| 460 |
+
if title not in self.seen_titles:
|
| 461 |
+
paper = {
|
| 462 |
+
"title": title,
|
| 463 |
+
"abstract": "",
|
| 464 |
+
"full_text": sample.get ("text", ""),
|
| 465 |
+
"domain": "education",
|
| 466 |
+
"section": "full_text",
|
| 467 |
+
"source": "fineweb_edu",
|
| 468 |
+
"authors": [],
|
| 469 |
+
"published": None,
|
| 470 |
+
"provenance": {"url": url},
|
| 471 |
+
"categories": ["education"],
|
| 472 |
+
"text": sample.get("text", "")
|
| 473 |
+
}
|
| 474 |
+
papers.append (paper)
|
| 475 |
+
self.seen_titles.add (title)
|
| 476 |
+
except (KeyError, TypeError, AttributeError) as e:
|
| 477 |
+
logger.warning (f"Error processing FineWeb sample: {e}")
|
| 478 |
+
self.metrics ["fineweb_edu"].errors += 1
|
| 479 |
+
continue
|
| 480 |
+
|
| 481 |
+
except (ConnectionError, HTTPError, URLError, OSError) as e:
|
| 482 |
+
logger.error (f"FineWeb-Edu fetch failed: {e}")
|
| 483 |
+
self.metrics ["fineweb_edu"].errors += 1
|
| 484 |
+
except KeyboardInterrupt:
|
| 485 |
+
logger.info ("FineWeb-Edu collection interrupted by user")
|
| 486 |
+
except ImportError as e:
|
| 487 |
+
logger.error (f"Failed to import required dataset library: {e}")
|
| 488 |
+
self.metrics ["fineweb_edu"].errors += 1
|
| 489 |
+
|
| 490 |
+
self._save_intermediate (papers, "fineweb_edu.jsonl")
|
| 491 |
+
elapsed = time.time () - start_time
|
| 492 |
+
self.metrics ["fineweb_edu"].papers = len (papers)
|
| 493 |
+
self.metrics ["fineweb_edu"].time = elapsed
|
| 494 |
+
logger.info (f"Collected {len (papers)} FineWeb-Edu papers in {elapsed:.2f}s")
|
| 495 |
+
return papers
|
| 496 |
+
|
| 497 |
+
@staticmethod
|
| 498 |
+
def preprocess_sample(paper: Dict) -> List [Dict]:
|
| 499 |
+
"""
|
| 500 |
+
Preprocess a paper sample into multiple training samples.
|
| 501 |
+
|
| 502 |
+
Args:
|
| 503 |
+
paper: Dictionary representing a paper.
|
| 504 |
+
|
| 505 |
+
Returns:
|
| 506 |
+
List of processed sample dictionaries.
|
| 507 |
+
"""
|
| 508 |
+
try:
|
| 509 |
+
title = clean_text (paper.get ("title", "")) if paper.get ("title") else ""
|
| 510 |
+
abstract = clean_text (paper.get ("abstract", "")) if paper.get ("abstract") else ""
|
| 511 |
+
full_text = clean_text (paper.get ("full_text", "")) if paper.get ("full_text") else ""
|
| 512 |
+
|
| 513 |
+
paragraphs = segment_paragraphs (full_text) if full_text else []
|
| 514 |
+
samples = []
|
| 515 |
+
|
| 516 |
+
if title or abstract:
|
| 517 |
+
sample = dict (paper)
|
| 518 |
+
sample ["title"] = title
|
| 519 |
+
sample ["abstract"] = abstract
|
| 520 |
+
sample ["full_text"] = ""
|
| 521 |
+
sample ["section"] = "abstract"
|
| 522 |
+
samples.append (sample)
|
| 523 |
+
|
| 524 |
+
for para in paragraphs:
|
| 525 |
+
if para.strip ():
|
| 526 |
+
sample = dict (paper)
|
| 527 |
+
sample ["title"] = title
|
| 528 |
+
sample ["abstract"] = ""
|
| 529 |
+
sample ["full_text"] = para
|
| 530 |
+
sample ["section"] = "paragraph"
|
| 531 |
+
samples.append (sample)
|
| 532 |
+
|
| 533 |
+
return samples
|
| 534 |
+
|
| 535 |
+
except (AttributeError, TypeError, ValueError) as e:
|
| 536 |
+
logger.warning (f"Error preprocessing sample: {e}")
|
| 537 |
+
return []
|
| 538 |
+
|
| 539 |
+
def process_papers(self, papers: List[Dict], domain: str) -> List[Dict]:
|
| 540 |
+
"""
|
| 541 |
+
Process papers with domain-specific tagging and filtering.
|
| 542 |
+
|
| 543 |
+
Args:
|
| 544 |
+
papers: List of paper dictionaries.
|
| 545 |
+
domain: Domain string for tagging.
|
| 546 |
+
|
| 547 |
+
Returns:
|
| 548 |
+
List of processed and filtered sample dictionaries.
|
| 549 |
+
"""
|
| 550 |
+
logger.info(f"Processing {len(papers)} {domain} papers...")
|
| 551 |
+
processed = []
|
| 552 |
+
unknown_domains = 0
|
| 553 |
+
unknown_sections = 0
|
| 554 |
+
|
| 555 |
+
def label_domain(paper):
|
| 556 |
+
cats = paper.get('categories', [])
|
| 557 |
+
if not cats:
|
| 558 |
+
return 'unknown'
|
| 559 |
+
cats_str = " ".join(cats).lower()
|
| 560 |
+
if 'bio' in cats_str:
|
| 561 |
+
return '[BIO]'
|
| 562 |
+
if 'gen' in cats_str:
|
| 563 |
+
return '[GEN]'
|
| 564 |
+
if 'phys' in cats_str:
|
| 565 |
+
return '[PHY]'
|
| 566 |
+
if 'math' in cats_str:
|
| 567 |
+
return '[MATH]'
|
| 568 |
+
if 'mat' in cats_str or 'materials' in cats_str:
|
| 569 |
+
return '[MAT]'
|
| 570 |
+
if 'astro' in cats_str:
|
| 571 |
+
return '[ASTRO]'
|
| 572 |
+
if 'cs' in cats_str:
|
| 573 |
+
return '[CS]'
|
| 574 |
+
return 'unknown'
|
| 575 |
+
|
| 576 |
+
def label_section(paper):
|
| 577 |
+
text = paper.get('text', '') or paper.get('abstract', '') or ''
|
| 578 |
+
text_lower = text.lower()
|
| 579 |
+
if not text_lower:
|
| 580 |
+
return 'unknown'
|
| 581 |
+
if 'abstract' in text_lower:
|
| 582 |
+
return '[ABSTRACT]'
|
| 583 |
+
if 'introduction' in text_lower:
|
| 584 |
+
return '[INTRO]'
|
| 585 |
+
if 'methods' in text_lower:
|
| 586 |
+
return '[METHODS]'
|
| 587 |
+
if 'results' in text_lower:
|
| 588 |
+
return '[RESULTS]'
|
| 589 |
+
if 'discussion' in text_lower:
|
| 590 |
+
return '[DISCUSSION]'
|
| 591 |
+
if 'conclusion' in text_lower:
|
| 592 |
+
return '[CONCLUSION]'
|
| 593 |
+
return 'unknown'
|
| 594 |
+
|
| 595 |
+
for paper in tqdm(papers, desc=f"Processing {domain} papers"):
|
| 596 |
+
try:
|
| 597 |
+
domain_tag = label_domain(paper)
|
| 598 |
+
section_tag = label_section(paper)
|
| 599 |
+
paper["domain_tag"] = domain_tag
|
| 600 |
+
paper["section_tag"] = section_tag
|
| 601 |
+
if domain_tag == 'unknown':
|
| 602 |
+
unknown_domains += 1
|
| 603 |
+
if section_tag == 'unknown':
|
| 604 |
+
unknown_sections += 1
|
| 605 |
+
|
| 606 |
+
task = paper.get("task", None)
|
| 607 |
+
if task and task in TASK_TAGS:
|
| 608 |
+
paper["task_tag"] = TASK_TAGS[task]
|
| 609 |
+
|
| 610 |
+
routing = paper.get("routing", "general")
|
| 611 |
+
paper["routing_tag"] = ROUTING_TAGS.get(routing, ROUTING_TAGS["general"])
|
| 612 |
+
|
| 613 |
+
samples = self.preprocess_sample(paper)
|
| 614 |
+
|
| 615 |
+
for sample in samples:
|
| 616 |
+
try:
|
| 617 |
+
content_parts = []
|
| 618 |
+
if sample.get("title"):
|
| 619 |
+
content_parts.append(str(sample["title"]))
|
| 620 |
+
if sample.get("abstract"):
|
| 621 |
+
content_parts.append(str(sample["abstract"]))
|
| 622 |
+
if sample.get("full_text"):
|
| 623 |
+
content_parts.append(str(sample["full_text"])[:1000])
|
| 624 |
+
content = " ".join(content_parts)
|
| 625 |
+
if content.strip() and self.ranker.is_explanatory(content):
|
| 626 |
+
sample["domain_tag"] = paper["domain_tag"]
|
| 627 |
+
sample["section_tag"] = paper["section_tag"]
|
| 628 |
+
sample["routing_tag"] = paper["routing_tag"]
|
| 629 |
+
if "task_tag" in paper:
|
| 630 |
+
sample["task_tag"] = paper["task_tag"]
|
| 631 |
+
processed.append(sample)
|
| 632 |
+
except Exception as e:
|
| 633 |
+
logger.debug(f"Error evaluating sample content: {e}")
|
| 634 |
+
continue
|
| 635 |
+
|
| 636 |
+
except Exception as e:
|
| 637 |
+
logger.warning(f"Paper processing error: {e}")
|
| 638 |
+
continue
|
| 639 |
+
|
| 640 |
+
logger.info(f"Processed {len(processed)}/{len(papers)} {domain} papers")
|
| 641 |
+
logger.info(f"Unknown domains: {unknown_domains}, Unknown sections: {unknown_sections}")
|
| 642 |
+
return processed
|
| 643 |
+
|
| 644 |
+
def _save_intermediate(self, papers: List[Dict], filename: str) -> None:
|
| 645 |
+
"""
|
| 646 |
+
Save intermediate results to disk as JSONL.
|
| 647 |
+
|
| 648 |
+
Args:
|
| 649 |
+
papers: List of paper/sample dictionaries.
|
| 650 |
+
filename: Output filename.
|
| 651 |
+
"""
|
| 652 |
+
path = self.data_dir / filename
|
| 653 |
+
try:
|
| 654 |
+
with open (path, "w", encoding="utf-8") as f:
|
| 655 |
+
for paper in papers:
|
| 656 |
+
f.write (json.dumps (paper, ensure_ascii=False) + "\n")
|
| 657 |
+
logger.info (f"Saved checkpoint to {path}")
|
| 658 |
+
except (OSError, IOError, PermissionError) as e:
|
| 659 |
+
logger.error (f"Failed to save intermediate file {filename}: {e}")
|
| 660 |
+
except (TypeError, ValueError) as e:
|
| 661 |
+
logger.error (f"JSON serialization error for {filename}: {e}")
|
| 662 |
+
|
| 663 |
+
def build_corpus(self, output_path: str, verify_only: bool = False) -> None:
|
| 664 |
+
"""
|
| 665 |
+
Build the complete scientific corpus with checkpoint verification.
|
| 666 |
+
|
| 667 |
+
Args:
|
| 668 |
+
output_path: Path to save the final corpus.
|
| 669 |
+
verify_only: If True, only verify checkpoints and skip merging.
|
| 670 |
+
"""
|
| 671 |
+
logger.info("Starting scientific corpus build...")
|
| 672 |
+
total_start = time.time()
|
| 673 |
+
all_papers = []
|
| 674 |
+
|
| 675 |
+
sources = [
|
| 676 |
+
("arXiv", self.fetch_arxiv_papers, None),
|
| 677 |
+
("PubMed", self.fetch_pubmed_papers, "biology"),
|
| 678 |
+
("FineWeb-Edu", self.fetch_fineweb_edu, "education")
|
| 679 |
+
]
|
| 680 |
+
for source_name, fetch_func, domain in sources:
|
| 681 |
+
if is_shutdown:
|
| 682 |
+
break
|
| 683 |
+
logger.info(f"Fetching {source_name} papers...")
|
| 684 |
+
try:
|
| 685 |
+
papers = fetch_func()
|
| 686 |
+
if domain:
|
| 687 |
+
processed = []
|
| 688 |
+
for i in range(0, len(papers), self.config.chunk_size):
|
| 689 |
+
chunk = papers[i:i + self.config.chunk_size]
|
| 690 |
+
processed.extend(self.process_papers(chunk, domain))
|
| 691 |
+
papers = processed
|
| 692 |
+
chkpt_filename = f"{source_name.lower()}_papers.jsonl"
|
| 693 |
+
self._save_intermediate(papers, chkpt_filename)
|
| 694 |
+
if not papers:
|
| 695 |
+
logger.error(f"{source_name} checkpoint {chkpt_filename} is empty!")
|
| 696 |
+
all_papers.extend(papers)
|
| 697 |
+
logger.info(f"Added {len(papers)} papers from {source_name}")
|
| 698 |
+
except Exception as e:
|
| 699 |
+
logger.error(f"Critical error fetching from {source_name}: {e}")
|
| 700 |
+
continue
|
| 701 |
+
|
| 702 |
+
logger.info(f"Total papers collected: {len(all_papers)}")
|
| 703 |
+
if verify_only:
|
| 704 |
+
logger.info("Verification flag enabled; skipping merge and build.")
|
| 705 |
+
self.print_report({})
|
| 706 |
+
return
|
| 707 |
+
|
| 708 |
+
if not all_papers:
|
| 709 |
+
logger.error("No papers collected. Cannot build corpus.")
|
| 710 |
+
self.print_report({})
|
| 711 |
+
return
|
| 712 |
+
|
| 713 |
+
logger.info("Ranking and deduplicating papers...")
|
| 714 |
+
try:
|
| 715 |
+
ranked_papers = self.ranker.rank_samples(all_papers)
|
| 716 |
+
if not ranked_papers:
|
| 717 |
+
logger.error("Final corpus is empty after ranking. Using unranked papers as fallback.")
|
| 718 |
+
ranked_papers = all_papers
|
| 719 |
+
logger.info(f"Final corpus size: {len(ranked_papers)} papers")
|
| 720 |
+
except Exception as e:
|
| 721 |
+
logger.error(f"Error ranking papers: {e}")
|
| 722 |
+
ranked_papers = all_papers
|
| 723 |
+
|
| 724 |
+
if not ranked_papers:
|
| 725 |
+
logger.error("Final corpus is empty. No data to process or save.")
|
| 726 |
+
self.print_report({})
|
| 727 |
+
return
|
| 728 |
+
|
| 729 |
+
self._save_intermediate(ranked_papers, "ranked_papers.jsonl")
|
| 730 |
+
try:
|
| 731 |
+
stats = self.analyzer.get_dataset_stats(ranked_papers)
|
| 732 |
+
self.metrics["total_tokens"] = int(stats.get("avg_tokens", 0) * stats.get("total_samples", 0))
|
| 733 |
+
except Exception as e:
|
| 734 |
+
logger.error(f"Error generating dataset statistics: {e}")
|
| 735 |
+
stats = {}
|
| 736 |
+
|
| 737 |
+
self.metrics["total_time"] = time.time() - total_start
|
| 738 |
+
logger.info("Processing final dataset in batches...")
|
| 739 |
+
try:
|
| 740 |
+
with open(output_path, "w", encoding="utf-8") as out_f:
|
| 741 |
+
for i in range(0, len(ranked_papers), self.config.chunk_size):
|
| 742 |
+
chunk = ranked_papers[i:i + self.config.chunk_size]
|
| 743 |
+
for paper in chunk:
|
| 744 |
+
out_f.write(json.dumps(paper, ensure_ascii=False) + "\n")
|
| 745 |
+
except Exception as e:
|
| 746 |
+
logger.error(f"Error processing final dataset: {e}")
|
| 747 |
+
|
| 748 |
+
# HuggingFace upload: warn if a file is too large
|
| 749 |
+
if os.path.exists(output_path) and os.path.getsize(output_path) > 10 * 1024 * 1024:
|
| 750 |
+
logger.warning(
|
| 751 |
+
f"{output_path} is larger than 10 MiB. HuggingFace will reject files >10 MiB unless you use Git LFS. "
|
| 752 |
+
"See https://hf.co/docs/hub/repositories-getting-started#terminal"
|
| 753 |
+
)
|
| 754 |
+
logger.warning(
|
| 755 |
+
"To fix: install git-lfs and run 'git lfs track \"*.jsonl\"' before pushing, or split your file."
|
| 756 |
+
)
|
| 757 |
+
|
| 758 |
+
self.print_report(stats)
|
| 759 |
+
logger.info(f"Scientific corpus successfully built: {output_path}")
|
| 760 |
+
|
| 761 |
+
def build_corpus_scoped(self, plan: str, token_budget: int) -> (list, dict):
|
| 762 |
+
"""
|
| 763 |
+
Build a scientific corpus, limiting the total number of tokens to the plan's budget.
|
| 764 |
+
Returns the corpus and stats.
|
| 765 |
+
"""
|
| 766 |
+
logger.info(f"Building corpus for plan '{plan}' with token budget {token_budget}")
|
| 767 |
+
all_papers = []
|
| 768 |
+
all_papers.extend(self.process_papers(self.fetch_arxiv_papers(), "arxiv"))
|
| 769 |
+
all_papers.extend(self.process_papers(self.fetch_pubmed_papers(), "biology"))
|
| 770 |
+
all_papers.extend(self.process_papers(self.fetch_fineweb_edu(), "education"))
|
| 771 |
+
|
| 772 |
+
# Rank and deduplicate
|
| 773 |
+
ranked_papers = self.ranker.rank_samples(all_papers)
|
| 774 |
+
corpus = []
|
| 775 |
+
total_tokens = 0
|
| 776 |
+
for paper in ranked_papers:
|
| 777 |
+
tokens = paper.get("text", "").split()
|
| 778 |
+
if total_tokens + len(tokens) > token_budget:
|
| 779 |
+
break
|
| 780 |
+
corpus.append(paper)
|
| 781 |
+
total_tokens += len(tokens)
|
| 782 |
+
stats = self.analyzer.get_dataset_stats(corpus)
|
| 783 |
+
stats["total_tokens"] = total_tokens
|
| 784 |
+
logger.info(f"Corpus built: {len(corpus)} samples, {total_tokens} tokens")
|
| 785 |
+
return corpus, stats
|
| 786 |
+
|
| 787 |
+
def print_report(self, stats: Dict) -> None:
|
| 788 |
+
"""
|
| 789 |
+
Print a comprehensive build report.
|
| 790 |
+
|
| 791 |
+
Args:
|
| 792 |
+
stats: Dictionary of dataset statistics.
|
| 793 |
+
"""
|
| 794 |
+
print("\n" + "=" * 67)
|
| 795 |
+
print(" SCIENTIFIC CORPUS BUILD REPORT")
|
| 796 |
+
print("=" * 67)
|
| 797 |
+
print("\nSOURCE METRICS:")
|
| 798 |
+
print("-" * 40)
|
| 799 |
+
for source_name, label in zip(["arxiv", "pubmed", "fineweb_edu"],
|
| 800 |
+
["ARXIV", "PUBMED", "FINEWEB_EDU"]):
|
| 801 |
+
metrics = self.metrics[source_name]
|
| 802 |
+
print(f"{label:15}: {metrics.papers:6d} papers | {metrics.errors:3d} errors | {metrics.time:9.2f}s")
|
| 803 |
+
print("\nOVERALL METRICS:")
|
| 804 |
+
print("-" * 40)
|
| 805 |
+
total_papers = sum(self.metrics[src].papers for src in ["arxiv", "pubmed", "fineweb_edu"])
|
| 806 |
+
total_errors = sum(self.metrics[src].errors for src in ["arxiv", "pubmed", "fineweb_edu"])
|
| 807 |
+
print(f"Total Papers: {total_papers:,}")
|
| 808 |
+
print(f"Total Tokens: {self.metrics['total_tokens']:,}")
|
| 809 |
+
print(f"Total Time: {self.metrics['total_time']:.2f}s")
|
| 810 |
+
print(f"Total Errors: {total_errors}")
|
| 811 |
+
success_rate = (1 - total_errors / max(total_papers + total_errors, 1)) * 100
|
| 812 |
+
print(f"Success Rate: {success_rate:.2f}%")
|
| 813 |
+
if stats:
|
| 814 |
+
print("\nDATASET STATISTICS:")
|
| 815 |
+
print("-" * 40)
|
| 816 |
+
for key, value in stats.items():
|
| 817 |
+
print(f"{key:20}: {value}")
|
| 818 |
+
print("=" * 67)
|
| 819 |
+
print()
|
| 820 |
+
|
| 821 |
+
|
| 822 |
+
def main() -> None:
|
| 823 |
+
"""
|
| 824 |
+
Main entry point for the corpus builder.
|
| 825 |
+
"""
|
| 826 |
+
try:
|
| 827 |
+
config = CorpusConfig()
|
| 828 |
+
builder = ScientificCorpusBuilder(config)
|
| 829 |
+
output_path = "scientific_corpus_325M.jsonl"
|
| 830 |
+
builder.build_corpus(output_path)
|
| 831 |
+
|
| 832 |
+
# --- Hugging Face upload with improved error handling ---
|
| 833 |
+
try:
|
| 834 |
+
# Split large files if needed
|
| 835 |
+
file_size = os.path.getsize(output_path)
|
| 836 |
+
if file_size > 10 * 1024 * 1024: # 10 MB
|
| 837 |
+
logger.info("Large file detected, splitting into chunks...")
|
| 838 |
+
chunk_size = 10 * 1024 * 1024 # 10 MB chunks
|
| 839 |
+
base_path = os.path.splitext(output_path)[0]
|
| 840 |
+
|
| 841 |
+
with open(output_path, 'r', encoding='utf-8') as f:
|
| 842 |
+
chunk_num = 0
|
| 843 |
+
chunk = []
|
| 844 |
+
current_size = 0
|
| 845 |
+
|
| 846 |
+
for line in f:
|
| 847 |
+
line_size = len(line.encode('utf-8'))
|
| 848 |
+
if current_size + line_size > chunk_size and chunk:
|
| 849 |
+
chunk_path = f"{base_path}_part{chunk_num}.jsonl"
|
| 850 |
+
with open(chunk_path, 'w', encoding='utf-8') as chunk_file:
|
| 851 |
+
chunk_file.writelines(chunk)
|
| 852 |
+
logger.info(f"Created chunk {chunk_num}: {chunk_path}")
|
| 853 |
+
chunk = []
|
| 854 |
+
current_size = 0
|
| 855 |
+
chunk_num += 1
|
| 856 |
+
|
| 857 |
+
chunk.append(line)
|
| 858 |
+
current_size += line_size
|
| 859 |
+
|
| 860 |
+
# Write final chunk
|
| 861 |
+
if chunk:
|
| 862 |
+
chunk_path = f"{base_path}_part{chunk_num}.jsonl"
|
| 863 |
+
with open(chunk_path, 'w', encoding='utf-8') as chunk_file:
|
| 864 |
+
chunk_file.writelines(chunk)
|
| 865 |
+
logger.info(f"Created final chunk {chunk_num}: {chunk_path}")
|
| 866 |
+
|
| 867 |
+
# Upload each chunk
|
| 868 |
+
for i in range(chunk_num + 1):
|
| 869 |
+
chunk_path = f"{base_path}_part{i}.jsonl"
|
| 870 |
+
logger.info(f"Uploading chunk {i}...")
|
| 871 |
+
upload_to_huggingface(
|
| 872 |
+
dataset_path=chunk_path,
|
| 873 |
+
repo_id="Allanatrix/Scientific_Research_Tokenized",
|
| 874 |
+
auto_generate_readme=(i == 0), # Only generate README for first chunk
|
| 875 |
+
compress=True,
|
| 876 |
+
keep_local=True # Keep files until all uploads complete
|
| 877 |
+
)
|
| 878 |
+
else:
|
| 879 |
+
# Upload single file
|
| 880 |
+
upload_to_huggingface(
|
| 881 |
+
dataset_path=output_path,
|
| 882 |
+
repo_id="Allanatrix/Scientific_Research_Tokenized",
|
| 883 |
+
auto_generate_readme=True,
|
| 884 |
+
compress=True
|
| 885 |
+
)
|
| 886 |
+
|
| 887 |
+
except ImportError:
|
| 888 |
+
logger.error("Hugging Face upload module not found. Please ensure hf_upload.py exists.")
|
| 889 |
+
except Exception as e:
|
| 890 |
+
logger.error(f"Error during Hugging Face upload: {e}")
|
| 891 |
+
if "EOF" in str(e) or "timeout" in str(e):
|
| 892 |
+
logger.warning("Upload interrupted. Try using smaller chunks or increasing timeout.")
|
| 893 |
+
finally:
|
| 894 |
+
# Cleanup temporary files
|
| 895 |
+
if 'chunk_num' in locals():
|
| 896 |
+
for i in range(chunk_num + 1):
|
| 897 |
+
try:
|
| 898 |
+
os.remove(f"{base_path}_part{i}.jsonl")
|
| 899 |
+
except OSError:
|
| 900 |
+
pass
|
| 901 |
+
|
| 902 |
+
except KeyboardInterrupt:
|
| 903 |
+
logger.info("Build process interrupted by user")
|
| 904 |
+
except Exception as e:
|
| 905 |
+
logger.error(f"Unexpected error in main: {e}")
|
| 906 |
+
raise
|
| 907 |
+
|
| 908 |
+
# Optionally, you can add a CLI entry point for testing:
|
| 909 |
+
def main_scoped(plan: str = "free"):
|
| 910 |
+
config = CorpusConfig()
|
| 911 |
+
builder = ScientificCorpusBuilder(config)
|
| 912 |
+
token_budget = PLAN_LIMITS.get(plan, 1000)
|
| 913 |
+
corpus, stats = builder.build_corpus_scoped(plan, token_budget)
|
| 914 |
+
output_path = f"scientific_corpus_{plan}_{token_budget}.jsonl"
|
| 915 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 916 |
+
for paper in corpus:
|
| 917 |
+
f.write(json.dumps(paper, ensure_ascii=False) + "\n")
|
| 918 |
+
print(f"Saved {len(corpus)} samples ({stats['total_tokens']} tokens) to {output_path}")
|
| 919 |
+
|
| 920 |
+
if __name__ == "__main__":
|
| 921 |
+
# main() # old entry point
|
| 922 |
+
main_scoped("free") # new entry point for plan-scoped corpus
|
Tokenization/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Tokenization/__init__.py
|
| 2 |
+
|
| 3 |
+
from .Entropy_ranker import EntropyRanker
|
| 4 |
+
from .Label_tokens import DOMAIN_TAGS, TASK_TAGS, SECTION_TAGS, ROUTING_TAGS, build_tag_string
|
| 5 |
+
from .preprocessing import clean_text, segment_paragraphs, preprocess_sample
|
| 6 |
+
|
| 7 |
+
# Expose the main dataset generation pipeline for external use
|
| 8 |
+
from .generate_dataset import generate_dataset
|
| 9 |
+
|
| 10 |
+
__all__ = [
|
| 11 |
+
"EntropyRanker",
|
| 12 |
+
"DOMAIN_TAGS",
|
| 13 |
+
"TASK_TAGS",
|
| 14 |
+
"SECTION_TAGS",
|
| 15 |
+
"ROUTING_TAGS",
|
| 16 |
+
"build_tag_string",
|
| 17 |
+
"clean_text",
|
| 18 |
+
"segment_paragraphs",
|
| 19 |
+
"preprocess_sample",
|
| 20 |
+
"generate_dataset",
|
| 21 |
+
]
|
Tokenization/__pycache__/Build_tokenizer.cpython-310.pyc
ADDED
|
Binary file (3.54 kB). View file
|
|
|
Tokenization/__pycache__/Entropy_ranker.cpython-310.pyc
ADDED
|
Binary file (3.39 kB). View file
|
|
|
Tokenization/__pycache__/Label_tokens.cpython-310.pyc
ADDED
|
Binary file (1.35 kB). View file
|
|
|
Tokenization/__pycache__/Main_2.cpython-310.pyc
ADDED
|
Binary file (26.8 kB). View file
|
|
|
Tokenization/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (565 Bytes). View file
|
|
|
Tokenization/__pycache__/generate_dataset.cpython-310.pyc
ADDED
|
Binary file (3.14 kB). View file
|
|
|
Tokenization/__pycache__/hf_upload.cpython-310.pyc
ADDED
|
Binary file (5.56 kB). View file
|
|
|
Tokenization/app.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import time
|
| 3 |
+
|
| 4 |
+
def calculate_price(payment_mode, tokens, plan, custom_price, file):
|
| 5 |
+
if payment_mode == "Pay as you go":
|
| 6 |
+
price = round(tokens * 0.01, 2) # Example: $0.01 per token
|
| 7 |
+
return f"{tokens:,} tokens\nPrice: ${price:.2f}", price
|
| 8 |
+
elif payment_mode == "Plan":
|
| 9 |
+
if plan == "Free":
|
| 10 |
+
return "0 tokens\nPrice: $0", 0
|
| 11 |
+
elif plan == "Starter":
|
| 12 |
+
return "100,000 tokens\nPrice: $15", 15
|
| 13 |
+
elif plan == "Pro":
|
| 14 |
+
return "500,000 tokens\nPrice: $30", 30
|
| 15 |
+
elif plan == "Custom":
|
| 16 |
+
return f"Custom plan\nPrice: ${custom_price}", float(custom_price or 0)
|
| 17 |
+
elif file is not None:
|
| 18 |
+
# Simulate token count from file size
|
| 19 |
+
tokens = 1000 # Replace it with real calculation
|
| 20 |
+
price = round(tokens * 0.01, 2)
|
| 21 |
+
return f"{tokens:,} tokens\nPrice: ${price:.2f}", price
|
| 22 |
+
return "", 0
|
| 23 |
+
|
| 24 |
+
def generate_dataset(*args, **kwargs):
|
| 25 |
+
for i in range(5):
|
| 26 |
+
yield f"Generating... ({(i+1)*20}%)", None, (i+1)/5
|
| 27 |
+
time.sleep(0.3)
|
| 28 |
+
yield "Ready! Please pay to download.", "dataset.jsonl", 1.0
|
| 29 |
+
|
| 30 |
+
with gr.Blocks(
|
| 31 |
+
title="Nexa Data Studio",
|
| 32 |
+
css="""
|
| 33 |
+
body, .gradio-container {
|
| 34 |
+
min-height: 100vh;
|
| 35 |
+
background: #111 !important;
|
| 36 |
+
color: #fff !important;
|
| 37 |
+
}
|
| 38 |
+
.gradio-container {
|
| 39 |
+
max-width: 900px !important;
|
| 40 |
+
margin: 40px auto !important;
|
| 41 |
+
box-shadow: 0 2px 16px #0008;
|
| 42 |
+
border-radius: 16px;
|
| 43 |
+
padding: 32px 32px 24px 32px !important;
|
| 44 |
+
background: #111 !important;
|
| 45 |
+
color: #fff !important;
|
| 46 |
+
display: flex;
|
| 47 |
+
flex-direction: column;
|
| 48 |
+
align-items: center;
|
| 49 |
+
}
|
| 50 |
+
.footer {margin-top: 2em; color: #bbb; font-size: 0.9em; text-align: center;}
|
| 51 |
+
#header {text-align: center;}
|
| 52 |
+
"""
|
| 53 |
+
) as demo:
|
| 54 |
+
gr.Markdown(
|
| 55 |
+
"""
|
| 56 |
+
<div style="display:flex;align-items:center;gap:16px;justify-content:center;">
|
| 57 |
+
<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" height="40"/>
|
| 58 |
+
<h1 style="margin-bottom:0;">Nexa Data Studio</h1>
|
| 59 |
+
</div>
|
| 60 |
+
<p style="text-align:center;">
|
| 61 |
+
<b>Generate or label scientific datasets for ML research.</b>
|
| 62 |
+
</p>
|
| 63 |
+
""",
|
| 64 |
+
elem_id="header"
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
payment_mode = gr.Radio(
|
| 68 |
+
["Pay as you go", "Plan"],
|
| 69 |
+
label="Payment Mode",
|
| 70 |
+
value="Pay as you go"
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
with gr.Row() as payg_row:
|
| 74 |
+
tokens = gr.Slider(100, 100000, value=1000, step=100, label="Tokens Requested")
|
| 75 |
+
with gr.Row(visible=False) as plan_row:
|
| 76 |
+
plan = gr.Dropdown(
|
| 77 |
+
["Free", "Starter", "Pro", "Custom"],
|
| 78 |
+
label="Plan",
|
| 79 |
+
value="Free"
|
| 80 |
+
)
|
| 81 |
+
custom_price = gr.Number(label="Custom Price ($)", visible=False)
|
| 82 |
+
|
| 83 |
+
job_type = gr.Radio(
|
| 84 |
+
["Generate Dataset", "Label Uploaded Data"],
|
| 85 |
+
label="Job Type",
|
| 86 |
+
value="Generate Dataset"
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
with gr.Column(visible=False) as label_col:
|
| 90 |
+
file = gr.File(label="Upload Dataset (.txt or .jsonl)")
|
| 91 |
+
|
| 92 |
+
price_info = gr.Textbox(label="Summary", interactive=False)
|
| 93 |
+
download = gr.File(label="Download")
|
| 94 |
+
progress = gr.Slider(0, 1, value=0, step=0.01, label="Progress", interactive=False)
|
| 95 |
+
status = gr.Text(label="Status", interactive=False)
|
| 96 |
+
|
| 97 |
+
def update_payment_ui(payment_mode_val, plan_val):
|
| 98 |
+
return (
|
| 99 |
+
gr.update(visible=payment_mode_val == "Pay as you go"),
|
| 100 |
+
gr.update(visible=payment_mode_val == "Plan"),
|
| 101 |
+
gr.update(visible=payment_mode_val == "Plan" and plan_val == "Custom")
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
payment_mode.change(
|
| 105 |
+
update_payment_ui,
|
| 106 |
+
inputs=[payment_mode, plan],
|
| 107 |
+
outputs=[payg_row, plan_row, custom_price]
|
| 108 |
+
)
|
| 109 |
+
plan.change(
|
| 110 |
+
lambda p: gr.update(visible=p == "Custom"),
|
| 111 |
+
inputs=plan,
|
| 112 |
+
outputs=custom_price
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
def update_label_ui(job_type_val):
|
| 116 |
+
return gr.update(visible=job_type_val == "Label Uploaded Data")
|
| 117 |
+
job_type.change(update_label_ui, inputs=job_type, outputs=label_col)
|
| 118 |
+
|
| 119 |
+
def update_summary(payment_mode, tokens, plan, custom_price, file, job_type):
|
| 120 |
+
if job_type == "Label Uploaded Data" and file is not None:
|
| 121 |
+
return calculate_price("Label", tokens, plan, custom_price, file)[0]
|
| 122 |
+
return calculate_price(payment_mode, tokens, plan, custom_price, file)[0]
|
| 123 |
+
|
| 124 |
+
inputs = [payment_mode, tokens, plan, custom_price, file, job_type]
|
| 125 |
+
gr.Button("Generate", elem_id="generate-btn", variant="primary").click(
|
| 126 |
+
generate_dataset,
|
| 127 |
+
inputs=inputs,
|
| 128 |
+
outputs=[status, download, progress]
|
| 129 |
+
)
|
| 130 |
+
gr.Button("Update Summary").click(
|
| 131 |
+
update_summary,
|
| 132 |
+
inputs=inputs,
|
| 133 |
+
outputs=price_info
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
gr.Markdown(
|
| 137 |
+
f"""
|
| 138 |
+
<div class="footer">
|
| 139 |
+
© {time.strftime("%Y")} Nexa Data Studio — Powered by Hugging Face Spaces<br>
|
| 140 |
+
For support, contact <a href="mailto:[email protected]">[email protected]</a>
|
| 141 |
+
</div>
|
| 142 |
+
"""
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
if __name__ == "__main__":
|
| 146 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
|
| 147 |
+
print("Nexa Data Studio is running at http://localhost:7860")
|
Tokenization/app/Api.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Api.py: FastAPI endpoints for dataset generation, progress polling, and download.
|
| 3 |
+
"""
|
| 4 |
+
from fastapi import FastAPI, Request
|
| 5 |
+
from fastapi.responses import JSONResponse, StreamingResponse
|
| 6 |
+
from .Core import job_manager
|
| 7 |
+
from .Progress import progress_tracker
|
| 8 |
+
from .Payment import payment_manager
|
| 9 |
+
import io
|
| 10 |
+
|
| 11 |
+
app = FastAPI()
|
| 12 |
+
|
| 13 |
+
@app.post("/generate-dataset")
|
| 14 |
+
async def generate_dataset(request: Request):
|
| 15 |
+
user_input = await request.json()
|
| 16 |
+
job_id, error = job_manager.start_job(user_input)
|
| 17 |
+
if error:
|
| 18 |
+
return JSONResponse({"error": error}, status_code=400)
|
| 19 |
+
return {"job_id": job_id}
|
| 20 |
+
|
| 21 |
+
@app.get("/progress/{job_id}")
|
| 22 |
+
def get_progress(job_id: str):
|
| 23 |
+
progress = progress_tracker.get(job_id)
|
| 24 |
+
if not progress:
|
| 25 |
+
return JSONResponse({"error": "Job not found"}, status_code=404)
|
| 26 |
+
return progress
|
| 27 |
+
|
| 28 |
+
@app.get("/download/{job_id}")
|
| 29 |
+
def download(job_id: str):
|
| 30 |
+
job = job_manager.get_job_status(job_id)
|
| 31 |
+
if not job or job.get("status") != "complete":
|
| 32 |
+
return JSONResponse({"error": "Job not complete"}, status_code=400)
|
| 33 |
+
# Payment check
|
| 34 |
+
plan = job.get("plan", "free")
|
| 35 |
+
tokens = job.get("token_budget", 0)
|
| 36 |
+
if payment_manager.requires_payment(plan, tokens):
|
| 37 |
+
return JSONResponse({"error": "Payment required", "checkout_url": payment_manager.create_checkout_session(plan, job_id)}, status_code=402)
|
| 38 |
+
# In production, use FileResponse to serve the file
|
| 39 |
+
return {
|
| 40 |
+
"download_url": job["result_path"],
|
| 41 |
+
"stats": job.get("stats", {})
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
@app.get("/download-corpus/{job_id}")
|
| 45 |
+
def download_corpus(job_id: str):
|
| 46 |
+
job = job_manager.get_job_status(job_id)
|
| 47 |
+
if not job or job.get("status") != "complete":
|
| 48 |
+
return JSONResponse({"error": "Job not complete"}, status_code=400)
|
| 49 |
+
if job.get("job_type") != "corpus":
|
| 50 |
+
return JSONResponse({"error": "Not a corpus job"}, status_code=400)
|
| 51 |
+
plan = job.get("plan", "free")
|
| 52 |
+
tokens = job.get("token_budget", 0)
|
| 53 |
+
if payment_manager.requires_payment(plan, tokens):
|
| 54 |
+
return JSONResponse({"error": "Payment required", "checkout_url": payment_manager.create_checkout_session(plan, job_id)}, status_code=402)
|
| 55 |
+
jsonl_lines = job.get("jsonl_lines", [])
|
| 56 |
+
stats = job.get("stats", {})
|
| 57 |
+
# Stream the JSONL as a file
|
| 58 |
+
file_like = io.StringIO("\n".join(jsonl_lines))
|
| 59 |
+
headers = {
|
| 60 |
+
"Content-Disposition": f"attachment; filename=scientific_corpus_{job_id}.jsonl"
|
| 61 |
+
}
|
| 62 |
+
return StreamingResponse(file_like, media_type="application/jsonl", headers=headers)
|
| 63 |
+
|
| 64 |
+
@app.get("/job-stats/{job_id}")
|
| 65 |
+
def job_stats(job_id: str):
|
| 66 |
+
job = job_manager.get_job_status(job_id)
|
| 67 |
+
if not job:
|
| 68 |
+
return JSONResponse({"error": "Job not found"}, status_code=404)
|
| 69 |
+
return {"stats": job.get("stats", {})}
|
| 70 |
+
|
| 71 |
+
@app.get("/price/{plan}")
|
| 72 |
+
def get_price(plan: str):
|
| 73 |
+
price = payment_manager.get_price(plan)
|
| 74 |
+
return {"plan": plan, "price": price}
|
| 75 |
+
|
Tokenization/app/Config.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Config.py: Configuration for plan limits, pricing, and app constants.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
# Plan limits (tokens per plan)
|
| 6 |
+
PLAN_LIMITS = {
|
| 7 |
+
"free": 1000,
|
| 8 |
+
"starter": 5000,
|
| 9 |
+
"pro": 10000,
|
| 10 |
+
"enterprise": 100000,
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
# Pricing per plan (USD)
|
| 14 |
+
PLAN_PRICING = {
|
| 15 |
+
"free": 0,
|
| 16 |
+
"starter": 15,
|
| 17 |
+
"pro": 30,
|
| 18 |
+
"enterprise": "custom",
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
# Other app-wide constants
|
| 22 |
+
tmp_dir = "./tmp_datasets"
|
| 23 |
+
|
| 24 |
+
# Stripe keys, etc. (to be set via environment variables in production)
|
| 25 |
+
STRIPE_API_KEY = None
|
Tokenization/app/Core.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Core.py: Orchestrates dataset generation jobs, plan enforcement, and background processing.
|
| 3 |
+
"""
|
| 4 |
+
import threading
|
| 5 |
+
import uuid
|
| 6 |
+
import os
|
| 7 |
+
import json
|
| 8 |
+
from .Config import PLAN_LIMITS, tmp_dir
|
| 9 |
+
from .Progress import progress_tracker
|
| 10 |
+
from .Payment import payment_manager
|
| 11 |
+
|
| 12 |
+
# Import your tokenizer module here (example)
|
| 13 |
+
from Tokenization.generate_dataset import generate_dataset
|
| 14 |
+
from Tokenization.Main_2 import ScientificCorpusBuilder, CorpusConfig
|
| 15 |
+
from Tokenization.Build_tokenizer import QLoRAPreprocessor
|
| 16 |
+
import nltk
|
| 17 |
+
|
| 18 |
+
class JobManager:
|
| 19 |
+
def __init__(self):
|
| 20 |
+
self.jobs = {}
|
| 21 |
+
self.lock = threading.Lock()
|
| 22 |
+
|
| 23 |
+
def start_job(self, user_input):
|
| 24 |
+
plan = user_input.get("plan")
|
| 25 |
+
token_budget = user_input.get("token_budget")
|
| 26 |
+
job_type = user_input.get("job_type", "tokenize") # "tokenize", "corpus", or "label"
|
| 27 |
+
# For label jobs, token_budget is determined after upload
|
| 28 |
+
if job_type != "label" and not payment_manager.check_plan_limit(plan, token_budget):
|
| 29 |
+
return None, "Plan limit exceeded"
|
| 30 |
+
job_id = str(uuid.uuid4())
|
| 31 |
+
with self.lock:
|
| 32 |
+
self.jobs[job_id] = {
|
| 33 |
+
"status": "pending",
|
| 34 |
+
"plan": plan,
|
| 35 |
+
"token_budget": token_budget,
|
| 36 |
+
"job_type": job_type,
|
| 37 |
+
"user_input": user_input
|
| 38 |
+
}
|
| 39 |
+
if job_type == "corpus":
|
| 40 |
+
thread = threading.Thread(target=self._run_corpus_pipeline, args=(job_id,))
|
| 41 |
+
elif job_type == "label":
|
| 42 |
+
thread = threading.Thread(target=self._run_label_pipeline, args=(job_id,))
|
| 43 |
+
else:
|
| 44 |
+
thread = threading.Thread(target=self._run_job, args=(job_id, user_input))
|
| 45 |
+
thread.start()
|
| 46 |
+
return job_id, None
|
| 47 |
+
|
| 48 |
+
def _run_job(self, job_id, user_input):
|
| 49 |
+
try:
|
| 50 |
+
progress_tracker.start_job(job_id, total_steps=6)
|
| 51 |
+
# Step 1: Data retrieval
|
| 52 |
+
progress_tracker.update(job_id, 1, "Retrieving data from sources...")
|
| 53 |
+
domain = user_input.get("domain")
|
| 54 |
+
token_budget = user_input.get("token_budget")
|
| 55 |
+
plan = user_input.get("plan")
|
| 56 |
+
custom_seed = user_input.get("custom_seed", None)
|
| 57 |
+
# Step 2: Preprocessing
|
| 58 |
+
progress_tracker.update(job_id, 2, "Preprocessing and cleaning data...")
|
| 59 |
+
# Step 3: Tokenization & Labeling
|
| 60 |
+
progress_tracker.update(job_id, 3, "Tokenizing and labeling samples...")
|
| 61 |
+
# Step 4: Validation & Stats
|
| 62 |
+
progress_tracker.update(job_id, 4, "Validating and computing statistics...")
|
| 63 |
+
# Step 5: Formatting output
|
| 64 |
+
progress_tracker.update(job_id, 5, "Formatting dataset as JSONL...")
|
| 65 |
+
# Call tokenizer pipeline (implement in tokenization/tokenizer.py)
|
| 66 |
+
result = generate_dataset(
|
| 67 |
+
domain=domain,
|
| 68 |
+
token_budget=token_budget,
|
| 69 |
+
plan=plan,
|
| 70 |
+
custom_seed=custom_seed,
|
| 71 |
+
progress_callback=lambda step, msg: progress_tracker.update(job_id, step, msg)
|
| 72 |
+
)
|
| 73 |
+
# Step 6: Save output
|
| 74 |
+
os.makedirs(tmp_dir, exist_ok=True)
|
| 75 |
+
output_path = os.path.join(tmp_dir, f"{domain}_{token_budget}_tokens_{job_id}.jsonl")
|
| 76 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 77 |
+
for line in result["jsonl_lines"]:
|
| 78 |
+
f.write(line + "\n")
|
| 79 |
+
progress_tracker.update(job_id, 6, "Dataset ready for download.")
|
| 80 |
+
progress_tracker.complete(job_id)
|
| 81 |
+
with self.lock:
|
| 82 |
+
self.jobs[job_id]["status"] = "complete"
|
| 83 |
+
self.jobs[job_id]["result_path"] = output_path
|
| 84 |
+
self.jobs[job_id]["stats"] = result.get("stats", {})
|
| 85 |
+
except Exception as e:
|
| 86 |
+
progress_tracker.update(job_id, 0, f"Job failed: {str(e)}")
|
| 87 |
+
with self.lock:
|
| 88 |
+
self.jobs[job_id]["status"] = "failed"
|
| 89 |
+
self.jobs[job_id]["error"] = str(e)
|
| 90 |
+
|
| 91 |
+
def _run_corpus_pipeline(self, job_id):
|
| 92 |
+
try:
|
| 93 |
+
with self.lock:
|
| 94 |
+
user_input = self.jobs[job_id]["user_input"]
|
| 95 |
+
plan = user_input.get("plan")
|
| 96 |
+
token_budget = user_input.get("token_budget")
|
| 97 |
+
progress_tracker.start_job(job_id, total_steps=5)
|
| 98 |
+
progress_tracker.update(job_id, 1, "Building scientific corpus...")
|
| 99 |
+
config = CorpusConfig()
|
| 100 |
+
builder = ScientificCorpusBuilder(config)
|
| 101 |
+
corpus, stats = builder.build_corpus_scoped(plan, token_budget)
|
| 102 |
+
progress_tracker.update(job_id, 2, "Formatting dataset as JSONL...")
|
| 103 |
+
jsonl_lines = [json.dumps(paper, ensure_ascii=False) for paper in corpus]
|
| 104 |
+
progress_tracker.update(job_id, 3, "Finalizing output...")
|
| 105 |
+
progress_tracker.update(job_id, 4, "Corpus ready for download.")
|
| 106 |
+
progress_tracker.complete(job_id)
|
| 107 |
+
with self.lock:
|
| 108 |
+
self.jobs[job_id]["status"] = "complete"
|
| 109 |
+
self.jobs[job_id]["jsonl_lines"] = jsonl_lines
|
| 110 |
+
self.jobs[job_id]["stats"] = stats
|
| 111 |
+
self.jobs[job_id]["actual_tokens"] = stats.get("total_tokens", 0)
|
| 112 |
+
except Exception as e:
|
| 113 |
+
progress_tracker.update(job_id, 0, f"Job failed: {str(e)}")
|
| 114 |
+
with self.lock:
|
| 115 |
+
self.jobs[job_id]["status"] = "failed"
|
| 116 |
+
self.jobs[job_id]["error"] = str(e)
|
| 117 |
+
|
| 118 |
+
def _run_label_pipeline(self, job_id):
|
| 119 |
+
try:
|
| 120 |
+
with self.lock:
|
| 121 |
+
user_input = self.jobs[job_id]["user_input"]
|
| 122 |
+
plan = self.jobs[job_id]["plan"]
|
| 123 |
+
progress_tracker.start_job(job_id, total_steps=4)
|
| 124 |
+
progress_tracker.update(job_id, 1, "Loading and preprocessing dataset...")
|
| 125 |
+
dataset_text = user_input.get("dataset_text", "")
|
| 126 |
+
if not dataset_text:
|
| 127 |
+
raise ValueError("No dataset text provided.")
|
| 128 |
+
tokens = nltk.word_tokenize(dataset_text)
|
| 129 |
+
num_tokens = len(tokens)
|
| 130 |
+
with self.lock:
|
| 131 |
+
self.jobs[job_id]["actual_tokens"] = num_tokens
|
| 132 |
+
if not payment_manager.check_plan_limit(plan, num_tokens):
|
| 133 |
+
raise ValueError("Plan limit exceeded.")
|
| 134 |
+
progress_tracker.update(job_id, 2, "Tokenizing and labeling dataset...")
|
| 135 |
+
preprocessor = QLoRAPreprocessor()
|
| 136 |
+
labeled_data = preprocessor.preprocess_function(dataset_text)
|
| 137 |
+
jsonl_lines = [json.dumps({"text": item}, ensure_ascii=False) for item in labeled_data]
|
| 138 |
+
stats = {"token_count": num_tokens, "sample_count": len(labeled_data)}
|
| 139 |
+
progress_tracker.update(job_id, 3, "Dataset ready for download.")
|
| 140 |
+
progress_tracker.complete(job_id)
|
| 141 |
+
with self.lock:
|
| 142 |
+
self.jobs[job_id]["status"] = "complete"
|
| 143 |
+
self.jobs[job_id]["jsonl_lines"] = jsonl_lines
|
| 144 |
+
self.jobs[job_id]["stats"] = stats
|
| 145 |
+
except Exception as e:
|
| 146 |
+
progress_tracker.update(job_id, 0, f"Job failed: {str(e)}")
|
| 147 |
+
with self.lock:
|
| 148 |
+
self.jobs[job_id]["status"] = "failed"
|
| 149 |
+
self.jobs[job_id]["error"] = str(e)
|
| 150 |
+
|
| 151 |
+
def get_job_status(self, job_id):
|
| 152 |
+
with self.lock:
|
| 153 |
+
return self.jobs.get(job_id, None)
|
| 154 |
+
|
| 155 |
+
job_manager = JobManager()
|
Tokenization/app/Payment.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Payment.py: Plan enforcement and payment logic (Stripe stub).
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
from .Config import PLAN_LIMITS, PLAN_PRICING
|
| 6 |
+
|
| 7 |
+
class PaymentManager:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.stripe_api_key = os.getenv("STRIPE_API_KEY")
|
| 10 |
+
|
| 11 |
+
def check_plan_limit(self, plan, requested_tokens):
|
| 12 |
+
limit = PLAN_LIMITS.get(plan, 0)
|
| 13 |
+
return requested_tokens <= limit
|
| 14 |
+
|
| 15 |
+
def get_price(self, plan):
|
| 16 |
+
return PLAN_PRICING.get(plan, 0)
|
| 17 |
+
|
| 18 |
+
def requires_payment(self, plan, requested_tokens):
|
| 19 |
+
if plan == "free":
|
| 20 |
+
return requested_tokens > PLAN_LIMITS["free"]
|
| 21 |
+
return plan not in PLAN_LIMITS
|
| 22 |
+
|
| 23 |
+
def create_checkout_session(self, plan, job_id):
|
| 24 |
+
# Stub: Integrate with Stripe API in production
|
| 25 |
+
return f"https://checkout.stripe.com/pay/{plan}/{job_id}"
|
| 26 |
+
|
| 27 |
+
payment_manager = PaymentManager()
|
Tokenization/app/Progress.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Progress.py: Thread-safe progress tracking for dataset generation jobs.
|
| 3 |
+
"""
|
| 4 |
+
import threading
|
| 5 |
+
|
| 6 |
+
class ProgressTracker:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self._progress = {}
|
| 9 |
+
self._lock = threading.Lock()
|
| 10 |
+
|
| 11 |
+
def start_job(self, job_id, total_steps):
|
| 12 |
+
with self._lock:
|
| 13 |
+
self._progress[job_id] = {
|
| 14 |
+
"current": 0,
|
| 15 |
+
"total": total_steps,
|
| 16 |
+
"status": "started",
|
| 17 |
+
"message": "Job started"
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
def update(self, job_id, current, message=None):
|
| 21 |
+
with self._lock:
|
| 22 |
+
if job_id in self._progress:
|
| 23 |
+
self._progress[job_id]["current"] = current
|
| 24 |
+
if message:
|
| 25 |
+
self._progress[job_id]["message"] = message # No emoji, just message
|
| 26 |
+
|
| 27 |
+
def complete(self, job_id):
|
| 28 |
+
with self._lock:
|
| 29 |
+
if job_id in self._progress:
|
| 30 |
+
self._progress[job_id]["status"] = "complete"
|
| 31 |
+
self._progress[job_id]["message"] = "Job complete"
|
| 32 |
+
|
| 33 |
+
def get(self, job_id):
|
| 34 |
+
with self._lock:
|
| 35 |
+
return self._progress.get(job_id, None)
|
| 36 |
+
|
| 37 |
+
progress_tracker = ProgressTracker()
|
Tokenization/app/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app/__init__.py: Exposes main backend components for reuse.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from .Api import app as fastapi_app
|
| 6 |
+
from .Core import job_manager
|
| 7 |
+
from .Progress import progress_tracker
|
| 8 |
+
from .Payment import payment_manager
|
| 9 |
+
|
| 10 |
+
__all__ = [
|
| 11 |
+
"fastapi_app",
|
| 12 |
+
"job_manager",
|
| 13 |
+
"progress_tracker",
|
| 14 |
+
"payment_manager",
|
| 15 |
+
]
|
Tokenization/app/__pycache__/Api.cpython-310.pyc
ADDED
|
Binary file (2.81 kB). View file
|
|
|
Tokenization/app/__pycache__/Config.cpython-310.pyc
ADDED
|
Binary file (444 Bytes). View file
|
|
|
Tokenization/app/__pycache__/Core.cpython-310.pyc
ADDED
|
Binary file (4.86 kB). View file
|
|
|
Tokenization/app/__pycache__/Payment.cpython-310.pyc
ADDED
|
Binary file (1.45 kB). View file
|
|
|
Tokenization/app/__pycache__/Progress.cpython-310.pyc
ADDED
|
Binary file (1.66 kB). View file
|
|
|
Tokenization/app/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (460 Bytes). View file
|
|
|
Tokenization/combined_scientific_papers.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Tokenization/combined_scientific_papers.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Tokenization/corpus_builder.log
ADDED
|
File without changes
|
Tokenization/debug_upload.log
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-12 18:18:01,037 - WARNING - Using default email for Entrez. Set ENTREZ_EMAIL environment variable.
|
| 2 |
+
2025-06-12 18:18:01,037 - INFO - Starting arXiv paper collection...
|
| 3 |
+
2025-06-12 18:18:01,038 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
|
| 4 |
+
2025-06-12 18:18:03,165 - INFO - Got first page: 100 of 1236760 total results
|
| 5 |
+
2025-06-12 18:18:03,172 - INFO - Sleeping: 2.828948 seconds
|
| 6 |
+
2025-06-12 18:18:06,004 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=100&max_results=100
|
| 7 |
+
2025-06-12 18:18:06,953 - INFO - Sleeping: 2.866122 seconds
|
| 8 |
+
2025-06-12 18:18:09,824 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=200&max_results=100
|
| 9 |
+
2025-06-12 18:18:11,783 - INFO - Sleeping: 2.823819 seconds
|
| 10 |
+
2025-06-12 18:18:14,608 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=300&max_results=100
|
| 11 |
+
2025-06-12 18:18:16,436 - INFO - Sleeping: 2.857095 seconds
|
| 12 |
+
2025-06-12 18:18:19,301 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=400&max_results=100
|
| 13 |
+
2025-06-12 18:18:22,022 - INFO - Sleeping: 2.790207 seconds
|
| 14 |
+
2025-06-12 18:18:24,820 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
|
| 15 |
+
2025-06-12 18:18:25,173 - INFO - Sleeping: 2.998001 seconds
|
| 16 |
+
2025-06-12 18:18:28,181 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
|
| 17 |
+
2025-06-12 18:18:28,988 - INFO - Sleeping: 2.999010 seconds
|
| 18 |
+
2025-06-12 18:18:32,000 - INFO - Requesting page (first: False, try: 2): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
|
| 19 |
+
2025-06-12 18:18:32,507 - INFO - Sleeping: 2.998957 seconds
|
| 20 |
+
2025-06-12 18:18:35,519 - INFO - Requesting page (first: False, try: 3): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
|
| 21 |
+
2025-06-12 18:18:36,061 - WARNING - Empty page returned for query 'cat:physics* OR cat:astro-ph* OR cat:cond-mat* OR cat:hep-th OR cat:quant-ph OR cat:math-ph': Page of results was unexpectedly empty (https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100)
|
| 22 |
+
2025-06-12 18:18:36,065 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
|
| 23 |
+
2025-06-12 18:18:36,888 - INFO - Got first page: 100 of 50293 total results
|
| 24 |
+
2025-06-12 18:18:36,896 - INFO - Sleeping: 2.871087 seconds
|
| 25 |
+
2025-06-12 18:18:39,783 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=100&max_results=100
|
| 26 |
+
2025-06-12 18:18:40,466 - INFO - Sleeping: 2.870444 seconds
|
| 27 |
+
2025-06-12 18:18:43,339 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=200&max_results=100
|
| 28 |
+
2025-06-12 18:18:44,012 - INFO - Sleeping: 2.874603 seconds
|
| 29 |
+
2025-06-12 18:18:46,893 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=300&max_results=100
|
| 30 |
+
2025-06-12 18:18:47,688 - INFO - Sleeping: 2.858048 seconds
|
| 31 |
+
2025-06-12 18:18:50,552 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=400&max_results=100
|
| 32 |
+
2025-06-12 18:18:51,370 - INFO - Sleeping: 2.870823 seconds
|
| 33 |
+
2025-06-12 18:18:54,246 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
|
| 34 |
+
2025-06-12 18:18:54,960 - INFO - Sleeping: 2.886596 seconds
|
| 35 |
+
2025-06-12 18:18:57,856 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=600&max_results=100
|
| 36 |
+
2025-06-12 18:18:58,568 - INFO - Sleeping: 2.886486 seconds
|
| 37 |
+
2025-06-12 18:19:01,466 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=700&max_results=100
|
| 38 |
+
2025-06-12 18:19:02,219 - INFO - Sleeping: 2.867826 seconds
|
| 39 |
+
2025-06-12 18:19:05,103 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=800&max_results=100
|
| 40 |
+
2025-06-12 18:19:06,346 - INFO - Sleeping: 2.766637 seconds
|
| 41 |
+
2025-06-12 18:19:09,120 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=900&max_results=100
|
| 42 |
+
2025-06-12 18:19:10,043 - INFO - Sleeping: 2.877552 seconds
|
| 43 |
+
2025-06-12 18:19:12,929 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1000&max_results=100
|
| 44 |
+
2025-06-12 18:19:13,641 - INFO - Sleeping: 2.873434 seconds
|
| 45 |
+
2025-06-12 18:19:16,525 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1100&max_results=100
|
| 46 |
+
2025-06-12 18:19:17,281 - INFO - Sleeping: 2.871482 seconds
|
| 47 |
+
2025-06-12 18:19:20,161 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1200&max_results=100
|
| 48 |
+
2025-06-12 18:19:20,990 - INFO - Sleeping: 2.872492 seconds
|
| 49 |
+
2025-06-12 18:19:23,876 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1300&max_results=100
|
| 50 |
+
2025-06-12 18:19:24,633 - INFO - Sleeping: 2.873157 seconds
|
| 51 |
+
2025-06-12 18:19:27,510 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1400&max_results=100
|
| 52 |
+
2025-06-12 18:19:28,249 - INFO - Sleeping: 2.872219 seconds
|
| 53 |
+
2025-06-12 18:19:31,132 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1500&max_results=100
|
| 54 |
+
2025-06-12 18:19:31,787 - INFO - Sleeping: 2.871294 seconds
|
| 55 |
+
2025-06-12 18:19:34,660 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1600&max_results=100
|
| 56 |
+
2025-06-12 18:19:35,423 - INFO - Sleeping: 2.864608 seconds
|
| 57 |
+
2025-06-12 18:19:38,291 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1700&max_results=100
|
| 58 |
+
2025-06-12 18:19:38,496 - INFO - Sleeping: 2.998046 seconds
|
| 59 |
+
2025-06-12 18:19:41,498 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1700&max_results=100
|
| 60 |
+
2025-06-12 18:19:41,682 - INFO - Sleeping: 2.998049 seconds
|
| 61 |
+
2025-06-12 18:19:44,693 - INFO - Requesting page (first: False, try: 2): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1700&max_results=100
|
| 62 |
+
2025-06-12 18:19:45,568 - INFO - Sleeping: 2.874692 seconds
|
| 63 |
+
2025-06-12 18:19:48,448 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1800&max_results=100
|
| 64 |
+
2025-06-12 18:19:48,654 - INFO - Sleeping: 2.998000 seconds
|
| 65 |
+
2025-06-12 18:19:51,668 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1800&max_results=100
|
| 66 |
+
2025-06-12 18:19:52,436 - INFO - Sleeping: 2.877867 seconds
|
| 67 |
+
2025-06-12 18:19:55,323 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1900&max_results=100
|
| 68 |
+
2025-06-12 18:19:56,074 - INFO - Sleeping: 2.878102 seconds
|
| 69 |
+
2025-06-12 18:19:58,961 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2000&max_results=100
|
| 70 |
+
2025-06-12 18:19:59,730 - INFO - Sleeping: 2.846435 seconds
|
| 71 |
+
2025-06-12 18:20:02,587 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2100&max_results=100
|
| 72 |
+
2025-06-12 18:20:02,802 - INFO - Sleeping: 2.997978 seconds
|
| 73 |
+
2025-06-12 18:20:05,801 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2100&max_results=100
|
| 74 |
+
2025-06-12 18:20:06,645 - INFO - Sleeping: 2.882026 seconds
|
| 75 |
+
2025-06-12 18:20:09,537 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2200&max_results=100
|
| 76 |
+
2025-06-12 18:20:10,681 - INFO - Sleeping: 2.867912 seconds
|
| 77 |
+
2025-06-12 18:20:13,558 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2300&max_results=100
|
| 78 |
+
2025-06-12 18:20:15,163 - INFO - Sleeping: 2.874383 seconds
|
| 79 |
+
2025-06-12 18:20:18,052 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2400&max_results=100
|
| 80 |
+
2025-06-12 18:20:19,022 - INFO - Sleeping: 2.885731 seconds
|
| 81 |
+
2025-06-12 18:20:21,916 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2500&max_results=100
|
| 82 |
+
2025-06-12 18:20:22,743 - INFO - Sleeping: 2.880111 seconds
|
| 83 |
+
2025-06-12 18:20:25,633 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2600&max_results=100
|
| 84 |
+
2025-06-12 18:20:26,848 - INFO - Sleeping: 2.877337 seconds
|
| 85 |
+
2025-06-12 18:20:29,728 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2700&max_results=100
|
| 86 |
+
2025-06-12 18:20:29,961 - INFO - Sleeping: 2.999086 seconds
|
| 87 |
+
2025-06-12 18:20:32,973 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2700&max_results=100
|
| 88 |
+
2025-06-12 18:20:33,783 - INFO - Sleeping: 2.870358 seconds
|
| 89 |
+
2025-06-12 18:20:36,664 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2800&max_results=100
|
| 90 |
+
2025-06-12 18:20:36,929 - INFO - Sleeping: 2.997254 seconds
|
| 91 |
+
2025-06-12 18:20:39,936 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2800&max_results=100
|
| 92 |
+
2025-06-12 18:20:40,834 - INFO - Sleeping: 2.876953 seconds
|
| 93 |
+
2025-06-12 18:20:43,716 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2900&max_results=100
|
| 94 |
+
2025-06-12 18:20:44,816 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
|
| 95 |
+
2025-06-12 18:20:46,192 - INFO - Got first page: 100 of 100310 total results
|
| 96 |
+
2025-06-12 18:20:46,198 - INFO - Sleeping: 2.859482 seconds
|
| 97 |
+
2025-06-12 18:20:49,073 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=100&max_results=100
|
| 98 |
+
2025-06-12 18:20:49,789 - INFO - Sleeping: 2.869352 seconds
|
| 99 |
+
2025-06-12 18:20:52,669 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=200&max_results=100
|
| 100 |
+
2025-06-12 18:20:53,467 - INFO - Sleeping: 2.862511 seconds
|
| 101 |
+
2025-06-12 18:20:56,338 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=300&max_results=100
|
| 102 |
+
2025-06-12 18:20:57,071 - INFO - Sleeping: 2.870255 seconds
|
| 103 |
+
2025-06-12 18:20:59,951 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=400&max_results=100
|
| 104 |
+
2025-06-12 18:21:00,728 - INFO - Sleeping: 2.869636 seconds
|
| 105 |
+
2025-06-12 18:21:03,604 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
|
| 106 |
+
2025-06-12 18:21:04,393 - INFO - Sleeping: 2.865000 seconds
|
| 107 |
+
2025-06-12 18:21:07,272 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=600&max_results=100
|
| 108 |
+
2025-06-12 18:21:08,029 - INFO - Sleeping: 2.858943 seconds
|
| 109 |
+
2025-06-12 18:21:10,895 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=700&max_results=100
|
| 110 |
+
2025-06-12 18:21:11,768 - INFO - Sleeping: 2.866744 seconds
|
| 111 |
+
2025-06-12 18:21:14,640 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=800&max_results=100
|
| 112 |
+
2025-06-12 18:21:15,488 - INFO - Sleeping: 2.720050 seconds
|
| 113 |
+
2025-06-12 18:21:18,211 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=900&max_results=100
|
| 114 |
+
2025-06-12 18:21:19,122 - INFO - Sleeping: 2.844511 seconds
|
| 115 |
+
2025-06-12 18:21:21,982 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1000&max_results=100
|
| 116 |
+
2025-06-12 18:21:22,772 - INFO - Sleeping: 2.871176 seconds
|
| 117 |
+
2025-06-12 18:21:25,647 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1100&max_results=100
|
| 118 |
+
2025-06-12 18:21:25,925 - INFO - Sleeping: 2.997949 seconds
|
| 119 |
+
2025-06-12 18:21:28,932 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1100&max_results=100
|
| 120 |
+
2025-06-12 18:21:29,774 - INFO - Sleeping: 2.864288 seconds
|
| 121 |
+
2025-06-12 18:21:32,644 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1200&max_results=100
|
| 122 |
+
2025-06-12 18:21:33,454 - INFO - Sleeping: 2.860076 seconds
|
| 123 |
+
2025-06-12 18:21:36,317 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1300&max_results=100
|
| 124 |
+
2025-06-12 18:21:36,605 - INFO - Sleeping: 2.997453 seconds
|
| 125 |
+
2025-06-12 18:21:39,607 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1300&max_results=100
|
| 126 |
+
2025-06-12 18:21:40,404 - INFO - Sleeping: 2.856277 seconds
|
| 127 |
+
2025-06-12 18:21:43,276 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1400&max_results=100
|
| 128 |
+
2025-06-12 18:21:44,085 - INFO - Sleeping: 2.862912 seconds
|
| 129 |
+
2025-06-12 18:21:46,964 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1500&max_results=100
|
| 130 |
+
2025-06-12 18:21:47,858 - INFO - Sleeping: 2.860433 seconds
|
| 131 |
+
2025-06-12 18:21:50,732 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1600&max_results=100
|
| 132 |
+
2025-06-12 18:21:51,504 - INFO - Sleeping: 2.874451 seconds
|
| 133 |
+
2025-06-12 18:21:54,387 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1700&max_results=100
|
| 134 |
+
2025-06-12 18:21:55,722 - INFO - Sleeping: 2.859315 seconds
|
| 135 |
+
2025-06-12 18:21:58,585 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1800&max_results=100
|
| 136 |
+
2025-06-12 18:21:59,503 - INFO - Sleeping: 2.863854 seconds
|
| 137 |
+
2025-06-12 18:22:02,377 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1900&max_results=100
|
| 138 |
+
2025-06-12 18:22:02,618 - INFO - Sleeping: 2.997967 seconds
|
| 139 |
+
2025-06-12 18:22:05,628 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1900&max_results=100
|
| 140 |
+
2025-06-12 18:22:06,677 - INFO - Sleeping: 2.844775 seconds
|
| 141 |
+
2025-06-12 18:22:09,533 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2000&max_results=100
|
| 142 |
+
2025-06-12 18:22:09,792 - INFO - Sleeping: 2.998977 seconds
|
| 143 |
+
2025-06-12 18:22:12,797 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2000&max_results=100
|
| 144 |
+
2025-06-12 18:22:13,677 - INFO - Sleeping: 2.860952 seconds
|
| 145 |
+
2025-06-12 18:22:16,551 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2100&max_results=100
|
| 146 |
+
2025-06-12 18:22:17,381 - INFO - Sleeping: 2.862895 seconds
|
| 147 |
+
2025-06-12 18:22:20,259 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2200&max_results=100
|
| 148 |
+
2025-06-12 18:22:21,092 - INFO - Sleeping: 2.865440 seconds
|
| 149 |
+
2025-06-12 18:22:23,963 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2300&max_results=100
|
| 150 |
+
2025-06-12 18:22:24,738 - INFO - Sleeping: 2.854685 seconds
|
| 151 |
+
2025-06-12 18:22:27,605 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2400&max_results=100
|
| 152 |
+
2025-06-12 18:22:28,443 - INFO - Sleeping: 2.866245 seconds
|
| 153 |
+
2025-06-12 18:22:31,321 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2500&max_results=100
|
| 154 |
+
2025-06-12 18:22:32,401 - INFO - Sleeping: 2.857156 seconds
|
| 155 |
+
2025-06-12 18:22:35,269 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2600&max_results=100
|
| 156 |
+
2025-06-12 18:22:35,481 - INFO - Sleeping: 2.997016 seconds
|
| 157 |
+
2025-06-12 18:22:38,486 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2600&max_results=100
|
| 158 |
+
2025-06-12 18:22:39,346 - INFO - Sleeping: 2.856990 seconds
|
| 159 |
+
2025-06-12 18:22:42,208 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2700&max_results=100
|
| 160 |
+
2025-06-12 18:22:43,031 - INFO - Sleeping: 2.852790 seconds
|
| 161 |
+
2025-06-12 18:22:45,889 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2800&max_results=100
|
| 162 |
+
2025-06-12 18:22:46,748 - INFO - Sleeping: 2.858054 seconds
|
| 163 |
+
2025-06-12 18:22:49,610 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2900&max_results=100
|
| 164 |
+
2025-06-12 18:22:49,923 - INFO - Sleeping: 2.997999 seconds
|
| 165 |
+
2025-06-12 18:22:52,927 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2900&max_results=100
|
| 166 |
+
2025-06-12 18:22:53,180 - INFO - Sleeping: 2.998443 seconds
|
| 167 |
+
2025-06-12 18:22:56,182 - INFO - Requesting page (first: False, try: 2): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2900&max_results=100
|
| 168 |
+
2025-06-12 18:22:57,297 - INFO - Saved checkpoint to scientific_corpus_data\arxiv_papers.jsonl
|
| 169 |
+
2025-06-12 18:22:57,297 - INFO - Collected 5989 arXiv papers in 296.26s
|
| 170 |
+
2025-06-12 18:22:57,310 - INFO - Starting PubMed paper collection...
|
| 171 |
+
2025-06-12 18:23:14,143 - INFO - Saved checkpoint to scientific_corpus_data\pubmed_papers.jsonl
|
| 172 |
+
2025-06-12 18:23:14,143 - INFO - Collected 2671 PubMed papers in 16.83s
|
| 173 |
+
2025-06-12 18:23:14,143 - INFO - Starting FineWeb-Edu collection...
|
| 174 |
+
2025-06-12 18:23:34,470 - INFO - Collected 10000 FineWeb samples
|
| 175 |
+
2025-06-12 18:23:38,652 - INFO - Collected 20000 FineWeb samples
|
| 176 |
+
2025-06-12 18:23:43,218 - INFO - Collected 30000 FineWeb samples
|
| 177 |
+
2025-06-12 18:23:43,221 - INFO - Processing 30000 FineWeb samples
|
| 178 |
+
2025-06-12 18:24:03,830 - INFO - Saved checkpoint to scientific_corpus_data\fineweb_edu.jsonl
|
| 179 |
+
2025-06-12 18:24:03,831 - INFO - Collected 29616 FineWeb-Edu papers in 49.69s
|
| 180 |
+
2025-06-12 18:24:03,873 - INFO - Processing 5989 arxiv papers...
|
| 181 |
+
2025-06-12 18:24:05,244 - INFO - Processed 5989/5989 arxiv papers
|
| 182 |
+
2025-06-12 18:24:05,244 - INFO - Unknown domains: 0, Unknown sections: 3349
|
| 183 |
+
2025-06-12 18:24:05,244 - INFO - Processing 2671 biology papers...
|
| 184 |
+
2025-06-12 18:24:05,765 - INFO - Processed 2605/2671 biology papers
|
| 185 |
+
2025-06-12 18:24:05,765 - INFO - Unknown domains: 0, Unknown sections: 1015
|
| 186 |
+
2025-06-12 18:24:05,765 - INFO - Processing 29616 education papers...
|
| 187 |
+
2025-06-12 18:24:39,231 - INFO - Processed 159402/29616 education papers
|
| 188 |
+
2025-06-12 18:24:39,231 - INFO - Unknown domains: 29616, Unknown sections: 21161
|
| 189 |
+
2025-06-12 19:06:41,335 - INFO - Received signal 2, shutting down gracefully. Frame: <frame at 0x0000023E5AF0BBC0, file 'C:\\Users\\kunya\\AppData\\Local\\Programs\\Python\\Python310\\lib\\threading.py', line 320, code wait>
|
| 190 |
+
2025-06-12 19:06:43,708 - WARNING - Using default email for Entrez. Set ENTREZ_EMAIL environment variable.
|
| 191 |
+
2025-06-12 19:06:43,710 - INFO - Starting arXiv paper collection...
|
| 192 |
+
2025-06-12 19:06:43,711 - INFO - Saved checkpoint to scientific_corpus_data\arxiv_papers.jsonl
|
| 193 |
+
2025-06-12 19:06:43,712 - INFO - Collected 0 arXiv papers in 0.00s
|
| 194 |
+
2025-06-12 19:06:43,713 - INFO - Starting PubMed paper collection...
|
| 195 |
+
2025-06-12 19:06:43,715 - INFO - Saved checkpoint to scientific_corpus_data\pubmed_papers.jsonl
|
| 196 |
+
2025-06-12 19:06:43,715 - INFO - Collected 0 PubMed papers in 0.00s
|
| 197 |
+
2025-06-12 19:06:43,716 - INFO - Shutdown in progress, aborting retries.
|
| 198 |
+
2025-06-12 19:16:11,718 - INFO - Received signal 2, shutting down gracefully. Frame: <frame at 0x0000023E7696F880, file 'C:\\Users\\kunya\\AppData\\Local\\Programs\\Python\\Python310\\lib\\selectors.py', line 315, code _select>
|
Tokenization/generate_dataset.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from typing import Optional, Callable, Dict, Any
|
| 3 |
+
|
| 4 |
+
from Tokenization.Build_tokenizer import QLoRAPreprocessor
|
| 5 |
+
from Tokenization.preprocessing.Clean_text import clean_text
|
| 6 |
+
from Tokenization.Main_2 import ScientificCorpusBuilder, CorpusConfig
|
| 7 |
+
|
| 8 |
+
def generate_dataset(
|
| 9 |
+
domain: str = None,
|
| 10 |
+
token_budget: int = 1000,
|
| 11 |
+
plan: str = "free",
|
| 12 |
+
custom_seed: Optional[str] = None,
|
| 13 |
+
job_type: str = "tokenize",
|
| 14 |
+
progress_callback: Optional[Callable[[int, str], None]] = None
|
| 15 |
+
) -> Dict[str, Any]:
|
| 16 |
+
"""
|
| 17 |
+
Unified dataset generation pipeline for both 'tokenize' and 'corpus' jobs.
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
domain (str): Domain for dataset.
|
| 21 |
+
token_budget (int): Token budget.
|
| 22 |
+
plan (str): Plan type.
|
| 23 |
+
custom_seed (str): Optional seed data.
|
| 24 |
+
job_type (str): "tokenize" or "corpus".
|
| 25 |
+
progress_callback (callable): Progress update callback.
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
dict: {"jsonl_lines": [...], "stats": {...}}
|
| 29 |
+
"""
|
| 30 |
+
if job_type == "corpus":
|
| 31 |
+
# Use Main_2 pipeline
|
| 32 |
+
if progress_callback:
|
| 33 |
+
progress_callback(1, "Initializing scientific corpus builder...")
|
| 34 |
+
config = CorpusConfig()
|
| 35 |
+
builder = ScientificCorpusBuilder(config)
|
| 36 |
+
if progress_callback:
|
| 37 |
+
progress_callback(2, "Fetching arXiv papers...")
|
| 38 |
+
arxiv_papers = builder.fetch_arxiv_papers()
|
| 39 |
+
if progress_callback:
|
| 40 |
+
progress_callback(3, "Fetching PubMed papers...")
|
| 41 |
+
pubmed_papers = builder.fetch_pubmed_papers()
|
| 42 |
+
if progress_callback:
|
| 43 |
+
progress_callback(4, "Fetching FineWeb-Edu samples...")
|
| 44 |
+
fineweb_papers = builder.fetch_fineweb_edu()
|
| 45 |
+
if progress_callback:
|
| 46 |
+
progress_callback(5, "Processing and tagging papers...")
|
| 47 |
+
all_papers = []
|
| 48 |
+
all_papers.extend(builder.process_papers(arxiv_papers, "arxiv"))
|
| 49 |
+
all_papers.extend(builder.process_papers(pubmed_papers, "biology"))
|
| 50 |
+
all_papers.extend(builder.process_papers(fineweb_papers, "education"))
|
| 51 |
+
if progress_callback:
|
| 52 |
+
progress_callback(6, "Ranking and deduplicating...")
|
| 53 |
+
ranked_papers = builder.ranker.rank_samples(all_papers)
|
| 54 |
+
if progress_callback:
|
| 55 |
+
progress_callback(7, "Preparing dataset for download...")
|
| 56 |
+
jsonl_lines = [json.dumps(paper, ensure_ascii=False) for paper in ranked_papers]
|
| 57 |
+
stats = builder.analyzer.get_dataset_stats(ranked_papers)
|
| 58 |
+
if progress_callback:
|
| 59 |
+
progress_callback(8, "Dataset ready for download.")
|
| 60 |
+
return {"jsonl_lines": jsonl_lines, "stats": stats}
|
| 61 |
+
|
| 62 |
+
# Standard "tokenize" job
|
| 63 |
+
if progress_callback:
|
| 64 |
+
progress_callback(1, "Cleaning input text...")
|
| 65 |
+
cleaned_text = clean_text(custom_seed or "")
|
| 66 |
+
if progress_callback:
|
| 67 |
+
progress_callback(2, "Tokenizing input...")
|
| 68 |
+
preprocessor = QLoRAPreprocessor()
|
| 69 |
+
# For demonstration, just split cleaned_text into sentences (replace with real logic)
|
| 70 |
+
tokens = [cleaned_text[i:i+token_budget] for i in range(0, len(cleaned_text), token_budget)]
|
| 71 |
+
if progress_callback:
|
| 72 |
+
progress_callback(3, "Formatting samples...")
|
| 73 |
+
jsonl_lines = [json.dumps({"text": t}) for t in tokens]
|
| 74 |
+
stats = {"token_count": sum(len(t.split()) for t in tokens), "total_samples": len(tokens)}
|
| 75 |
+
if progress_callback:
|
| 76 |
+
progress_callback(4, "Dataset ready for download.")
|
| 77 |
+
return {"jsonl_lines": jsonl_lines, "stats": stats}
|
Tokenization/hf_upload.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
from datasets import Dataset, Features, Value
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
from huggingface_hub import HfApi
|
| 10 |
+
|
| 11 |
+
# Load environment variables
|
| 12 |
+
load_dotenv()
|
| 13 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 14 |
+
|
| 15 |
+
# Logging setup
|
| 16 |
+
logging.basicConfig(
|
| 17 |
+
level=logging.INFO,
|
| 18 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 19 |
+
handlers=[
|
| 20 |
+
logging.StreamHandler(sys.stdout),
|
| 21 |
+
logging.FileHandler('debug_upload.log', mode='w')
|
| 22 |
+
]
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
REPO_ID = "Allanatrix/Scientific_Research_Tokenized"
|
| 26 |
+
JSONL_SRC = Path(r"C:\Users\kunya\PycharmProjects\DataVolt\Tokenization\scientific_corpus_325M.jsonl")
|
| 27 |
+
ARROW_PATH = Path("scientific_corpus_325M.arrow")
|
| 28 |
+
README_PATH = Path("README.md")
|
| 29 |
+
|
| 30 |
+
def debug_jsonl_head(jsonl_path, n=5):
|
| 31 |
+
logging.info(f"Printing the first {n} lines of {jsonl_path} for schema inspection:")
|
| 32 |
+
try:
|
| 33 |
+
with open(jsonl_path, "r", encoding="utf-8") as f:
|
| 34 |
+
for i in range(n):
|
| 35 |
+
line = f.readline()
|
| 36 |
+
if not line:
|
| 37 |
+
break
|
| 38 |
+
logging.info(f"Line {i+1}: {line.strip()}")
|
| 39 |
+
except Exception as e:
|
| 40 |
+
logging.error(f"Failed to read JSONL head: {e}")
|
| 41 |
+
|
| 42 |
+
def infer_features_from_sample(jsonl_path, n=100):
|
| 43 |
+
import json
|
| 44 |
+
from collections import defaultdict
|
| 45 |
+
types = defaultdict(set)
|
| 46 |
+
try:
|
| 47 |
+
with open(jsonl_path, "r", encoding="utf-8") as f:
|
| 48 |
+
for i, line in enumerate(f):
|
| 49 |
+
if i >= n:
|
| 50 |
+
break
|
| 51 |
+
obj = json.loads(line)
|
| 52 |
+
for k, v in obj.items():
|
| 53 |
+
types[k].add(type(v).__name__)
|
| 54 |
+
logging.info(f"Inferred field types from first {n} lines: {dict(types)}")
|
| 55 |
+
except Exception as e:
|
| 56 |
+
logging.error(f"Failed to infer features: {e}")
|
| 57 |
+
|
| 58 |
+
def convert_jsonl_to_arrow(jsonl_path, arrow_path):
|
| 59 |
+
try:
|
| 60 |
+
logging.info(f"Converting {jsonl_path} to Arrow format at {arrow_path} ...")
|
| 61 |
+
if not jsonl_path.exists():
|
| 62 |
+
logging.error(f"JSONL source file does not exist: {jsonl_path}")
|
| 63 |
+
print(f"\n❌ JSONL source file does not exist: {jsonl_path}")
|
| 64 |
+
raise FileNotFoundError(f"JSONL source file does not exist: {jsonl_path}")
|
| 65 |
+
logging.info(f"File size: {jsonl_path.stat().st_size} bytes")
|
| 66 |
+
debug_jsonl_head(jsonl_path, n=5)
|
| 67 |
+
infer_features_from_sample(jsonl_path, n=100)
|
| 68 |
+
# Try loading a small sample first for debugging
|
| 69 |
+
try:
|
| 70 |
+
sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]")
|
| 71 |
+
logging.info(f"Sample loaded: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}")
|
| 72 |
+
except Exception as sample_e:
|
| 73 |
+
logging.error(f"Failed to load sample from JSONL: {sample_e}", exc_info=True)
|
| 74 |
+
print(f"\n❌ Failed to load sample from JSONL. See debug_upload.log for details.")
|
| 75 |
+
# Try to load with explicit features if possible
|
| 76 |
+
# Example: features = Features({'url': Value('string'), 'pubmed_id': Value('string')})
|
| 77 |
+
# Uncomment and adjust the following lines if you know the schema:
|
| 78 |
+
# features = Features({'url': Value('string'), 'pubmed_id': Value('string')})
|
| 79 |
+
# try:
|
| 80 |
+
# sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]", features=features)
|
| 81 |
+
# logging.info(f"Sample loaded with explicit features: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}")
|
| 82 |
+
# except Exception as e2:
|
| 83 |
+
# logging.error(f"Still failed with explicit features: {e2}", exc_info=True)
|
| 84 |
+
raise
|
| 85 |
+
# Now load the full dataset
|
| 86 |
+
dataset = Dataset.from_json(str(jsonl_path))
|
| 87 |
+
logging.info(f"Full dataset loaded: {len(dataset)} rows, columns: {dataset.column_names}")
|
| 88 |
+
dataset.to_file(str(arrow_path))
|
| 89 |
+
logging.info(f"Saved Arrow dataset with {len(dataset):,} rows.")
|
| 90 |
+
return dataset
|
| 91 |
+
except Exception as e:
|
| 92 |
+
logging.error(f"An error occurred while generating the dataset: {e}", exc_info=True)
|
| 93 |
+
print(f"\n❌ Failed to convert JSONL to Arrow. See debug_upload.log for details.")
|
| 94 |
+
raise
|
| 95 |
+
|
| 96 |
+
def create_readme(dataset):
|
| 97 |
+
content = f"""# Scientific Research Tokenized Dataset
|
| 98 |
+
|
| 99 |
+
- **Examples**: {len(dataset):,}
|
| 100 |
+
- **Columns**: {dataset.column_names}
|
| 101 |
+
- **Updated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
| 102 |
+
|
| 103 |
+
## Usage
|
| 104 |
+
```python
|
| 105 |
+
from datasets import load_dataset
|
| 106 |
+
ds = load_dataset("{REPO_ID}")
|
| 107 |
+
```
|
| 108 |
+
"""
|
| 109 |
+
with open(README_PATH, "w", encoding="utf-8") as f:
|
| 110 |
+
f.write(content)
|
| 111 |
+
logging.info("README.md created.")
|
| 112 |
+
|
| 113 |
+
def upload_to_hf():
|
| 114 |
+
api = HfApi()
|
| 115 |
+
logging.info("Uploading Arrow file to HuggingFace Hub ...")
|
| 116 |
+
api.upload_file(
|
| 117 |
+
path_or_fileobj=str(ARROW_PATH),
|
| 118 |
+
path_in_repo=ARROW_PATH.name,
|
| 119 |
+
repo_id=REPO_ID,
|
| 120 |
+
repo_type="dataset",
|
| 121 |
+
token=HF_TOKEN,
|
| 122 |
+
commit_message="Upload Arrow dataset"
|
| 123 |
+
)
|
| 124 |
+
logging.info("Uploading README.md to HuggingFace Hub ...")
|
| 125 |
+
api.upload_file(
|
| 126 |
+
path_or_fileobj=str(README_PATH),
|
| 127 |
+
path_in_repo="README.md",
|
| 128 |
+
repo_id=REPO_ID,
|
| 129 |
+
repo_type="dataset",
|
| 130 |
+
token=HF_TOKEN,
|
| 131 |
+
commit_message="Update README"
|
| 132 |
+
)
|
| 133 |
+
logging.info("Upload complete.")
|
| 134 |
+
|
| 135 |
+
def upload_to_huggingface(*args, **kwargs):
|
| 136 |
+
"""Alias for upload_to_hf to match expected import in Main_2.py"""
|
| 137 |
+
return upload_to_hf(*args, **kwargs)
|
| 138 |
+
|
| 139 |
+
def cleanup():
|
| 140 |
+
if ARROW_PATH.exists():
|
| 141 |
+
ARROW_PATH.unlink()
|
| 142 |
+
if README_PATH.exists():
|
| 143 |
+
README_PATH.unlink()
|
| 144 |
+
logging.info("Cleaned up local files.")
|
| 145 |
+
|
| 146 |
+
def main():
|
| 147 |
+
try:
|
| 148 |
+
if not HF_TOKEN:
|
| 149 |
+
print("❌ HF_TOKEN not found in environment. Please set it in your .env file.")
|
| 150 |
+
return
|
| 151 |
+
dataset = convert_jsonl_to_arrow(JSONL_SRC, ARROW_PATH)
|
| 152 |
+
create_readme(dataset)
|
| 153 |
+
upload_to_hf()
|
| 154 |
+
print(f"\n🎉 SUCCESS! View at: https://huggingface.co/datasets/{REPO_ID}")
|
| 155 |
+
except Exception as e:
|
| 156 |
+
logging.error(f"Process failed: {e}")
|
| 157 |
+
print(f"\n❌ Upload failed. See debug_upload.log for details.")
|
| 158 |
+
sys.exit(1)
|
| 159 |
+
finally:
|
| 160 |
+
cleanup()
|
| 161 |
+
|
| 162 |
+
if __name__ == "__main__":
|
| 163 |
+
main()
|
Tokenization/preprocessing/Clean_text.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import unicodedata
|
| 3 |
+
|
| 4 |
+
def clean_text(text: str) -> str:
|
| 5 |
+
"""Clean and normalize text for LLM ingestion."""
|
| 6 |
+
if not isinstance(text, str):
|
| 7 |
+
return ""
|
| 8 |
+
# Normalize unicode
|
| 9 |
+
text = unicodedata.normalize("NFKC", text)
|
| 10 |
+
# Remove control characters
|
| 11 |
+
text = re.sub(r"[\x00-\x1F\x7F]", " ", text)
|
| 12 |
+
# Replace multiple spaces/newlines with a single space
|
| 13 |
+
text = re.sub(r"\s+", " ", text)
|
| 14 |
+
# Strip leading/trailing whitespace
|
| 15 |
+
text = text.strip()
|
| 16 |
+
return text
|
Tokenization/preprocessing/Preprocess_sample.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, List
|
| 2 |
+
from Tokenization.preprocessing.Clean_text import clean_text
|
| 3 |
+
from Tokenization.preprocessing.Segment_paragraphs import segment_paragraphs
|
| 4 |
+
|
| 5 |
+
def preprocess_sample(paper: Dict) -> List[Dict]:
|
| 6 |
+
"""
|
| 7 |
+
Clean and segment a paper into samples for LLM ingestion.
|
| 8 |
+
Returns a list of dicts: one for title+abstract, and one per paragraph.
|
| 9 |
+
"""
|
| 10 |
+
title = clean_text(paper.get("title", ""))
|
| 11 |
+
abstract = clean_text(paper.get("abstract", ""))
|
| 12 |
+
full_text = clean_text(paper.get("full_text", ""))
|
| 13 |
+
paragraphs = segment_paragraphs(full_text) if full_text else []
|
| 14 |
+
samples = []
|
| 15 |
+
# Title + abstract sample
|
| 16 |
+
if title or abstract:
|
| 17 |
+
sample = dict(paper)
|
| 18 |
+
sample["title"] = title
|
| 19 |
+
sample["abstract"] = abstract
|
| 20 |
+
sample["full_text"] = ""
|
| 21 |
+
sample["section"] = "abstract"
|
| 22 |
+
samples.append(sample)
|
| 23 |
+
# Paragraph samples
|
| 24 |
+
for para in paragraphs:
|
| 25 |
+
sample = dict(paper)
|
| 26 |
+
sample["title"] = title
|
| 27 |
+
sample["abstract"] = ""
|
| 28 |
+
sample["full_text"] = para
|
| 29 |
+
sample["section"] = "paragraph"
|
| 30 |
+
samples.append(sample)
|
| 31 |
+
return samples
|
Tokenization/preprocessing/Segment_paragraphs.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
def segment_paragraphs(text: str) -> list:
|
| 4 |
+
"""Segment text into paragraphs using double newlines or similar heuristics."""
|
| 5 |
+
if not isinstance(text, str):
|
| 6 |
+
return []
|
| 7 |
+
# Split on two or more newlines, or at least 200 chars per paragraph
|
| 8 |
+
paras = re.split(r"\n{2,}", text)
|
| 9 |
+
# Fallback: split-long paragraphs
|
| 10 |
+
result = []
|
| 11 |
+
for para in paras:
|
| 12 |
+
para = para.strip()
|
| 13 |
+
if len(para) > 1000:
|
| 14 |
+
# Split further if too long
|
| 15 |
+
chunks = [para[i:i+1000] for i in range(0, len(para), 1000)]
|
| 16 |
+
result.extend(chunks)
|
| 17 |
+
elif para:
|
| 18 |
+
result.append(para)
|
| 19 |
+
return [p for p in result if p]
|
Tokenization/preprocessing/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .Clean_text import clean_text
|
| 2 |
+
from .Segment_paragraphs import segment_paragraphs
|
| 3 |
+
from .Preprocess_sample import preprocess_sample
|
| 4 |
+
|
| 5 |
+
__all__ = [
|
| 6 |
+
"clean_text",
|
| 7 |
+
"segment_paragraphs",
|
| 8 |
+
"preprocess_sample",
|
| 9 |
+
]
|
Tokenization/preprocessing/__pycache__/Clean_text.cpython-310.pyc
ADDED
|
Binary file (544 Bytes). View file
|
|
|
Tokenization/preprocessing/__pycache__/Preprocess_sample.cpython-310.pyc
ADDED
|
Binary file (1.03 kB). View file
|
|
|
Tokenization/preprocessing/__pycache__/Segment_paragraphs.cpython-310.pyc
ADDED
|
Binary file (932 Bytes). View file
|
|
|
Tokenization/preprocessing/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (372 Bytes). View file
|
|
|
Tokenization/pretraining/Dataset_stats.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import Counter
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
from transformers import AutoTokenizer
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class DatasetAnalyzer:
|
| 9 |
+
def __init__(self, model_name: str = "facebook/opt-350m"):
|
| 10 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 11 |
+
|
| 12 |
+
def analyze_sample(self, sample: Dict) -> Dict:
|
| 13 |
+
tokens = self.tokenizer.encode(str(sample))
|
| 14 |
+
return {
|
| 15 |
+
"token_count": len(tokens),
|
| 16 |
+
"word_count": len(str(sample).split()),
|
| 17 |
+
"has_abstract": bool(sample.get("abstract")),
|
| 18 |
+
"has_content": bool(sample.get("full_text") or sample.get("excerpt")),
|
| 19 |
+
"has_section": bool(sample.get("section_type")),
|
| 20 |
+
"domain": sample.get("domain_tag", "unknown")
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
def get_dataset_stats(self, samples: List[Dict]) -> Dict:
|
| 24 |
+
stats = []
|
| 25 |
+
domains = Counter()
|
| 26 |
+
sections = Counter()
|
| 27 |
+
|
| 28 |
+
for sample in samples:
|
| 29 |
+
sample_stats = self.analyze_sample(sample)
|
| 30 |
+
stats.append(sample_stats)
|
| 31 |
+
domains[sample_stats["domain"]] += 1
|
| 32 |
+
sections[sample.get("section_type", "unknown")] += 1
|
| 33 |
+
|
| 34 |
+
return {
|
| 35 |
+
"total_samples": len(samples),
|
| 36 |
+
"avg_tokens": np.mean([s["token_count"] for s in stats]),
|
| 37 |
+
"avg_words": np.mean([s["word_count"] for s in stats]),
|
| 38 |
+
"domain_distribution": dict(domains),
|
| 39 |
+
"section_distribution": dict(sections)
|
| 40 |
+
}
|
Tokenization/pretraining/Instruction_formatter.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Tokenization/pretraining/instruction_formatter.py
|
| 2 |
+
|
| 3 |
+
class InstructionFormatter:
|
| 4 |
+
@staticmethod
|
| 5 |
+
def format_sample(sample):
|
| 6 |
+
"""
|
| 7 |
+
Formats a sample dict with 'instruction', 'input', and 'output' fields.
|
| 8 |
+
This is a placeholder; customize as needed for your data.
|
| 9 |
+
"""
|
| 10 |
+
# Ensure required fields exist
|
| 11 |
+
instruction = sample.get("instruction", "")
|
| 12 |
+
input_ = sample.get("input", "")
|
| 13 |
+
output = sample.get("output", "")
|
| 14 |
+
return {
|
| 15 |
+
"instruction": instruction.strip(),
|
| 16 |
+
"input": input_.strip(),
|
| 17 |
+
"output": output.strip(),
|
| 18 |
+
}
|
Tokenization/pretraining/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .Dataset_stats import DatasetAnalyzer
|
| 2 |
+
|
| 3 |
+
__all__ = ["DatasetAnalyzer"]
|
Tokenization/pretraining/__pycache__/Dataset_stats.cpython-310.pyc
ADDED
|
Binary file (1.97 kB). View file
|
|
|
Tokenization/pretraining/__pycache__/Instruction_formatter.cpython-310.pyc
ADDED
|
Binary file (806 Bytes). View file
|
|
|
Tokenization/pretraining/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (244 Bytes). View file
|
|
|
Tokenization/requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
gradio
|
| 4 |
+
requests
|
| 5 |
+
nltk
|
| 6 |
+
scikit-learn
|
| 7 |
+
beautifulsoup4
|
| 8 |
+
arxiv
|
| 9 |
+
huggingface_hub
|
| 10 |
+
python-dotenv
|
| 11 |
+
stripe
|
Tokenization/run_backend.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import uvicorn
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
if __name__ == "__main__":
|
| 5 |
+
os.makedirs("tmp", exist_ok=True)
|
| 6 |
+
print("Starting FastAPI backend at http://localhost:8000 ...")
|
| 7 |
+
uvicorn.run(
|
| 8 |
+
"Tokenization.app:fastapi_app",
|
| 9 |
+
host="0.0.0.0",
|
| 10 |
+
port=8000,
|
| 11 |
+
reload=True
|
| 12 |
+
)
|