Allanatrix commited on
Commit
ef4c8c3
·
verified ·
1 Parent(s): 554d2ab

Upload 50 files

Browse files
Files changed (50) hide show
  1. Tokenization/Build_tokenizer.py +89 -0
  2. Tokenization/Cleanser.py +102 -0
  3. Tokenization/Entropy_ranker.py +59 -0
  4. Tokenization/Label_tokens.py +69 -0
  5. Tokenization/Logs/corpus_builder.log +0 -0
  6. Tokenization/Logs/debug_upload.log +4 -0
  7. Tokenization/Main_2.py +922 -0
  8. Tokenization/__init__.py +21 -0
  9. Tokenization/__pycache__/Build_tokenizer.cpython-310.pyc +0 -0
  10. Tokenization/__pycache__/Entropy_ranker.cpython-310.pyc +0 -0
  11. Tokenization/__pycache__/Label_tokens.cpython-310.pyc +0 -0
  12. Tokenization/__pycache__/Main_2.cpython-310.pyc +0 -0
  13. Tokenization/__pycache__/__init__.cpython-310.pyc +0 -0
  14. Tokenization/__pycache__/generate_dataset.cpython-310.pyc +0 -0
  15. Tokenization/__pycache__/hf_upload.cpython-310.pyc +0 -0
  16. Tokenization/app.py +147 -0
  17. Tokenization/app/Api.py +75 -0
  18. Tokenization/app/Config.py +25 -0
  19. Tokenization/app/Core.py +155 -0
  20. Tokenization/app/Payment.py +27 -0
  21. Tokenization/app/Progress.py +37 -0
  22. Tokenization/app/__init__.py +15 -0
  23. Tokenization/app/__pycache__/Api.cpython-310.pyc +0 -0
  24. Tokenization/app/__pycache__/Config.cpython-310.pyc +0 -0
  25. Tokenization/app/__pycache__/Core.cpython-310.pyc +0 -0
  26. Tokenization/app/__pycache__/Payment.cpython-310.pyc +0 -0
  27. Tokenization/app/__pycache__/Progress.cpython-310.pyc +0 -0
  28. Tokenization/app/__pycache__/__init__.cpython-310.pyc +0 -0
  29. Tokenization/combined_scientific_papers.json +0 -0
  30. Tokenization/combined_scientific_papers.jsonl +0 -0
  31. Tokenization/corpus_builder.log +0 -0
  32. Tokenization/debug_upload.log +198 -0
  33. Tokenization/generate_dataset.py +77 -0
  34. Tokenization/hf_upload.py +163 -0
  35. Tokenization/preprocessing/Clean_text.py +16 -0
  36. Tokenization/preprocessing/Preprocess_sample.py +31 -0
  37. Tokenization/preprocessing/Segment_paragraphs.py +19 -0
  38. Tokenization/preprocessing/__init__.py +9 -0
  39. Tokenization/preprocessing/__pycache__/Clean_text.cpython-310.pyc +0 -0
  40. Tokenization/preprocessing/__pycache__/Preprocess_sample.cpython-310.pyc +0 -0
  41. Tokenization/preprocessing/__pycache__/Segment_paragraphs.cpython-310.pyc +0 -0
  42. Tokenization/preprocessing/__pycache__/__init__.cpython-310.pyc +0 -0
  43. Tokenization/pretraining/Dataset_stats.py +40 -0
  44. Tokenization/pretraining/Instruction_formatter.py +18 -0
  45. Tokenization/pretraining/__init__.py +3 -0
  46. Tokenization/pretraining/__pycache__/Dataset_stats.cpython-310.pyc +0 -0
  47. Tokenization/pretraining/__pycache__/Instruction_formatter.cpython-310.pyc +0 -0
  48. Tokenization/pretraining/__pycache__/__init__.cpython-310.pyc +0 -0
  49. Tokenization/requirements.txt +11 -0
  50. Tokenization/run_backend.py +12 -0
Tokenization/Build_tokenizer.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Dict
4
+
5
+ from transformers import AutoTokenizer
6
+
7
+ from Tokenization.Entropy_ranker import EntropyRanker
8
+ from Tokenization.Label_tokens import MIN_WORDS, MAX_TOKENS, MAX_TOTAL_TOKENS, TOKEN_TARGETS
9
+ from Tokenization.pretraining.Dataset_stats import DatasetAnalyzer
10
+ from Tokenization.pretraining.Instruction_formatter import InstructionFormatter
11
+
12
+
13
+ class QLoRAPreprocessor:
14
+ def __init__(self, model_name: str = "facebook/opt-350m", corpus_type: str = "warm_start"):
15
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+ self.analyzer = DatasetAnalyzer(model_name)
17
+ self.formatter = InstructionFormatter()
18
+ self.ranker = EntropyRanker()
19
+ self.token_target = TOKEN_TARGETS[corpus_type]
20
+ self.current_tokens = 0
21
+
22
+ def track_tokens(self, text: str) -> bool:
23
+ tokens = self.tokenizer.encode(text)
24
+ self.current_tokens += len(tokens)
25
+ return self.current_tokens <= self.token_target
26
+
27
+ def validate_sample(self, sample: Dict) -> bool:
28
+ if not all(k in sample for k in ["instruction", "input", "output"]):
29
+ return False
30
+ total_text = f"{sample['instruction']} {sample['input']} {sample['output']}"
31
+ tokens = self.tokenizer.encode(total_text)
32
+ words = total_text.split()
33
+ return (len(words) >= MIN_WORDS and
34
+ len(tokens) <= MAX_TOKENS and
35
+ len(tokens) <= MAX_TOTAL_TOKENS)
36
+
37
+ def process_dataset(self, input_path: str, output_path: str):
38
+ # Load data, skipping blank lines and malformed JSON
39
+ data = []
40
+ with open(input_path, 'r', encoding='utf-8') as f:
41
+ for i, line in enumerate(f, 1):
42
+ line = line.strip()
43
+ if not line:
44
+ continue
45
+ try:
46
+ data.append(json.loads(line))
47
+ except json.JSONDecodeError as e:
48
+ print(f"Skipping line {i}: {e}")
49
+
50
+ # Analyze dataset
51
+ stats = self.analyzer.get_dataset_stats(data)
52
+ print(f"Dataset stats: {stats}")
53
+
54
+ # Format samples
55
+ formatted_samples = [
56
+ self.formatter.format_sample(sample)
57
+ for sample in data
58
+ ]
59
+
60
+ # Rank and filter samples
61
+ ranked_samples = self.ranker.rank_samples(formatted_samples)
62
+
63
+ # Track token count while processing
64
+ valid_samples = []
65
+ for sample in ranked_samples:
66
+ if not self.validate_sample(sample):
67
+ continue
68
+
69
+ sample_text = f"{sample['instruction']} {sample['input']} {sample['output']}"
70
+ if not self.track_tokens(sample_text):
71
+ break
72
+
73
+ valid_samples.append(sample)
74
+
75
+ # Save to JSONL
76
+ output_file = Path(output_path)
77
+ output_file.parent.mkdir(parents=True, exist_ok=True)
78
+ with open(output_file, 'w', encoding='utf-8') as f:
79
+ for sample in valid_samples:
80
+ f.write(json.dumps(sample) + '\n')
81
+
82
+ print(f"Processed {len(valid_samples)} samples saved to {output_path}")
83
+
84
+ if __name__ == "__main__":
85
+ preprocessor = QLoRAPreprocessor()
86
+ preprocessor.process_dataset(
87
+ "C:/Users/kunya/PycharmProjects/DataVolt/Tokenizers/combined_scientific_papers.json",
88
+ "nexa_scientific_instruction_300k.jsonl"
89
+ )
Tokenization/Cleanser.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+ from datasets import Dataset
6
+
7
+ # Tag dictionaries
8
+ DOMAIN_TAGS = {
9
+ "physics": "[PHYS]",
10
+ "biology": "[BIO]",
11
+ "materials": "[MAT]",
12
+ "education": "[GEN]",
13
+ }
14
+
15
+ TASK_TAGS = {
16
+ "hypothesis": "[HYP]",
17
+ "method": "[MTH]",
18
+ "experiment": "[EXP]",
19
+ }
20
+
21
+ SECTION_TAGS = {
22
+ "abstract": "[ABSTRACT]",
23
+ "introduction": "[INTRO]",
24
+ "results": "[RESULTS]",
25
+ "discussion": "[DISCUSSION]",
26
+ "conclusion": "[CONCLUSION]",
27
+ "method": "[MTH]",
28
+ "experiment": "[EXP]",
29
+ }
30
+
31
+ SRC_PATH = Path(r"C:\Users\kunya\PycharmProjects\DataVolt\Tokenization\scientific_corpus_325M.jsonl")
32
+ CLEANED_JSONL_PATH = Path("scientific_corpus_325M.cleaned.jsonl")
33
+ CLEANED_ARROW_PATH = Path("scientific_corpus_325M.cleaned.arrow")
34
+ CHUNK_SIZE = 10000
35
+ MAX_WORKERS = os.cpu_count() or 4
36
+
37
+ def tag_record(record):
38
+ # Tagging logic: add tags to text fields if domain/task/section present
39
+ # You may need to adjust keys based on your schema
40
+ domain = record.get("domain", "").lower()
41
+ task = record.get("task", "").lower()
42
+ section = record.get("section", "").lower()
43
+ text = record.get("full_text", "")
44
+
45
+ tags = []
46
+ if domain in DOMAIN_TAGS:
47
+ tags.append(DOMAIN_TAGS[domain])
48
+ if task in TASK_TAGS:
49
+ tags.append(TASK_TAGS[task])
50
+ if section in SECTION_TAGS:
51
+ tags.append(SECTION_TAGS[section])
52
+
53
+ # Prepend tags to text
54
+ record["tagged_text"] = " ".join(tags) + " " + text if tags else text
55
+ return record
56
+
57
+ def process_chunk(lines):
58
+ cleaned = []
59
+ for line in lines:
60
+ try:
61
+ record = json.loads(line)
62
+ cleaned.append(tag_record(record))
63
+ except Exception:
64
+ continue # skip malformed lines
65
+ return cleaned
66
+
67
+ def chunked_file_reader(path, chunk_size):
68
+ with open(path, "r", encoding="utf-8") as f:
69
+ chunk = []
70
+ for line in f:
71
+ chunk.append(line)
72
+ if len(chunk) == chunk_size:
73
+ yield chunk
74
+ chunk = []
75
+ if chunk:
76
+ yield chunk
77
+
78
+ def main():
79
+ print("Starting cleaning process...")
80
+ # Write cleaned records to a new JSONL file in chunks
81
+ with open(CLEANED_JSONL_PATH, "w", encoding="utf-8") as out_f:
82
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
83
+ futures = []
84
+ for chunk in chunked_file_reader(SRC_PATH, CHUNK_SIZE):
85
+ futures.append(executor.submit(process_chunk, chunk))
86
+ for fut in as_completed(futures):
87
+ for record in fut.result():
88
+ out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
89
+ print(f"Cleaned JSONL written to {CLEANED_JSONL_PATH}")
90
+
91
+ # Convert cleaned JSONL to Arrow using datasets (handles chunking internally)
92
+ print("Saving cleaned dataset to Arrow format...")
93
+ ds = Dataset.from_json(str(CLEANED_JSONL_PATH))
94
+ ds.save_to_disk(str(CLEANED_ARROW_PATH))
95
+ print(f"Saved cleaned Arrow dataset at: {CLEANED_ARROW_PATH}")
96
+
97
+ # Optionally, call hf_upload.py asynchronously
98
+ print("Uploading to HuggingFace using hf_upload.py ...")
99
+ os.system(f"python hf_upload.py")
100
+
101
+ if __name__ == "__main__":
102
+ main()
Tokenization/Entropy_ranker.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import List, Dict, Optional, Callable
3
+
4
+ class EntropyRanker:
5
+ """
6
+ Scores and filters text samples by Shannon entropy of their token distribution.
7
+ Used to remove low-information or repetitive samples from scientific corpora.
8
+ """
9
+
10
+ def __init__(self, entropy_threshold: float = 3.5, tokenizer: Optional[Callable[[str], List[str]]] = None):
11
+ """
12
+ Args:
13
+ entropy_threshold: Minimum entropy required to keep a sample.
14
+ tokenizer: Function to tokenize text. Defaults to whitespace split.
15
+ """
16
+ self.entropy_threshold = entropy_threshold
17
+ self.tokenizer = tokenizer or (lambda x: x.split())
18
+
19
+ @staticmethod
20
+ def shannon_entropy(tokens: List[str]) -> float:
21
+ """Compute Shannon entropy for a list of tokens."""
22
+ if not tokens:
23
+ return 0.0
24
+ freq = {}
25
+ for t in tokens:
26
+ freq[t] = freq.get(t, 0) + 1
27
+ total = len(tokens)
28
+ entropy = 0.0
29
+ for count in freq.values():
30
+ p = count / total
31
+ entropy -= p * math.log(p, 2)
32
+ return entropy
33
+
34
+ def score_sample(self, text: str) -> float:
35
+ """Tokenize and score a text sample by entropy."""
36
+ tokens = self.tokenizer(text)
37
+ return self.shannon_entropy(tokens)
38
+
39
+ def is_explanatory(self, text: str) -> bool:
40
+ """Return True if sample passes an entropy threshold."""
41
+ return self.score_sample(text) >= self.entropy_threshold
42
+
43
+ def filter_samples(self, samples: List[Dict], text_key: str = "text") -> List[Dict]:
44
+ """Filter a list of dict samples, keeping only those above a threshold."""
45
+ return [s for s in samples if self.is_explanatory(s.get(text_key, ""))]
46
+
47
+ def rank_samples(self, samples: List[Dict], text_key: str = "text", top_k: Optional[int] = None) -> List[Dict]:
48
+ """
49
+ Rank samples by entropy, descending. Optionally return only top_k.
50
+ """
51
+ scored = [
52
+ (self.score_sample(s.get(text_key, "")), s)
53
+ for s in samples
54
+ ]
55
+ scored.sort(reverse=True, key=lambda x: x[0])
56
+ ranked = [s for _, s in scored if _ >= self.entropy_threshold]
57
+ if top_k is not None:
58
+ ranked = ranked[:top_k]
59
+ return ranked
Tokenization/Label_tokens.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tokenization/label_tokens.py
2
+
3
+ # Domain tags
4
+ DOMAIN_TAGS = {
5
+ "physics": "[PHYS]",
6
+ "biology": "[BIO]",
7
+ "materials": "[MAT]",
8
+ "education": "[GEN]",
9
+ }
10
+
11
+ # Task tags
12
+ TASK_TAGS = {
13
+ "hypothesis": "[HYP]",
14
+ "method": "[MTH]",
15
+ "experiment": "[EXP]",
16
+ }
17
+
18
+ # Section tags (for further granularity, e.g., for long-context or future models)
19
+ SECTION_TAGS = {
20
+ "abstract": "[ABSTRACT]",
21
+ "introduction": "[INTRO]",
22
+ "results": "[RESULTS]",
23
+ "discussion": "[DISCUSSION]",
24
+ "conclusion": "[CONCLUSION]",
25
+ "method": "[MTH]",
26
+ "experiment": "[EXP]",
27
+ }
28
+
29
+ # Routing tags
30
+ ROUTING_TAGS = {
31
+ "general": "[GEN]",
32
+ "specific": "[SPEC]",
33
+ }
34
+
35
+ # Token/word limits for validation and filtering
36
+ MIN_WORDS = 8
37
+ MAX_TOKENS = 1024
38
+ MAX_TOTAL_TOKENS = 327680000 # Example: 325M tokens
39
+
40
+ # Token targets for different corpus types
41
+ TOKEN_TARGETS = {
42
+ "warm_start": 100_000_000,
43
+ "scientific": 225_000_000,
44
+ "instruction": 30_000_000,
45
+ "default": 325_000_000,
46
+ }
47
+
48
+ def build_tag_string(
49
+ domain: str,
50
+ task: str = None,
51
+ section: str = None,
52
+ routing: str = "general",
53
+ subdomain: str = None
54
+ ) -> str:
55
+ """
56
+ Build a tag string for a sample, e.g. [PHYS][HYP][GEN] or [BIO][MTH][SPEC: Genomics]
57
+ """
58
+ tags = []
59
+ if domain in DOMAIN_TAGS:
60
+ tags.append(DOMAIN_TAGS[domain])
61
+ if task in TASK_TAGS:
62
+ tags.append(TASK_TAGS[task])
63
+ if section in SECTION_TAGS:
64
+ tags.append(SECTION_TAGS[section])
65
+ if routing == "general":
66
+ tags.append(ROUTING_TAGS["general"])
67
+ elif routing == "specific" and subdomain:
68
+ tags.append(f"[SPEC:{subdomain}]")
69
+ return "".join(tags)
Tokenization/Logs/corpus_builder.log ADDED
The diff for this file is too large to render. See raw diff
 
Tokenization/Logs/debug_upload.log ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ 2025-06-07 20:23:13,293 - INFO - Converting C:\Users\kunya\PycharmProjects\DataVolt\Tokenization\scientific_corpus_325M.jsonl to Arrow format at scientific_corpus_325M.arrow ...
2
+ 2025-06-07 20:23:36,951 - ERROR - An error occurred while generating the dataset: An error occurred while generating the dataset
3
+ 2025-06-07 20:23:36,951 - ERROR - Process failed: An error occurred while generating the dataset
4
+ 2025-06-07 20:23:36,952 - INFO - Cleaned up local files.
Tokenization/Main_2.py ADDED
@@ -0,0 +1,922 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # python
2
+ """
3
+ The Main pipeline for building a scientific corpus from multiple sources.
4
+
5
+ Responsibilities:
6
+ - Orchestrates collection, processing, ranking, and deduplication of papers from arXiv, PubMed, and FineWeb-Edu.
7
+ - Handles error logging, checkpointing, and metrics for observability.
8
+ - Modular design for extensibility and maintainability.
9
+
10
+ Usage:
11
+ python Main_2.py
12
+
13
+ Classes:
14
+ - SourceMetrics: Tracks per-source metrics.
15
+ - CorpusConfig: Configuration for corpus building.
16
+ - ScientificCorpusBuilder: Main pipeline class.
17
+
18
+ Functions:
19
+ - main: Entry point for running the pipeline.
20
+
21
+ Environment:
22
+ - Requires ENTREZ_EMAIL for PubMed API.
23
+ - Outputs logs and intermediate checkpoints to ./scientific_corpus_data.
24
+
25
+ """
26
+
27
+ import concurrent.futures
28
+ import json
29
+ import logging
30
+ import os
31
+ import signal
32
+ import time
33
+ from dataclasses import dataclass
34
+ from pathlib import Path
35
+ from types import FrameType
36
+ from typing import List, Dict, Set, Optional, Callable, Any
37
+ from urllib.error import URLError, HTTPError
38
+ from xml.parsers.expat import ExpatError
39
+
40
+ import arxiv
41
+ from Bio import Entrez
42
+ from datasets import load_dataset
43
+ from tqdm import tqdm
44
+
45
+ from Tokenization.Build_tokenizer import QLoRAPreprocessor
46
+ from Tokenization.Entropy_ranker import EntropyRanker
47
+ from Tokenization.hf_upload import upload_to_huggingface
48
+ from Tokenization.Label_tokens import TASK_TAGS, ROUTING_TAGS
49
+ from Tokenization.preprocessing import clean_text, segment_paragraphs
50
+ from Tokenization.pretraining.Dataset_stats import DatasetAnalyzer
51
+ from Tokenization.app.Config import PLAN_LIMITS
52
+
53
+ # Configure logging
54
+ logging.basicConfig(
55
+ level=logging.INFO,
56
+ format="%(asctime)s - %(levelname)s - %(message)s",
57
+ handlers=[
58
+ logging.FileHandler("corpus_builder.log"),
59
+ logging.StreamHandler()
60
+ ]
61
+ )
62
+ logger = logging.getLogger(__name__)
63
+
64
+
65
+ is_shutdown = False
66
+ """Global flag indicating whether a shutdown signal has been received.
67
+
68
+ This flag is set to True by the signal handler to allow for graceful shutdown
69
+ of long-running operations throughout the pipeline.
70
+ """
71
+
72
+ def signal_handler(sig: int, frame: FrameType) -> None:
73
+ """Handle shutdown signals gracefully and set shutdown flag."""
74
+ global is_shutdown
75
+ logger.info(f"Received signal {sig}, shutting down gracefully. Frame: {frame}")
76
+ is_shutdown = True
77
+
78
+
79
+ # Register signal handlers for graceful shutdown
80
+ signal.signal(signal.SIGINT, signal_handler)
81
+ signal.signal(signal.SIGTERM, signal_handler)
82
+
83
+
84
+ def retry(max_retries: int = 3, backoff_factor: float = 1.0,
85
+ exceptions: tuple = (Exception,)) -> Callable:
86
+ """
87
+ Decorator for retrying a function with exponential backoff.
88
+
89
+ Args:
90
+ max_retries: Maximum number of retries.
91
+ backoff_factor: Multiplier for exponential backoff.
92
+ exceptions: Exception types to catch and retry.
93
+
94
+ Returns:
95
+ Decorated function with retry logic.
96
+ """
97
+ def decorator(func: Callable) -> Callable:
98
+ def wrapper(*args, **kwargs) -> Any:
99
+ retries = 0
100
+ while retries < max_retries:
101
+ if is_shutdown:
102
+ logger.info("Shutdown in progress, aborting retries.")
103
+ raise KeyboardInterrupt("Shutdown requested")
104
+ try:
105
+ return func(*args, **kwargs)
106
+ except exceptions as e:
107
+ wait = backoff_factor * (2 ** retries)
108
+ logger.warning(f"Error in {func.__name__}: {e}. Retrying in {wait:.1f}s...")
109
+ time.sleep(wait)
110
+ retries += 1
111
+ logger.error(f"Function {func.__name__} failed after {max_retries} attempts.")
112
+ raise RuntimeError(f"{func.__name__} failed after {max_retries} attempts")
113
+ return wrapper
114
+ return decorator
115
+
116
+
117
+ @dataclass
118
+ class SourceMetrics:
119
+ """Metrics for tracking source performance."""
120
+ papers: int = 0
121
+ tokens: int = 0
122
+ time: float = 0.0
123
+ errors: int = 0
124
+
125
+
126
+ @dataclass
127
+ class CorpusConfig:
128
+ """
129
+ Configuration for corpus building parameters.
130
+
131
+ Attributes:
132
+ max_arxiv_papers: Maximum number of arXiv papers to fetch.
133
+ max_pubmed_papers: Maximum number of PubMed papers to fetch.
134
+ max_fineweb_samples: Maximum number of FineWeb-Edu samples to fetch.
135
+ max_workers: Number of workers for parallel processing.
136
+ timeout: Timeout for API requests.
137
+ chunk_size: Chunk size for batch processing.
138
+ """
139
+ max_arxiv_papers: int = 9000
140
+ max_pubmed_papers: int = 3000
141
+ max_fineweb_samples: int = 30000
142
+ max_workers: int = 8
143
+ timeout: int = 30
144
+ chunk_size: int = 1000
145
+
146
+
147
+ class ScientificCorpusBuilder:
148
+ """
149
+ Main class for building a scientific corpus from multiple sources.
150
+
151
+ Methods:
152
+ fetch_arxiv_papers: Collects papers from arXiv.
153
+ fetch_pubmed_papers: Collects papers from PubMed.
154
+ fetch_fineweb_edu: Collects educational content from FineWeb-Edu.
155
+ preprocess_sample: Cleans and segments a paper into samples.
156
+ process_papers: Tags, filters, and preprocesses papers.
157
+ build_corpus: Orchestrates the full pipeline and builds the corpus.
158
+ print_report: Prints a summary report of the build process.
159
+ """
160
+
161
+ def __init__(self, config: Optional[CorpusConfig] = None):
162
+ """
163
+ Initialize the corpus builder with configuration and dependencies.
164
+
165
+ Args:
166
+ config: Optional CorpusConfig object.
167
+ """
168
+ self.config = config or CorpusConfig()
169
+ self.preprocessor = QLoRAPreprocessor(corpus_type="scientific")
170
+ self.analyzer = DatasetAnalyzer()
171
+ self.ranker = EntropyRanker()
172
+ self.data_dir = Path("scientific_corpus_data")
173
+ self.data_dir.mkdir(exist_ok=True)
174
+ self._setup_apis()
175
+ self.seen_titles: Set[str] = set()
176
+ self.metrics = {
177
+ "arxiv": SourceMetrics(),
178
+ "pubmed": SourceMetrics(),
179
+ "fineweb_edu": SourceMetrics(),
180
+ "total_tokens": 0,
181
+ "total_time": 0.0
182
+ }
183
+
184
+ @staticmethod
185
+ def _setup_apis() -> None:
186
+ """
187
+ Setup API configurations for external data sources.
188
+ """
189
+ Entrez.email = os.getenv("ENTREZ_EMAIL", "[email protected]")
190
+ if Entrez.email == "[email protected]":
191
+ logger.warning("Using default email for Entrez. Set ENTREZ_EMAIL environment variable.")
192
+
193
+ @retry(max_retries=3, backoff_factor=2,
194
+ exceptions=(arxiv.ArxivError, HTTPError, URLError, ConnectionError))
195
+ def _fetch_arxiv_search(self, query: str, max_results: int) -> List[Any]:
196
+ """
197
+ Fetch arXiv search results with error handling and exponential backoff.
198
+
199
+ Args:
200
+ query: arXiv API query string.
201
+ max_results: Maximum number of results to fetch.
202
+
203
+ Returns:
204
+ List of arXiv result objects.
205
+ """
206
+ try:
207
+ search = arxiv.Search(
208
+ query=query,
209
+ max_results=max_results,
210
+ sort_by=arxiv.SortCriterion.SubmittedDate,
211
+ )
212
+ client = arxiv.Client()
213
+ results = list(client.results(search))
214
+ if not results:
215
+ logger.warning(f"Empty page returned for query '{query}'")
216
+ return results
217
+ except (arxiv.UnexpectedEmptyPageError, arxiv.HTTPError) as e:
218
+ logger.warning(f"Empty page returned for query '{query}': {e}")
219
+ return []
220
+ except Exception as e:
221
+ logger.error(f"Error in _fetch_arxiv_search for query '{query}': {e}")
222
+ raise
223
+
224
+ def fetch_arxiv_papers(self) -> List[Dict]:
225
+ """
226
+ Fetch papers from arXiv across multiple domains with verification and checkpoint saving.
227
+
228
+ Returns:
229
+ List of arXiv paper dictionaries.
230
+ """
231
+ logger.info("Starting arXiv paper collection...")
232
+ start_time = time.time()
233
+ papers = []
234
+ queries = [
235
+ ("physics", "cat:physics* OR cat:astro-ph* OR cat:cond-mat* OR cat:hep-th OR cat:quant-ph OR cat:math-ph"),
236
+ ("biology", "cat:q-bio*"),
237
+ ("materials", "cat:cond-mat.mtrl-sci OR cat:materials*")
238
+ ]
239
+ for domain, query in queries:
240
+ if is_shutdown:
241
+ break
242
+ try:
243
+ results = self._fetch_arxiv_search(query, self.config.max_arxiv_papers // 3)
244
+ for result in tqdm(results, desc=f"arXiv {domain}"):
245
+ if is_shutdown:
246
+ break
247
+ try:
248
+ paper = {
249
+ "title": result.title.strip() if result.title else "",
250
+ "abstract": result.summary.strip() if result.summary else "",
251
+ "full_text": "",
252
+ "domain": domain,
253
+ "section": "abstract",
254
+ "source": "arxiv",
255
+ "authors": [str(a) for a in result.authors] if result.authors else [],
256
+ "published": result.published.isoformat() if result.published else None,
257
+ "provenance": {"arxiv_id": result.get_short_id()},
258
+ "categories": [c for c in getattr(result, "categories", [])] if hasattr(result, "categories") else [],
259
+ "text": result.summary.strip() if result.summary else ""
260
+ }
261
+ if paper["title"] and paper["title"] not in self.seen_titles:
262
+ papers.append(paper)
263
+ self.seen_titles.add(paper["title"])
264
+ except Exception as e:
265
+ logger.warning(f"Error processing arXiv result: {e}")
266
+ self.metrics["arxiv"].errors += 1
267
+ continue
268
+ except Exception as e:
269
+ logger.error(f"arXiv {domain} search failed: {e}")
270
+ self.metrics["arxiv"].errors += 1
271
+ self._save_intermediate(papers, "arxiv_papers.jsonl")
272
+ elapsed = time.time() - start_time
273
+ self.metrics["arxiv"].papers = len(papers)
274
+ self.metrics["arxiv"].time = elapsed
275
+ logger.info(f"Collected {len(papers)} arXiv papers in {elapsed:.2f}s")
276
+ return papers
277
+
278
+ @retry(max_retries=3, backoff_factor=2,
279
+ exceptions=(HTTPError, URLError, ConnectionError, ExpatError))
280
+ def _fetch_pubmed_batch(self, chunk_pmids: List[str]) -> Dict:
281
+ """
282
+ Fetch a batch of PubMed records with error handling.
283
+
284
+ Args:
285
+ chunk_pmids: List of PubMed IDs.
286
+
287
+ Returns:
288
+ Dictionary of PubMed records.
289
+ """
290
+ try:
291
+ fetch_handle = Entrez.efetch (
292
+ db="pubmed",
293
+ id=",".join (chunk_pmids),
294
+ rettype="medline",
295
+ retmode="xml"
296
+ )
297
+ records = Entrez.read (fetch_handle)
298
+ fetch_handle.close ()
299
+ return records
300
+ except ExpatError as e:
301
+ logger.error (f"XML parsing error in PubMed batch: {e}")
302
+ raise
303
+ except (HTTPError, URLError) as e:
304
+ logger.error (f"Network error fetching PubMed batch: {e}")
305
+ raise
306
+
307
+ def fetch_pubmed_papers(self) -> List[Dict]:
308
+ """
309
+ Fetch papers from PubMed with biology focus.
310
+
311
+ Returns:
312
+ List of PubMed paper dictionaries.
313
+ """
314
+ logger.info ("Starting PubMed paper collection...")
315
+ start_time = time.time ()
316
+ papers = []
317
+
318
+ search_terms = [
319
+ "(methods[Title/Abstract]) AND (biology[MeSH Terms])",
320
+ "(computational biology[MeSH Terms]) AND (methods[Title/Abstract])",
321
+ "(bioinformatics[MeSH Terms]) AND (algorithm[Title/Abstract])",
322
+ "(molecular biology[MeSH Terms]) AND (technique[Title/Abstract])"
323
+ ]
324
+
325
+ for search_term in search_terms:
326
+ if is_shutdown:
327
+ break
328
+
329
+ try:
330
+ handle = Entrez.esearch (
331
+ db="pubmed",
332
+ term=search_term,
333
+ retmax=self.config.max_pubmed_papers // len (search_terms),
334
+ sort="relevance"
335
+ )
336
+ record = Entrez.read (handle)
337
+ handle.close ()
338
+ pmids = record.get ("IdList", [])
339
+
340
+ for i in tqdm (range (0, len (pmids), self.config.chunk_size), desc="PubMed batch"):
341
+ if is_shutdown:
342
+ break
343
+
344
+ chunk_pmids = pmids [i:i + self.config.chunk_size]
345
+ try:
346
+ records = self._fetch_pubmed_batch (chunk_pmids)
347
+
348
+ for rec in records.get ("PubmedArticle", []):
349
+ try:
350
+ medline_citation = rec.get ("MedlineCitation", {})
351
+ article = medline_citation.get ("Article", {})
352
+
353
+ title = article.get ("ArticleTitle", "")
354
+ abstract_list = article.get ("Abstract", {}).get ("AbstractText", [""])
355
+ abstract = abstract_list [0] if abstract_list else ""
356
+
357
+ if title and isinstance (title, str) and title not in self.seen_titles:
358
+ paper = {
359
+ "title": title.strip (),
360
+ "abstract": abstract.strip () if isinstance (abstract, str) else "",
361
+ "full_text": "",
362
+ "domain": "biology",
363
+ "section": "abstract",
364
+ "source": "pubmed",
365
+ "authors": [],
366
+ "published": None,
367
+ "provenance": {"pubmed_id": str (medline_citation.get ("PMID", ""))},
368
+ "categories": ["biology"],
369
+ "text": abstract.strip () if isinstance (abstract, str) else ""
370
+ }
371
+ papers.append (paper)
372
+ self.seen_titles.add (title)
373
+
374
+ except (KeyError, TypeError, AttributeError) as e:
375
+ logger.warning (f"Error processing PubMed record: {e}")
376
+ self.metrics ["pubmed"].errors += 1
377
+ continue
378
+
379
+ except (HTTPError, URLError, ConnectionError, ExpatError) as e:
380
+ self.metrics ["pubmed"].errors += 1
381
+ logger.warning (f"Failed to fetch PubMed batch: {e}")
382
+ continue
383
+
384
+ except (HTTPError, URLError, ConnectionError, ExpatError) as e:
385
+ self.metrics ["pubmed"].errors += 1
386
+ logger.error (f"PubMed search failed for {search_term}: {e}")
387
+ except KeyboardInterrupt:
388
+ logger.info ("PubMed collection interrupted by user")
389
+ break
390
+
391
+ self._save_intermediate (papers, "pubmed_papers.jsonl")
392
+ elapsed = time.time () - start_time
393
+ self.metrics ["pubmed"].papers = len (papers)
394
+ self.metrics ["pubmed"].time = elapsed
395
+ logger.info (f"Collected {len (papers)} PubMed papers in {elapsed:.2f}s")
396
+ return papers
397
+
398
+ @retry (max_retries=3, backoff_factor=2,
399
+ exceptions=(ConnectionError, HTTPError, URLError, OSError))
400
+ def fetch_fineweb_edu(self) -> List [Dict]:
401
+ """
402
+ Fetch educational content from FineWeb-Edu dataset.
403
+
404
+ Returns:
405
+ List of FineWeb-Edu paper dictionaries.
406
+ """
407
+ logger.info ("Starting FineWeb-Edu collection...")
408
+ start_time = time.time ()
409
+ papers = []
410
+
411
+ try:
412
+ ds = load_dataset ("HuggingFaceFW/fineweb-edu", "sample-10BT",
413
+ split="train", streaming=True)
414
+ samples = []
415
+
416
+ for i, sample in enumerate (ds):
417
+ if is_shutdown:
418
+ break
419
+ if i >= self.config.max_fineweb_samples:
420
+ break
421
+
422
+ if not isinstance (sample, dict) or "text" not in sample:
423
+ logger.warning (f"Invalid sample structure at index {i}")
424
+ continue
425
+
426
+ samples.append (sample)
427
+ if (i + 1) % 10000 == 0:
428
+ logger.info (f"Collected {i + 1} FineWeb samples")
429
+
430
+ logger.info (f"Processing {len (samples)} FineWeb samples")
431
+
432
+ def is_educational_content(sample: Dict) -> bool:
433
+ """Check if content is educational and suitable."""
434
+ try:
435
+ text = sample.get ("text", "")
436
+ if not isinstance (text, str) or len (text) < 500:
437
+ return False
438
+ return self.ranker.is_explanatory (text)
439
+ except (AttributeError, TypeError, ValueError) as e:
440
+ logger.debug (f"Error evaluating educational content: {e}")
441
+ return False
442
+
443
+ with concurrent.futures.ThreadPoolExecutor (max_workers=self.config.max_workers) as executor:
444
+ filtered_results = list (tqdm (
445
+ executor.map (is_educational_content, samples),
446
+ total=len (samples),
447
+ desc="Filtering FineWeb content"
448
+ ))
449
+
450
+ for sample, is_good in zip (samples, filtered_results):
451
+ if is_shutdown:
452
+ break
453
+ if is_good:
454
+ try:
455
+ url = sample.get ("url", "")
456
+ meta = sample.get ("meta", {})
457
+ title = meta.get ("title", "") if isinstance (meta, dict) else ""
458
+ title = title or url or f"Document_{len (papers)}"
459
+
460
+ if title not in self.seen_titles:
461
+ paper = {
462
+ "title": title,
463
+ "abstract": "",
464
+ "full_text": sample.get ("text", ""),
465
+ "domain": "education",
466
+ "section": "full_text",
467
+ "source": "fineweb_edu",
468
+ "authors": [],
469
+ "published": None,
470
+ "provenance": {"url": url},
471
+ "categories": ["education"],
472
+ "text": sample.get("text", "")
473
+ }
474
+ papers.append (paper)
475
+ self.seen_titles.add (title)
476
+ except (KeyError, TypeError, AttributeError) as e:
477
+ logger.warning (f"Error processing FineWeb sample: {e}")
478
+ self.metrics ["fineweb_edu"].errors += 1
479
+ continue
480
+
481
+ except (ConnectionError, HTTPError, URLError, OSError) as e:
482
+ logger.error (f"FineWeb-Edu fetch failed: {e}")
483
+ self.metrics ["fineweb_edu"].errors += 1
484
+ except KeyboardInterrupt:
485
+ logger.info ("FineWeb-Edu collection interrupted by user")
486
+ except ImportError as e:
487
+ logger.error (f"Failed to import required dataset library: {e}")
488
+ self.metrics ["fineweb_edu"].errors += 1
489
+
490
+ self._save_intermediate (papers, "fineweb_edu.jsonl")
491
+ elapsed = time.time () - start_time
492
+ self.metrics ["fineweb_edu"].papers = len (papers)
493
+ self.metrics ["fineweb_edu"].time = elapsed
494
+ logger.info (f"Collected {len (papers)} FineWeb-Edu papers in {elapsed:.2f}s")
495
+ return papers
496
+
497
+ @staticmethod
498
+ def preprocess_sample(paper: Dict) -> List [Dict]:
499
+ """
500
+ Preprocess a paper sample into multiple training samples.
501
+
502
+ Args:
503
+ paper: Dictionary representing a paper.
504
+
505
+ Returns:
506
+ List of processed sample dictionaries.
507
+ """
508
+ try:
509
+ title = clean_text (paper.get ("title", "")) if paper.get ("title") else ""
510
+ abstract = clean_text (paper.get ("abstract", "")) if paper.get ("abstract") else ""
511
+ full_text = clean_text (paper.get ("full_text", "")) if paper.get ("full_text") else ""
512
+
513
+ paragraphs = segment_paragraphs (full_text) if full_text else []
514
+ samples = []
515
+
516
+ if title or abstract:
517
+ sample = dict (paper)
518
+ sample ["title"] = title
519
+ sample ["abstract"] = abstract
520
+ sample ["full_text"] = ""
521
+ sample ["section"] = "abstract"
522
+ samples.append (sample)
523
+
524
+ for para in paragraphs:
525
+ if para.strip ():
526
+ sample = dict (paper)
527
+ sample ["title"] = title
528
+ sample ["abstract"] = ""
529
+ sample ["full_text"] = para
530
+ sample ["section"] = "paragraph"
531
+ samples.append (sample)
532
+
533
+ return samples
534
+
535
+ except (AttributeError, TypeError, ValueError) as e:
536
+ logger.warning (f"Error preprocessing sample: {e}")
537
+ return []
538
+
539
+ def process_papers(self, papers: List[Dict], domain: str) -> List[Dict]:
540
+ """
541
+ Process papers with domain-specific tagging and filtering.
542
+
543
+ Args:
544
+ papers: List of paper dictionaries.
545
+ domain: Domain string for tagging.
546
+
547
+ Returns:
548
+ List of processed and filtered sample dictionaries.
549
+ """
550
+ logger.info(f"Processing {len(papers)} {domain} papers...")
551
+ processed = []
552
+ unknown_domains = 0
553
+ unknown_sections = 0
554
+
555
+ def label_domain(paper):
556
+ cats = paper.get('categories', [])
557
+ if not cats:
558
+ return 'unknown'
559
+ cats_str = " ".join(cats).lower()
560
+ if 'bio' in cats_str:
561
+ return '[BIO]'
562
+ if 'gen' in cats_str:
563
+ return '[GEN]'
564
+ if 'phys' in cats_str:
565
+ return '[PHY]'
566
+ if 'math' in cats_str:
567
+ return '[MATH]'
568
+ if 'mat' in cats_str or 'materials' in cats_str:
569
+ return '[MAT]'
570
+ if 'astro' in cats_str:
571
+ return '[ASTRO]'
572
+ if 'cs' in cats_str:
573
+ return '[CS]'
574
+ return 'unknown'
575
+
576
+ def label_section(paper):
577
+ text = paper.get('text', '') or paper.get('abstract', '') or ''
578
+ text_lower = text.lower()
579
+ if not text_lower:
580
+ return 'unknown'
581
+ if 'abstract' in text_lower:
582
+ return '[ABSTRACT]'
583
+ if 'introduction' in text_lower:
584
+ return '[INTRO]'
585
+ if 'methods' in text_lower:
586
+ return '[METHODS]'
587
+ if 'results' in text_lower:
588
+ return '[RESULTS]'
589
+ if 'discussion' in text_lower:
590
+ return '[DISCUSSION]'
591
+ if 'conclusion' in text_lower:
592
+ return '[CONCLUSION]'
593
+ return 'unknown'
594
+
595
+ for paper in tqdm(papers, desc=f"Processing {domain} papers"):
596
+ try:
597
+ domain_tag = label_domain(paper)
598
+ section_tag = label_section(paper)
599
+ paper["domain_tag"] = domain_tag
600
+ paper["section_tag"] = section_tag
601
+ if domain_tag == 'unknown':
602
+ unknown_domains += 1
603
+ if section_tag == 'unknown':
604
+ unknown_sections += 1
605
+
606
+ task = paper.get("task", None)
607
+ if task and task in TASK_TAGS:
608
+ paper["task_tag"] = TASK_TAGS[task]
609
+
610
+ routing = paper.get("routing", "general")
611
+ paper["routing_tag"] = ROUTING_TAGS.get(routing, ROUTING_TAGS["general"])
612
+
613
+ samples = self.preprocess_sample(paper)
614
+
615
+ for sample in samples:
616
+ try:
617
+ content_parts = []
618
+ if sample.get("title"):
619
+ content_parts.append(str(sample["title"]))
620
+ if sample.get("abstract"):
621
+ content_parts.append(str(sample["abstract"]))
622
+ if sample.get("full_text"):
623
+ content_parts.append(str(sample["full_text"])[:1000])
624
+ content = " ".join(content_parts)
625
+ if content.strip() and self.ranker.is_explanatory(content):
626
+ sample["domain_tag"] = paper["domain_tag"]
627
+ sample["section_tag"] = paper["section_tag"]
628
+ sample["routing_tag"] = paper["routing_tag"]
629
+ if "task_tag" in paper:
630
+ sample["task_tag"] = paper["task_tag"]
631
+ processed.append(sample)
632
+ except Exception as e:
633
+ logger.debug(f"Error evaluating sample content: {e}")
634
+ continue
635
+
636
+ except Exception as e:
637
+ logger.warning(f"Paper processing error: {e}")
638
+ continue
639
+
640
+ logger.info(f"Processed {len(processed)}/{len(papers)} {domain} papers")
641
+ logger.info(f"Unknown domains: {unknown_domains}, Unknown sections: {unknown_sections}")
642
+ return processed
643
+
644
+ def _save_intermediate(self, papers: List[Dict], filename: str) -> None:
645
+ """
646
+ Save intermediate results to disk as JSONL.
647
+
648
+ Args:
649
+ papers: List of paper/sample dictionaries.
650
+ filename: Output filename.
651
+ """
652
+ path = self.data_dir / filename
653
+ try:
654
+ with open (path, "w", encoding="utf-8") as f:
655
+ for paper in papers:
656
+ f.write (json.dumps (paper, ensure_ascii=False) + "\n")
657
+ logger.info (f"Saved checkpoint to {path}")
658
+ except (OSError, IOError, PermissionError) as e:
659
+ logger.error (f"Failed to save intermediate file {filename}: {e}")
660
+ except (TypeError, ValueError) as e:
661
+ logger.error (f"JSON serialization error for {filename}: {e}")
662
+
663
+ def build_corpus(self, output_path: str, verify_only: bool = False) -> None:
664
+ """
665
+ Build the complete scientific corpus with checkpoint verification.
666
+
667
+ Args:
668
+ output_path: Path to save the final corpus.
669
+ verify_only: If True, only verify checkpoints and skip merging.
670
+ """
671
+ logger.info("Starting scientific corpus build...")
672
+ total_start = time.time()
673
+ all_papers = []
674
+
675
+ sources = [
676
+ ("arXiv", self.fetch_arxiv_papers, None),
677
+ ("PubMed", self.fetch_pubmed_papers, "biology"),
678
+ ("FineWeb-Edu", self.fetch_fineweb_edu, "education")
679
+ ]
680
+ for source_name, fetch_func, domain in sources:
681
+ if is_shutdown:
682
+ break
683
+ logger.info(f"Fetching {source_name} papers...")
684
+ try:
685
+ papers = fetch_func()
686
+ if domain:
687
+ processed = []
688
+ for i in range(0, len(papers), self.config.chunk_size):
689
+ chunk = papers[i:i + self.config.chunk_size]
690
+ processed.extend(self.process_papers(chunk, domain))
691
+ papers = processed
692
+ chkpt_filename = f"{source_name.lower()}_papers.jsonl"
693
+ self._save_intermediate(papers, chkpt_filename)
694
+ if not papers:
695
+ logger.error(f"{source_name} checkpoint {chkpt_filename} is empty!")
696
+ all_papers.extend(papers)
697
+ logger.info(f"Added {len(papers)} papers from {source_name}")
698
+ except Exception as e:
699
+ logger.error(f"Critical error fetching from {source_name}: {e}")
700
+ continue
701
+
702
+ logger.info(f"Total papers collected: {len(all_papers)}")
703
+ if verify_only:
704
+ logger.info("Verification flag enabled; skipping merge and build.")
705
+ self.print_report({})
706
+ return
707
+
708
+ if not all_papers:
709
+ logger.error("No papers collected. Cannot build corpus.")
710
+ self.print_report({})
711
+ return
712
+
713
+ logger.info("Ranking and deduplicating papers...")
714
+ try:
715
+ ranked_papers = self.ranker.rank_samples(all_papers)
716
+ if not ranked_papers:
717
+ logger.error("Final corpus is empty after ranking. Using unranked papers as fallback.")
718
+ ranked_papers = all_papers
719
+ logger.info(f"Final corpus size: {len(ranked_papers)} papers")
720
+ except Exception as e:
721
+ logger.error(f"Error ranking papers: {e}")
722
+ ranked_papers = all_papers
723
+
724
+ if not ranked_papers:
725
+ logger.error("Final corpus is empty. No data to process or save.")
726
+ self.print_report({})
727
+ return
728
+
729
+ self._save_intermediate(ranked_papers, "ranked_papers.jsonl")
730
+ try:
731
+ stats = self.analyzer.get_dataset_stats(ranked_papers)
732
+ self.metrics["total_tokens"] = int(stats.get("avg_tokens", 0) * stats.get("total_samples", 0))
733
+ except Exception as e:
734
+ logger.error(f"Error generating dataset statistics: {e}")
735
+ stats = {}
736
+
737
+ self.metrics["total_time"] = time.time() - total_start
738
+ logger.info("Processing final dataset in batches...")
739
+ try:
740
+ with open(output_path, "w", encoding="utf-8") as out_f:
741
+ for i in range(0, len(ranked_papers), self.config.chunk_size):
742
+ chunk = ranked_papers[i:i + self.config.chunk_size]
743
+ for paper in chunk:
744
+ out_f.write(json.dumps(paper, ensure_ascii=False) + "\n")
745
+ except Exception as e:
746
+ logger.error(f"Error processing final dataset: {e}")
747
+
748
+ # HuggingFace upload: warn if a file is too large
749
+ if os.path.exists(output_path) and os.path.getsize(output_path) > 10 * 1024 * 1024:
750
+ logger.warning(
751
+ f"{output_path} is larger than 10 MiB. HuggingFace will reject files >10 MiB unless you use Git LFS. "
752
+ "See https://hf.co/docs/hub/repositories-getting-started#terminal"
753
+ )
754
+ logger.warning(
755
+ "To fix: install git-lfs and run 'git lfs track \"*.jsonl\"' before pushing, or split your file."
756
+ )
757
+
758
+ self.print_report(stats)
759
+ logger.info(f"Scientific corpus successfully built: {output_path}")
760
+
761
+ def build_corpus_scoped(self, plan: str, token_budget: int) -> (list, dict):
762
+ """
763
+ Build a scientific corpus, limiting the total number of tokens to the plan's budget.
764
+ Returns the corpus and stats.
765
+ """
766
+ logger.info(f"Building corpus for plan '{plan}' with token budget {token_budget}")
767
+ all_papers = []
768
+ all_papers.extend(self.process_papers(self.fetch_arxiv_papers(), "arxiv"))
769
+ all_papers.extend(self.process_papers(self.fetch_pubmed_papers(), "biology"))
770
+ all_papers.extend(self.process_papers(self.fetch_fineweb_edu(), "education"))
771
+
772
+ # Rank and deduplicate
773
+ ranked_papers = self.ranker.rank_samples(all_papers)
774
+ corpus = []
775
+ total_tokens = 0
776
+ for paper in ranked_papers:
777
+ tokens = paper.get("text", "").split()
778
+ if total_tokens + len(tokens) > token_budget:
779
+ break
780
+ corpus.append(paper)
781
+ total_tokens += len(tokens)
782
+ stats = self.analyzer.get_dataset_stats(corpus)
783
+ stats["total_tokens"] = total_tokens
784
+ logger.info(f"Corpus built: {len(corpus)} samples, {total_tokens} tokens")
785
+ return corpus, stats
786
+
787
+ def print_report(self, stats: Dict) -> None:
788
+ """
789
+ Print a comprehensive build report.
790
+
791
+ Args:
792
+ stats: Dictionary of dataset statistics.
793
+ """
794
+ print("\n" + "=" * 67)
795
+ print(" SCIENTIFIC CORPUS BUILD REPORT")
796
+ print("=" * 67)
797
+ print("\nSOURCE METRICS:")
798
+ print("-" * 40)
799
+ for source_name, label in zip(["arxiv", "pubmed", "fineweb_edu"],
800
+ ["ARXIV", "PUBMED", "FINEWEB_EDU"]):
801
+ metrics = self.metrics[source_name]
802
+ print(f"{label:15}: {metrics.papers:6d} papers | {metrics.errors:3d} errors | {metrics.time:9.2f}s")
803
+ print("\nOVERALL METRICS:")
804
+ print("-" * 40)
805
+ total_papers = sum(self.metrics[src].papers for src in ["arxiv", "pubmed", "fineweb_edu"])
806
+ total_errors = sum(self.metrics[src].errors for src in ["arxiv", "pubmed", "fineweb_edu"])
807
+ print(f"Total Papers: {total_papers:,}")
808
+ print(f"Total Tokens: {self.metrics['total_tokens']:,}")
809
+ print(f"Total Time: {self.metrics['total_time']:.2f}s")
810
+ print(f"Total Errors: {total_errors}")
811
+ success_rate = (1 - total_errors / max(total_papers + total_errors, 1)) * 100
812
+ print(f"Success Rate: {success_rate:.2f}%")
813
+ if stats:
814
+ print("\nDATASET STATISTICS:")
815
+ print("-" * 40)
816
+ for key, value in stats.items():
817
+ print(f"{key:20}: {value}")
818
+ print("=" * 67)
819
+ print()
820
+
821
+
822
+ def main() -> None:
823
+ """
824
+ Main entry point for the corpus builder.
825
+ """
826
+ try:
827
+ config = CorpusConfig()
828
+ builder = ScientificCorpusBuilder(config)
829
+ output_path = "scientific_corpus_325M.jsonl"
830
+ builder.build_corpus(output_path)
831
+
832
+ # --- Hugging Face upload with improved error handling ---
833
+ try:
834
+ # Split large files if needed
835
+ file_size = os.path.getsize(output_path)
836
+ if file_size > 10 * 1024 * 1024: # 10 MB
837
+ logger.info("Large file detected, splitting into chunks...")
838
+ chunk_size = 10 * 1024 * 1024 # 10 MB chunks
839
+ base_path = os.path.splitext(output_path)[0]
840
+
841
+ with open(output_path, 'r', encoding='utf-8') as f:
842
+ chunk_num = 0
843
+ chunk = []
844
+ current_size = 0
845
+
846
+ for line in f:
847
+ line_size = len(line.encode('utf-8'))
848
+ if current_size + line_size > chunk_size and chunk:
849
+ chunk_path = f"{base_path}_part{chunk_num}.jsonl"
850
+ with open(chunk_path, 'w', encoding='utf-8') as chunk_file:
851
+ chunk_file.writelines(chunk)
852
+ logger.info(f"Created chunk {chunk_num}: {chunk_path}")
853
+ chunk = []
854
+ current_size = 0
855
+ chunk_num += 1
856
+
857
+ chunk.append(line)
858
+ current_size += line_size
859
+
860
+ # Write final chunk
861
+ if chunk:
862
+ chunk_path = f"{base_path}_part{chunk_num}.jsonl"
863
+ with open(chunk_path, 'w', encoding='utf-8') as chunk_file:
864
+ chunk_file.writelines(chunk)
865
+ logger.info(f"Created final chunk {chunk_num}: {chunk_path}")
866
+
867
+ # Upload each chunk
868
+ for i in range(chunk_num + 1):
869
+ chunk_path = f"{base_path}_part{i}.jsonl"
870
+ logger.info(f"Uploading chunk {i}...")
871
+ upload_to_huggingface(
872
+ dataset_path=chunk_path,
873
+ repo_id="Allanatrix/Scientific_Research_Tokenized",
874
+ auto_generate_readme=(i == 0), # Only generate README for first chunk
875
+ compress=True,
876
+ keep_local=True # Keep files until all uploads complete
877
+ )
878
+ else:
879
+ # Upload single file
880
+ upload_to_huggingface(
881
+ dataset_path=output_path,
882
+ repo_id="Allanatrix/Scientific_Research_Tokenized",
883
+ auto_generate_readme=True,
884
+ compress=True
885
+ )
886
+
887
+ except ImportError:
888
+ logger.error("Hugging Face upload module not found. Please ensure hf_upload.py exists.")
889
+ except Exception as e:
890
+ logger.error(f"Error during Hugging Face upload: {e}")
891
+ if "EOF" in str(e) or "timeout" in str(e):
892
+ logger.warning("Upload interrupted. Try using smaller chunks or increasing timeout.")
893
+ finally:
894
+ # Cleanup temporary files
895
+ if 'chunk_num' in locals():
896
+ for i in range(chunk_num + 1):
897
+ try:
898
+ os.remove(f"{base_path}_part{i}.jsonl")
899
+ except OSError:
900
+ pass
901
+
902
+ except KeyboardInterrupt:
903
+ logger.info("Build process interrupted by user")
904
+ except Exception as e:
905
+ logger.error(f"Unexpected error in main: {e}")
906
+ raise
907
+
908
+ # Optionally, you can add a CLI entry point for testing:
909
+ def main_scoped(plan: str = "free"):
910
+ config = CorpusConfig()
911
+ builder = ScientificCorpusBuilder(config)
912
+ token_budget = PLAN_LIMITS.get(plan, 1000)
913
+ corpus, stats = builder.build_corpus_scoped(plan, token_budget)
914
+ output_path = f"scientific_corpus_{plan}_{token_budget}.jsonl"
915
+ with open(output_path, "w", encoding="utf-8") as f:
916
+ for paper in corpus:
917
+ f.write(json.dumps(paper, ensure_ascii=False) + "\n")
918
+ print(f"Saved {len(corpus)} samples ({stats['total_tokens']} tokens) to {output_path}")
919
+
920
+ if __name__ == "__main__":
921
+ # main() # old entry point
922
+ main_scoped("free") # new entry point for plan-scoped corpus
Tokenization/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tokenization/__init__.py
2
+
3
+ from .Entropy_ranker import EntropyRanker
4
+ from .Label_tokens import DOMAIN_TAGS, TASK_TAGS, SECTION_TAGS, ROUTING_TAGS, build_tag_string
5
+ from .preprocessing import clean_text, segment_paragraphs, preprocess_sample
6
+
7
+ # Expose the main dataset generation pipeline for external use
8
+ from .generate_dataset import generate_dataset
9
+
10
+ __all__ = [
11
+ "EntropyRanker",
12
+ "DOMAIN_TAGS",
13
+ "TASK_TAGS",
14
+ "SECTION_TAGS",
15
+ "ROUTING_TAGS",
16
+ "build_tag_string",
17
+ "clean_text",
18
+ "segment_paragraphs",
19
+ "preprocess_sample",
20
+ "generate_dataset",
21
+ ]
Tokenization/__pycache__/Build_tokenizer.cpython-310.pyc ADDED
Binary file (3.54 kB). View file
 
Tokenization/__pycache__/Entropy_ranker.cpython-310.pyc ADDED
Binary file (3.39 kB). View file
 
Tokenization/__pycache__/Label_tokens.cpython-310.pyc ADDED
Binary file (1.35 kB). View file
 
Tokenization/__pycache__/Main_2.cpython-310.pyc ADDED
Binary file (26.8 kB). View file
 
Tokenization/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (565 Bytes). View file
 
Tokenization/__pycache__/generate_dataset.cpython-310.pyc ADDED
Binary file (3.14 kB). View file
 
Tokenization/__pycache__/hf_upload.cpython-310.pyc ADDED
Binary file (5.56 kB). View file
 
Tokenization/app.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import time
3
+
4
+ def calculate_price(payment_mode, tokens, plan, custom_price, file):
5
+ if payment_mode == "Pay as you go":
6
+ price = round(tokens * 0.01, 2) # Example: $0.01 per token
7
+ return f"{tokens:,} tokens\nPrice: ${price:.2f}", price
8
+ elif payment_mode == "Plan":
9
+ if plan == "Free":
10
+ return "0 tokens\nPrice: $0", 0
11
+ elif plan == "Starter":
12
+ return "100,000 tokens\nPrice: $15", 15
13
+ elif plan == "Pro":
14
+ return "500,000 tokens\nPrice: $30", 30
15
+ elif plan == "Custom":
16
+ return f"Custom plan\nPrice: ${custom_price}", float(custom_price or 0)
17
+ elif file is not None:
18
+ # Simulate token count from file size
19
+ tokens = 1000 # Replace it with real calculation
20
+ price = round(tokens * 0.01, 2)
21
+ return f"{tokens:,} tokens\nPrice: ${price:.2f}", price
22
+ return "", 0
23
+
24
+ def generate_dataset(*args, **kwargs):
25
+ for i in range(5):
26
+ yield f"Generating... ({(i+1)*20}%)", None, (i+1)/5
27
+ time.sleep(0.3)
28
+ yield "Ready! Please pay to download.", "dataset.jsonl", 1.0
29
+
30
+ with gr.Blocks(
31
+ title="Nexa Data Studio",
32
+ css="""
33
+ body, .gradio-container {
34
+ min-height: 100vh;
35
+ background: #111 !important;
36
+ color: #fff !important;
37
+ }
38
+ .gradio-container {
39
+ max-width: 900px !important;
40
+ margin: 40px auto !important;
41
+ box-shadow: 0 2px 16px #0008;
42
+ border-radius: 16px;
43
+ padding: 32px 32px 24px 32px !important;
44
+ background: #111 !important;
45
+ color: #fff !important;
46
+ display: flex;
47
+ flex-direction: column;
48
+ align-items: center;
49
+ }
50
+ .footer {margin-top: 2em; color: #bbb; font-size: 0.9em; text-align: center;}
51
+ #header {text-align: center;}
52
+ """
53
+ ) as demo:
54
+ gr.Markdown(
55
+ """
56
+ <div style="display:flex;align-items:center;gap:16px;justify-content:center;">
57
+ <img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" height="40"/>
58
+ <h1 style="margin-bottom:0;">Nexa Data Studio</h1>
59
+ </div>
60
+ <p style="text-align:center;">
61
+ <b>Generate or label scientific datasets for ML research.</b>
62
+ </p>
63
+ """,
64
+ elem_id="header"
65
+ )
66
+
67
+ payment_mode = gr.Radio(
68
+ ["Pay as you go", "Plan"],
69
+ label="Payment Mode",
70
+ value="Pay as you go"
71
+ )
72
+
73
+ with gr.Row() as payg_row:
74
+ tokens = gr.Slider(100, 100000, value=1000, step=100, label="Tokens Requested")
75
+ with gr.Row(visible=False) as plan_row:
76
+ plan = gr.Dropdown(
77
+ ["Free", "Starter", "Pro", "Custom"],
78
+ label="Plan",
79
+ value="Free"
80
+ )
81
+ custom_price = gr.Number(label="Custom Price ($)", visible=False)
82
+
83
+ job_type = gr.Radio(
84
+ ["Generate Dataset", "Label Uploaded Data"],
85
+ label="Job Type",
86
+ value="Generate Dataset"
87
+ )
88
+
89
+ with gr.Column(visible=False) as label_col:
90
+ file = gr.File(label="Upload Dataset (.txt or .jsonl)")
91
+
92
+ price_info = gr.Textbox(label="Summary", interactive=False)
93
+ download = gr.File(label="Download")
94
+ progress = gr.Slider(0, 1, value=0, step=0.01, label="Progress", interactive=False)
95
+ status = gr.Text(label="Status", interactive=False)
96
+
97
+ def update_payment_ui(payment_mode_val, plan_val):
98
+ return (
99
+ gr.update(visible=payment_mode_val == "Pay as you go"),
100
+ gr.update(visible=payment_mode_val == "Plan"),
101
+ gr.update(visible=payment_mode_val == "Plan" and plan_val == "Custom")
102
+ )
103
+
104
+ payment_mode.change(
105
+ update_payment_ui,
106
+ inputs=[payment_mode, plan],
107
+ outputs=[payg_row, plan_row, custom_price]
108
+ )
109
+ plan.change(
110
+ lambda p: gr.update(visible=p == "Custom"),
111
+ inputs=plan,
112
+ outputs=custom_price
113
+ )
114
+
115
+ def update_label_ui(job_type_val):
116
+ return gr.update(visible=job_type_val == "Label Uploaded Data")
117
+ job_type.change(update_label_ui, inputs=job_type, outputs=label_col)
118
+
119
+ def update_summary(payment_mode, tokens, plan, custom_price, file, job_type):
120
+ if job_type == "Label Uploaded Data" and file is not None:
121
+ return calculate_price("Label", tokens, plan, custom_price, file)[0]
122
+ return calculate_price(payment_mode, tokens, plan, custom_price, file)[0]
123
+
124
+ inputs = [payment_mode, tokens, plan, custom_price, file, job_type]
125
+ gr.Button("Generate", elem_id="generate-btn", variant="primary").click(
126
+ generate_dataset,
127
+ inputs=inputs,
128
+ outputs=[status, download, progress]
129
+ )
130
+ gr.Button("Update Summary").click(
131
+ update_summary,
132
+ inputs=inputs,
133
+ outputs=price_info
134
+ )
135
+
136
+ gr.Markdown(
137
+ f"""
138
+ <div class="footer">
139
+ &copy; {time.strftime("%Y")} Nexa Data Studio &mdash; Powered by Hugging Face Spaces<br>
140
+ For support, contact <a href="mailto:[email protected]">[email protected]</a>
141
+ </div>
142
+ """
143
+ )
144
+
145
+ if __name__ == "__main__":
146
+ demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
147
+ print("Nexa Data Studio is running at http://localhost:7860")
Tokenization/app/Api.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Api.py: FastAPI endpoints for dataset generation, progress polling, and download.
3
+ """
4
+ from fastapi import FastAPI, Request
5
+ from fastapi.responses import JSONResponse, StreamingResponse
6
+ from .Core import job_manager
7
+ from .Progress import progress_tracker
8
+ from .Payment import payment_manager
9
+ import io
10
+
11
+ app = FastAPI()
12
+
13
+ @app.post("/generate-dataset")
14
+ async def generate_dataset(request: Request):
15
+ user_input = await request.json()
16
+ job_id, error = job_manager.start_job(user_input)
17
+ if error:
18
+ return JSONResponse({"error": error}, status_code=400)
19
+ return {"job_id": job_id}
20
+
21
+ @app.get("/progress/{job_id}")
22
+ def get_progress(job_id: str):
23
+ progress = progress_tracker.get(job_id)
24
+ if not progress:
25
+ return JSONResponse({"error": "Job not found"}, status_code=404)
26
+ return progress
27
+
28
+ @app.get("/download/{job_id}")
29
+ def download(job_id: str):
30
+ job = job_manager.get_job_status(job_id)
31
+ if not job or job.get("status") != "complete":
32
+ return JSONResponse({"error": "Job not complete"}, status_code=400)
33
+ # Payment check
34
+ plan = job.get("plan", "free")
35
+ tokens = job.get("token_budget", 0)
36
+ if payment_manager.requires_payment(plan, tokens):
37
+ return JSONResponse({"error": "Payment required", "checkout_url": payment_manager.create_checkout_session(plan, job_id)}, status_code=402)
38
+ # In production, use FileResponse to serve the file
39
+ return {
40
+ "download_url": job["result_path"],
41
+ "stats": job.get("stats", {})
42
+ }
43
+
44
+ @app.get("/download-corpus/{job_id}")
45
+ def download_corpus(job_id: str):
46
+ job = job_manager.get_job_status(job_id)
47
+ if not job or job.get("status") != "complete":
48
+ return JSONResponse({"error": "Job not complete"}, status_code=400)
49
+ if job.get("job_type") != "corpus":
50
+ return JSONResponse({"error": "Not a corpus job"}, status_code=400)
51
+ plan = job.get("plan", "free")
52
+ tokens = job.get("token_budget", 0)
53
+ if payment_manager.requires_payment(plan, tokens):
54
+ return JSONResponse({"error": "Payment required", "checkout_url": payment_manager.create_checkout_session(plan, job_id)}, status_code=402)
55
+ jsonl_lines = job.get("jsonl_lines", [])
56
+ stats = job.get("stats", {})
57
+ # Stream the JSONL as a file
58
+ file_like = io.StringIO("\n".join(jsonl_lines))
59
+ headers = {
60
+ "Content-Disposition": f"attachment; filename=scientific_corpus_{job_id}.jsonl"
61
+ }
62
+ return StreamingResponse(file_like, media_type="application/jsonl", headers=headers)
63
+
64
+ @app.get("/job-stats/{job_id}")
65
+ def job_stats(job_id: str):
66
+ job = job_manager.get_job_status(job_id)
67
+ if not job:
68
+ return JSONResponse({"error": "Job not found"}, status_code=404)
69
+ return {"stats": job.get("stats", {})}
70
+
71
+ @app.get("/price/{plan}")
72
+ def get_price(plan: str):
73
+ price = payment_manager.get_price(plan)
74
+ return {"plan": plan, "price": price}
75
+
Tokenization/app/Config.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Config.py: Configuration for plan limits, pricing, and app constants.
3
+ """
4
+
5
+ # Plan limits (tokens per plan)
6
+ PLAN_LIMITS = {
7
+ "free": 1000,
8
+ "starter": 5000,
9
+ "pro": 10000,
10
+ "enterprise": 100000,
11
+ }
12
+
13
+ # Pricing per plan (USD)
14
+ PLAN_PRICING = {
15
+ "free": 0,
16
+ "starter": 15,
17
+ "pro": 30,
18
+ "enterprise": "custom",
19
+ }
20
+
21
+ # Other app-wide constants
22
+ tmp_dir = "./tmp_datasets"
23
+
24
+ # Stripe keys, etc. (to be set via environment variables in production)
25
+ STRIPE_API_KEY = None
Tokenization/app/Core.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Core.py: Orchestrates dataset generation jobs, plan enforcement, and background processing.
3
+ """
4
+ import threading
5
+ import uuid
6
+ import os
7
+ import json
8
+ from .Config import PLAN_LIMITS, tmp_dir
9
+ from .Progress import progress_tracker
10
+ from .Payment import payment_manager
11
+
12
+ # Import your tokenizer module here (example)
13
+ from Tokenization.generate_dataset import generate_dataset
14
+ from Tokenization.Main_2 import ScientificCorpusBuilder, CorpusConfig
15
+ from Tokenization.Build_tokenizer import QLoRAPreprocessor
16
+ import nltk
17
+
18
+ class JobManager:
19
+ def __init__(self):
20
+ self.jobs = {}
21
+ self.lock = threading.Lock()
22
+
23
+ def start_job(self, user_input):
24
+ plan = user_input.get("plan")
25
+ token_budget = user_input.get("token_budget")
26
+ job_type = user_input.get("job_type", "tokenize") # "tokenize", "corpus", or "label"
27
+ # For label jobs, token_budget is determined after upload
28
+ if job_type != "label" and not payment_manager.check_plan_limit(plan, token_budget):
29
+ return None, "Plan limit exceeded"
30
+ job_id = str(uuid.uuid4())
31
+ with self.lock:
32
+ self.jobs[job_id] = {
33
+ "status": "pending",
34
+ "plan": plan,
35
+ "token_budget": token_budget,
36
+ "job_type": job_type,
37
+ "user_input": user_input
38
+ }
39
+ if job_type == "corpus":
40
+ thread = threading.Thread(target=self._run_corpus_pipeline, args=(job_id,))
41
+ elif job_type == "label":
42
+ thread = threading.Thread(target=self._run_label_pipeline, args=(job_id,))
43
+ else:
44
+ thread = threading.Thread(target=self._run_job, args=(job_id, user_input))
45
+ thread.start()
46
+ return job_id, None
47
+
48
+ def _run_job(self, job_id, user_input):
49
+ try:
50
+ progress_tracker.start_job(job_id, total_steps=6)
51
+ # Step 1: Data retrieval
52
+ progress_tracker.update(job_id, 1, "Retrieving data from sources...")
53
+ domain = user_input.get("domain")
54
+ token_budget = user_input.get("token_budget")
55
+ plan = user_input.get("plan")
56
+ custom_seed = user_input.get("custom_seed", None)
57
+ # Step 2: Preprocessing
58
+ progress_tracker.update(job_id, 2, "Preprocessing and cleaning data...")
59
+ # Step 3: Tokenization & Labeling
60
+ progress_tracker.update(job_id, 3, "Tokenizing and labeling samples...")
61
+ # Step 4: Validation & Stats
62
+ progress_tracker.update(job_id, 4, "Validating and computing statistics...")
63
+ # Step 5: Formatting output
64
+ progress_tracker.update(job_id, 5, "Formatting dataset as JSONL...")
65
+ # Call tokenizer pipeline (implement in tokenization/tokenizer.py)
66
+ result = generate_dataset(
67
+ domain=domain,
68
+ token_budget=token_budget,
69
+ plan=plan,
70
+ custom_seed=custom_seed,
71
+ progress_callback=lambda step, msg: progress_tracker.update(job_id, step, msg)
72
+ )
73
+ # Step 6: Save output
74
+ os.makedirs(tmp_dir, exist_ok=True)
75
+ output_path = os.path.join(tmp_dir, f"{domain}_{token_budget}_tokens_{job_id}.jsonl")
76
+ with open(output_path, "w", encoding="utf-8") as f:
77
+ for line in result["jsonl_lines"]:
78
+ f.write(line + "\n")
79
+ progress_tracker.update(job_id, 6, "Dataset ready for download.")
80
+ progress_tracker.complete(job_id)
81
+ with self.lock:
82
+ self.jobs[job_id]["status"] = "complete"
83
+ self.jobs[job_id]["result_path"] = output_path
84
+ self.jobs[job_id]["stats"] = result.get("stats", {})
85
+ except Exception as e:
86
+ progress_tracker.update(job_id, 0, f"Job failed: {str(e)}")
87
+ with self.lock:
88
+ self.jobs[job_id]["status"] = "failed"
89
+ self.jobs[job_id]["error"] = str(e)
90
+
91
+ def _run_corpus_pipeline(self, job_id):
92
+ try:
93
+ with self.lock:
94
+ user_input = self.jobs[job_id]["user_input"]
95
+ plan = user_input.get("plan")
96
+ token_budget = user_input.get("token_budget")
97
+ progress_tracker.start_job(job_id, total_steps=5)
98
+ progress_tracker.update(job_id, 1, "Building scientific corpus...")
99
+ config = CorpusConfig()
100
+ builder = ScientificCorpusBuilder(config)
101
+ corpus, stats = builder.build_corpus_scoped(plan, token_budget)
102
+ progress_tracker.update(job_id, 2, "Formatting dataset as JSONL...")
103
+ jsonl_lines = [json.dumps(paper, ensure_ascii=False) for paper in corpus]
104
+ progress_tracker.update(job_id, 3, "Finalizing output...")
105
+ progress_tracker.update(job_id, 4, "Corpus ready for download.")
106
+ progress_tracker.complete(job_id)
107
+ with self.lock:
108
+ self.jobs[job_id]["status"] = "complete"
109
+ self.jobs[job_id]["jsonl_lines"] = jsonl_lines
110
+ self.jobs[job_id]["stats"] = stats
111
+ self.jobs[job_id]["actual_tokens"] = stats.get("total_tokens", 0)
112
+ except Exception as e:
113
+ progress_tracker.update(job_id, 0, f"Job failed: {str(e)}")
114
+ with self.lock:
115
+ self.jobs[job_id]["status"] = "failed"
116
+ self.jobs[job_id]["error"] = str(e)
117
+
118
+ def _run_label_pipeline(self, job_id):
119
+ try:
120
+ with self.lock:
121
+ user_input = self.jobs[job_id]["user_input"]
122
+ plan = self.jobs[job_id]["plan"]
123
+ progress_tracker.start_job(job_id, total_steps=4)
124
+ progress_tracker.update(job_id, 1, "Loading and preprocessing dataset...")
125
+ dataset_text = user_input.get("dataset_text", "")
126
+ if not dataset_text:
127
+ raise ValueError("No dataset text provided.")
128
+ tokens = nltk.word_tokenize(dataset_text)
129
+ num_tokens = len(tokens)
130
+ with self.lock:
131
+ self.jobs[job_id]["actual_tokens"] = num_tokens
132
+ if not payment_manager.check_plan_limit(plan, num_tokens):
133
+ raise ValueError("Plan limit exceeded.")
134
+ progress_tracker.update(job_id, 2, "Tokenizing and labeling dataset...")
135
+ preprocessor = QLoRAPreprocessor()
136
+ labeled_data = preprocessor.preprocess_function(dataset_text)
137
+ jsonl_lines = [json.dumps({"text": item}, ensure_ascii=False) for item in labeled_data]
138
+ stats = {"token_count": num_tokens, "sample_count": len(labeled_data)}
139
+ progress_tracker.update(job_id, 3, "Dataset ready for download.")
140
+ progress_tracker.complete(job_id)
141
+ with self.lock:
142
+ self.jobs[job_id]["status"] = "complete"
143
+ self.jobs[job_id]["jsonl_lines"] = jsonl_lines
144
+ self.jobs[job_id]["stats"] = stats
145
+ except Exception as e:
146
+ progress_tracker.update(job_id, 0, f"Job failed: {str(e)}")
147
+ with self.lock:
148
+ self.jobs[job_id]["status"] = "failed"
149
+ self.jobs[job_id]["error"] = str(e)
150
+
151
+ def get_job_status(self, job_id):
152
+ with self.lock:
153
+ return self.jobs.get(job_id, None)
154
+
155
+ job_manager = JobManager()
Tokenization/app/Payment.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Payment.py: Plan enforcement and payment logic (Stripe stub).
3
+ """
4
+ import os
5
+ from .Config import PLAN_LIMITS, PLAN_PRICING
6
+
7
+ class PaymentManager:
8
+ def __init__(self):
9
+ self.stripe_api_key = os.getenv("STRIPE_API_KEY")
10
+
11
+ def check_plan_limit(self, plan, requested_tokens):
12
+ limit = PLAN_LIMITS.get(plan, 0)
13
+ return requested_tokens <= limit
14
+
15
+ def get_price(self, plan):
16
+ return PLAN_PRICING.get(plan, 0)
17
+
18
+ def requires_payment(self, plan, requested_tokens):
19
+ if plan == "free":
20
+ return requested_tokens > PLAN_LIMITS["free"]
21
+ return plan not in PLAN_LIMITS
22
+
23
+ def create_checkout_session(self, plan, job_id):
24
+ # Stub: Integrate with Stripe API in production
25
+ return f"https://checkout.stripe.com/pay/{plan}/{job_id}"
26
+
27
+ payment_manager = PaymentManager()
Tokenization/app/Progress.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Progress.py: Thread-safe progress tracking for dataset generation jobs.
3
+ """
4
+ import threading
5
+
6
+ class ProgressTracker:
7
+ def __init__(self):
8
+ self._progress = {}
9
+ self._lock = threading.Lock()
10
+
11
+ def start_job(self, job_id, total_steps):
12
+ with self._lock:
13
+ self._progress[job_id] = {
14
+ "current": 0,
15
+ "total": total_steps,
16
+ "status": "started",
17
+ "message": "Job started"
18
+ }
19
+
20
+ def update(self, job_id, current, message=None):
21
+ with self._lock:
22
+ if job_id in self._progress:
23
+ self._progress[job_id]["current"] = current
24
+ if message:
25
+ self._progress[job_id]["message"] = message # No emoji, just message
26
+
27
+ def complete(self, job_id):
28
+ with self._lock:
29
+ if job_id in self._progress:
30
+ self._progress[job_id]["status"] = "complete"
31
+ self._progress[job_id]["message"] = "Job complete"
32
+
33
+ def get(self, job_id):
34
+ with self._lock:
35
+ return self._progress.get(job_id, None)
36
+
37
+ progress_tracker = ProgressTracker()
Tokenization/app/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ app/__init__.py: Exposes main backend components for reuse.
3
+ """
4
+
5
+ from .Api import app as fastapi_app
6
+ from .Core import job_manager
7
+ from .Progress import progress_tracker
8
+ from .Payment import payment_manager
9
+
10
+ __all__ = [
11
+ "fastapi_app",
12
+ "job_manager",
13
+ "progress_tracker",
14
+ "payment_manager",
15
+ ]
Tokenization/app/__pycache__/Api.cpython-310.pyc ADDED
Binary file (2.81 kB). View file
 
Tokenization/app/__pycache__/Config.cpython-310.pyc ADDED
Binary file (444 Bytes). View file
 
Tokenization/app/__pycache__/Core.cpython-310.pyc ADDED
Binary file (4.86 kB). View file
 
Tokenization/app/__pycache__/Payment.cpython-310.pyc ADDED
Binary file (1.45 kB). View file
 
Tokenization/app/__pycache__/Progress.cpython-310.pyc ADDED
Binary file (1.66 kB). View file
 
Tokenization/app/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (460 Bytes). View file
 
Tokenization/combined_scientific_papers.json ADDED
The diff for this file is too large to render. See raw diff
 
Tokenization/combined_scientific_papers.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
Tokenization/corpus_builder.log ADDED
File without changes
Tokenization/debug_upload.log ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-12 18:18:01,037 - WARNING - Using default email for Entrez. Set ENTREZ_EMAIL environment variable.
2
+ 2025-06-12 18:18:01,037 - INFO - Starting arXiv paper collection...
3
+ 2025-06-12 18:18:01,038 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
4
+ 2025-06-12 18:18:03,165 - INFO - Got first page: 100 of 1236760 total results
5
+ 2025-06-12 18:18:03,172 - INFO - Sleeping: 2.828948 seconds
6
+ 2025-06-12 18:18:06,004 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=100&max_results=100
7
+ 2025-06-12 18:18:06,953 - INFO - Sleeping: 2.866122 seconds
8
+ 2025-06-12 18:18:09,824 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=200&max_results=100
9
+ 2025-06-12 18:18:11,783 - INFO - Sleeping: 2.823819 seconds
10
+ 2025-06-12 18:18:14,608 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=300&max_results=100
11
+ 2025-06-12 18:18:16,436 - INFO - Sleeping: 2.857095 seconds
12
+ 2025-06-12 18:18:19,301 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=400&max_results=100
13
+ 2025-06-12 18:18:22,022 - INFO - Sleeping: 2.790207 seconds
14
+ 2025-06-12 18:18:24,820 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
15
+ 2025-06-12 18:18:25,173 - INFO - Sleeping: 2.998001 seconds
16
+ 2025-06-12 18:18:28,181 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
17
+ 2025-06-12 18:18:28,988 - INFO - Sleeping: 2.999010 seconds
18
+ 2025-06-12 18:18:32,000 - INFO - Requesting page (first: False, try: 2): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
19
+ 2025-06-12 18:18:32,507 - INFO - Sleeping: 2.998957 seconds
20
+ 2025-06-12 18:18:35,519 - INFO - Requesting page (first: False, try: 3): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
21
+ 2025-06-12 18:18:36,061 - WARNING - Empty page returned for query 'cat:physics* OR cat:astro-ph* OR cat:cond-mat* OR cat:hep-th OR cat:quant-ph OR cat:math-ph': Page of results was unexpectedly empty (https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100)
22
+ 2025-06-12 18:18:36,065 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
23
+ 2025-06-12 18:18:36,888 - INFO - Got first page: 100 of 50293 total results
24
+ 2025-06-12 18:18:36,896 - INFO - Sleeping: 2.871087 seconds
25
+ 2025-06-12 18:18:39,783 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=100&max_results=100
26
+ 2025-06-12 18:18:40,466 - INFO - Sleeping: 2.870444 seconds
27
+ 2025-06-12 18:18:43,339 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=200&max_results=100
28
+ 2025-06-12 18:18:44,012 - INFO - Sleeping: 2.874603 seconds
29
+ 2025-06-12 18:18:46,893 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=300&max_results=100
30
+ 2025-06-12 18:18:47,688 - INFO - Sleeping: 2.858048 seconds
31
+ 2025-06-12 18:18:50,552 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=400&max_results=100
32
+ 2025-06-12 18:18:51,370 - INFO - Sleeping: 2.870823 seconds
33
+ 2025-06-12 18:18:54,246 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
34
+ 2025-06-12 18:18:54,960 - INFO - Sleeping: 2.886596 seconds
35
+ 2025-06-12 18:18:57,856 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=600&max_results=100
36
+ 2025-06-12 18:18:58,568 - INFO - Sleeping: 2.886486 seconds
37
+ 2025-06-12 18:19:01,466 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=700&max_results=100
38
+ 2025-06-12 18:19:02,219 - INFO - Sleeping: 2.867826 seconds
39
+ 2025-06-12 18:19:05,103 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=800&max_results=100
40
+ 2025-06-12 18:19:06,346 - INFO - Sleeping: 2.766637 seconds
41
+ 2025-06-12 18:19:09,120 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=900&max_results=100
42
+ 2025-06-12 18:19:10,043 - INFO - Sleeping: 2.877552 seconds
43
+ 2025-06-12 18:19:12,929 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1000&max_results=100
44
+ 2025-06-12 18:19:13,641 - INFO - Sleeping: 2.873434 seconds
45
+ 2025-06-12 18:19:16,525 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1100&max_results=100
46
+ 2025-06-12 18:19:17,281 - INFO - Sleeping: 2.871482 seconds
47
+ 2025-06-12 18:19:20,161 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1200&max_results=100
48
+ 2025-06-12 18:19:20,990 - INFO - Sleeping: 2.872492 seconds
49
+ 2025-06-12 18:19:23,876 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1300&max_results=100
50
+ 2025-06-12 18:19:24,633 - INFO - Sleeping: 2.873157 seconds
51
+ 2025-06-12 18:19:27,510 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1400&max_results=100
52
+ 2025-06-12 18:19:28,249 - INFO - Sleeping: 2.872219 seconds
53
+ 2025-06-12 18:19:31,132 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1500&max_results=100
54
+ 2025-06-12 18:19:31,787 - INFO - Sleeping: 2.871294 seconds
55
+ 2025-06-12 18:19:34,660 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1600&max_results=100
56
+ 2025-06-12 18:19:35,423 - INFO - Sleeping: 2.864608 seconds
57
+ 2025-06-12 18:19:38,291 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1700&max_results=100
58
+ 2025-06-12 18:19:38,496 - INFO - Sleeping: 2.998046 seconds
59
+ 2025-06-12 18:19:41,498 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1700&max_results=100
60
+ 2025-06-12 18:19:41,682 - INFO - Sleeping: 2.998049 seconds
61
+ 2025-06-12 18:19:44,693 - INFO - Requesting page (first: False, try: 2): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1700&max_results=100
62
+ 2025-06-12 18:19:45,568 - INFO - Sleeping: 2.874692 seconds
63
+ 2025-06-12 18:19:48,448 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1800&max_results=100
64
+ 2025-06-12 18:19:48,654 - INFO - Sleeping: 2.998000 seconds
65
+ 2025-06-12 18:19:51,668 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1800&max_results=100
66
+ 2025-06-12 18:19:52,436 - INFO - Sleeping: 2.877867 seconds
67
+ 2025-06-12 18:19:55,323 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1900&max_results=100
68
+ 2025-06-12 18:19:56,074 - INFO - Sleeping: 2.878102 seconds
69
+ 2025-06-12 18:19:58,961 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2000&max_results=100
70
+ 2025-06-12 18:19:59,730 - INFO - Sleeping: 2.846435 seconds
71
+ 2025-06-12 18:20:02,587 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2100&max_results=100
72
+ 2025-06-12 18:20:02,802 - INFO - Sleeping: 2.997978 seconds
73
+ 2025-06-12 18:20:05,801 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2100&max_results=100
74
+ 2025-06-12 18:20:06,645 - INFO - Sleeping: 2.882026 seconds
75
+ 2025-06-12 18:20:09,537 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2200&max_results=100
76
+ 2025-06-12 18:20:10,681 - INFO - Sleeping: 2.867912 seconds
77
+ 2025-06-12 18:20:13,558 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2300&max_results=100
78
+ 2025-06-12 18:20:15,163 - INFO - Sleeping: 2.874383 seconds
79
+ 2025-06-12 18:20:18,052 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2400&max_results=100
80
+ 2025-06-12 18:20:19,022 - INFO - Sleeping: 2.885731 seconds
81
+ 2025-06-12 18:20:21,916 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2500&max_results=100
82
+ 2025-06-12 18:20:22,743 - INFO - Sleeping: 2.880111 seconds
83
+ 2025-06-12 18:20:25,633 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2600&max_results=100
84
+ 2025-06-12 18:20:26,848 - INFO - Sleeping: 2.877337 seconds
85
+ 2025-06-12 18:20:29,728 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2700&max_results=100
86
+ 2025-06-12 18:20:29,961 - INFO - Sleeping: 2.999086 seconds
87
+ 2025-06-12 18:20:32,973 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2700&max_results=100
88
+ 2025-06-12 18:20:33,783 - INFO - Sleeping: 2.870358 seconds
89
+ 2025-06-12 18:20:36,664 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2800&max_results=100
90
+ 2025-06-12 18:20:36,929 - INFO - Sleeping: 2.997254 seconds
91
+ 2025-06-12 18:20:39,936 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2800&max_results=100
92
+ 2025-06-12 18:20:40,834 - INFO - Sleeping: 2.876953 seconds
93
+ 2025-06-12 18:20:43,716 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2900&max_results=100
94
+ 2025-06-12 18:20:44,816 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
95
+ 2025-06-12 18:20:46,192 - INFO - Got first page: 100 of 100310 total results
96
+ 2025-06-12 18:20:46,198 - INFO - Sleeping: 2.859482 seconds
97
+ 2025-06-12 18:20:49,073 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=100&max_results=100
98
+ 2025-06-12 18:20:49,789 - INFO - Sleeping: 2.869352 seconds
99
+ 2025-06-12 18:20:52,669 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=200&max_results=100
100
+ 2025-06-12 18:20:53,467 - INFO - Sleeping: 2.862511 seconds
101
+ 2025-06-12 18:20:56,338 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=300&max_results=100
102
+ 2025-06-12 18:20:57,071 - INFO - Sleeping: 2.870255 seconds
103
+ 2025-06-12 18:20:59,951 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=400&max_results=100
104
+ 2025-06-12 18:21:00,728 - INFO - Sleeping: 2.869636 seconds
105
+ 2025-06-12 18:21:03,604 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
106
+ 2025-06-12 18:21:04,393 - INFO - Sleeping: 2.865000 seconds
107
+ 2025-06-12 18:21:07,272 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=600&max_results=100
108
+ 2025-06-12 18:21:08,029 - INFO - Sleeping: 2.858943 seconds
109
+ 2025-06-12 18:21:10,895 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=700&max_results=100
110
+ 2025-06-12 18:21:11,768 - INFO - Sleeping: 2.866744 seconds
111
+ 2025-06-12 18:21:14,640 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=800&max_results=100
112
+ 2025-06-12 18:21:15,488 - INFO - Sleeping: 2.720050 seconds
113
+ 2025-06-12 18:21:18,211 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=900&max_results=100
114
+ 2025-06-12 18:21:19,122 - INFO - Sleeping: 2.844511 seconds
115
+ 2025-06-12 18:21:21,982 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1000&max_results=100
116
+ 2025-06-12 18:21:22,772 - INFO - Sleeping: 2.871176 seconds
117
+ 2025-06-12 18:21:25,647 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1100&max_results=100
118
+ 2025-06-12 18:21:25,925 - INFO - Sleeping: 2.997949 seconds
119
+ 2025-06-12 18:21:28,932 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1100&max_results=100
120
+ 2025-06-12 18:21:29,774 - INFO - Sleeping: 2.864288 seconds
121
+ 2025-06-12 18:21:32,644 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1200&max_results=100
122
+ 2025-06-12 18:21:33,454 - INFO - Sleeping: 2.860076 seconds
123
+ 2025-06-12 18:21:36,317 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1300&max_results=100
124
+ 2025-06-12 18:21:36,605 - INFO - Sleeping: 2.997453 seconds
125
+ 2025-06-12 18:21:39,607 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1300&max_results=100
126
+ 2025-06-12 18:21:40,404 - INFO - Sleeping: 2.856277 seconds
127
+ 2025-06-12 18:21:43,276 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1400&max_results=100
128
+ 2025-06-12 18:21:44,085 - INFO - Sleeping: 2.862912 seconds
129
+ 2025-06-12 18:21:46,964 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1500&max_results=100
130
+ 2025-06-12 18:21:47,858 - INFO - Sleeping: 2.860433 seconds
131
+ 2025-06-12 18:21:50,732 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1600&max_results=100
132
+ 2025-06-12 18:21:51,504 - INFO - Sleeping: 2.874451 seconds
133
+ 2025-06-12 18:21:54,387 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1700&max_results=100
134
+ 2025-06-12 18:21:55,722 - INFO - Sleeping: 2.859315 seconds
135
+ 2025-06-12 18:21:58,585 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1800&max_results=100
136
+ 2025-06-12 18:21:59,503 - INFO - Sleeping: 2.863854 seconds
137
+ 2025-06-12 18:22:02,377 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1900&max_results=100
138
+ 2025-06-12 18:22:02,618 - INFO - Sleeping: 2.997967 seconds
139
+ 2025-06-12 18:22:05,628 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1900&max_results=100
140
+ 2025-06-12 18:22:06,677 - INFO - Sleeping: 2.844775 seconds
141
+ 2025-06-12 18:22:09,533 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2000&max_results=100
142
+ 2025-06-12 18:22:09,792 - INFO - Sleeping: 2.998977 seconds
143
+ 2025-06-12 18:22:12,797 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2000&max_results=100
144
+ 2025-06-12 18:22:13,677 - INFO - Sleeping: 2.860952 seconds
145
+ 2025-06-12 18:22:16,551 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2100&max_results=100
146
+ 2025-06-12 18:22:17,381 - INFO - Sleeping: 2.862895 seconds
147
+ 2025-06-12 18:22:20,259 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2200&max_results=100
148
+ 2025-06-12 18:22:21,092 - INFO - Sleeping: 2.865440 seconds
149
+ 2025-06-12 18:22:23,963 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2300&max_results=100
150
+ 2025-06-12 18:22:24,738 - INFO - Sleeping: 2.854685 seconds
151
+ 2025-06-12 18:22:27,605 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2400&max_results=100
152
+ 2025-06-12 18:22:28,443 - INFO - Sleeping: 2.866245 seconds
153
+ 2025-06-12 18:22:31,321 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2500&max_results=100
154
+ 2025-06-12 18:22:32,401 - INFO - Sleeping: 2.857156 seconds
155
+ 2025-06-12 18:22:35,269 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2600&max_results=100
156
+ 2025-06-12 18:22:35,481 - INFO - Sleeping: 2.997016 seconds
157
+ 2025-06-12 18:22:38,486 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2600&max_results=100
158
+ 2025-06-12 18:22:39,346 - INFO - Sleeping: 2.856990 seconds
159
+ 2025-06-12 18:22:42,208 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2700&max_results=100
160
+ 2025-06-12 18:22:43,031 - INFO - Sleeping: 2.852790 seconds
161
+ 2025-06-12 18:22:45,889 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2800&max_results=100
162
+ 2025-06-12 18:22:46,748 - INFO - Sleeping: 2.858054 seconds
163
+ 2025-06-12 18:22:49,610 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2900&max_results=100
164
+ 2025-06-12 18:22:49,923 - INFO - Sleeping: 2.997999 seconds
165
+ 2025-06-12 18:22:52,927 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2900&max_results=100
166
+ 2025-06-12 18:22:53,180 - INFO - Sleeping: 2.998443 seconds
167
+ 2025-06-12 18:22:56,182 - INFO - Requesting page (first: False, try: 2): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2900&max_results=100
168
+ 2025-06-12 18:22:57,297 - INFO - Saved checkpoint to scientific_corpus_data\arxiv_papers.jsonl
169
+ 2025-06-12 18:22:57,297 - INFO - Collected 5989 arXiv papers in 296.26s
170
+ 2025-06-12 18:22:57,310 - INFO - Starting PubMed paper collection...
171
+ 2025-06-12 18:23:14,143 - INFO - Saved checkpoint to scientific_corpus_data\pubmed_papers.jsonl
172
+ 2025-06-12 18:23:14,143 - INFO - Collected 2671 PubMed papers in 16.83s
173
+ 2025-06-12 18:23:14,143 - INFO - Starting FineWeb-Edu collection...
174
+ 2025-06-12 18:23:34,470 - INFO - Collected 10000 FineWeb samples
175
+ 2025-06-12 18:23:38,652 - INFO - Collected 20000 FineWeb samples
176
+ 2025-06-12 18:23:43,218 - INFO - Collected 30000 FineWeb samples
177
+ 2025-06-12 18:23:43,221 - INFO - Processing 30000 FineWeb samples
178
+ 2025-06-12 18:24:03,830 - INFO - Saved checkpoint to scientific_corpus_data\fineweb_edu.jsonl
179
+ 2025-06-12 18:24:03,831 - INFO - Collected 29616 FineWeb-Edu papers in 49.69s
180
+ 2025-06-12 18:24:03,873 - INFO - Processing 5989 arxiv papers...
181
+ 2025-06-12 18:24:05,244 - INFO - Processed 5989/5989 arxiv papers
182
+ 2025-06-12 18:24:05,244 - INFO - Unknown domains: 0, Unknown sections: 3349
183
+ 2025-06-12 18:24:05,244 - INFO - Processing 2671 biology papers...
184
+ 2025-06-12 18:24:05,765 - INFO - Processed 2605/2671 biology papers
185
+ 2025-06-12 18:24:05,765 - INFO - Unknown domains: 0, Unknown sections: 1015
186
+ 2025-06-12 18:24:05,765 - INFO - Processing 29616 education papers...
187
+ 2025-06-12 18:24:39,231 - INFO - Processed 159402/29616 education papers
188
+ 2025-06-12 18:24:39,231 - INFO - Unknown domains: 29616, Unknown sections: 21161
189
+ 2025-06-12 19:06:41,335 - INFO - Received signal 2, shutting down gracefully. Frame: <frame at 0x0000023E5AF0BBC0, file 'C:\\Users\\kunya\\AppData\\Local\\Programs\\Python\\Python310\\lib\\threading.py', line 320, code wait>
190
+ 2025-06-12 19:06:43,708 - WARNING - Using default email for Entrez. Set ENTREZ_EMAIL environment variable.
191
+ 2025-06-12 19:06:43,710 - INFO - Starting arXiv paper collection...
192
+ 2025-06-12 19:06:43,711 - INFO - Saved checkpoint to scientific_corpus_data\arxiv_papers.jsonl
193
+ 2025-06-12 19:06:43,712 - INFO - Collected 0 arXiv papers in 0.00s
194
+ 2025-06-12 19:06:43,713 - INFO - Starting PubMed paper collection...
195
+ 2025-06-12 19:06:43,715 - INFO - Saved checkpoint to scientific_corpus_data\pubmed_papers.jsonl
196
+ 2025-06-12 19:06:43,715 - INFO - Collected 0 PubMed papers in 0.00s
197
+ 2025-06-12 19:06:43,716 - INFO - Shutdown in progress, aborting retries.
198
+ 2025-06-12 19:16:11,718 - INFO - Received signal 2, shutting down gracefully. Frame: <frame at 0x0000023E7696F880, file 'C:\\Users\\kunya\\AppData\\Local\\Programs\\Python\\Python310\\lib\\selectors.py', line 315, code _select>
Tokenization/generate_dataset.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Optional, Callable, Dict, Any
3
+
4
+ from Tokenization.Build_tokenizer import QLoRAPreprocessor
5
+ from Tokenization.preprocessing.Clean_text import clean_text
6
+ from Tokenization.Main_2 import ScientificCorpusBuilder, CorpusConfig
7
+
8
+ def generate_dataset(
9
+ domain: str = None,
10
+ token_budget: int = 1000,
11
+ plan: str = "free",
12
+ custom_seed: Optional[str] = None,
13
+ job_type: str = "tokenize",
14
+ progress_callback: Optional[Callable[[int, str], None]] = None
15
+ ) -> Dict[str, Any]:
16
+ """
17
+ Unified dataset generation pipeline for both 'tokenize' and 'corpus' jobs.
18
+
19
+ Args:
20
+ domain (str): Domain for dataset.
21
+ token_budget (int): Token budget.
22
+ plan (str): Plan type.
23
+ custom_seed (str): Optional seed data.
24
+ job_type (str): "tokenize" or "corpus".
25
+ progress_callback (callable): Progress update callback.
26
+
27
+ Returns:
28
+ dict: {"jsonl_lines": [...], "stats": {...}}
29
+ """
30
+ if job_type == "corpus":
31
+ # Use Main_2 pipeline
32
+ if progress_callback:
33
+ progress_callback(1, "Initializing scientific corpus builder...")
34
+ config = CorpusConfig()
35
+ builder = ScientificCorpusBuilder(config)
36
+ if progress_callback:
37
+ progress_callback(2, "Fetching arXiv papers...")
38
+ arxiv_papers = builder.fetch_arxiv_papers()
39
+ if progress_callback:
40
+ progress_callback(3, "Fetching PubMed papers...")
41
+ pubmed_papers = builder.fetch_pubmed_papers()
42
+ if progress_callback:
43
+ progress_callback(4, "Fetching FineWeb-Edu samples...")
44
+ fineweb_papers = builder.fetch_fineweb_edu()
45
+ if progress_callback:
46
+ progress_callback(5, "Processing and tagging papers...")
47
+ all_papers = []
48
+ all_papers.extend(builder.process_papers(arxiv_papers, "arxiv"))
49
+ all_papers.extend(builder.process_papers(pubmed_papers, "biology"))
50
+ all_papers.extend(builder.process_papers(fineweb_papers, "education"))
51
+ if progress_callback:
52
+ progress_callback(6, "Ranking and deduplicating...")
53
+ ranked_papers = builder.ranker.rank_samples(all_papers)
54
+ if progress_callback:
55
+ progress_callback(7, "Preparing dataset for download...")
56
+ jsonl_lines = [json.dumps(paper, ensure_ascii=False) for paper in ranked_papers]
57
+ stats = builder.analyzer.get_dataset_stats(ranked_papers)
58
+ if progress_callback:
59
+ progress_callback(8, "Dataset ready for download.")
60
+ return {"jsonl_lines": jsonl_lines, "stats": stats}
61
+
62
+ # Standard "tokenize" job
63
+ if progress_callback:
64
+ progress_callback(1, "Cleaning input text...")
65
+ cleaned_text = clean_text(custom_seed or "")
66
+ if progress_callback:
67
+ progress_callback(2, "Tokenizing input...")
68
+ preprocessor = QLoRAPreprocessor()
69
+ # For demonstration, just split cleaned_text into sentences (replace with real logic)
70
+ tokens = [cleaned_text[i:i+token_budget] for i in range(0, len(cleaned_text), token_budget)]
71
+ if progress_callback:
72
+ progress_callback(3, "Formatting samples...")
73
+ jsonl_lines = [json.dumps({"text": t}) for t in tokens]
74
+ stats = {"token_count": sum(len(t.split()) for t in tokens), "total_samples": len(tokens)}
75
+ if progress_callback:
76
+ progress_callback(4, "Dataset ready for download.")
77
+ return {"jsonl_lines": jsonl_lines, "stats": stats}
Tokenization/hf_upload.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import sys
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+
7
+ from datasets import Dataset, Features, Value
8
+ from dotenv import load_dotenv
9
+ from huggingface_hub import HfApi
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+ HF_TOKEN = os.getenv("HF_TOKEN")
14
+
15
+ # Logging setup
16
+ logging.basicConfig(
17
+ level=logging.INFO,
18
+ format='%(asctime)s - %(levelname)s - %(message)s',
19
+ handlers=[
20
+ logging.StreamHandler(sys.stdout),
21
+ logging.FileHandler('debug_upload.log', mode='w')
22
+ ]
23
+ )
24
+
25
+ REPO_ID = "Allanatrix/Scientific_Research_Tokenized"
26
+ JSONL_SRC = Path(r"C:\Users\kunya\PycharmProjects\DataVolt\Tokenization\scientific_corpus_325M.jsonl")
27
+ ARROW_PATH = Path("scientific_corpus_325M.arrow")
28
+ README_PATH = Path("README.md")
29
+
30
+ def debug_jsonl_head(jsonl_path, n=5):
31
+ logging.info(f"Printing the first {n} lines of {jsonl_path} for schema inspection:")
32
+ try:
33
+ with open(jsonl_path, "r", encoding="utf-8") as f:
34
+ for i in range(n):
35
+ line = f.readline()
36
+ if not line:
37
+ break
38
+ logging.info(f"Line {i+1}: {line.strip()}")
39
+ except Exception as e:
40
+ logging.error(f"Failed to read JSONL head: {e}")
41
+
42
+ def infer_features_from_sample(jsonl_path, n=100):
43
+ import json
44
+ from collections import defaultdict
45
+ types = defaultdict(set)
46
+ try:
47
+ with open(jsonl_path, "r", encoding="utf-8") as f:
48
+ for i, line in enumerate(f):
49
+ if i >= n:
50
+ break
51
+ obj = json.loads(line)
52
+ for k, v in obj.items():
53
+ types[k].add(type(v).__name__)
54
+ logging.info(f"Inferred field types from first {n} lines: {dict(types)}")
55
+ except Exception as e:
56
+ logging.error(f"Failed to infer features: {e}")
57
+
58
+ def convert_jsonl_to_arrow(jsonl_path, arrow_path):
59
+ try:
60
+ logging.info(f"Converting {jsonl_path} to Arrow format at {arrow_path} ...")
61
+ if not jsonl_path.exists():
62
+ logging.error(f"JSONL source file does not exist: {jsonl_path}")
63
+ print(f"\n❌ JSONL source file does not exist: {jsonl_path}")
64
+ raise FileNotFoundError(f"JSONL source file does not exist: {jsonl_path}")
65
+ logging.info(f"File size: {jsonl_path.stat().st_size} bytes")
66
+ debug_jsonl_head(jsonl_path, n=5)
67
+ infer_features_from_sample(jsonl_path, n=100)
68
+ # Try loading a small sample first for debugging
69
+ try:
70
+ sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]")
71
+ logging.info(f"Sample loaded: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}")
72
+ except Exception as sample_e:
73
+ logging.error(f"Failed to load sample from JSONL: {sample_e}", exc_info=True)
74
+ print(f"\n❌ Failed to load sample from JSONL. See debug_upload.log for details.")
75
+ # Try to load with explicit features if possible
76
+ # Example: features = Features({'url': Value('string'), 'pubmed_id': Value('string')})
77
+ # Uncomment and adjust the following lines if you know the schema:
78
+ # features = Features({'url': Value('string'), 'pubmed_id': Value('string')})
79
+ # try:
80
+ # sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]", features=features)
81
+ # logging.info(f"Sample loaded with explicit features: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}")
82
+ # except Exception as e2:
83
+ # logging.error(f"Still failed with explicit features: {e2}", exc_info=True)
84
+ raise
85
+ # Now load the full dataset
86
+ dataset = Dataset.from_json(str(jsonl_path))
87
+ logging.info(f"Full dataset loaded: {len(dataset)} rows, columns: {dataset.column_names}")
88
+ dataset.to_file(str(arrow_path))
89
+ logging.info(f"Saved Arrow dataset with {len(dataset):,} rows.")
90
+ return dataset
91
+ except Exception as e:
92
+ logging.error(f"An error occurred while generating the dataset: {e}", exc_info=True)
93
+ print(f"\n❌ Failed to convert JSONL to Arrow. See debug_upload.log for details.")
94
+ raise
95
+
96
+ def create_readme(dataset):
97
+ content = f"""# Scientific Research Tokenized Dataset
98
+
99
+ - **Examples**: {len(dataset):,}
100
+ - **Columns**: {dataset.column_names}
101
+ - **Updated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
102
+
103
+ ## Usage
104
+ ```python
105
+ from datasets import load_dataset
106
+ ds = load_dataset("{REPO_ID}")
107
+ ```
108
+ """
109
+ with open(README_PATH, "w", encoding="utf-8") as f:
110
+ f.write(content)
111
+ logging.info("README.md created.")
112
+
113
+ def upload_to_hf():
114
+ api = HfApi()
115
+ logging.info("Uploading Arrow file to HuggingFace Hub ...")
116
+ api.upload_file(
117
+ path_or_fileobj=str(ARROW_PATH),
118
+ path_in_repo=ARROW_PATH.name,
119
+ repo_id=REPO_ID,
120
+ repo_type="dataset",
121
+ token=HF_TOKEN,
122
+ commit_message="Upload Arrow dataset"
123
+ )
124
+ logging.info("Uploading README.md to HuggingFace Hub ...")
125
+ api.upload_file(
126
+ path_or_fileobj=str(README_PATH),
127
+ path_in_repo="README.md",
128
+ repo_id=REPO_ID,
129
+ repo_type="dataset",
130
+ token=HF_TOKEN,
131
+ commit_message="Update README"
132
+ )
133
+ logging.info("Upload complete.")
134
+
135
+ def upload_to_huggingface(*args, **kwargs):
136
+ """Alias for upload_to_hf to match expected import in Main_2.py"""
137
+ return upload_to_hf(*args, **kwargs)
138
+
139
+ def cleanup():
140
+ if ARROW_PATH.exists():
141
+ ARROW_PATH.unlink()
142
+ if README_PATH.exists():
143
+ README_PATH.unlink()
144
+ logging.info("Cleaned up local files.")
145
+
146
+ def main():
147
+ try:
148
+ if not HF_TOKEN:
149
+ print("❌ HF_TOKEN not found in environment. Please set it in your .env file.")
150
+ return
151
+ dataset = convert_jsonl_to_arrow(JSONL_SRC, ARROW_PATH)
152
+ create_readme(dataset)
153
+ upload_to_hf()
154
+ print(f"\n🎉 SUCCESS! View at: https://huggingface.co/datasets/{REPO_ID}")
155
+ except Exception as e:
156
+ logging.error(f"Process failed: {e}")
157
+ print(f"\n❌ Upload failed. See debug_upload.log for details.")
158
+ sys.exit(1)
159
+ finally:
160
+ cleanup()
161
+
162
+ if __name__ == "__main__":
163
+ main()
Tokenization/preprocessing/Clean_text.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import unicodedata
3
+
4
+ def clean_text(text: str) -> str:
5
+ """Clean and normalize text for LLM ingestion."""
6
+ if not isinstance(text, str):
7
+ return ""
8
+ # Normalize unicode
9
+ text = unicodedata.normalize("NFKC", text)
10
+ # Remove control characters
11
+ text = re.sub(r"[\x00-\x1F\x7F]", " ", text)
12
+ # Replace multiple spaces/newlines with a single space
13
+ text = re.sub(r"\s+", " ", text)
14
+ # Strip leading/trailing whitespace
15
+ text = text.strip()
16
+ return text
Tokenization/preprocessing/Preprocess_sample.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+ from Tokenization.preprocessing.Clean_text import clean_text
3
+ from Tokenization.preprocessing.Segment_paragraphs import segment_paragraphs
4
+
5
+ def preprocess_sample(paper: Dict) -> List[Dict]:
6
+ """
7
+ Clean and segment a paper into samples for LLM ingestion.
8
+ Returns a list of dicts: one for title+abstract, and one per paragraph.
9
+ """
10
+ title = clean_text(paper.get("title", ""))
11
+ abstract = clean_text(paper.get("abstract", ""))
12
+ full_text = clean_text(paper.get("full_text", ""))
13
+ paragraphs = segment_paragraphs(full_text) if full_text else []
14
+ samples = []
15
+ # Title + abstract sample
16
+ if title or abstract:
17
+ sample = dict(paper)
18
+ sample["title"] = title
19
+ sample["abstract"] = abstract
20
+ sample["full_text"] = ""
21
+ sample["section"] = "abstract"
22
+ samples.append(sample)
23
+ # Paragraph samples
24
+ for para in paragraphs:
25
+ sample = dict(paper)
26
+ sample["title"] = title
27
+ sample["abstract"] = ""
28
+ sample["full_text"] = para
29
+ sample["section"] = "paragraph"
30
+ samples.append(sample)
31
+ return samples
Tokenization/preprocessing/Segment_paragraphs.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def segment_paragraphs(text: str) -> list:
4
+ """Segment text into paragraphs using double newlines or similar heuristics."""
5
+ if not isinstance(text, str):
6
+ return []
7
+ # Split on two or more newlines, or at least 200 chars per paragraph
8
+ paras = re.split(r"\n{2,}", text)
9
+ # Fallback: split-long paragraphs
10
+ result = []
11
+ for para in paras:
12
+ para = para.strip()
13
+ if len(para) > 1000:
14
+ # Split further if too long
15
+ chunks = [para[i:i+1000] for i in range(0, len(para), 1000)]
16
+ result.extend(chunks)
17
+ elif para:
18
+ result.append(para)
19
+ return [p for p in result if p]
Tokenization/preprocessing/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from .Clean_text import clean_text
2
+ from .Segment_paragraphs import segment_paragraphs
3
+ from .Preprocess_sample import preprocess_sample
4
+
5
+ __all__ = [
6
+ "clean_text",
7
+ "segment_paragraphs",
8
+ "preprocess_sample",
9
+ ]
Tokenization/preprocessing/__pycache__/Clean_text.cpython-310.pyc ADDED
Binary file (544 Bytes). View file
 
Tokenization/preprocessing/__pycache__/Preprocess_sample.cpython-310.pyc ADDED
Binary file (1.03 kB). View file
 
Tokenization/preprocessing/__pycache__/Segment_paragraphs.cpython-310.pyc ADDED
Binary file (932 Bytes). View file
 
Tokenization/preprocessing/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (372 Bytes). View file
 
Tokenization/pretraining/Dataset_stats.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter
2
+ from typing import Dict, List
3
+
4
+ import numpy as np
5
+ from transformers import AutoTokenizer
6
+
7
+
8
+ class DatasetAnalyzer:
9
+ def __init__(self, model_name: str = "facebook/opt-350m"):
10
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
11
+
12
+ def analyze_sample(self, sample: Dict) -> Dict:
13
+ tokens = self.tokenizer.encode(str(sample))
14
+ return {
15
+ "token_count": len(tokens),
16
+ "word_count": len(str(sample).split()),
17
+ "has_abstract": bool(sample.get("abstract")),
18
+ "has_content": bool(sample.get("full_text") or sample.get("excerpt")),
19
+ "has_section": bool(sample.get("section_type")),
20
+ "domain": sample.get("domain_tag", "unknown")
21
+ }
22
+
23
+ def get_dataset_stats(self, samples: List[Dict]) -> Dict:
24
+ stats = []
25
+ domains = Counter()
26
+ sections = Counter()
27
+
28
+ for sample in samples:
29
+ sample_stats = self.analyze_sample(sample)
30
+ stats.append(sample_stats)
31
+ domains[sample_stats["domain"]] += 1
32
+ sections[sample.get("section_type", "unknown")] += 1
33
+
34
+ return {
35
+ "total_samples": len(samples),
36
+ "avg_tokens": np.mean([s["token_count"] for s in stats]),
37
+ "avg_words": np.mean([s["word_count"] for s in stats]),
38
+ "domain_distribution": dict(domains),
39
+ "section_distribution": dict(sections)
40
+ }
Tokenization/pretraining/Instruction_formatter.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tokenization/pretraining/instruction_formatter.py
2
+
3
+ class InstructionFormatter:
4
+ @staticmethod
5
+ def format_sample(sample):
6
+ """
7
+ Formats a sample dict with 'instruction', 'input', and 'output' fields.
8
+ This is a placeholder; customize as needed for your data.
9
+ """
10
+ # Ensure required fields exist
11
+ instruction = sample.get("instruction", "")
12
+ input_ = sample.get("input", "")
13
+ output = sample.get("output", "")
14
+ return {
15
+ "instruction": instruction.strip(),
16
+ "input": input_.strip(),
17
+ "output": output.strip(),
18
+ }
Tokenization/pretraining/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .Dataset_stats import DatasetAnalyzer
2
+
3
+ __all__ = ["DatasetAnalyzer"]
Tokenization/pretraining/__pycache__/Dataset_stats.cpython-310.pyc ADDED
Binary file (1.97 kB). View file
 
Tokenization/pretraining/__pycache__/Instruction_formatter.cpython-310.pyc ADDED
Binary file (806 Bytes). View file
 
Tokenization/pretraining/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (244 Bytes). View file
 
Tokenization/requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ gradio
4
+ requests
5
+ nltk
6
+ scikit-learn
7
+ beautifulsoup4
8
+ arxiv
9
+ huggingface_hub
10
+ python-dotenv
11
+ stripe
Tokenization/run_backend.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uvicorn
2
+ import os
3
+
4
+ if __name__ == "__main__":
5
+ os.makedirs("tmp", exist_ok=True)
6
+ print("Starting FastAPI backend at http://localhost:8000 ...")
7
+ uvicorn.run(
8
+ "Tokenization.app:fastapi_app",
9
+ host="0.0.0.0",
10
+ port=8000,
11
+ reload=True
12
+ )