Spaces:

Allanatrix
/

Nexa_Data_Studio

Running

App Files Files Community

Allanatrix commited on Jun 12

Commit

ef4c8c3

verified ·

1 Parent(s): 554d2ab

Upload 50 files

Browse files

Files changed (50) hide show

Tokenization/Build_tokenizer.py +89 -0
Tokenization/Cleanser.py +102 -0
Tokenization/Entropy_ranker.py +59 -0
Tokenization/Label_tokens.py +69 -0
Tokenization/Logs/corpus_builder.log +0 -0
Tokenization/Logs/debug_upload.log +4 -0
Tokenization/Main_2.py +922 -0
Tokenization/__init__.py +21 -0
Tokenization/__pycache__/Build_tokenizer.cpython-310.pyc +0 -0
Tokenization/__pycache__/Entropy_ranker.cpython-310.pyc +0 -0
Tokenization/__pycache__/Label_tokens.cpython-310.pyc +0 -0
Tokenization/__pycache__/Main_2.cpython-310.pyc +0 -0
Tokenization/__pycache__/__init__.cpython-310.pyc +0 -0
Tokenization/__pycache__/generate_dataset.cpython-310.pyc +0 -0
Tokenization/__pycache__/hf_upload.cpython-310.pyc +0 -0
Tokenization/app.py +147 -0
Tokenization/app/Api.py +75 -0
Tokenization/app/Config.py +25 -0
Tokenization/app/Core.py +155 -0
Tokenization/app/Payment.py +27 -0
Tokenization/app/Progress.py +37 -0
Tokenization/app/__init__.py +15 -0
Tokenization/app/__pycache__/Api.cpython-310.pyc +0 -0
Tokenization/app/__pycache__/Config.cpython-310.pyc +0 -0
Tokenization/app/__pycache__/Core.cpython-310.pyc +0 -0
Tokenization/app/__pycache__/Payment.cpython-310.pyc +0 -0
Tokenization/app/__pycache__/Progress.cpython-310.pyc +0 -0
Tokenization/app/__pycache__/__init__.cpython-310.pyc +0 -0
Tokenization/combined_scientific_papers.json +0 -0
Tokenization/combined_scientific_papers.jsonl +0 -0
Tokenization/corpus_builder.log +0 -0
Tokenization/debug_upload.log +198 -0
Tokenization/generate_dataset.py +77 -0
Tokenization/hf_upload.py +163 -0
Tokenization/preprocessing/Clean_text.py +16 -0
Tokenization/preprocessing/Preprocess_sample.py +31 -0
Tokenization/preprocessing/Segment_paragraphs.py +19 -0
Tokenization/preprocessing/__init__.py +9 -0
Tokenization/preprocessing/__pycache__/Clean_text.cpython-310.pyc +0 -0
Tokenization/preprocessing/__pycache__/Preprocess_sample.cpython-310.pyc +0 -0
Tokenization/preprocessing/__pycache__/Segment_paragraphs.cpython-310.pyc +0 -0
Tokenization/preprocessing/__pycache__/__init__.cpython-310.pyc +0 -0
Tokenization/pretraining/Dataset_stats.py +40 -0
Tokenization/pretraining/Instruction_formatter.py +18 -0
Tokenization/pretraining/__init__.py +3 -0
Tokenization/pretraining/__pycache__/Dataset_stats.cpython-310.pyc +0 -0
Tokenization/pretraining/__pycache__/Instruction_formatter.cpython-310.pyc +0 -0
Tokenization/pretraining/__pycache__/__init__.cpython-310.pyc +0 -0
Tokenization/requirements.txt +11 -0
Tokenization/run_backend.py +12 -0

Tokenization/Build_tokenizer.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import json
+from pathlib import Path
+from typing import Dict
+from transformers import AutoTokenizer
+from Tokenization.Entropy_ranker import EntropyRanker
+from Tokenization.Label_tokens import MIN_WORDS, MAX_TOKENS, MAX_TOTAL_TOKENS, TOKEN_TARGETS
+from Tokenization.pretraining.Dataset_stats import DatasetAnalyzer
+from Tokenization.pretraining.Instruction_formatter import InstructionFormatter
+class QLoRAPreprocessor:
+    def __init__(self, model_name: str = "facebook/opt-350m", corpus_type: str = "warm_start"):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.analyzer = DatasetAnalyzer(model_name)
+        self.formatter = InstructionFormatter()
+        self.ranker = EntropyRanker()
+        self.token_target = TOKEN_TARGETS[corpus_type]
+        self.current_tokens = 0
+    def track_tokens(self, text: str) -> bool:
+        tokens = self.tokenizer.encode(text)
+        self.current_tokens += len(tokens)
+        return self.current_tokens <= self.token_target
+    def validate_sample(self, sample: Dict) -> bool:
+        if not all(k in sample for k in ["instruction", "input", "output"]):
+            return False
+        total_text = f"{sample['instruction']} {sample['input']} {sample['output']}"
+        tokens = self.tokenizer.encode(total_text)
+        words = total_text.split()
+        return (len(words) >= MIN_WORDS and
+                len(tokens) <= MAX_TOKENS and
+                len(tokens) <= MAX_TOTAL_TOKENS)
+    def process_dataset(self, input_path: str, output_path: str):
+        # Load data, skipping blank lines and malformed JSON
+        data = []
+        with open(input_path, 'r', encoding='utf-8') as f:
+            for i, line in enumerate(f, 1):
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    data.append(json.loads(line))
+                except json.JSONDecodeError as e:
+                    print(f"Skipping line {i}: {e}")
+        # Analyze dataset
+        stats = self.analyzer.get_dataset_stats(data)
+        print(f"Dataset stats: {stats}")
+        # Format samples
+        formatted_samples = [
+            self.formatter.format_sample(sample)
+            for sample in data
+        ]
+        # Rank and filter samples
+        ranked_samples = self.ranker.rank_samples(formatted_samples)
+        # Track token count while processing
+        valid_samples = []
+        for sample in ranked_samples:
+            if not self.validate_sample(sample):
+                continue
+            sample_text = f"{sample['instruction']} {sample['input']} {sample['output']}"
+            if not self.track_tokens(sample_text):
+                break
+            valid_samples.append(sample)
+        # Save to JSONL
+        output_file = Path(output_path)
+        output_file.parent.mkdir(parents=True, exist_ok=True)
+        with open(output_file, 'w', encoding='utf-8') as f:
+            for sample in valid_samples:
+                f.write(json.dumps(sample) + '\n')
+        print(f"Processed {len(valid_samples)} samples saved to {output_path}")
+if __name__ == "__main__":
+    preprocessor = QLoRAPreprocessor()
+    preprocessor.process_dataset(
+        "C:/Users/kunya/PycharmProjects/DataVolt/Tokenizers/combined_scientific_papers.json",
+        "nexa_scientific_instruction_300k.jsonl"
+    )

Tokenization/Cleanser.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import json
+import os
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datasets import Dataset
+# Tag dictionaries
+DOMAIN_TAGS = {
+    "physics": "[PHYS]",
+    "biology": "[BIO]",
+    "materials": "[MAT]",
+    "education": "[GEN]",
+}
+TASK_TAGS = {
+    "hypothesis": "[HYP]",
+    "method": "[MTH]",
+    "experiment": "[EXP]",
+}
+SECTION_TAGS = {
+    "abstract": "[ABSTRACT]",
+    "introduction": "[INTRO]",
+    "results": "[RESULTS]",
+    "discussion": "[DISCUSSION]",
+    "conclusion": "[CONCLUSION]",
+    "method": "[MTH]",
+    "experiment": "[EXP]",
+}
+SRC_PATH = Path(r"C:\Users\kunya\PycharmProjects\DataVolt\Tokenization\scientific_corpus_325M.jsonl")
+CLEANED_JSONL_PATH = Path("scientific_corpus_325M.cleaned.jsonl")
+CLEANED_ARROW_PATH = Path("scientific_corpus_325M.cleaned.arrow")
+CHUNK_SIZE = 10000
+MAX_WORKERS = os.cpu_count() or 4
+def tag_record(record):
+    # Tagging logic: add tags to text fields if domain/task/section present
+    # You may need to adjust keys based on your schema
+    domain = record.get("domain", "").lower()
+    task = record.get("task", "").lower()
+    section = record.get("section", "").lower()
+    text = record.get("full_text", "")
+    tags = []
+    if domain in DOMAIN_TAGS:
+        tags.append(DOMAIN_TAGS[domain])
+    if task in TASK_TAGS:
+        tags.append(TASK_TAGS[task])
+    if section in SECTION_TAGS:
+        tags.append(SECTION_TAGS[section])
+    # Prepend tags to text
+    record["tagged_text"] = " ".join(tags) + " " + text if tags else text
+    return record
+def process_chunk(lines):
+    cleaned = []
+    for line in lines:
+        try:
+            record = json.loads(line)
+            cleaned.append(tag_record(record))
+        except Exception:
+            continue  # skip malformed lines
+    return cleaned
+def chunked_file_reader(path, chunk_size):
+    with open(path, "r", encoding="utf-8") as f:
+        chunk = []
+        for line in f:
+            chunk.append(line)
+            if len(chunk) == chunk_size:
+                yield chunk
+                chunk = []
+        if chunk:
+            yield chunk
+def main():
+    print("Starting cleaning process...")
+    # Write cleaned records to a new JSONL file in chunks
+    with open(CLEANED_JSONL_PATH, "w", encoding="utf-8") as out_f:
+        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+            futures = []
+            for chunk in chunked_file_reader(SRC_PATH, CHUNK_SIZE):
+                futures.append(executor.submit(process_chunk, chunk))
+            for fut in as_completed(futures):
+                for record in fut.result():
+                    out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
+    print(f"Cleaned JSONL written to {CLEANED_JSONL_PATH}")
+    # Convert cleaned JSONL to Arrow using datasets (handles chunking internally)
+    print("Saving cleaned dataset to Arrow format...")
+    ds = Dataset.from_json(str(CLEANED_JSONL_PATH))
+    ds.save_to_disk(str(CLEANED_ARROW_PATH))
+    print(f"Saved cleaned Arrow dataset at: {CLEANED_ARROW_PATH}")
+    # Optionally, call hf_upload.py asynchronously
+    print("Uploading to HuggingFace using hf_upload.py ...")
+    os.system(f"python hf_upload.py")
+if __name__ == "__main__":
+    main()

Tokenization/Entropy_ranker.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import math
+from typing import List, Dict, Optional, Callable
+class EntropyRanker:
+    """
+    Scores and filters text samples by Shannon entropy of their token distribution.
+    Used to remove low-information or repetitive samples from scientific corpora.
+    """
+    def __init__(self, entropy_threshold: float = 3.5, tokenizer: Optional[Callable[[str], List[str]]] = None):
+        """
+        Args:
+            entropy_threshold: Minimum entropy required to keep a sample.
+            tokenizer: Function to tokenize text. Defaults to whitespace split.
+        """
+        self.entropy_threshold = entropy_threshold
+        self.tokenizer = tokenizer or (lambda x: x.split())
+    @staticmethod
+    def shannon_entropy(tokens: List[str]) -> float:
+        """Compute Shannon entropy for a list of tokens."""
+        if not tokens:
+            return 0.0
+        freq = {}
+        for t in tokens:
+            freq[t] = freq.get(t, 0) + 1
+        total = len(tokens)
+        entropy = 0.0
+        for count in freq.values():
+            p = count / total
+            entropy -= p * math.log(p, 2)
+        return entropy
+    def score_sample(self, text: str) -> float:
+        """Tokenize and score a text sample by entropy."""
+        tokens = self.tokenizer(text)
+        return self.shannon_entropy(tokens)
+    def is_explanatory(self, text: str) -> bool:
+        """Return True if sample passes an entropy threshold."""
+        return self.score_sample(text) >= self.entropy_threshold
+    def filter_samples(self, samples: List[Dict], text_key: str = "text") -> List[Dict]:
+        """Filter a list of dict samples, keeping only those above a threshold."""
+        return [s for s in samples if self.is_explanatory(s.get(text_key, ""))]
+    def rank_samples(self, samples: List[Dict], text_key: str = "text", top_k: Optional[int] = None) -> List[Dict]:
+        """
+        Rank samples by entropy, descending. Optionally return only top_k.
+        """
+        scored = [
+            (self.score_sample(s.get(text_key, "")), s)
+            for s in samples
+        ]
+        scored.sort(reverse=True, key=lambda x: x[0])
+        ranked = [s for _, s in scored if _ >= self.entropy_threshold]
+        if top_k is not None:
+            ranked = ranked[:top_k]
+        return ranked

Tokenization/Label_tokens.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# Tokenization/label_tokens.py
+# Domain tags
+DOMAIN_TAGS = {
+    "physics": "[PHYS]",
+    "biology": "[BIO]",
+    "materials": "[MAT]",
+    "education": "[GEN]",
+}
+# Task tags
+TASK_TAGS = {
+    "hypothesis": "[HYP]",
+    "method": "[MTH]",
+    "experiment": "[EXP]",
+}
+# Section tags (for further granularity, e.g., for long-context or future models)
+SECTION_TAGS = {
+    "abstract": "[ABSTRACT]",
+    "introduction": "[INTRO]",
+    "results": "[RESULTS]",
+    "discussion": "[DISCUSSION]",
+    "conclusion": "[CONCLUSION]",
+    "method": "[MTH]",
+    "experiment": "[EXP]",
+}
+# Routing tags
+ROUTING_TAGS = {
+    "general": "[GEN]",
+    "specific": "[SPEC]",
+}
+# Token/word limits for validation and filtering
+MIN_WORDS = 8
+MAX_TOKENS = 1024
+MAX_TOTAL_TOKENS = 327680000  # Example: 325M tokens
+# Token targets for different corpus types
+TOKEN_TARGETS = {
+    "warm_start": 100_000_000,
+    "scientific": 225_000_000,
+    "instruction": 30_000_000,
+    "default": 325_000_000,
+}
+def build_tag_string(
+    domain: str,
+    task: str = None,
+    section: str = None,
+    routing: str = "general",
+    subdomain: str = None
+) -> str:
+    """
+    Build a tag string for a sample, e.g. [PHYS][HYP][GEN] or [BIO][MTH][SPEC: Genomics]
+    """
+    tags = []
+    if domain in DOMAIN_TAGS:
+        tags.append(DOMAIN_TAGS[domain])
+    if task in TASK_TAGS:
+        tags.append(TASK_TAGS[task])
+    if section in SECTION_TAGS:
+        tags.append(SECTION_TAGS[section])
+    if routing == "general":
+        tags.append(ROUTING_TAGS["general"])
+    elif routing == "specific" and subdomain:
+        tags.append(f"[SPEC:{subdomain}]")
+    return "".join(tags)

Tokenization/Logs/corpus_builder.log ADDED Viewed

The diff for this file is too large to render. See raw diff

Tokenization/Logs/debug_upload.log ADDED Viewed

	@@ -0,0 +1,4 @@

+2025-06-07 20:23:13,293 - INFO - Converting C:\Users\kunya\PycharmProjects\DataVolt\Tokenization\scientific_corpus_325M.jsonl to Arrow format at scientific_corpus_325M.arrow ...
+2025-06-07 20:23:36,951 - ERROR - An error occurred while generating the dataset: An error occurred while generating the dataset
+2025-06-07 20:23:36,951 - ERROR - Process failed: An error occurred while generating the dataset
+2025-06-07 20:23:36,952 - INFO - Cleaned up local files.

Tokenization/Main_2.py ADDED Viewed

	@@ -0,0 +1,922 @@

+# python
+"""
+The Main pipeline for building a scientific corpus from multiple sources.
+Responsibilities:
+- Orchestrates collection, processing, ranking, and deduplication of papers from arXiv, PubMed, and FineWeb-Edu.
+- Handles error logging, checkpointing, and metrics for observability.
+- Modular design for extensibility and maintainability.
+Usage:
+    python Main_2.py
+Classes:
+    - SourceMetrics: Tracks per-source metrics.
+    - CorpusConfig: Configuration for corpus building.
+    - ScientificCorpusBuilder: Main pipeline class.
+Functions:
+    - main: Entry point for running the pipeline.
+Environment:
+    - Requires ENTREZ_EMAIL for PubMed API.
+    - Outputs logs and intermediate checkpoints to ./scientific_corpus_data.
+"""
+import concurrent.futures
+import json
+import logging
+import os
+import signal
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from types import FrameType
+from typing import List, Dict, Set, Optional, Callable, Any
+from urllib.error import URLError, HTTPError
+from xml.parsers.expat import ExpatError
+import arxiv
+from Bio import Entrez
+from datasets import load_dataset
+from tqdm import tqdm
+from Tokenization.Build_tokenizer import QLoRAPreprocessor
+from Tokenization.Entropy_ranker import EntropyRanker
+from Tokenization.hf_upload import upload_to_huggingface
+from Tokenization.Label_tokens import TASK_TAGS, ROUTING_TAGS
+from Tokenization.preprocessing import clean_text, segment_paragraphs
+from Tokenization.pretraining.Dataset_stats import DatasetAnalyzer
+from Tokenization.app.Config import PLAN_LIMITS
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[
+        logging.FileHandler("corpus_builder.log"),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+is_shutdown = False
+"""Global flag indicating whether a shutdown signal has been received.
+This flag is set to True by the signal handler to allow for graceful shutdown
+of long-running operations throughout the pipeline.
+"""
+def signal_handler(sig: int, frame: FrameType) -> None:
+    """Handle shutdown signals gracefully and set shutdown flag."""
+    global is_shutdown
+    logger.info(f"Received signal {sig}, shutting down gracefully. Frame: {frame}")
+    is_shutdown = True
+# Register signal handlers for graceful shutdown
+signal.signal(signal.SIGINT, signal_handler)
+signal.signal(signal.SIGTERM, signal_handler)
+def retry(max_retries: int = 3, backoff_factor: float = 1.0,
+          exceptions: tuple = (Exception,)) -> Callable:
+    """
+    Decorator for retrying a function with exponential backoff.
+    Args:
+        max_retries: Maximum number of retries.
+        backoff_factor: Multiplier for exponential backoff.
+        exceptions: Exception types to catch and retry.
+    Returns:
+        Decorated function with retry logic.
+    """
+    def decorator(func: Callable) -> Callable:
+        def wrapper(*args, **kwargs) -> Any:
+            retries = 0
+            while retries < max_retries:
+                if is_shutdown:
+                    logger.info("Shutdown in progress, aborting retries.")
+                    raise KeyboardInterrupt("Shutdown requested")
+                try:
+                    return func(*args, **kwargs)
+                except exceptions as e:
+                    wait = backoff_factor * (2 ** retries)
+                    logger.warning(f"Error in {func.__name__}: {e}. Retrying in {wait:.1f}s...")
+                    time.sleep(wait)
+                    retries += 1
+            logger.error(f"Function {func.__name__} failed after {max_retries} attempts.")
+            raise RuntimeError(f"{func.__name__} failed after {max_retries} attempts")
+        return wrapper
+    return decorator
+@dataclass
+class SourceMetrics:
+    """Metrics for tracking source performance."""
+    papers: int = 0
+    tokens: int = 0
+    time: float = 0.0
+    errors: int = 0
+@dataclass
+class CorpusConfig:
+    """
+    Configuration for corpus building parameters.
+    Attributes:
+        max_arxiv_papers: Maximum number of arXiv papers to fetch.
+        max_pubmed_papers: Maximum number of PubMed papers to fetch.
+        max_fineweb_samples: Maximum number of FineWeb-Edu samples to fetch.
+        max_workers: Number of workers for parallel processing.
+        timeout: Timeout for API requests.
+        chunk_size: Chunk size for batch processing.
+    """
+    max_arxiv_papers: int = 9000
+    max_pubmed_papers: int = 3000
+    max_fineweb_samples: int = 30000
+    max_workers: int = 8
+    timeout: int = 30
+    chunk_size: int = 1000
+class ScientificCorpusBuilder:
+    """
+    Main class for building a scientific corpus from multiple sources.
+    Methods:
+        fetch_arxiv_papers: Collects papers from arXiv.
+        fetch_pubmed_papers: Collects papers from PubMed.
+        fetch_fineweb_edu: Collects educational content from FineWeb-Edu.
+        preprocess_sample: Cleans and segments a paper into samples.
+        process_papers: Tags, filters, and preprocesses papers.
+        build_corpus: Orchestrates the full pipeline and builds the corpus.
+        print_report: Prints a summary report of the build process.
+    """
+    def __init__(self, config: Optional[CorpusConfig] = None):
+        """
+        Initialize the corpus builder with configuration and dependencies.
+        Args:
+            config: Optional CorpusConfig object.
+        """
+        self.config = config or CorpusConfig()
+        self.preprocessor = QLoRAPreprocessor(corpus_type="scientific")
+        self.analyzer = DatasetAnalyzer()
+        self.ranker = EntropyRanker()
+        self.data_dir = Path("scientific_corpus_data")
+        self.data_dir.mkdir(exist_ok=True)
+        self._setup_apis()
+        self.seen_titles: Set[str] = set()
+        self.metrics = {
+            "arxiv": SourceMetrics(),
+            "pubmed": SourceMetrics(),
+            "fineweb_edu": SourceMetrics(),
+            "total_tokens": 0,
+            "total_time": 0.0
+        }
+    @staticmethod
+    def _setup_apis() -> None:
+        """
+        Setup API configurations for external data sources.
+        """
+        Entrez.email = os.getenv("ENTREZ_EMAIL", "[email protected]")
+        if Entrez.email == "[email protected]":
+            logger.warning("Using default email for Entrez. Set ENTREZ_EMAIL environment variable.")
+    @retry(max_retries=3, backoff_factor=2,
+           exceptions=(arxiv.ArxivError, HTTPError, URLError, ConnectionError))
+    def _fetch_arxiv_search(self, query: str, max_results: int) -> List[Any]:
+        """
+        Fetch arXiv search results with error handling and exponential backoff.
+        Args:
+            query: arXiv API query string.
+            max_results: Maximum number of results to fetch.
+        Returns:
+            List of arXiv result objects.
+        """
+        try:
+            search = arxiv.Search(
+                query=query,
+                max_results=max_results,
+                sort_by=arxiv.SortCriterion.SubmittedDate,
+            )
+            client = arxiv.Client()
+            results = list(client.results(search))
+            if not results:
+                logger.warning(f"Empty page returned for query '{query}'")
+            return results
+        except (arxiv.UnexpectedEmptyPageError, arxiv.HTTPError) as e:
+            logger.warning(f"Empty page returned for query '{query}': {e}")
+            return []
+        except Exception as e:
+            logger.error(f"Error in _fetch_arxiv_search for query '{query}': {e}")
+            raise
+    def fetch_arxiv_papers(self) -> List[Dict]:
+        """
+        Fetch papers from arXiv across multiple domains with verification and checkpoint saving.
+        Returns:
+            List of arXiv paper dictionaries.
+        """
+        logger.info("Starting arXiv paper collection...")
+        start_time = time.time()
+        papers = []
+        queries = [
+            ("physics", "cat:physics* OR cat:astro-ph* OR cat:cond-mat* OR cat:hep-th OR cat:quant-ph OR cat:math-ph"),
+            ("biology", "cat:q-bio*"),
+            ("materials", "cat:cond-mat.mtrl-sci OR cat:materials*")
+        ]
+        for domain, query in queries:
+            if is_shutdown:
+                break
+            try:
+                results = self._fetch_arxiv_search(query, self.config.max_arxiv_papers // 3)
+                for result in tqdm(results, desc=f"arXiv {domain}"):
+                    if is_shutdown:
+                        break
+                    try:
+                        paper = {
+                            "title": result.title.strip() if result.title else "",
+                            "abstract": result.summary.strip() if result.summary else "",
+                            "full_text": "",
+                            "domain": domain,
+                            "section": "abstract",
+                            "source": "arxiv",
+                            "authors": [str(a) for a in result.authors] if result.authors else [],
+                            "published": result.published.isoformat() if result.published else None,
+                            "provenance": {"arxiv_id": result.get_short_id()},
+                            "categories": [c for c in getattr(result, "categories", [])] if hasattr(result, "categories") else [],
+                            "text": result.summary.strip() if result.summary else ""
+                        }
+                        if paper["title"] and paper["title"] not in self.seen_titles:
+                            papers.append(paper)
+                            self.seen_titles.add(paper["title"])
+                    except Exception as e:
+                        logger.warning(f"Error processing arXiv result: {e}")
+                        self.metrics["arxiv"].errors += 1
+                        continue
+            except Exception as e:
+                logger.error(f"arXiv {domain} search failed: {e}")
+                self.metrics["arxiv"].errors += 1
+        self._save_intermediate(papers, "arxiv_papers.jsonl")
+        elapsed = time.time() - start_time
+        self.metrics["arxiv"].papers = len(papers)
+        self.metrics["arxiv"].time = elapsed
+        logger.info(f"Collected {len(papers)} arXiv papers in {elapsed:.2f}s")
+        return papers
+    @retry(max_retries=3, backoff_factor=2,
+           exceptions=(HTTPError, URLError, ConnectionError, ExpatError))
+    def _fetch_pubmed_batch(self, chunk_pmids: List[str]) -> Dict:
+        """
+        Fetch a batch of PubMed records with error handling.
+        Args:
+            chunk_pmids: List of PubMed IDs.
+        Returns:
+            Dictionary of PubMed records.
+        """
+        try:
+            fetch_handle = Entrez.efetch (
+                db="pubmed",
+                id=",".join (chunk_pmids),
+                rettype="medline",
+                retmode="xml"
+            )
+            records = Entrez.read (fetch_handle)
+            fetch_handle.close ()
+            return records
+        except ExpatError as e:
+            logger.error (f"XML parsing error in PubMed batch: {e}")
+            raise
+        except (HTTPError, URLError) as e:
+            logger.error (f"Network error fetching PubMed batch: {e}")
+            raise
+    def fetch_pubmed_papers(self) -> List[Dict]:
+        """
+        Fetch papers from PubMed with biology focus.
+        Returns:
+            List of PubMed paper dictionaries.
+        """
+        logger.info ("Starting PubMed paper collection...")
+        start_time = time.time ()
+        papers = []
+        search_terms = [
+            "(methods[Title/Abstract]) AND (biology[MeSH Terms])",
+            "(computational biology[MeSH Terms]) AND (methods[Title/Abstract])",
+            "(bioinformatics[MeSH Terms]) AND (algorithm[Title/Abstract])",
+            "(molecular biology[MeSH Terms]) AND (technique[Title/Abstract])"
+        ]
+        for search_term in search_terms:
+            if is_shutdown:
+                break
+            try:
+                handle = Entrez.esearch (
+                    db="pubmed",
+                    term=search_term,
+                    retmax=self.config.max_pubmed_papers // len (search_terms),
+                    sort="relevance"
+                )
+                record = Entrez.read (handle)
+                handle.close ()
+                pmids = record.get ("IdList", [])
+                for i in tqdm (range (0, len (pmids), self.config.chunk_size), desc="PubMed batch"):
+                    if is_shutdown:
+                        break
+                    chunk_pmids = pmids [i:i + self.config.chunk_size]
+                    try:
+                        records = self._fetch_pubmed_batch (chunk_pmids)
+                        for rec in records.get ("PubmedArticle", []):
+                            try:
+                                medline_citation = rec.get ("MedlineCitation", {})
+                                article = medline_citation.get ("Article", {})
+                                title = article.get ("ArticleTitle", "")
+                                abstract_list = article.get ("Abstract", {}).get ("AbstractText", [""])
+                                abstract = abstract_list [0] if abstract_list else ""
+                                if title and isinstance (title, str) and title not in self.seen_titles:
+                                    paper = {
+                                        "title": title.strip (),
+                                        "abstract": abstract.strip () if isinstance (abstract, str) else "",
+                                        "full_text": "",
+                                        "domain": "biology",
+                                        "section": "abstract",
+                                        "source": "pubmed",
+                                        "authors": [],
+                                        "published": None,
+                                        "provenance": {"pubmed_id": str (medline_citation.get ("PMID", ""))},
+                                        "categories": ["biology"],
+                                        "text": abstract.strip () if isinstance (abstract, str) else ""
+                                    }
+                                    papers.append (paper)
+                                    self.seen_titles.add (title)
+                            except (KeyError, TypeError, AttributeError) as e:
+                                logger.warning (f"Error processing PubMed record: {e}")
+                                self.metrics ["pubmed"].errors += 1
+                                continue
+                    except (HTTPError, URLError, ConnectionError, ExpatError) as e:
+                        self.metrics ["pubmed"].errors += 1
+                        logger.warning (f"Failed to fetch PubMed batch: {e}")
+                        continue
+            except (HTTPError, URLError, ConnectionError, ExpatError) as e:
+                self.metrics ["pubmed"].errors += 1
+                logger.error (f"PubMed search failed for {search_term}: {e}")
+            except KeyboardInterrupt:
+                logger.info ("PubMed collection interrupted by user")
+                break
+        self._save_intermediate (papers, "pubmed_papers.jsonl")
+        elapsed = time.time () - start_time
+        self.metrics ["pubmed"].papers = len (papers)
+        self.metrics ["pubmed"].time = elapsed
+        logger.info (f"Collected {len (papers)} PubMed papers in {elapsed:.2f}s")
+        return papers
+    @retry (max_retries=3, backoff_factor=2,
+            exceptions=(ConnectionError, HTTPError, URLError, OSError))
+    def fetch_fineweb_edu(self) -> List [Dict]:
+        """
+        Fetch educational content from FineWeb-Edu dataset.
+        Returns:
+            List of FineWeb-Edu paper dictionaries.
+        """
+        logger.info ("Starting FineWeb-Edu collection...")
+        start_time = time.time ()
+        papers = []
+        try:
+            ds = load_dataset ("HuggingFaceFW/fineweb-edu", "sample-10BT",
+                               split="train", streaming=True)
+            samples = []
+            for i, sample in enumerate (ds):
+                if is_shutdown:
+                    break
+                if i >= self.config.max_fineweb_samples:
+                    break
+                if not isinstance (sample, dict) or "text" not in sample:
+                    logger.warning (f"Invalid sample structure at index {i}")
+                    continue
+                samples.append (sample)
+                if (i + 1) % 10000 == 0:
+                    logger.info (f"Collected {i + 1} FineWeb samples")
+            logger.info (f"Processing {len (samples)} FineWeb samples")
+            def is_educational_content(sample: Dict) -> bool:
+                """Check if content is educational and suitable."""
+                try:
+                    text = sample.get ("text", "")
+                    if not isinstance (text, str) or len (text) < 500:
+                        return False
+                    return self.ranker.is_explanatory (text)
+                except (AttributeError, TypeError, ValueError) as e:
+                    logger.debug (f"Error evaluating educational content: {e}")
+                    return False
+            with concurrent.futures.ThreadPoolExecutor (max_workers=self.config.max_workers) as executor:
+                filtered_results = list (tqdm (
+                    executor.map (is_educational_content, samples),
+                    total=len (samples),
+                    desc="Filtering FineWeb content"
+                ))
+            for sample, is_good in zip (samples, filtered_results):
+                if is_shutdown:
+                    break
+                if is_good:
+                    try:
+                        url = sample.get ("url", "")
+                        meta = sample.get ("meta", {})
+                        title = meta.get ("title", "") if isinstance (meta, dict) else ""
+                        title = title or url or f"Document_{len (papers)}"
+                        if title not in self.seen_titles:
+                            paper = {
+                                "title": title,
+                                "abstract": "",
+                                "full_text": sample.get ("text", ""),
+                                "domain": "education",
+                                "section": "full_text",
+                                "source": "fineweb_edu",
+                                "authors": [],
+                                "published": None,
+                                "provenance": {"url": url},
+                                "categories": ["education"],
+                                "text": sample.get("text", "")
+                            }
+                            papers.append (paper)
+                            self.seen_titles.add (title)
+                    except (KeyError, TypeError, AttributeError) as e:
+                        logger.warning (f"Error processing FineWeb sample: {e}")
+                        self.metrics ["fineweb_edu"].errors += 1
+                        continue
+        except (ConnectionError, HTTPError, URLError, OSError) as e:
+            logger.error (f"FineWeb-Edu fetch failed: {e}")
+            self.metrics ["fineweb_edu"].errors += 1
+        except KeyboardInterrupt:
+            logger.info ("FineWeb-Edu collection interrupted by user")
+        except ImportError as e:
+            logger.error (f"Failed to import required dataset library: {e}")
+            self.metrics ["fineweb_edu"].errors += 1
+        self._save_intermediate (papers, "fineweb_edu.jsonl")
+        elapsed = time.time () - start_time
+        self.metrics ["fineweb_edu"].papers = len (papers)
+        self.metrics ["fineweb_edu"].time = elapsed
+        logger.info (f"Collected {len (papers)} FineWeb-Edu papers in {elapsed:.2f}s")
+        return papers
+    @staticmethod
+    def preprocess_sample(paper: Dict) -> List [Dict]:
+        """
+        Preprocess a paper sample into multiple training samples.
+        Args:
+            paper: Dictionary representing a paper.
+        Returns:
+            List of processed sample dictionaries.
+        """
+        try:
+            title = clean_text (paper.get ("title", "")) if paper.get ("title") else ""
+            abstract = clean_text (paper.get ("abstract", "")) if paper.get ("abstract") else ""
+            full_text = clean_text (paper.get ("full_text", "")) if paper.get ("full_text") else ""
+            paragraphs = segment_paragraphs (full_text) if full_text else []
+            samples = []
+            if title or abstract:
+                sample = dict (paper)
+                sample ["title"] = title
+                sample ["abstract"] = abstract
+                sample ["full_text"] = ""
+                sample ["section"] = "abstract"
+                samples.append (sample)
+            for para in paragraphs:
+                if para.strip ():
+                    sample = dict (paper)
+                    sample ["title"] = title
+                    sample ["abstract"] = ""
+                    sample ["full_text"] = para
+                    sample ["section"] = "paragraph"
+                    samples.append (sample)
+            return samples
+        except (AttributeError, TypeError, ValueError) as e:
+            logger.warning (f"Error preprocessing sample: {e}")
+            return []
+    def process_papers(self, papers: List[Dict], domain: str) -> List[Dict]:
+        """
+        Process papers with domain-specific tagging and filtering.
+        Args:
+            papers: List of paper dictionaries.
+            domain: Domain string for tagging.
+        Returns:
+            List of processed and filtered sample dictionaries.
+        """
+        logger.info(f"Processing {len(papers)} {domain} papers...")
+        processed = []
+        unknown_domains = 0
+        unknown_sections = 0
+        def label_domain(paper):
+            cats = paper.get('categories', [])
+            if not cats:
+                return 'unknown'
+            cats_str = " ".join(cats).lower()
+            if 'bio' in cats_str:
+                return '[BIO]'
+            if 'gen' in cats_str:
+                return '[GEN]'
+            if 'phys' in cats_str:
+                return '[PHY]'
+            if 'math' in cats_str:
+                return '[MATH]'
+            if 'mat' in cats_str or 'materials' in cats_str:
+                return '[MAT]'
+            if 'astro' in cats_str:
+                return '[ASTRO]'
+            if 'cs' in cats_str:
+                return '[CS]'
+            return 'unknown'
+        def label_section(paper):
+            text = paper.get('text', '') or paper.get('abstract', '') or ''
+            text_lower = text.lower()
+            if not text_lower:
+                return 'unknown'
+            if 'abstract' in text_lower:
+                return '[ABSTRACT]'
+            if 'introduction' in text_lower:
+                return '[INTRO]'
+            if 'methods' in text_lower:
+                return '[METHODS]'
+            if 'results' in text_lower:
+                return '[RESULTS]'
+            if 'discussion' in text_lower:
+                return '[DISCUSSION]'
+            if 'conclusion' in text_lower:
+                return '[CONCLUSION]'
+            return 'unknown'
+        for paper in tqdm(papers, desc=f"Processing {domain} papers"):
+            try:
+                domain_tag = label_domain(paper)
+                section_tag = label_section(paper)
+                paper["domain_tag"] = domain_tag
+                paper["section_tag"] = section_tag
+                if domain_tag == 'unknown':
+                    unknown_domains += 1
+                if section_tag == 'unknown':
+                    unknown_sections += 1
+                task = paper.get("task", None)
+                if task and task in TASK_TAGS:
+                    paper["task_tag"] = TASK_TAGS[task]
+                routing = paper.get("routing", "general")
+                paper["routing_tag"] = ROUTING_TAGS.get(routing, ROUTING_TAGS["general"])
+                samples = self.preprocess_sample(paper)
+                for sample in samples:
+                    try:
+                        content_parts = []
+                        if sample.get("title"):
+                            content_parts.append(str(sample["title"]))
+                        if sample.get("abstract"):
+                            content_parts.append(str(sample["abstract"]))
+                        if sample.get("full_text"):
+                            content_parts.append(str(sample["full_text"])[:1000])
+                        content = " ".join(content_parts)
+                        if content.strip() and self.ranker.is_explanatory(content):
+                            sample["domain_tag"] = paper["domain_tag"]
+                            sample["section_tag"] = paper["section_tag"]
+                            sample["routing_tag"] = paper["routing_tag"]
+                            if "task_tag" in paper:
+                                sample["task_tag"] = paper["task_tag"]
+                            processed.append(sample)
+                    except Exception as e:
+                        logger.debug(f"Error evaluating sample content: {e}")
+                        continue
+            except Exception as e:
+                logger.warning(f"Paper processing error: {e}")
+                continue
+        logger.info(f"Processed {len(processed)}/{len(papers)} {domain} papers")
+        logger.info(f"Unknown domains: {unknown_domains}, Unknown sections: {unknown_sections}")
+        return processed
+    def _save_intermediate(self, papers: List[Dict], filename: str) -> None:
+        """
+        Save intermediate results to disk as JSONL.
+        Args:
+            papers: List of paper/sample dictionaries.
+            filename: Output filename.
+        """
+        path = self.data_dir / filename
+        try:
+            with open (path, "w", encoding="utf-8") as f:
+                for paper in papers:
+                    f.write (json.dumps (paper, ensure_ascii=False) + "\n")
+            logger.info (f"Saved checkpoint to {path}")
+        except (OSError, IOError, PermissionError) as e:
+            logger.error (f"Failed to save intermediate file {filename}: {e}")
+        except (TypeError, ValueError) as e:
+            logger.error (f"JSON serialization error for {filename}: {e}")
+    def build_corpus(self, output_path: str, verify_only: bool = False) -> None:
+        """
+        Build the complete scientific corpus with checkpoint verification.
+        Args:
+            output_path: Path to save the final corpus.
+            verify_only: If True, only verify checkpoints and skip merging.
+        """
+        logger.info("Starting scientific corpus build...")
+        total_start = time.time()
+        all_papers = []
+        sources = [
+            ("arXiv", self.fetch_arxiv_papers, None),
+            ("PubMed", self.fetch_pubmed_papers, "biology"),
+            ("FineWeb-Edu", self.fetch_fineweb_edu, "education")
+        ]
+        for source_name, fetch_func, domain in sources:
+            if is_shutdown:
+                break
+            logger.info(f"Fetching {source_name} papers...")
+            try:
+                papers = fetch_func()
+                if domain:
+                    processed = []
+                    for i in range(0, len(papers), self.config.chunk_size):
+                        chunk = papers[i:i + self.config.chunk_size]
+                        processed.extend(self.process_papers(chunk, domain))
+                    papers = processed
+                chkpt_filename = f"{source_name.lower()}_papers.jsonl"
+                self._save_intermediate(papers, chkpt_filename)
+                if not papers:
+                    logger.error(f"{source_name} checkpoint {chkpt_filename} is empty!")
+                all_papers.extend(papers)
+                logger.info(f"Added {len(papers)} papers from {source_name}")
+            except Exception as e:
+                logger.error(f"Critical error fetching from {source_name}: {e}")
+                continue
+        logger.info(f"Total papers collected: {len(all_papers)}")
+        if verify_only:
+            logger.info("Verification flag enabled; skipping merge and build.")
+            self.print_report({})
+            return
+        if not all_papers:
+            logger.error("No papers collected. Cannot build corpus.")
+            self.print_report({})
+            return
+        logger.info("Ranking and deduplicating papers...")
+        try:
+            ranked_papers = self.ranker.rank_samples(all_papers)
+            if not ranked_papers:
+                logger.error("Final corpus is empty after ranking. Using unranked papers as fallback.")
+                ranked_papers = all_papers
+            logger.info(f"Final corpus size: {len(ranked_papers)} papers")
+        except Exception as e:
+            logger.error(f"Error ranking papers: {e}")
+            ranked_papers = all_papers
+        if not ranked_papers:
+            logger.error("Final corpus is empty. No data to process or save.")
+            self.print_report({})
+            return
+        self._save_intermediate(ranked_papers, "ranked_papers.jsonl")
+        try:
+            stats = self.analyzer.get_dataset_stats(ranked_papers)
+            self.metrics["total_tokens"] = int(stats.get("avg_tokens", 0) * stats.get("total_samples", 0))
+        except Exception as e:
+            logger.error(f"Error generating dataset statistics: {e}")
+            stats = {}
+        self.metrics["total_time"] = time.time() - total_start
+        logger.info("Processing final dataset in batches...")
+        try:
+            with open(output_path, "w", encoding="utf-8") as out_f:
+                for i in range(0, len(ranked_papers), self.config.chunk_size):
+                    chunk = ranked_papers[i:i + self.config.chunk_size]
+                    for paper in chunk:
+                        out_f.write(json.dumps(paper, ensure_ascii=False) + "\n")
+        except Exception as e:
+            logger.error(f"Error processing final dataset: {e}")
+        # HuggingFace upload: warn if a file is too large
+        if os.path.exists(output_path) and os.path.getsize(output_path) > 10 * 1024 * 1024:
+            logger.warning(
+                f"{output_path} is larger than 10 MiB. HuggingFace will reject files >10 MiB unless you use Git LFS. "
+                "See https://hf.co/docs/hub/repositories-getting-started#terminal"
+            )
+            logger.warning(
+                "To fix: install git-lfs and run 'git lfs track \"*.jsonl\"' before pushing, or split your file."
+            )
+        self.print_report(stats)
+        logger.info(f"Scientific corpus successfully built: {output_path}")
+    def build_corpus_scoped(self, plan: str, token_budget: int) -> (list, dict):
+        """
+        Build a scientific corpus, limiting the total number of tokens to the plan's budget.
+        Returns the corpus and stats.
+        """
+        logger.info(f"Building corpus for plan '{plan}' with token budget {token_budget}")
+        all_papers = []
+        all_papers.extend(self.process_papers(self.fetch_arxiv_papers(), "arxiv"))
+        all_papers.extend(self.process_papers(self.fetch_pubmed_papers(), "biology"))
+        all_papers.extend(self.process_papers(self.fetch_fineweb_edu(), "education"))
+        # Rank and deduplicate
+        ranked_papers = self.ranker.rank_samples(all_papers)
+        corpus = []
+        total_tokens = 0
+        for paper in ranked_papers:
+            tokens = paper.get("text", "").split()
+            if total_tokens + len(tokens) > token_budget:
+                break
+            corpus.append(paper)
+            total_tokens += len(tokens)
+        stats = self.analyzer.get_dataset_stats(corpus)
+        stats["total_tokens"] = total_tokens
+        logger.info(f"Corpus built: {len(corpus)} samples, {total_tokens} tokens")
+        return corpus, stats
+    def print_report(self, stats: Dict) -> None:
+        """
+        Print a comprehensive build report.
+        Args:
+            stats: Dictionary of dataset statistics.
+        """
+        print("\n" + "=" * 67)
+        print("           SCIENTIFIC CORPUS BUILD REPORT")
+        print("=" * 67)
+        print("\nSOURCE METRICS:")
+        print("-" * 40)
+        for source_name, label in zip(["arxiv", "pubmed", "fineweb_edu"],
+                                      ["ARXIV", "PUBMED", "FINEWEB_EDU"]):
+            metrics = self.metrics[source_name]
+            print(f"{label:15}: {metrics.papers:6d} papers | {metrics.errors:3d} errors | {metrics.time:9.2f}s")
+        print("\nOVERALL METRICS:")
+        print("-" * 40)
+        total_papers = sum(self.metrics[src].papers for src in ["arxiv", "pubmed", "fineweb_edu"])
+        total_errors = sum(self.metrics[src].errors for src in ["arxiv", "pubmed", "fineweb_edu"])
+        print(f"Total Papers:     {total_papers:,}")
+        print(f"Total Tokens:     {self.metrics['total_tokens']:,}")
+        print(f"Total Time:       {self.metrics['total_time']:.2f}s")
+        print(f"Total Errors:     {total_errors}")
+        success_rate = (1 - total_errors / max(total_papers + total_errors, 1)) * 100
+        print(f"Success Rate:     {success_rate:.2f}%")
+        if stats:
+            print("\nDATASET STATISTICS:")
+            print("-" * 40)
+            for key, value in stats.items():
+                print(f"{key:20}: {value}")
+        print("=" * 67)
+        print()
+def main() -> None:
+    """
+    Main entry point for the corpus builder.
+    """
+    try:
+        config = CorpusConfig()
+        builder = ScientificCorpusBuilder(config)
+        output_path = "scientific_corpus_325M.jsonl"
+        builder.build_corpus(output_path)
+        # --- Hugging Face upload with improved error handling ---
+        try:
+            # Split large files if needed
+            file_size = os.path.getsize(output_path)
+            if file_size > 10 * 1024 * 1024:  # 10 MB
+                logger.info("Large file detected, splitting into chunks...")
+                chunk_size = 10 * 1024 * 1024  # 10 MB chunks
+                base_path = os.path.splitext(output_path)[0]
+                with open(output_path, 'r', encoding='utf-8') as f:
+                    chunk_num = 0
+                    chunk = []
+                    current_size = 0
+                    for line in f:
+                        line_size = len(line.encode('utf-8'))
+                        if current_size + line_size > chunk_size and chunk:
+                            chunk_path = f"{base_path}_part{chunk_num}.jsonl"
+                            with open(chunk_path, 'w', encoding='utf-8') as chunk_file:
+                                chunk_file.writelines(chunk)
+                            logger.info(f"Created chunk {chunk_num}: {chunk_path}")
+                            chunk = []
+                            current_size = 0
+                            chunk_num += 1
+                        chunk.append(line)
+                        current_size += line_size
+                    # Write final chunk
+                    if chunk:
+                        chunk_path = f"{base_path}_part{chunk_num}.jsonl"
+                        with open(chunk_path, 'w', encoding='utf-8') as chunk_file:
+                            chunk_file.writelines(chunk)
+                        logger.info(f"Created final chunk {chunk_num}: {chunk_path}")
+                # Upload each chunk
+                for i in range(chunk_num + 1):
+                    chunk_path = f"{base_path}_part{i}.jsonl"
+                    logger.info(f"Uploading chunk {i}...")
+                    upload_to_huggingface(
+                        dataset_path=chunk_path,
+                        repo_id="Allanatrix/Scientific_Research_Tokenized",
+                        auto_generate_readme=(i == 0),  # Only generate README for first chunk
+                        compress=True,
+                        keep_local=True  # Keep files until all uploads complete
+                    )
+            else:
+                # Upload single file
+                upload_to_huggingface(
+                    dataset_path=output_path,
+                    repo_id="Allanatrix/Scientific_Research_Tokenized",
+                    auto_generate_readme=True,
+                    compress=True
+                )
+        except ImportError:
+            logger.error("Hugging Face upload module not found. Please ensure hf_upload.py exists.")
+        except Exception as e:
+            logger.error(f"Error during Hugging Face upload: {e}")
+            if "EOF" in str(e) or "timeout" in str(e):
+                logger.warning("Upload interrupted. Try using smaller chunks or increasing timeout.")
+        finally:
+            # Cleanup temporary files
+            if 'chunk_num' in locals():
+                for i in range(chunk_num + 1):
+                    try:
+                        os.remove(f"{base_path}_part{i}.jsonl")
+                    except OSError:
+                        pass
+    except KeyboardInterrupt:
+        logger.info("Build process interrupted by user")
+    except Exception as e:
+        logger.error(f"Unexpected error in main: {e}")
+        raise
+# Optionally, you can add a CLI entry point for testing:
+def main_scoped(plan: str = "free"):
+    config = CorpusConfig()
+    builder = ScientificCorpusBuilder(config)
+    token_budget = PLAN_LIMITS.get(plan, 1000)
+    corpus, stats = builder.build_corpus_scoped(plan, token_budget)
+    output_path = f"scientific_corpus_{plan}_{token_budget}.jsonl"
+    with open(output_path, "w", encoding="utf-8") as f:
+        for paper in corpus:
+            f.write(json.dumps(paper, ensure_ascii=False) + "\n")
+    print(f"Saved {len(corpus)} samples ({stats['total_tokens']} tokens) to {output_path}")
+if __name__ == "__main__":
+    # main()  # old entry point
+    main_scoped("free")  # new entry point for plan-scoped corpus

Tokenization/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Tokenization/__init__.py
+from .Entropy_ranker import EntropyRanker
+from .Label_tokens import DOMAIN_TAGS, TASK_TAGS, SECTION_TAGS, ROUTING_TAGS, build_tag_string
+from .preprocessing import clean_text, segment_paragraphs, preprocess_sample
+# Expose the main dataset generation pipeline for external use
+from .generate_dataset import generate_dataset
+__all__ = [
+    "EntropyRanker",
+    "DOMAIN_TAGS",
+    "TASK_TAGS",
+    "SECTION_TAGS",
+    "ROUTING_TAGS",
+    "build_tag_string",
+    "clean_text",
+    "segment_paragraphs",
+    "preprocess_sample",
+    "generate_dataset",
+]

Tokenization/__pycache__/Build_tokenizer.cpython-310.pyc ADDED Viewed

Binary file (3.54 kB). View file

Tokenization/__pycache__/Entropy_ranker.cpython-310.pyc ADDED Viewed

Binary file (3.39 kB). View file

Tokenization/__pycache__/Label_tokens.cpython-310.pyc ADDED Viewed

Binary file (1.35 kB). View file

Tokenization/__pycache__/Main_2.cpython-310.pyc ADDED Viewed

Binary file (26.8 kB). View file

Tokenization/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (565 Bytes). View file

Tokenization/__pycache__/generate_dataset.cpython-310.pyc ADDED Viewed

Binary file (3.14 kB). View file

Tokenization/__pycache__/hf_upload.cpython-310.pyc ADDED Viewed

Binary file (5.56 kB). View file

Tokenization/app.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import gradio as gr
+import time
+def calculate_price(payment_mode, tokens, plan, custom_price, file):
+    if payment_mode == "Pay as you go":
+        price = round(tokens * 0.01, 2)  # Example: $0.01 per token
+        return f"{tokens:,} tokens\nPrice: ${price:.2f}", price
+    elif payment_mode == "Plan":
+        if plan == "Free":
+            return "0 tokens\nPrice: $0", 0
+        elif plan == "Starter":
+            return "100,000 tokens\nPrice: $15", 15
+        elif plan == "Pro":
+            return "500,000 tokens\nPrice: $30", 30
+        elif plan == "Custom":
+            return f"Custom plan\nPrice: ${custom_price}", float(custom_price or 0)
+    elif file is not None:
+        # Simulate token count from file size
+        tokens = 1000  # Replace it with real calculation
+        price = round(tokens * 0.01, 2)
+        return f"{tokens:,} tokens\nPrice: ${price:.2f}", price
+    return "", 0
+def generate_dataset(*args, **kwargs):
+    for i in range(5):
+        yield f"Generating... ({(i+1)*20}%)", None, (i+1)/5
+        time.sleep(0.3)
+    yield "Ready! Please pay to download.", "dataset.jsonl", 1.0
+with gr.Blocks(
+    title="Nexa Data Studio",
+    css="""
+    body, .gradio-container {
+        min-height: 100vh;
+        background: #111 !important;
+        color: #fff !important;
+    }
+    .gradio-container {
+        max-width: 900px !important;
+        margin: 40px auto !important;
+        box-shadow: 0 2px 16px #0008;
+        border-radius: 16px;
+        padding: 32px 32px 24px 32px !important;
+        background: #111 !important;
+        color: #fff !important;
+        display: flex;
+        flex-direction: column;
+        align-items: center;
+    }
+    .footer {margin-top: 2em; color: #bbb; font-size: 0.9em; text-align: center;}
+    #header {text-align: center;}
+    """
+) as demo:
+    gr.Markdown(
+        """
+        <div style="display:flex;align-items:center;gap:16px;justify-content:center;">
+            <img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" height="40"/>
+            <h1 style="margin-bottom:0;">Nexa Data Studio</h1>
+        </div>
+        <p style="text-align:center;">
+        <b>Generate or label scientific datasets for ML research.</b>
+        </p>
+        """,
+        elem_id="header"
+    )
+    payment_mode = gr.Radio(
+        ["Pay as you go", "Plan"],
+        label="Payment Mode",
+        value="Pay as you go"
+    )
+    with gr.Row() as payg_row:
+        tokens = gr.Slider(100, 100000, value=1000, step=100, label="Tokens Requested")
+    with gr.Row(visible=False) as plan_row:
+        plan = gr.Dropdown(
+            ["Free", "Starter", "Pro", "Custom"],
+            label="Plan",
+            value="Free"
+        )
+        custom_price = gr.Number(label="Custom Price ($)", visible=False)
+    job_type = gr.Radio(
+        ["Generate Dataset", "Label Uploaded Data"],
+        label="Job Type",
+        value="Generate Dataset"
+    )
+    with gr.Column(visible=False) as label_col:
+        file = gr.File(label="Upload Dataset (.txt or .jsonl)")
+    price_info = gr.Textbox(label="Summary", interactive=False)
+    download = gr.File(label="Download")
+    progress = gr.Slider(0, 1, value=0, step=0.01, label="Progress", interactive=False)
+    status = gr.Text(label="Status", interactive=False)
+    def update_payment_ui(payment_mode_val, plan_val):
+        return (
+            gr.update(visible=payment_mode_val == "Pay as you go"),
+            gr.update(visible=payment_mode_val == "Plan"),
+            gr.update(visible=payment_mode_val == "Plan" and plan_val == "Custom")
+        )
+    payment_mode.change(
+        update_payment_ui,
+        inputs=[payment_mode, plan],
+        outputs=[payg_row, plan_row, custom_price]
+    )
+    plan.change(
+        lambda p: gr.update(visible=p == "Custom"),
+        inputs=plan,
+        outputs=custom_price
+    )
+    def update_label_ui(job_type_val):
+        return gr.update(visible=job_type_val == "Label Uploaded Data")
+    job_type.change(update_label_ui, inputs=job_type, outputs=label_col)
+    def update_summary(payment_mode, tokens, plan, custom_price, file, job_type):
+        if job_type == "Label Uploaded Data" and file is not None:
+            return calculate_price("Label", tokens, plan, custom_price, file)[0]
+        return calculate_price(payment_mode, tokens, plan, custom_price, file)[0]
+    inputs = [payment_mode, tokens, plan, custom_price, file, job_type]
+    gr.Button("Generate", elem_id="generate-btn", variant="primary").click(
+        generate_dataset,
+        inputs=inputs,
+        outputs=[status, download, progress]
+    )
+    gr.Button("Update Summary").click(
+        update_summary,
+        inputs=inputs,
+        outputs=price_info
+    )
+    gr.Markdown(
+        f"""
+        <div class="footer">
+        &copy; {time.strftime("%Y")} Nexa Data Studio &mdash; Powered by Hugging Face Spaces<br>
+        For support, contact <a href="mailto:[email protected]">[email protected]</a>
+        </div>
+        """
+    )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
+    print("Nexa Data Studio is running at http://localhost:7860")

Tokenization/app/Api.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+Api.py: FastAPI endpoints for dataset generation, progress polling, and download.
+"""
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+from .Core import job_manager
+from .Progress import progress_tracker
+from .Payment import payment_manager
+import io
+app = FastAPI()
+@app.post("/generate-dataset")
+async def generate_dataset(request: Request):
+    user_input = await request.json()
+    job_id, error = job_manager.start_job(user_input)
+    if error:
+        return JSONResponse({"error": error}, status_code=400)
+    return {"job_id": job_id}
+@app.get("/progress/{job_id}")
+def get_progress(job_id: str):
+    progress = progress_tracker.get(job_id)
+    if not progress:
+        return JSONResponse({"error": "Job not found"}, status_code=404)
+    return progress
+@app.get("/download/{job_id}")
+def download(job_id: str):
+    job = job_manager.get_job_status(job_id)
+    if not job or job.get("status") != "complete":
+        return JSONResponse({"error": "Job not complete"}, status_code=400)
+    # Payment check
+    plan = job.get("plan", "free")
+    tokens = job.get("token_budget", 0)
+    if payment_manager.requires_payment(plan, tokens):
+        return JSONResponse({"error": "Payment required", "checkout_url": payment_manager.create_checkout_session(plan, job_id)}, status_code=402)
+    # In production, use FileResponse to serve the file
+    return {
+        "download_url": job["result_path"],
+        "stats": job.get("stats", {})
+    }
+@app.get("/download-corpus/{job_id}")
+def download_corpus(job_id: str):
+    job = job_manager.get_job_status(job_id)
+    if not job or job.get("status") != "complete":
+        return JSONResponse({"error": "Job not complete"}, status_code=400)
+    if job.get("job_type") != "corpus":
+        return JSONResponse({"error": "Not a corpus job"}, status_code=400)
+    plan = job.get("plan", "free")
+    tokens = job.get("token_budget", 0)
+    if payment_manager.requires_payment(plan, tokens):
+        return JSONResponse({"error": "Payment required", "checkout_url": payment_manager.create_checkout_session(plan, job_id)}, status_code=402)
+    jsonl_lines = job.get("jsonl_lines", [])
+    stats = job.get("stats", {})
+    # Stream the JSONL as a file
+    file_like = io.StringIO("\n".join(jsonl_lines))
+    headers = {
+        "Content-Disposition": f"attachment; filename=scientific_corpus_{job_id}.jsonl"
+    }
+    return StreamingResponse(file_like, media_type="application/jsonl", headers=headers)
+@app.get("/job-stats/{job_id}")
+def job_stats(job_id: str):
+    job = job_manager.get_job_status(job_id)
+    if not job:
+        return JSONResponse({"error": "Job not found"}, status_code=404)
+    return {"stats": job.get("stats", {})}
+@app.get("/price/{plan}")
+def get_price(plan: str):
+    price = payment_manager.get_price(plan)
+    return {"plan": plan, "price": price}

Tokenization/app/Config.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""
+Config.py: Configuration for plan limits, pricing, and app constants.
+"""
+# Plan limits (tokens per plan)
+PLAN_LIMITS = {
+    "free": 1000,
+    "starter": 5000,
+    "pro": 10000,
+    "enterprise": 100000,
+}
+# Pricing per plan (USD)
+PLAN_PRICING = {
+    "free": 0,
+    "starter": 15,
+    "pro": 30,
+    "enterprise": "custom",
+}
+# Other app-wide constants
+tmp_dir = "./tmp_datasets"
+# Stripe keys, etc. (to be set via environment variables in production)
+STRIPE_API_KEY = None

Tokenization/app/Core.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""
+Core.py: Orchestrates dataset generation jobs, plan enforcement, and background processing.
+"""
+import threading
+import uuid
+import os
+import json
+from .Config import PLAN_LIMITS, tmp_dir
+from .Progress import progress_tracker
+from .Payment import payment_manager
+# Import your tokenizer module here (example)
+from Tokenization.generate_dataset import generate_dataset
+from Tokenization.Main_2 import ScientificCorpusBuilder, CorpusConfig
+from Tokenization.Build_tokenizer import QLoRAPreprocessor
+import nltk
+class JobManager:
+    def __init__(self):
+        self.jobs = {}
+        self.lock = threading.Lock()
+    def start_job(self, user_input):
+        plan = user_input.get("plan")
+        token_budget = user_input.get("token_budget")
+        job_type = user_input.get("job_type", "tokenize")  # "tokenize", "corpus", or "label"
+        # For label jobs, token_budget is determined after upload
+        if job_type != "label" and not payment_manager.check_plan_limit(plan, token_budget):
+            return None, "Plan limit exceeded"
+        job_id = str(uuid.uuid4())
+        with self.lock:
+            self.jobs[job_id] = {
+                "status": "pending",
+                "plan": plan,
+                "token_budget": token_budget,
+                "job_type": job_type,
+                "user_input": user_input
+            }
+        if job_type == "corpus":
+            thread = threading.Thread(target=self._run_corpus_pipeline, args=(job_id,))
+        elif job_type == "label":
+            thread = threading.Thread(target=self._run_label_pipeline, args=(job_id,))
+        else:
+            thread = threading.Thread(target=self._run_job, args=(job_id, user_input))
+        thread.start()
+        return job_id, None
+    def _run_job(self, job_id, user_input):
+        try:
+            progress_tracker.start_job(job_id, total_steps=6)
+            # Step 1: Data retrieval
+            progress_tracker.update(job_id, 1, "Retrieving data from sources...")
+            domain = user_input.get("domain")
+            token_budget = user_input.get("token_budget")
+            plan = user_input.get("plan")
+            custom_seed = user_input.get("custom_seed", None)
+            # Step 2: Preprocessing
+            progress_tracker.update(job_id, 2, "Preprocessing and cleaning data...")
+            # Step 3: Tokenization & Labeling
+            progress_tracker.update(job_id, 3, "Tokenizing and labeling samples...")
+            # Step 4: Validation & Stats
+            progress_tracker.update(job_id, 4, "Validating and computing statistics...")
+            # Step 5: Formatting output
+            progress_tracker.update(job_id, 5, "Formatting dataset as JSONL...")
+            # Call tokenizer pipeline (implement in tokenization/tokenizer.py)
+            result = generate_dataset(
+                domain=domain,
+                token_budget=token_budget,
+                plan=plan,
+                custom_seed=custom_seed,
+                progress_callback=lambda step, msg: progress_tracker.update(job_id, step, msg)
+            )
+            # Step 6: Save output
+            os.makedirs(tmp_dir, exist_ok=True)
+            output_path = os.path.join(tmp_dir, f"{domain}_{token_budget}_tokens_{job_id}.jsonl")
+            with open(output_path, "w", encoding="utf-8") as f:
+                for line in result["jsonl_lines"]:
+                    f.write(line + "\n")
+            progress_tracker.update(job_id, 6, "Dataset ready for download.")
+            progress_tracker.complete(job_id)
+            with self.lock:
+                self.jobs[job_id]["status"] = "complete"
+                self.jobs[job_id]["result_path"] = output_path
+                self.jobs[job_id]["stats"] = result.get("stats", {})
+        except Exception as e:
+            progress_tracker.update(job_id, 0, f"Job failed: {str(e)}")
+            with self.lock:
+                self.jobs[job_id]["status"] = "failed"
+                self.jobs[job_id]["error"] = str(e)
+    def _run_corpus_pipeline(self, job_id):
+        try:
+            with self.lock:
+                user_input = self.jobs[job_id]["user_input"]
+            plan = user_input.get("plan")
+            token_budget = user_input.get("token_budget")
+            progress_tracker.start_job(job_id, total_steps=5)
+            progress_tracker.update(job_id, 1, "Building scientific corpus...")
+            config = CorpusConfig()
+            builder = ScientificCorpusBuilder(config)
+            corpus, stats = builder.build_corpus_scoped(plan, token_budget)
+            progress_tracker.update(job_id, 2, "Formatting dataset as JSONL...")
+            jsonl_lines = [json.dumps(paper, ensure_ascii=False) for paper in corpus]
+            progress_tracker.update(job_id, 3, "Finalizing output...")
+            progress_tracker.update(job_id, 4, "Corpus ready for download.")
+            progress_tracker.complete(job_id)
+            with self.lock:
+                self.jobs[job_id]["status"] = "complete"
+                self.jobs[job_id]["jsonl_lines"] = jsonl_lines
+                self.jobs[job_id]["stats"] = stats
+                self.jobs[job_id]["actual_tokens"] = stats.get("total_tokens", 0)
+        except Exception as e:
+            progress_tracker.update(job_id, 0, f"Job failed: {str(e)}")
+            with self.lock:
+                self.jobs[job_id]["status"] = "failed"
+                self.jobs[job_id]["error"] = str(e)
+    def _run_label_pipeline(self, job_id):
+        try:
+            with self.lock:
+                user_input = self.jobs[job_id]["user_input"]
+                plan = self.jobs[job_id]["plan"]
+            progress_tracker.start_job(job_id, total_steps=4)
+            progress_tracker.update(job_id, 1, "Loading and preprocessing dataset...")
+            dataset_text = user_input.get("dataset_text", "")
+            if not dataset_text:
+                raise ValueError("No dataset text provided.")
+            tokens = nltk.word_tokenize(dataset_text)
+            num_tokens = len(tokens)
+            with self.lock:
+                self.jobs[job_id]["actual_tokens"] = num_tokens
+            if not payment_manager.check_plan_limit(plan, num_tokens):
+                raise ValueError("Plan limit exceeded.")
+            progress_tracker.update(job_id, 2, "Tokenizing and labeling dataset...")
+            preprocessor = QLoRAPreprocessor()
+            labeled_data = preprocessor.preprocess_function(dataset_text)
+            jsonl_lines = [json.dumps({"text": item}, ensure_ascii=False) for item in labeled_data]
+            stats = {"token_count": num_tokens, "sample_count": len(labeled_data)}
+            progress_tracker.update(job_id, 3, "Dataset ready for download.")
+            progress_tracker.complete(job_id)
+            with self.lock:
+                self.jobs[job_id]["status"] = "complete"
+                self.jobs[job_id]["jsonl_lines"] = jsonl_lines
+                self.jobs[job_id]["stats"] = stats
+        except Exception as e:
+            progress_tracker.update(job_id, 0, f"Job failed: {str(e)}")
+            with self.lock:
+                self.jobs[job_id]["status"] = "failed"
+                self.jobs[job_id]["error"] = str(e)
+    def get_job_status(self, job_id):
+        with self.lock:
+            return self.jobs.get(job_id, None)
+job_manager = JobManager()

Tokenization/app/Payment.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""
+Payment.py: Plan enforcement and payment logic (Stripe stub).
+"""
+import os
+from .Config import PLAN_LIMITS, PLAN_PRICING
+class PaymentManager:
+    def __init__(self):
+        self.stripe_api_key = os.getenv("STRIPE_API_KEY")
+    def check_plan_limit(self, plan, requested_tokens):
+        limit = PLAN_LIMITS.get(plan, 0)
+        return requested_tokens <= limit
+    def get_price(self, plan):
+        return PLAN_PRICING.get(plan, 0)
+    def requires_payment(self, plan, requested_tokens):
+        if plan == "free":
+            return requested_tokens > PLAN_LIMITS["free"]
+        return plan not in PLAN_LIMITS
+    def create_checkout_session(self, plan, job_id):
+        # Stub: Integrate with Stripe API in production
+        return f"https://checkout.stripe.com/pay/{plan}/{job_id}"
+payment_manager = PaymentManager()

Tokenization/app/Progress.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""
+Progress.py: Thread-safe progress tracking for dataset generation jobs.
+"""
+import threading
+class ProgressTracker:
+    def __init__(self):
+        self._progress = {}
+        self._lock = threading.Lock()
+    def start_job(self, job_id, total_steps):
+        with self._lock:
+            self._progress[job_id] = {
+                "current": 0,
+                "total": total_steps,
+                "status": "started",
+                "message": "Job started"
+            }
+    def update(self, job_id, current, message=None):
+        with self._lock:
+            if job_id in self._progress:
+                self._progress[job_id]["current"] = current
+                if message:
+                    self._progress[job_id]["message"] = message  # No emoji, just message
+    def complete(self, job_id):
+        with self._lock:
+            if job_id in self._progress:
+                self._progress[job_id]["status"] = "complete"
+                self._progress[job_id]["message"] = "Job complete"
+    def get(self, job_id):
+        with self._lock:
+            return self._progress.get(job_id, None)
+progress_tracker = ProgressTracker()

Tokenization/app/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""
+app/__init__.py: Exposes main backend components for reuse.
+"""
+from .Api import app as fastapi_app
+from .Core import job_manager
+from .Progress import progress_tracker
+from .Payment import payment_manager
+__all__ = [
+    "fastapi_app",
+    "job_manager",
+    "progress_tracker",
+    "payment_manager",
+]

Tokenization/app/__pycache__/Api.cpython-310.pyc ADDED Viewed

Binary file (2.81 kB). View file

Tokenization/app/__pycache__/Config.cpython-310.pyc ADDED Viewed

Binary file (444 Bytes). View file

Tokenization/app/__pycache__/Core.cpython-310.pyc ADDED Viewed

Binary file (4.86 kB). View file

Tokenization/app/__pycache__/Payment.cpython-310.pyc ADDED Viewed

Binary file (1.45 kB). View file

Tokenization/app/__pycache__/Progress.cpython-310.pyc ADDED Viewed

Binary file (1.66 kB). View file

Tokenization/app/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (460 Bytes). View file

Tokenization/combined_scientific_papers.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Tokenization/combined_scientific_papers.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

Tokenization/corpus_builder.log ADDED Viewed

File without changes

Tokenization/debug_upload.log ADDED Viewed

	@@ -0,0 +1,198 @@

+2025-06-12 18:18:01,037 - WARNING - Using default email for Entrez. Set ENTREZ_EMAIL environment variable.
+2025-06-12 18:18:01,037 - INFO - Starting arXiv paper collection...
+2025-06-12 18:18:01,038 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
+2025-06-12 18:18:03,165 - INFO - Got first page: 100 of 1236760 total results
+2025-06-12 18:18:03,172 - INFO - Sleeping: 2.828948 seconds
+2025-06-12 18:18:06,004 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=100&max_results=100
+2025-06-12 18:18:06,953 - INFO - Sleeping: 2.866122 seconds
+2025-06-12 18:18:09,824 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=200&max_results=100
+2025-06-12 18:18:11,783 - INFO - Sleeping: 2.823819 seconds
+2025-06-12 18:18:14,608 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=300&max_results=100
+2025-06-12 18:18:16,436 - INFO - Sleeping: 2.857095 seconds
+2025-06-12 18:18:19,301 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=400&max_results=100
+2025-06-12 18:18:22,022 - INFO - Sleeping: 2.790207 seconds
+2025-06-12 18:18:24,820 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
+2025-06-12 18:18:25,173 - INFO - Sleeping: 2.998001 seconds
+2025-06-12 18:18:28,181 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
+2025-06-12 18:18:28,988 - INFO - Sleeping: 2.999010 seconds
+2025-06-12 18:18:32,000 - INFO - Requesting page (first: False, try: 2): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
+2025-06-12 18:18:32,507 - INFO - Sleeping: 2.998957 seconds
+2025-06-12 18:18:35,519 - INFO - Requesting page (first: False, try: 3): https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
+2025-06-12 18:18:36,061 - WARNING - Empty page returned for query 'cat:physics* OR cat:astro-ph* OR cat:cond-mat* OR cat:hep-th OR cat:quant-ph OR cat:math-ph': Page of results was unexpectedly empty (https://export.arxiv.org/api/query?search_query=cat%3Aphysics%2A+OR+cat%3Aastro-ph%2A+OR+cat%3Acond-mat%2A+OR+cat%3Ahep-th+OR+cat%3Aquant-ph+OR+cat%3Amath-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100)
+2025-06-12 18:18:36,065 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
+2025-06-12 18:18:36,888 - INFO - Got first page: 100 of 50293 total results
+2025-06-12 18:18:36,896 - INFO - Sleeping: 2.871087 seconds
+2025-06-12 18:18:39,783 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=100&max_results=100
+2025-06-12 18:18:40,466 - INFO - Sleeping: 2.870444 seconds
+2025-06-12 18:18:43,339 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=200&max_results=100
+2025-06-12 18:18:44,012 - INFO - Sleeping: 2.874603 seconds
+2025-06-12 18:18:46,893 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=300&max_results=100
+2025-06-12 18:18:47,688 - INFO - Sleeping: 2.858048 seconds
+2025-06-12 18:18:50,552 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=400&max_results=100
+2025-06-12 18:18:51,370 - INFO - Sleeping: 2.870823 seconds
+2025-06-12 18:18:54,246 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
+2025-06-12 18:18:54,960 - INFO - Sleeping: 2.886596 seconds
+2025-06-12 18:18:57,856 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=600&max_results=100
+2025-06-12 18:18:58,568 - INFO - Sleeping: 2.886486 seconds
+2025-06-12 18:19:01,466 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=700&max_results=100
+2025-06-12 18:19:02,219 - INFO - Sleeping: 2.867826 seconds
+2025-06-12 18:19:05,103 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=800&max_results=100
+2025-06-12 18:19:06,346 - INFO - Sleeping: 2.766637 seconds
+2025-06-12 18:19:09,120 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=900&max_results=100
+2025-06-12 18:19:10,043 - INFO - Sleeping: 2.877552 seconds
+2025-06-12 18:19:12,929 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1000&max_results=100
+2025-06-12 18:19:13,641 - INFO - Sleeping: 2.873434 seconds
+2025-06-12 18:19:16,525 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1100&max_results=100
+2025-06-12 18:19:17,281 - INFO - Sleeping: 2.871482 seconds
+2025-06-12 18:19:20,161 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1200&max_results=100
+2025-06-12 18:19:20,990 - INFO - Sleeping: 2.872492 seconds
+2025-06-12 18:19:23,876 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1300&max_results=100
+2025-06-12 18:19:24,633 - INFO - Sleeping: 2.873157 seconds
+2025-06-12 18:19:27,510 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1400&max_results=100
+2025-06-12 18:19:28,249 - INFO - Sleeping: 2.872219 seconds
+2025-06-12 18:19:31,132 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1500&max_results=100
+2025-06-12 18:19:31,787 - INFO - Sleeping: 2.871294 seconds
+2025-06-12 18:19:34,660 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1600&max_results=100
+2025-06-12 18:19:35,423 - INFO - Sleeping: 2.864608 seconds
+2025-06-12 18:19:38,291 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1700&max_results=100
+2025-06-12 18:19:38,496 - INFO - Sleeping: 2.998046 seconds
+2025-06-12 18:19:41,498 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1700&max_results=100
+2025-06-12 18:19:41,682 - INFO - Sleeping: 2.998049 seconds
+2025-06-12 18:19:44,693 - INFO - Requesting page (first: False, try: 2): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1700&max_results=100
+2025-06-12 18:19:45,568 - INFO - Sleeping: 2.874692 seconds
+2025-06-12 18:19:48,448 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1800&max_results=100
+2025-06-12 18:19:48,654 - INFO - Sleeping: 2.998000 seconds
+2025-06-12 18:19:51,668 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1800&max_results=100
+2025-06-12 18:19:52,436 - INFO - Sleeping: 2.877867 seconds
+2025-06-12 18:19:55,323 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1900&max_results=100
+2025-06-12 18:19:56,074 - INFO - Sleeping: 2.878102 seconds
+2025-06-12 18:19:58,961 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2000&max_results=100
+2025-06-12 18:19:59,730 - INFO - Sleeping: 2.846435 seconds
+2025-06-12 18:20:02,587 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2100&max_results=100
+2025-06-12 18:20:02,802 - INFO - Sleeping: 2.997978 seconds
+2025-06-12 18:20:05,801 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2100&max_results=100
+2025-06-12 18:20:06,645 - INFO - Sleeping: 2.882026 seconds
+2025-06-12 18:20:09,537 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2200&max_results=100
+2025-06-12 18:20:10,681 - INFO - Sleeping: 2.867912 seconds
+2025-06-12 18:20:13,558 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2300&max_results=100
+2025-06-12 18:20:15,163 - INFO - Sleeping: 2.874383 seconds
+2025-06-12 18:20:18,052 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2400&max_results=100
+2025-06-12 18:20:19,022 - INFO - Sleeping: 2.885731 seconds
+2025-06-12 18:20:21,916 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2500&max_results=100
+2025-06-12 18:20:22,743 - INFO - Sleeping: 2.880111 seconds
+2025-06-12 18:20:25,633 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2600&max_results=100
+2025-06-12 18:20:26,848 - INFO - Sleeping: 2.877337 seconds
+2025-06-12 18:20:29,728 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2700&max_results=100
+2025-06-12 18:20:29,961 - INFO - Sleeping: 2.999086 seconds
+2025-06-12 18:20:32,973 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2700&max_results=100
+2025-06-12 18:20:33,783 - INFO - Sleeping: 2.870358 seconds
+2025-06-12 18:20:36,664 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2800&max_results=100
+2025-06-12 18:20:36,929 - INFO - Sleeping: 2.997254 seconds
+2025-06-12 18:20:39,936 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2800&max_results=100
+2025-06-12 18:20:40,834 - INFO - Sleeping: 2.876953 seconds
+2025-06-12 18:20:43,716 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Aq-bio%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2900&max_results=100
+2025-06-12 18:20:44,816 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
+2025-06-12 18:20:46,192 - INFO - Got first page: 100 of 100310 total results
+2025-06-12 18:20:46,198 - INFO - Sleeping: 2.859482 seconds
+2025-06-12 18:20:49,073 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=100&max_results=100
+2025-06-12 18:20:49,789 - INFO - Sleeping: 2.869352 seconds
+2025-06-12 18:20:52,669 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=200&max_results=100
+2025-06-12 18:20:53,467 - INFO - Sleeping: 2.862511 seconds
+2025-06-12 18:20:56,338 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=300&max_results=100
+2025-06-12 18:20:57,071 - INFO - Sleeping: 2.870255 seconds
+2025-06-12 18:20:59,951 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=400&max_results=100
+2025-06-12 18:21:00,728 - INFO - Sleeping: 2.869636 seconds
+2025-06-12 18:21:03,604 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=500&max_results=100
+2025-06-12 18:21:04,393 - INFO - Sleeping: 2.865000 seconds
+2025-06-12 18:21:07,272 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=600&max_results=100
+2025-06-12 18:21:08,029 - INFO - Sleeping: 2.858943 seconds
+2025-06-12 18:21:10,895 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=700&max_results=100
+2025-06-12 18:21:11,768 - INFO - Sleeping: 2.866744 seconds
+2025-06-12 18:21:14,640 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=800&max_results=100
+2025-06-12 18:21:15,488 - INFO - Sleeping: 2.720050 seconds
+2025-06-12 18:21:18,211 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=900&max_results=100
+2025-06-12 18:21:19,122 - INFO - Sleeping: 2.844511 seconds
+2025-06-12 18:21:21,982 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1000&max_results=100
+2025-06-12 18:21:22,772 - INFO - Sleeping: 2.871176 seconds
+2025-06-12 18:21:25,647 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1100&max_results=100
+2025-06-12 18:21:25,925 - INFO - Sleeping: 2.997949 seconds
+2025-06-12 18:21:28,932 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1100&max_results=100
+2025-06-12 18:21:29,774 - INFO - Sleeping: 2.864288 seconds
+2025-06-12 18:21:32,644 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1200&max_results=100
+2025-06-12 18:21:33,454 - INFO - Sleeping: 2.860076 seconds
+2025-06-12 18:21:36,317 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1300&max_results=100
+2025-06-12 18:21:36,605 - INFO - Sleeping: 2.997453 seconds
+2025-06-12 18:21:39,607 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1300&max_results=100
+2025-06-12 18:21:40,404 - INFO - Sleeping: 2.856277 seconds
+2025-06-12 18:21:43,276 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1400&max_results=100
+2025-06-12 18:21:44,085 - INFO - Sleeping: 2.862912 seconds
+2025-06-12 18:21:46,964 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1500&max_results=100
+2025-06-12 18:21:47,858 - INFO - Sleeping: 2.860433 seconds
+2025-06-12 18:21:50,732 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1600&max_results=100
+2025-06-12 18:21:51,504 - INFO - Sleeping: 2.874451 seconds
+2025-06-12 18:21:54,387 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1700&max_results=100
+2025-06-12 18:21:55,722 - INFO - Sleeping: 2.859315 seconds
+2025-06-12 18:21:58,585 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1800&max_results=100
+2025-06-12 18:21:59,503 - INFO - Sleeping: 2.863854 seconds
+2025-06-12 18:22:02,377 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1900&max_results=100
+2025-06-12 18:22:02,618 - INFO - Sleeping: 2.997967 seconds
+2025-06-12 18:22:05,628 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=1900&max_results=100
+2025-06-12 18:22:06,677 - INFO - Sleeping: 2.844775 seconds
+2025-06-12 18:22:09,533 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2000&max_results=100
+2025-06-12 18:22:09,792 - INFO - Sleeping: 2.998977 seconds
+2025-06-12 18:22:12,797 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2000&max_results=100
+2025-06-12 18:22:13,677 - INFO - Sleeping: 2.860952 seconds
+2025-06-12 18:22:16,551 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2100&max_results=100
+2025-06-12 18:22:17,381 - INFO - Sleeping: 2.862895 seconds
+2025-06-12 18:22:20,259 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2200&max_results=100
+2025-06-12 18:22:21,092 - INFO - Sleeping: 2.865440 seconds
+2025-06-12 18:22:23,963 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2300&max_results=100
+2025-06-12 18:22:24,738 - INFO - Sleeping: 2.854685 seconds
+2025-06-12 18:22:27,605 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2400&max_results=100
+2025-06-12 18:22:28,443 - INFO - Sleeping: 2.866245 seconds
+2025-06-12 18:22:31,321 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2500&max_results=100
+2025-06-12 18:22:32,401 - INFO - Sleeping: 2.857156 seconds
+2025-06-12 18:22:35,269 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2600&max_results=100
+2025-06-12 18:22:35,481 - INFO - Sleeping: 2.997016 seconds
+2025-06-12 18:22:38,486 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2600&max_results=100
+2025-06-12 18:22:39,346 - INFO - Sleeping: 2.856990 seconds
+2025-06-12 18:22:42,208 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2700&max_results=100
+2025-06-12 18:22:43,031 - INFO - Sleeping: 2.852790 seconds
+2025-06-12 18:22:45,889 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2800&max_results=100
+2025-06-12 18:22:46,748 - INFO - Sleeping: 2.858054 seconds
+2025-06-12 18:22:49,610 - INFO - Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2900&max_results=100
+2025-06-12 18:22:49,923 - INFO - Sleeping: 2.997999 seconds
+2025-06-12 18:22:52,927 - INFO - Requesting page (first: False, try: 1): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2900&max_results=100
+2025-06-12 18:22:53,180 - INFO - Sleeping: 2.998443 seconds
+2025-06-12 18:22:56,182 - INFO - Requesting page (first: False, try: 2): https://export.arxiv.org/api/query?search_query=cat%3Acond-mat.mtrl-sci+OR+cat%3Amaterials%2A&id_list=&sortBy=submittedDate&sortOrder=descending&start=2900&max_results=100
+2025-06-12 18:22:57,297 - INFO - Saved checkpoint to scientific_corpus_data\arxiv_papers.jsonl
+2025-06-12 18:22:57,297 - INFO - Collected 5989 arXiv papers in 296.26s
+2025-06-12 18:22:57,310 - INFO - Starting PubMed paper collection...
+2025-06-12 18:23:14,143 - INFO - Saved checkpoint to scientific_corpus_data\pubmed_papers.jsonl
+2025-06-12 18:23:14,143 - INFO - Collected 2671 PubMed papers in 16.83s
+2025-06-12 18:23:14,143 - INFO - Starting FineWeb-Edu collection...
+2025-06-12 18:23:34,470 - INFO - Collected 10000 FineWeb samples
+2025-06-12 18:23:38,652 - INFO - Collected 20000 FineWeb samples
+2025-06-12 18:23:43,218 - INFO - Collected 30000 FineWeb samples
+2025-06-12 18:23:43,221 - INFO - Processing 30000 FineWeb samples
+2025-06-12 18:24:03,830 - INFO - Saved checkpoint to scientific_corpus_data\fineweb_edu.jsonl
+2025-06-12 18:24:03,831 - INFO - Collected 29616 FineWeb-Edu papers in 49.69s
+2025-06-12 18:24:03,873 - INFO - Processing 5989 arxiv papers...
+2025-06-12 18:24:05,244 - INFO - Processed 5989/5989 arxiv papers
+2025-06-12 18:24:05,244 - INFO - Unknown domains: 0, Unknown sections: 3349
+2025-06-12 18:24:05,244 - INFO - Processing 2671 biology papers...
+2025-06-12 18:24:05,765 - INFO - Processed 2605/2671 biology papers
+2025-06-12 18:24:05,765 - INFO - Unknown domains: 0, Unknown sections: 1015
+2025-06-12 18:24:05,765 - INFO - Processing 29616 education papers...
+2025-06-12 18:24:39,231 - INFO - Processed 159402/29616 education papers
+2025-06-12 18:24:39,231 - INFO - Unknown domains: 29616, Unknown sections: 21161
+2025-06-12 19:06:41,335 - INFO - Received signal 2, shutting down gracefully. Frame: <frame at 0x0000023E5AF0BBC0, file 'C:\\Users\\kunya\\AppData\\Local\\Programs\\Python\\Python310\\lib\\threading.py', line 320, code wait>
+2025-06-12 19:06:43,708 - WARNING - Using default email for Entrez. Set ENTREZ_EMAIL environment variable.
+2025-06-12 19:06:43,710 - INFO - Starting arXiv paper collection...
+2025-06-12 19:06:43,711 - INFO - Saved checkpoint to scientific_corpus_data\arxiv_papers.jsonl
+2025-06-12 19:06:43,712 - INFO - Collected 0 arXiv papers in 0.00s
+2025-06-12 19:06:43,713 - INFO - Starting PubMed paper collection...
+2025-06-12 19:06:43,715 - INFO - Saved checkpoint to scientific_corpus_data\pubmed_papers.jsonl
+2025-06-12 19:06:43,715 - INFO - Collected 0 PubMed papers in 0.00s
+2025-06-12 19:06:43,716 - INFO - Shutdown in progress, aborting retries.
+2025-06-12 19:16:11,718 - INFO - Received signal 2, shutting down gracefully. Frame: <frame at 0x0000023E7696F880, file 'C:\\Users\\kunya\\AppData\\Local\\Programs\\Python\\Python310\\lib\\selectors.py', line 315, code _select>

Tokenization/generate_dataset.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import json
+from typing import Optional, Callable, Dict, Any
+from Tokenization.Build_tokenizer import QLoRAPreprocessor
+from Tokenization.preprocessing.Clean_text import clean_text
+from Tokenization.Main_2 import ScientificCorpusBuilder, CorpusConfig
+def generate_dataset(
+    domain: str = None,
+    token_budget: int = 1000,
+    plan: str = "free",
+    custom_seed: Optional[str] = None,
+    job_type: str = "tokenize",
+    progress_callback: Optional[Callable[[int, str], None]] = None
+) -> Dict[str, Any]:
+    """
+    Unified dataset generation pipeline for both 'tokenize' and 'corpus' jobs.
+    Args:
+        domain (str): Domain for dataset.
+        token_budget (int): Token budget.
+        plan (str): Plan type.
+        custom_seed (str): Optional seed data.
+        job_type (str): "tokenize" or "corpus".
+        progress_callback (callable): Progress update callback.
+    Returns:
+        dict: {"jsonl_lines": [...], "stats": {...}}
+    """
+    if job_type == "corpus":
+        # Use Main_2 pipeline
+        if progress_callback:
+            progress_callback(1, "Initializing scientific corpus builder...")
+        config = CorpusConfig()
+        builder = ScientificCorpusBuilder(config)
+        if progress_callback:
+            progress_callback(2, "Fetching arXiv papers...")
+        arxiv_papers = builder.fetch_arxiv_papers()
+        if progress_callback:
+            progress_callback(3, "Fetching PubMed papers...")
+        pubmed_papers = builder.fetch_pubmed_papers()
+        if progress_callback:
+            progress_callback(4, "Fetching FineWeb-Edu samples...")
+        fineweb_papers = builder.fetch_fineweb_edu()
+        if progress_callback:
+            progress_callback(5, "Processing and tagging papers...")
+        all_papers = []
+        all_papers.extend(builder.process_papers(arxiv_papers, "arxiv"))
+        all_papers.extend(builder.process_papers(pubmed_papers, "biology"))
+        all_papers.extend(builder.process_papers(fineweb_papers, "education"))
+        if progress_callback:
+            progress_callback(6, "Ranking and deduplicating...")
+        ranked_papers = builder.ranker.rank_samples(all_papers)
+        if progress_callback:
+            progress_callback(7, "Preparing dataset for download...")
+        jsonl_lines = [json.dumps(paper, ensure_ascii=False) for paper in ranked_papers]
+        stats = builder.analyzer.get_dataset_stats(ranked_papers)
+        if progress_callback:
+            progress_callback(8, "Dataset ready for download.")
+        return {"jsonl_lines": jsonl_lines, "stats": stats}
+    # Standard "tokenize" job
+    if progress_callback:
+        progress_callback(1, "Cleaning input text...")
+    cleaned_text = clean_text(custom_seed or "")
+    if progress_callback:
+        progress_callback(2, "Tokenizing input...")
+    preprocessor = QLoRAPreprocessor()
+    # For demonstration, just split cleaned_text into sentences (replace with real logic)
+    tokens = [cleaned_text[i:i+token_budget] for i in range(0, len(cleaned_text), token_budget)]
+    if progress_callback:
+        progress_callback(3, "Formatting samples...")
+    jsonl_lines = [json.dumps({"text": t}) for t in tokens]
+    stats = {"token_count": sum(len(t.split()) for t in tokens), "total_samples": len(tokens)}
+    if progress_callback:
+        progress_callback(4, "Dataset ready for download.")
+    return {"jsonl_lines": jsonl_lines, "stats": stats}

Tokenization/hf_upload.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import logging
+import os
+import sys
+from datetime import datetime
+from pathlib import Path
+from datasets import Dataset, Features, Value
+from dotenv import load_dotenv
+from huggingface_hub import HfApi
+# Load environment variables
+load_dotenv()
+HF_TOKEN = os.getenv("HF_TOKEN")
+# Logging setup
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler('debug_upload.log', mode='w')
+    ]
+)
+REPO_ID = "Allanatrix/Scientific_Research_Tokenized"
+JSONL_SRC = Path(r"C:\Users\kunya\PycharmProjects\DataVolt\Tokenization\scientific_corpus_325M.jsonl")
+ARROW_PATH = Path("scientific_corpus_325M.arrow")
+README_PATH = Path("README.md")
+def debug_jsonl_head(jsonl_path, n=5):
+    logging.info(f"Printing the first {n} lines of {jsonl_path} for schema inspection:")
+    try:
+        with open(jsonl_path, "r", encoding="utf-8") as f:
+            for i in range(n):
+                line = f.readline()
+                if not line:
+                    break
+                logging.info(f"Line {i+1}: {line.strip()}")
+    except Exception as e:
+        logging.error(f"Failed to read JSONL head: {e}")
+def infer_features_from_sample(jsonl_path, n=100):
+    import json
+    from collections import defaultdict
+    types = defaultdict(set)
+    try:
+        with open(jsonl_path, "r", encoding="utf-8") as f:
+            for i, line in enumerate(f):
+                if i >= n:
+                    break
+                obj = json.loads(line)
+                for k, v in obj.items():
+                    types[k].add(type(v).__name__)
+        logging.info(f"Inferred field types from first {n} lines: {dict(types)}")
+    except Exception as e:
+        logging.error(f"Failed to infer features: {e}")
+def convert_jsonl_to_arrow(jsonl_path, arrow_path):
+    try:
+        logging.info(f"Converting {jsonl_path} to Arrow format at {arrow_path} ...")
+        if not jsonl_path.exists():
+            logging.error(f"JSONL source file does not exist: {jsonl_path}")
+            print(f"\n❌ JSONL source file does not exist: {jsonl_path}")
+            raise FileNotFoundError(f"JSONL source file does not exist: {jsonl_path}")
+        logging.info(f"File size: {jsonl_path.stat().st_size} bytes")
+        debug_jsonl_head(jsonl_path, n=5)
+        infer_features_from_sample(jsonl_path, n=100)
+        # Try loading a small sample first for debugging
+        try:
+            sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]")
+            logging.info(f"Sample loaded: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}")
+        except Exception as sample_e:
+            logging.error(f"Failed to load sample from JSONL: {sample_e}", exc_info=True)
+            print(f"\n❌ Failed to load sample from JSONL. See debug_upload.log for details.")
+            # Try to load with explicit features if possible
+            # Example: features = Features({'url': Value('string'), 'pubmed_id': Value('string')})
+            # Uncomment and adjust the following lines if you know the schema:
+            # features = Features({'url': Value('string'), 'pubmed_id': Value('string')})
+            # try:
+            #     sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]", features=features)
+            #     logging.info(f"Sample loaded with explicit features: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}")
+            # except Exception as e2:
+            #     logging.error(f"Still failed with explicit features: {e2}", exc_info=True)
+            raise
+        # Now load the full dataset
+        dataset = Dataset.from_json(str(jsonl_path))
+        logging.info(f"Full dataset loaded: {len(dataset)} rows, columns: {dataset.column_names}")
+        dataset.to_file(str(arrow_path))
+        logging.info(f"Saved Arrow dataset with {len(dataset):,} rows.")
+        return dataset
+    except Exception as e:
+        logging.error(f"An error occurred while generating the dataset: {e}", exc_info=True)
+        print(f"\n❌ Failed to convert JSONL to Arrow. See debug_upload.log for details.")
+        raise
+def create_readme(dataset):
+    content = f"""# Scientific Research Tokenized Dataset
+- **Examples**: {len(dataset):,}
+- **Columns**: {dataset.column_names}
+- **Updated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+## Usage
+```python
+from datasets import load_dataset
+ds = load_dataset("{REPO_ID}")
+```
+"""
+    with open(README_PATH, "w", encoding="utf-8") as f:
+        f.write(content)
+    logging.info("README.md created.")
+def upload_to_hf():
+    api = HfApi()
+    logging.info("Uploading Arrow file to HuggingFace Hub ...")
+    api.upload_file(
+        path_or_fileobj=str(ARROW_PATH),
+        path_in_repo=ARROW_PATH.name,
+        repo_id=REPO_ID,
+        repo_type="dataset",
+        token=HF_TOKEN,
+        commit_message="Upload Arrow dataset"
+    )
+    logging.info("Uploading README.md to HuggingFace Hub ...")
+    api.upload_file(
+        path_or_fileobj=str(README_PATH),
+        path_in_repo="README.md",
+        repo_id=REPO_ID,
+        repo_type="dataset",
+        token=HF_TOKEN,
+        commit_message="Update README"
+    )
+    logging.info("Upload complete.")
+def upload_to_huggingface(*args, **kwargs):
+    """Alias for upload_to_hf to match expected import in Main_2.py"""
+    return upload_to_hf(*args, **kwargs)
+def cleanup():
+    if ARROW_PATH.exists():
+        ARROW_PATH.unlink()
+    if README_PATH.exists():
+        README_PATH.unlink()
+    logging.info("Cleaned up local files.")
+def main():
+    try:
+        if not HF_TOKEN:
+            print("❌ HF_TOKEN not found in environment. Please set it in your .env file.")
+            return
+        dataset = convert_jsonl_to_arrow(JSONL_SRC, ARROW_PATH)
+        create_readme(dataset)
+        upload_to_hf()
+        print(f"\n🎉 SUCCESS! View at: https://huggingface.co/datasets/{REPO_ID}")
+    except Exception as e:
+        logging.error(f"Process failed: {e}")
+        print(f"\n❌ Upload failed. See debug_upload.log for details.")
+        sys.exit(1)
+    finally:
+        cleanup()
+if __name__ == "__main__":
+    main()

Tokenization/preprocessing/Clean_text.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import re
+import unicodedata
+def clean_text(text: str) -> str:
+    """Clean and normalize text for LLM ingestion."""
+    if not isinstance(text, str):
+        return ""
+    # Normalize unicode
+    text = unicodedata.normalize("NFKC", text)
+    # Remove control characters
+    text = re.sub(r"[\x00-\x1F\x7F]", " ", text)
+    # Replace multiple spaces/newlines with a single space
+    text = re.sub(r"\s+", " ", text)
+    # Strip leading/trailing whitespace
+    text = text.strip()
+    return text

Tokenization/preprocessing/Preprocess_sample.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from typing import Dict, List
+from Tokenization.preprocessing.Clean_text import clean_text
+from Tokenization.preprocessing.Segment_paragraphs import segment_paragraphs
+def preprocess_sample(paper: Dict) -> List[Dict]:
+    """
+    Clean and segment a paper into samples for LLM ingestion.
+    Returns a list of dicts: one for title+abstract, and one per paragraph.
+    """
+    title = clean_text(paper.get("title", ""))
+    abstract = clean_text(paper.get("abstract", ""))
+    full_text = clean_text(paper.get("full_text", ""))
+    paragraphs = segment_paragraphs(full_text) if full_text else []
+    samples = []
+    # Title + abstract sample
+    if title or abstract:
+        sample = dict(paper)
+        sample["title"] = title
+        sample["abstract"] = abstract
+        sample["full_text"] = ""
+        sample["section"] = "abstract"
+        samples.append(sample)
+    # Paragraph samples
+    for para in paragraphs:
+        sample = dict(paper)
+        sample["title"] = title
+        sample["abstract"] = ""
+        sample["full_text"] = para
+        sample["section"] = "paragraph"
+        samples.append(sample)
+    return samples

Tokenization/preprocessing/Segment_paragraphs.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import re
+def segment_paragraphs(text: str) -> list:
+    """Segment text into paragraphs using double newlines or similar heuristics."""
+    if not isinstance(text, str):
+        return []
+    # Split on two or more newlines, or at least 200 chars per paragraph
+    paras = re.split(r"\n{2,}", text)
+    # Fallback: split-long paragraphs
+    result = []
+    for para in paras:
+        para = para.strip()
+        if len(para) > 1000:
+            # Split further if too long
+            chunks = [para[i:i+1000] for i in range(0, len(para), 1000)]
+            result.extend(chunks)
+        elif para:
+            result.append(para)
+    return [p for p in result if p]

Tokenization/preprocessing/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .Clean_text import clean_text
+from .Segment_paragraphs import segment_paragraphs
+from .Preprocess_sample import preprocess_sample
+__all__ = [
+    "clean_text",
+    "segment_paragraphs",
+    "preprocess_sample",
+]

Tokenization/preprocessing/__pycache__/Clean_text.cpython-310.pyc ADDED Viewed

Binary file (544 Bytes). View file

Tokenization/preprocessing/__pycache__/Preprocess_sample.cpython-310.pyc ADDED Viewed

Binary file (1.03 kB). View file

Tokenization/preprocessing/__pycache__/Segment_paragraphs.cpython-310.pyc ADDED Viewed

Binary file (932 Bytes). View file

Tokenization/preprocessing/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (372 Bytes). View file

Tokenization/pretraining/Dataset_stats.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from collections import Counter
+from typing import Dict, List
+import numpy as np
+from transformers import AutoTokenizer
+class DatasetAnalyzer:
+    def __init__(self, model_name: str = "facebook/opt-350m"):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def analyze_sample(self, sample: Dict) -> Dict:
+        tokens = self.tokenizer.encode(str(sample))
+        return {
+            "token_count": len(tokens),
+            "word_count": len(str(sample).split()),
+            "has_abstract": bool(sample.get("abstract")),
+            "has_content": bool(sample.get("full_text") or sample.get("excerpt")),
+            "has_section": bool(sample.get("section_type")),
+            "domain": sample.get("domain_tag", "unknown")
+        }
+    def get_dataset_stats(self, samples: List[Dict]) -> Dict:
+        stats = []
+        domains = Counter()
+        sections = Counter()
+        for sample in samples:
+            sample_stats = self.analyze_sample(sample)
+            stats.append(sample_stats)
+            domains[sample_stats["domain"]] += 1
+            sections[sample.get("section_type", "unknown")] += 1
+        return {
+            "total_samples": len(samples),
+            "avg_tokens": np.mean([s["token_count"] for s in stats]),
+            "avg_words": np.mean([s["word_count"] for s in stats]),
+            "domain_distribution": dict(domains),
+            "section_distribution": dict(sections)
+        }

Tokenization/pretraining/Instruction_formatter.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Tokenization/pretraining/instruction_formatter.py
+class InstructionFormatter:
+    @staticmethod
+    def format_sample(sample):
+        """
+        Formats a sample dict with 'instruction', 'input', and 'output' fields.
+        This is a placeholder; customize as needed for your data.
+        """
+        # Ensure required fields exist
+        instruction = sample.get("instruction", "")
+        input_ = sample.get("input", "")
+        output = sample.get("output", "")
+        return {
+            "instruction": instruction.strip(),
+            "input": input_.strip(),
+            "output": output.strip(),
+        }

Tokenization/pretraining/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .Dataset_stats import DatasetAnalyzer
2	+
3	+ __all__ = ["DatasetAnalyzer"]

Tokenization/pretraining/__pycache__/Dataset_stats.cpython-310.pyc ADDED Viewed

Binary file (1.97 kB). View file

Tokenization/pretraining/__pycache__/Instruction_formatter.cpython-310.pyc ADDED Viewed

Binary file (806 Bytes). View file

Tokenization/pretraining/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (244 Bytes). View file

Tokenization/requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+fastapi
+uvicorn
+gradio
+requests
+nltk
+scikit-learn
+beautifulsoup4
+arxiv
+huggingface_hub
+python-dotenv
+stripe

Tokenization/run_backend.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import uvicorn
+import os
+if __name__ == "__main__":
+    os.makedirs("tmp", exist_ok=True)
+    print("Starting FastAPI backend at http://localhost:8000 ...")
+    uvicorn.run(
+        "Tokenization.app:fastapi_app",
+        host="0.0.0.0",
+        port=8000,
+        reload=True
+    )