File size: 3,486 Bytes
ef4c8c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import json
from typing import Optional, Callable, Dict, Any

from Tokenization.Build_tokenizer import QLoRAPreprocessor
from Tokenization.preprocessing.Clean_text import clean_text
from Tokenization.Main_2 import ScientificCorpusBuilder, CorpusConfig

def generate_dataset(

    domain: str = None,

    token_budget: int = 1000,

    plan: str = "free",

    custom_seed: Optional[str] = None,

    job_type: str = "tokenize",

    progress_callback: Optional[Callable[[int, str], None]] = None

) -> Dict[str, Any]:
    """

    Unified dataset generation pipeline for both 'tokenize' and 'corpus' jobs.



    Args:

        domain (str): Domain for dataset.

        token_budget (int): Token budget.

        plan (str): Plan type.

        custom_seed (str): Optional seed data.

        job_type (str): "tokenize" or "corpus".

        progress_callback (callable): Progress update callback.



    Returns:

        dict: {"jsonl_lines": [...], "stats": {...}}

    """
    if job_type == "corpus":
        # Use Main_2 pipeline
        if progress_callback:
            progress_callback(1, "Initializing scientific corpus builder...")
        config = CorpusConfig()
        builder = ScientificCorpusBuilder(config)
        if progress_callback:
            progress_callback(2, "Fetching arXiv papers...")
        arxiv_papers = builder.fetch_arxiv_papers()
        if progress_callback:
            progress_callback(3, "Fetching PubMed papers...")
        pubmed_papers = builder.fetch_pubmed_papers()
        if progress_callback:
            progress_callback(4, "Fetching FineWeb-Edu samples...")
        fineweb_papers = builder.fetch_fineweb_edu()
        if progress_callback:
            progress_callback(5, "Processing and tagging papers...")
        all_papers = []
        all_papers.extend(builder.process_papers(arxiv_papers, "arxiv"))
        all_papers.extend(builder.process_papers(pubmed_papers, "biology"))
        all_papers.extend(builder.process_papers(fineweb_papers, "education"))
        if progress_callback:
            progress_callback(6, "Ranking and deduplicating...")
        ranked_papers = builder.ranker.rank_samples(all_papers)
        if progress_callback:
            progress_callback(7, "Preparing dataset for download...")
        jsonl_lines = [json.dumps(paper, ensure_ascii=False) for paper in ranked_papers]
        stats = builder.analyzer.get_dataset_stats(ranked_papers)
        if progress_callback:
            progress_callback(8, "Dataset ready for download.")
        return {"jsonl_lines": jsonl_lines, "stats": stats}

    # Standard "tokenize" job
    if progress_callback:
        progress_callback(1, "Cleaning input text...")
    cleaned_text = clean_text(custom_seed or "")
    if progress_callback:
        progress_callback(2, "Tokenizing input...")
    preprocessor = QLoRAPreprocessor()
    # For demonstration, just split cleaned_text into sentences (replace with real logic)
    tokens = [cleaned_text[i:i+token_budget] for i in range(0, len(cleaned_text), token_budget)]
    if progress_callback:
        progress_callback(3, "Formatting samples...")
    jsonl_lines = [json.dumps({"text": t}) for t in tokens]
    stats = {"token_count": sum(len(t.split()) for t in tokens), "total_samples": len(tokens)}
    if progress_callback:
        progress_callback(4, "Dataset ready for download.")
    return {"jsonl_lines": jsonl_lines, "stats": stats}