Spaces:
Sleeping
Sleeping
File size: 3,486 Bytes
ef4c8c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import json
from typing import Optional, Callable, Dict, Any
from Tokenization.Build_tokenizer import QLoRAPreprocessor
from Tokenization.preprocessing.Clean_text import clean_text
from Tokenization.Main_2 import ScientificCorpusBuilder, CorpusConfig
def generate_dataset(
domain: str = None,
token_budget: int = 1000,
plan: str = "free",
custom_seed: Optional[str] = None,
job_type: str = "tokenize",
progress_callback: Optional[Callable[[int, str], None]] = None
) -> Dict[str, Any]:
"""
Unified dataset generation pipeline for both 'tokenize' and 'corpus' jobs.
Args:
domain (str): Domain for dataset.
token_budget (int): Token budget.
plan (str): Plan type.
custom_seed (str): Optional seed data.
job_type (str): "tokenize" or "corpus".
progress_callback (callable): Progress update callback.
Returns:
dict: {"jsonl_lines": [...], "stats": {...}}
"""
if job_type == "corpus":
# Use Main_2 pipeline
if progress_callback:
progress_callback(1, "Initializing scientific corpus builder...")
config = CorpusConfig()
builder = ScientificCorpusBuilder(config)
if progress_callback:
progress_callback(2, "Fetching arXiv papers...")
arxiv_papers = builder.fetch_arxiv_papers()
if progress_callback:
progress_callback(3, "Fetching PubMed papers...")
pubmed_papers = builder.fetch_pubmed_papers()
if progress_callback:
progress_callback(4, "Fetching FineWeb-Edu samples...")
fineweb_papers = builder.fetch_fineweb_edu()
if progress_callback:
progress_callback(5, "Processing and tagging papers...")
all_papers = []
all_papers.extend(builder.process_papers(arxiv_papers, "arxiv"))
all_papers.extend(builder.process_papers(pubmed_papers, "biology"))
all_papers.extend(builder.process_papers(fineweb_papers, "education"))
if progress_callback:
progress_callback(6, "Ranking and deduplicating...")
ranked_papers = builder.ranker.rank_samples(all_papers)
if progress_callback:
progress_callback(7, "Preparing dataset for download...")
jsonl_lines = [json.dumps(paper, ensure_ascii=False) for paper in ranked_papers]
stats = builder.analyzer.get_dataset_stats(ranked_papers)
if progress_callback:
progress_callback(8, "Dataset ready for download.")
return {"jsonl_lines": jsonl_lines, "stats": stats}
# Standard "tokenize" job
if progress_callback:
progress_callback(1, "Cleaning input text...")
cleaned_text = clean_text(custom_seed or "")
if progress_callback:
progress_callback(2, "Tokenizing input...")
preprocessor = QLoRAPreprocessor()
# For demonstration, just split cleaned_text into sentences (replace with real logic)
tokens = [cleaned_text[i:i+token_budget] for i in range(0, len(cleaned_text), token_budget)]
if progress_callback:
progress_callback(3, "Formatting samples...")
jsonl_lines = [json.dumps({"text": t}) for t in tokens]
stats = {"token_count": sum(len(t.split()) for t in tokens), "total_samples": len(tokens)}
if progress_callback:
progress_callback(4, "Dataset ready for download.")
return {"jsonl_lines": jsonl_lines, "stats": stats}
|