import logging import os import sys from datetime import datetime from pathlib import Path from datasets import Dataset, Features, Value from dotenv import load_dotenv from huggingface_hub import HfApi # Load environment variables load_dotenv() HF_TOKEN = os.getenv("HF_TOKEN") # Logging setup logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler('debug_upload.log', mode='w') ] ) REPO_ID = "Allanatrix/Scientific_Research_Tokenized" JSONL_SRC = Path(r"C:\Users\kunya\PycharmProjects\DataVolt\Tokenization\scientific_corpus_325M.jsonl") ARROW_PATH = Path("scientific_corpus_325M.arrow") README_PATH = Path("README.md") def debug_jsonl_head(jsonl_path, n=5): logging.info(f"Printing the first {n} lines of {jsonl_path} for schema inspection:") try: with open(jsonl_path, "r", encoding="utf-8") as f: for i in range(n): line = f.readline() if not line: break logging.info(f"Line {i+1}: {line.strip()}") except Exception as e: logging.error(f"Failed to read JSONL head: {e}") def infer_features_from_sample(jsonl_path, n=100): import json from collections import defaultdict types = defaultdict(set) try: with open(jsonl_path, "r", encoding="utf-8") as f: for i, line in enumerate(f): if i >= n: break obj = json.loads(line) for k, v in obj.items(): types[k].add(type(v).__name__) logging.info(f"Inferred field types from first {n} lines: {dict(types)}") except Exception as e: logging.error(f"Failed to infer features: {e}") def convert_jsonl_to_arrow(jsonl_path, arrow_path): try: logging.info(f"Converting {jsonl_path} to Arrow format at {arrow_path} ...") if not jsonl_path.exists(): logging.error(f"JSONL source file does not exist: {jsonl_path}") print(f"\nāŒ JSONL source file does not exist: {jsonl_path}") raise FileNotFoundError(f"JSONL source file does not exist: {jsonl_path}") logging.info(f"File size: {jsonl_path.stat().st_size} bytes") debug_jsonl_head(jsonl_path, n=5) infer_features_from_sample(jsonl_path, n=100) # Try loading a small sample first for debugging try: sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]") logging.info(f"Sample loaded: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}") except Exception as sample_e: logging.error(f"Failed to load sample from JSONL: {sample_e}", exc_info=True) print(f"\nāŒ Failed to load sample from JSONL. See debug_upload.log for details.") # Try to load with explicit features if possible # Example: features = Features({'url': Value('string'), 'pubmed_id': Value('string')}) # Uncomment and adjust the following lines if you know the schema: # features = Features({'url': Value('string'), 'pubmed_id': Value('string')}) # try: # sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]", features=features) # logging.info(f"Sample loaded with explicit features: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}") # except Exception as e2: # logging.error(f"Still failed with explicit features: {e2}", exc_info=True) raise # Now load the full dataset dataset = Dataset.from_json(str(jsonl_path)) logging.info(f"Full dataset loaded: {len(dataset)} rows, columns: {dataset.column_names}") dataset.to_file(str(arrow_path)) logging.info(f"Saved Arrow dataset with {len(dataset):,} rows.") return dataset except Exception as e: logging.error(f"An error occurred while generating the dataset: {e}", exc_info=True) print(f"\nāŒ Failed to convert JSONL to Arrow. See debug_upload.log for details.") raise def create_readme(dataset): content = f"""# Scientific Research Tokenized Dataset - **Examples**: {len(dataset):,} - **Columns**: {dataset.column_names} - **Updated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ## Usage ```python from datasets import load_dataset ds = load_dataset("{REPO_ID}") ``` """ with open(README_PATH, "w", encoding="utf-8") as f: f.write(content) logging.info("README.md created.") def upload_to_hf(): api = HfApi() logging.info("Uploading Arrow file to HuggingFace Hub ...") api.upload_file( path_or_fileobj=str(ARROW_PATH), path_in_repo=ARROW_PATH.name, repo_id=REPO_ID, repo_type="dataset", token=HF_TOKEN, commit_message="Upload Arrow dataset" ) logging.info("Uploading README.md to HuggingFace Hub ...") api.upload_file( path_or_fileobj=str(README_PATH), path_in_repo="README.md", repo_id=REPO_ID, repo_type="dataset", token=HF_TOKEN, commit_message="Update README" ) logging.info("Upload complete.") def upload_to_huggingface(*args, **kwargs): """Alias for upload_to_hf to match expected import in Main_2.py""" return upload_to_hf(*args, **kwargs) def cleanup(): if ARROW_PATH.exists(): ARROW_PATH.unlink() if README_PATH.exists(): README_PATH.unlink() logging.info("Cleaned up local files.") def main(): try: if not HF_TOKEN: print("āŒ HF_TOKEN not found in environment. Please set it in your .env file.") return dataset = convert_jsonl_to_arrow(JSONL_SRC, ARROW_PATH) create_readme(dataset) upload_to_hf() print(f"\nšŸŽ‰ SUCCESS! View at: https://huggingface.co/datasets/{REPO_ID}") except Exception as e: logging.error(f"Process failed: {e}") print(f"\nāŒ Upload failed. See debug_upload.log for details.") sys.exit(1) finally: cleanup() if __name__ == "__main__": main()