Allanatrix's picture
Upload 50 files
ef4c8c3 verified
raw
history blame
6.21 kB
import logging
import os
import sys
from datetime import datetime
from pathlib import Path
from datasets import Dataset, Features, Value
from dotenv import load_dotenv
from huggingface_hub import HfApi
# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
# Logging setup
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler('debug_upload.log', mode='w')
]
)
REPO_ID = "Allanatrix/Scientific_Research_Tokenized"
JSONL_SRC = Path(r"C:\Users\kunya\PycharmProjects\DataVolt\Tokenization\scientific_corpus_325M.jsonl")
ARROW_PATH = Path("scientific_corpus_325M.arrow")
README_PATH = Path("README.md")
def debug_jsonl_head(jsonl_path, n=5):
logging.info(f"Printing the first {n} lines of {jsonl_path} for schema inspection:")
try:
with open(jsonl_path, "r", encoding="utf-8") as f:
for i in range(n):
line = f.readline()
if not line:
break
logging.info(f"Line {i+1}: {line.strip()}")
except Exception as e:
logging.error(f"Failed to read JSONL head: {e}")
def infer_features_from_sample(jsonl_path, n=100):
import json
from collections import defaultdict
types = defaultdict(set)
try:
with open(jsonl_path, "r", encoding="utf-8") as f:
for i, line in enumerate(f):
if i >= n:
break
obj = json.loads(line)
for k, v in obj.items():
types[k].add(type(v).__name__)
logging.info(f"Inferred field types from first {n} lines: {dict(types)}")
except Exception as e:
logging.error(f"Failed to infer features: {e}")
def convert_jsonl_to_arrow(jsonl_path, arrow_path):
try:
logging.info(f"Converting {jsonl_path} to Arrow format at {arrow_path} ...")
if not jsonl_path.exists():
logging.error(f"JSONL source file does not exist: {jsonl_path}")
print(f"\n❌ JSONL source file does not exist: {jsonl_path}")
raise FileNotFoundError(f"JSONL source file does not exist: {jsonl_path}")
logging.info(f"File size: {jsonl_path.stat().st_size} bytes")
debug_jsonl_head(jsonl_path, n=5)
infer_features_from_sample(jsonl_path, n=100)
# Try loading a small sample first for debugging
try:
sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]")
logging.info(f"Sample loaded: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}")
except Exception as sample_e:
logging.error(f"Failed to load sample from JSONL: {sample_e}", exc_info=True)
print(f"\n❌ Failed to load sample from JSONL. See debug_upload.log for details.")
# Try to load with explicit features if possible
# Example: features = Features({'url': Value('string'), 'pubmed_id': Value('string')})
# Uncomment and adjust the following lines if you know the schema:
# features = Features({'url': Value('string'), 'pubmed_id': Value('string')})
# try:
# sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]", features=features)
# logging.info(f"Sample loaded with explicit features: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}")
# except Exception as e2:
# logging.error(f"Still failed with explicit features: {e2}", exc_info=True)
raise
# Now load the full dataset
dataset = Dataset.from_json(str(jsonl_path))
logging.info(f"Full dataset loaded: {len(dataset)} rows, columns: {dataset.column_names}")
dataset.to_file(str(arrow_path))
logging.info(f"Saved Arrow dataset with {len(dataset):,} rows.")
return dataset
except Exception as e:
logging.error(f"An error occurred while generating the dataset: {e}", exc_info=True)
print(f"\n❌ Failed to convert JSONL to Arrow. See debug_upload.log for details.")
raise
def create_readme(dataset):
content = f"""# Scientific Research Tokenized Dataset
- **Examples**: {len(dataset):,}
- **Columns**: {dataset.column_names}
- **Updated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
## Usage
```python
from datasets import load_dataset
ds = load_dataset("{REPO_ID}")
```
"""
with open(README_PATH, "w", encoding="utf-8") as f:
f.write(content)
logging.info("README.md created.")
def upload_to_hf():
api = HfApi()
logging.info("Uploading Arrow file to HuggingFace Hub ...")
api.upload_file(
path_or_fileobj=str(ARROW_PATH),
path_in_repo=ARROW_PATH.name,
repo_id=REPO_ID,
repo_type="dataset",
token=HF_TOKEN,
commit_message="Upload Arrow dataset"
)
logging.info("Uploading README.md to HuggingFace Hub ...")
api.upload_file(
path_or_fileobj=str(README_PATH),
path_in_repo="README.md",
repo_id=REPO_ID,
repo_type="dataset",
token=HF_TOKEN,
commit_message="Update README"
)
logging.info("Upload complete.")
def upload_to_huggingface(*args, **kwargs):
"""Alias for upload_to_hf to match expected import in Main_2.py"""
return upload_to_hf(*args, **kwargs)
def cleanup():
if ARROW_PATH.exists():
ARROW_PATH.unlink()
if README_PATH.exists():
README_PATH.unlink()
logging.info("Cleaned up local files.")
def main():
try:
if not HF_TOKEN:
print("❌ HF_TOKEN not found in environment. Please set it in your .env file.")
return
dataset = convert_jsonl_to_arrow(JSONL_SRC, ARROW_PATH)
create_readme(dataset)
upload_to_hf()
print(f"\nπŸŽ‰ SUCCESS! View at: https://huggingface.co/datasets/{REPO_ID}")
except Exception as e:
logging.error(f"Process failed: {e}")
print(f"\n❌ Upload failed. See debug_upload.log for details.")
sys.exit(1)
finally:
cleanup()
if __name__ == "__main__":
main()