borrs / my_dataset.py
JosueElias's picture
Modifing code of dataset file.
9854175
raw
history blame
730 Bytes
'''
load dataset:
https://huggingface.co/docs/datasets/loading#hugging-face-hub
'''
from datasets import load_from_disk, Dataset
from huggingface_hub import hf_hub_download
from datasets import load_dataset
import faiss
# load wikipedia dataset
datasetx = load_dataset("JosueElias/pipeline_dataset2")
# load faiss file and get route of file
path2 = hf_hub_download(repo_id="JosueElias/pipeline_faiss", filename="faiss.index", repo_type="dataset")
# save wikipedia dataset locally
datasetx.save_to_disk("./directory")
# delete variable to have more memory space
del datasetx
# load dataset again in arrow format
datasetx = load_from_disk("./directory/train")
# load faiss to dataset
datasetx.load_faiss_index('embeddings', path2)