Sigrid De los Santos
Remove remaining binary file for Hugging Face
9df4cc0
raw
history blame
1.07 kB
import datasets
import pandas as pd
from tqdm.notebook import tqdm
import json
import os
def load_dataset(dataset_name, **kwargs):
if dataset_name == "Stocknet":
root_path = r"../../../stocknet-dataset/tweet/raw"
stock_lists = os.listdir(root_path)
all = pd.DataFrame()
for stock in tqdm(stock_lists, desc="Loading Stocknet dataset..."):
stock_path = os.path.join(root_path, stock)
date_files = os.listdir(stock_path)
for date in date_files:
with open(os.path.join(stock_path, date_files[0])) as f:
json_list = f.readlines()
tmp_json = []
for json_str in json_list:
tmp_json.append(json.loads(json_str))
tmp_json = pd.DataFrame(tmp_json)
all = pd.concat([all, tmp_json], axis=0)
all = all.reset_index(drop=True)
all = datasets.Dataset.from_pandas(all)
return all
else:
raise NotImplementedError("Only support Stocknet dataset for now")