Spaces:
Sleeping
Sleeping
import os | |
import pandas as pd | |
import string | |
from backend.utils import logger | |
logger = logger.get_logger() | |
DATASET_PATH = "src/backend/data/dataset.csv" | |
PARAQUET_DATASET_PATH = "hf://datasets/lavita/ChatDoctor-HealthCareMagic-100k/data/train-00000-of-00001-5e7cb295b9cff0bf.parquet" | |
def get_data_set(): | |
try: | |
if not os.path.exists(DATASET_PATH): | |
logger.info(f"{DATASET_PATH} not found. Reading from Parquet file.") | |
df = pd.read_parquet(PARAQUET_DATASET_PATH) | |
else: | |
logger.info(f"Loading existing dataset from: {DATASET_PATH}") | |
df = pd.read_csv(DATASET_PATH).fillna("") | |
# Cleaning logic for both Parquet and CSV data | |
df.drop_duplicates(subset=["input", "output","instruction"], inplace=True) | |
# Remove NaN values or empty strings | |
df = df[df["input"].str.strip().notna() & df["output"].str.strip().notna()] | |
df = df[(df["input"].str.strip() != "") & (df["output"].str.strip() != "")] | |
# Clean punctuation and emojis | |
translator = str.maketrans('', '', string.punctuation) | |
df["input"] = df["input"].fillna("").str.lower().str.translate(translator) | |
df["output"] = df["output"].fillna("").str.lower().str.translate(translator) | |
df["instruction"] = df["instruction"].fillna("").str.lower().str.translate(translator) | |
# Save only if data is present | |
if not os.path.exists(DATASET_PATH): | |
df.to_csv(DATASET_PATH, index=False) | |
logger.info(f"CSV file created and cleaned at: {DATASET_PATH}") | |
return df | |
except Exception as e: | |
logger.error(f"Error while loading dataset: {e}", exc_info=True) | |
return None | |