Spaces:
Runtime error
Runtime error
File size: 1,799 Bytes
a602362 d1608e1 b1db484 a602362 d1608e1 a602362 6a8def7 8eb4c46 6a8def7 d1608e1 6a8def7 a602362 d1608e1 7565a64 d1608e1 7565a64 a602362 7565a64 8eb4c46 c40364d 874f99e c40364d 7565a64 8eb4c46 6a8def7 a602362 7565a64 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
from datasets import load_dataset
from datasets import Dataset
from langchain.docstore.document import Document as LangchainDocument
from sentence_transformers import SentenceTransformer
#from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from sentence_transformers import SentenceTransformer
from huggingface_hub import Repository, upload_file
from datasets import Dataset
import pandas as pd
import os
DATA_PATH='./data'
HF_TOKEN = os.getenv('HF_Token')
#dataset = load_dataset("Namitg02/Test", split='train', streaming=False)
##url = "https://www.webmd.com/"
#loader = WebBaseLoader(url)
#document = loader.load()
def create_vector_db():
loader = DirectoryLoader(DATA_PATH, glob='*.md', loader_cls=TextLoader, show_progress=True)
documents =loader.load()
# split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50)
texts = text_splitter.split_documents(document)
print(texts[1])
print(texts[3])
print(texts[17])
df = pd.DataFrame(texts)
column_headers = list(df.columns.values)
print(column_headers)
pd.options.display.max_colwidth = 400
df = df.drop(columns=[1, 2])
print(df.iloc[[3]])
df[0] = df[0].astype('string', errors='raise').copy()
datatypes = df.dtypes
print(datatypes)
df[0] = df[0].str[18:]
df[0] = df[0].str[:-2]
print(df.iloc[[3]])
print(df.iloc[[17]])
dataset = Dataset.from_pandas(df)
print("check2b")
print(dataset[3])
dataset.push_to_hub("Namitg02/Test",token = HF_TOKEN)
if __name__ == "__main__":
print("check31")
create_vector_db() |