File size: 1,799 Bytes
a602362
 
 
 
d1608e1
b1db484
a602362
 
d1608e1
 
a602362
 
6a8def7
8eb4c46
 
6a8def7
 
d1608e1
6a8def7
a602362
d1608e1
 
 
 
 
7565a64
 
 
d1608e1
 
 
7565a64
 
 
 
 
 
 
 
a602362
7565a64
 
8eb4c46
c40364d
 
 
 
 
 
874f99e
 
c40364d
7565a64
8eb4c46
 
6a8def7
 
 
 
 
a602362
7565a64
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from datasets import load_dataset
from datasets import Dataset
from langchain.docstore.document import Document as LangchainDocument
from sentence_transformers import SentenceTransformer
#from langchain_community.document_loaders import WebBaseLoader


from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader, DirectoryLoader

from sentence_transformers import SentenceTransformer
from huggingface_hub import Repository, upload_file
from datasets import Dataset
import pandas as pd

import os

DATA_PATH='./data'
HF_TOKEN = os.getenv('HF_Token')

#dataset = load_dataset("Namitg02/Test", split='train', streaming=False)

##url = "https://www.webmd.com/"
#loader = WebBaseLoader(url)
#document = loader.load()

def create_vector_db():

    loader = DirectoryLoader(DATA_PATH, glob='*.md', loader_cls=TextLoader, show_progress=True)
    documents =loader.load()
    
    # split the document into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50)
    texts = text_splitter.split_documents(document)
    print(texts[1])
    print(texts[3])
    print(texts[17])

    df = pd.DataFrame(texts)
    
    column_headers = list(df.columns.values)
    print(column_headers)
    pd.options.display.max_colwidth = 400
    df = df.drop(columns=[1, 2])

    print(df.iloc[[3]])
    df[0] = df[0].astype('string', errors='raise').copy()
    datatypes = df.dtypes 
    print(datatypes)
    df[0] = df[0].str[18:]
    df[0] = df[0].str[:-2]

    print(df.iloc[[3]])
    print(df.iloc[[17]])

    dataset = Dataset.from_pandas(df)
    print("check2b")
    print(dataset[3])
    dataset.push_to_hub("Namitg02/Test",token = HF_TOKEN)


if __name__ == "__main__":
    print("check31")
    create_vector_db()