File size: 1,307 Bytes
a602362
 
 
 
b1db484
 
a602362
 
 
 
 
6a8def7
 
 
 
a602362
 
 
 
7565a64
 
 
 
 
 
 
 
 
 
 
a602362
7565a64
 
 
 
6a8def7
 
 
 
 
a602362
7565a64
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from datasets import load_dataset
from datasets import Dataset
from langchain.docstore.document import Document as LangchainDocument
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import WebBaseLoader


from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from sentence_transformers import SentenceTransformer
from huggingface_hub import Repository, upload_file
from datasets import Dataset
import os

HF_TOKEN = os.getenv('HF_Token')

url = "https://oxyjon.com/blog/"
loader = WebBaseLoader(url)
document = loader.load()

def create_vector_db():

    # split the document into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50)
    texts = text_splitter.split_documents(document)
    print(texts[1])
    print(texts[3])
    print(texts[17])

    df = pd.DataFrame(texts)
    
    column_headers = list(df.columns.values)
    print(column_headers)
    pd.options.display.max_colwidth = 300
    print(df.iloc[[3]])
    dataset = Dataset.from_pandas(df)
    print("check2b")
    print(dataset[3])
    dataset.push_to_hub("Namitg02/Test",token = HF_TOKEN)


if __name__ == "__main__":
    print("check31")
    create_vector_db()