Namitg02 commited on
Commit
d1608e1
·
verified ·
1 Parent(s): 7c830c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -5
app.py CHANGED
@@ -2,11 +2,12 @@ from datasets import load_dataset
2
  from datasets import Dataset
3
  from langchain.docstore.document import Document as LangchainDocument
4
  from sentence_transformers import SentenceTransformer
5
- from langchain_community.document_loaders import WebBaseLoader
6
 
7
 
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
- from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
 
10
  from sentence_transformers import SentenceTransformer
11
  from huggingface_hub import Repository, upload_file
12
  from datasets import Dataset
@@ -14,14 +15,20 @@ import pandas as pd
14
 
15
  import os
16
 
 
17
  HF_TOKEN = os.getenv('HF_Token')
18
 
19
- url = "https://www.webmd.com/"
20
- loader = WebBaseLoader(url)
21
- document = loader.load()
 
 
22
 
23
  def create_vector_db():
24
 
 
 
 
25
  # split the document into chunks
26
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50)
27
  texts = text_splitter.split_documents(document)
 
2
  from datasets import Dataset
3
  from langchain.docstore.document import Document as LangchainDocument
4
  from sentence_transformers import SentenceTransformer
5
+ #from langchain_community.document_loaders import WebBaseLoader
6
 
7
 
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from langchain_community.document_loaders import TextLoader, DirectoryLoader
10
+
11
  from sentence_transformers import SentenceTransformer
12
  from huggingface_hub import Repository, upload_file
13
  from datasets import Dataset
 
15
 
16
  import os
17
 
18
+ DATA_PATH='./data'
19
  HF_TOKEN = os.getenv('HF_Token')
20
 
21
+ #dataset = load_dataset("Namitg02/Test", split='train', streaming=False)
22
+
23
+ ##url = "https://www.webmd.com/"
24
+ #loader = WebBaseLoader(url)
25
+ #document = loader.load()
26
 
27
  def create_vector_db():
28
 
29
+ loader = DirectoryLoader(DATA_PATH, glob='*.md', loader_cls=TextLoader, show_progress=True)
30
+ documents =loader.load()
31
+
32
  # split the document into chunks
33
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50)
34
  texts = text_splitter.split_documents(document)