Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -2,11 +2,12 @@ from datasets import load_dataset
|
|
2 |
from datasets import Dataset
|
3 |
from langchain.docstore.document import Document as LangchainDocument
|
4 |
from sentence_transformers import SentenceTransformer
|
5 |
-
from langchain_community.document_loaders import WebBaseLoader
|
6 |
|
7 |
|
8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
-
from langchain_community.document_loaders import
|
|
|
10 |
from sentence_transformers import SentenceTransformer
|
11 |
from huggingface_hub import Repository, upload_file
|
12 |
from datasets import Dataset
|
@@ -14,14 +15,20 @@ import pandas as pd
|
|
14 |
|
15 |
import os
|
16 |
|
|
|
17 |
HF_TOKEN = os.getenv('HF_Token')
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
22 |
|
23 |
def create_vector_db():
|
24 |
|
|
|
|
|
|
|
25 |
# split the document into chunks
|
26 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50)
|
27 |
texts = text_splitter.split_documents(document)
|
|
|
2 |
from datasets import Dataset
|
3 |
from langchain.docstore.document import Document as LangchainDocument
|
4 |
from sentence_transformers import SentenceTransformer
|
5 |
+
#from langchain_community.document_loaders import WebBaseLoader
|
6 |
|
7 |
|
8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
+
from langchain_community.document_loaders import TextLoader, DirectoryLoader
|
10 |
+
|
11 |
from sentence_transformers import SentenceTransformer
|
12 |
from huggingface_hub import Repository, upload_file
|
13 |
from datasets import Dataset
|
|
|
15 |
|
16 |
import os
|
17 |
|
18 |
+
DATA_PATH='./data'
|
19 |
HF_TOKEN = os.getenv('HF_Token')
|
20 |
|
21 |
+
#dataset = load_dataset("Namitg02/Test", split='train', streaming=False)
|
22 |
+
|
23 |
+
##url = "https://www.webmd.com/"
|
24 |
+
#loader = WebBaseLoader(url)
|
25 |
+
#document = loader.load()
|
26 |
|
27 |
def create_vector_db():
|
28 |
|
29 |
+
loader = DirectoryLoader(DATA_PATH, glob='*.md', loader_cls=TextLoader, show_progress=True)
|
30 |
+
documents =loader.load()
|
31 |
+
|
32 |
# split the document into chunks
|
33 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50)
|
34 |
texts = text_splitter.split_documents(document)
|