Spaces:
Running
on
T4
Running
on
T4
| import glob | |
| import os | |
| from langchain_text_splitters import MarkdownHeaderTextSplitter | |
| from langchain_community.document_loaders import UnstructuredMarkdownLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter | |
| from transformers import AutoTokenizer | |
| from langchain_community.document_loaders import PyMuPDFLoader | |
| path_to_data = "./data/" | |
| def process_markdown(): | |
| headers_to_split_on = [ | |
| ("#", "Header 1"), | |
| ("##", "Header 2"), | |
| ("###", "Header 3"), | |
| ("####", "Header 4"), | |
| ("#####", "Header 5") | |
| ] | |
| markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on) | |
| files = glob.glob(path_to_data+"*.md") | |
| print(files) | |
| docs = [] | |
| for file in files: | |
| try: | |
| with open(file) as f: | |
| docs.append(f.read()) | |
| except Exception as e: | |
| print("Exception: ", e) | |
| docs_processed = [markdown_splitter.split_text(doc) for doc in docs] | |
| print(len(docs_processed)) | |
| print(docs_processed[0]) | |
| def process_pdf(): | |
| files = glob.glob(path_to_data+"*.pdf") | |
| docs = [] | |
| for file in files: | |
| try: | |
| docs.append(PyMuPDFLoader(file).load()) | |
| except Exception as e: | |
| print("Exception: ", e) | |
| chunk_size = 256 | |
| text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( | |
| AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"), | |
| chunk_size=chunk_size, | |
| chunk_overlap=int(chunk_size / 10), | |
| add_start_index=True, | |
| strip_whitespace=True, | |
| separators=["\n\n", "\n", ".", " ", ""], | |
| ) | |
| docs_processed = [text_splitter.split_documents(doc) for doc in docs] | |
| docs_processed = [item for sublist in docs_processed for item in sublist] | |
| print("length of text chunks:",len(docs_processed)) | |
| return docs_processed | |