File size: 2,046 Bytes
c379a6e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
from typing import Any
from bs4 import BeautifulSoup
from langchain_core.documents import Document
from markdown import markdown
from pathlib import Path
from langchain.text_splitter import MarkdownTextSplitter, MarkdownHeaderTextSplitter, TextSplitter
from src.utils import batched
def read_markdown_file(path: str | Path) -> [str, str]:
path = Path(path)
with open(path, 'r', encoding="utf8") as f_r:
text = f_r.read()
# text = markdown(text)
# text = ''.join(BeautifulSoup(text).findAll(text=True))
return text, str(path)
def split_markdown(md: str | list[str],
metadata=dict[str, Any] | list[dict[str, Any]],
chunk_size=512,
overlap=64,
splitter: TextSplitter = None) -> list[Document]:
if isinstance(md, str):
md = [md]
if isinstance(metadata, list):
raise ValueError("metadata should be a single dict")
metadata = [metadata]
if splitter is None:
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
]
md = [MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False).split_text(i) for i in md]
metadata = [{**metadata[i], **text.metadata} for i, text_split in enumerate(md) for text in text_split]
md = [j.page_content for i in md for j in i]
splitter = MarkdownTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
docs = splitter.create_documents(md, metadata)
return docs
def process_markdown_files(paths: list[str | Path], batch_size=1, chunk_size=512, overlap=64):
for files in batched(paths, batch_size):
mds_w_paths = [read_markdown_file(i) for i in files]
metadata = [{"path": md_path} for _, md_path in mds_w_paths]
md = [md for md, _ in mds_w_paths]
docs = split_markdown(md, metadata, chunk_size=chunk_size, overlap=overlap)
yield [i.page_content for i in docs], [i.metadata for i in docs]
|