|
from typing import Any |
|
|
|
from bs4 import BeautifulSoup |
|
from langchain_core.documents import Document |
|
from markdown import markdown |
|
from pathlib import Path |
|
from langchain.text_splitter import MarkdownTextSplitter, MarkdownHeaderTextSplitter, TextSplitter |
|
|
|
from src.utils import batched |
|
|
|
|
|
def read_markdown_file(path: str | Path) -> [str, str]: |
|
path = Path(path) |
|
with open(path, 'r', encoding="utf8") as f_r: |
|
text = f_r.read() |
|
|
|
|
|
|
|
return text, str(path) |
|
|
|
|
|
def split_markdown(md: str | list[str], |
|
metadata=dict[str, Any] | list[dict[str, Any]], |
|
chunk_size=512, |
|
overlap=64, |
|
splitter: TextSplitter = None) -> list[Document]: |
|
if isinstance(md, str): |
|
md = [md] |
|
if isinstance(metadata, list): |
|
raise ValueError("metadata should be a single dict") |
|
metadata = [metadata] |
|
if splitter is None: |
|
headers_to_split_on = [ |
|
("#", "Header 1"), |
|
("##", "Header 2"), |
|
("###", "Header 3"), |
|
] |
|
md = [MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False).split_text(i) for i in md] |
|
metadata = [{**metadata[i], **text.metadata} for i, text_split in enumerate(md) for text in text_split] |
|
md = [j.page_content for i in md for j in i] |
|
splitter = MarkdownTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap) |
|
|
|
docs = splitter.create_documents(md, metadata) |
|
return docs |
|
|
|
|
|
def process_markdown_files(paths: list[str | Path], batch_size=1, chunk_size=512, overlap=64): |
|
for files in batched(paths, batch_size): |
|
mds_w_paths = [read_markdown_file(i) for i in files] |
|
metadata = [{"path": md_path} for _, md_path in mds_w_paths] |
|
md = [md for md, _ in mds_w_paths] |
|
docs = split_markdown(md, metadata, chunk_size=chunk_size, overlap=overlap) |
|
yield [i.page_content for i in docs], [i.metadata for i in docs] |
|
|