File size: 2,046 Bytes
c379a6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from typing import Any

from bs4 import BeautifulSoup
from langchain_core.documents import Document
from markdown import markdown
from pathlib import Path
from langchain.text_splitter import MarkdownTextSplitter, MarkdownHeaderTextSplitter, TextSplitter

from src.utils import batched


def read_markdown_file(path: str | Path) -> [str, str]:
    path = Path(path)
    with open(path, 'r', encoding="utf8") as f_r:
        text = f_r.read()

    # text = markdown(text)
    # text = ''.join(BeautifulSoup(text).findAll(text=True))
    return text, str(path)


def split_markdown(md: str | list[str],
                   metadata=dict[str, Any] | list[dict[str, Any]],
                   chunk_size=512,
                   overlap=64,
                   splitter: TextSplitter = None) -> list[Document]:
    if isinstance(md, str):
        md = [md]
        if isinstance(metadata, list):
            raise ValueError("metadata should be a single dict")
        metadata = [metadata]
    if splitter is None:
        headers_to_split_on = [
            ("#", "Header 1"),
            ("##", "Header 2"),
            ("###", "Header 3"),
        ]
        md = [MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False).split_text(i) for i in md]
        metadata = [{**metadata[i], **text.metadata} for i, text_split in enumerate(md) for text in text_split]
        md = [j.page_content for i in md for j in i]
        splitter = MarkdownTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)

    docs = splitter.create_documents(md, metadata)
    return docs


def process_markdown_files(paths: list[str | Path], batch_size=1, chunk_size=512, overlap=64):
    for files in batched(paths, batch_size):
        mds_w_paths = [read_markdown_file(i) for i in files]
        metadata = [{"path": md_path} for _, md_path in mds_w_paths]
        md = [md for md, _ in mds_w_paths]
        docs = split_markdown(md, metadata, chunk_size=chunk_size, overlap=overlap)
        yield [i.page_content for i in docs], [i.metadata for i in docs]