File size: 606 Bytes
7f7b773
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from typing import Iterator
from langchain.docstore.document import Document
from langchain.document_loaders.parsers import GrobidParser
from langchain.document_loaders.generic import GenericLoader

class GrobidLoader:
    def __init__(self, grobid_parser: GrobidParser):
        self.parser = grobid_parser

    def load_documents(self, root: str) -> Iterator[Document]:
        loader = GenericLoader.from_filesystem(
            root,
            glob="**/*.pdf",
            show_progress=True,
            suffixes=[".pdf"],
            parser=self.parser,
        )

        return loader.lazy_load()