|
import glob |
|
from langchain_community.document_loaders import PyPDFLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
|
class DataExtractor: |
|
def __init__(self, pdf_directory): |
|
self.pdf_directory = pdf_directory |
|
self.pdf_text = [] |
|
self.docs = [] |
|
self.split_docs = None |
|
|
|
|
|
def extract_text(self): |
|
print(f'Extracting text from pdf files in {self.pdf_directory}') |
|
pdf_files = glob.glob(f'{self.pdf_directory}/*.pdf') |
|
print(pdf_files) |
|
for pdf_file in pdf_files: |
|
print(pdf_file) |
|
loader = PyPDFLoader(pdf_file) |
|
documents = loader.load() |
|
self.pdf_text.append(documents) |
|
|
|
return self.pdf_text |
|
|
|
|
|
|
|
def clean_and_split_text(self, documents): |
|
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) |
|
split_docs = [] |
|
print(f'Cleaning and splitting text from {len(documents)} documents') |
|
for doc in documents: |
|
|
|
split_docs.extend(splitter.split_documents(doc)) |
|
print(f'Number of documents after splitting: {len(split_docs)}') |
|
return split_docs |
|
|
|
|
|
|