genai_service / rag /extractor.py
ahmed-eisa's picture
started RAG
43e97e3
raw
history blame contribute delete
389 Bytes
from pypdf import PdfReader
def pdf_text_extractor(filepath: str) -> None:
content = ""
pdf_reader = PdfReader(filepath, strict=True)
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text:
content += f"{page_text}\n\n"
with open(filepath.replace("pdf", "txt"), "w", encoding="utf-8") as file:
file.write(content)