pdf2tmx / utils.py
nelsonjq's picture
Update utils.py
38a37b1 verified
raw
history blame
458 Bytes
import os
import pandas as pd
from lingtrain_aligner import preprocessor, splitter, aligner, resolver, reader, vis_helper
from PyPDF2 import PdfReader
def pdf_to_text(pdf_path: str) -> str:
text = ""
with open(pdf_path, "rb") as file:
reader = PdfReader(file)
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def save_to_excel(df, file_name: str):
df.to_excel(file_name, index=False)