pdf2tmx / utils.py
nelsonjq's picture
get_paragraphs returns 4 values
9209794 verified
raw
history blame
2.61 kB
import os
import pandas as pd
from PyPDF2 import PdfReader
from lingtrain_aligner import preprocessor, splitter, aligner, resolver, reader, vis_helper
def pdf_to_text(pdf_path):
text = ""
with open(pdf_path, "rb") as file:
reader = PdfReader(file)
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
text += page.extract_text() + "\n"
return text
def align_text(txt1: str, txt2: str, lang1: str, lang2: str) -> pd.DataFrame:
db_path = "bilingualdata.db"
models = ["sentence_transformer_multilingual", "sentence_transformer_multilingual_labse"]
model_name = models[0]
txt1 = txt1.split("\n")
txt2 = txt2.split("\n")
text1_prepared = preprocessor.mark_paragraphs(txt1)
text2_prepared = preprocessor.mark_paragraphs(txt2)
splitted_from = splitter.split_by_sentences_wrapper(text1_prepared, lang1)
splitted_to = splitter.split_by_sentences_wrapper(text2_prepared, lang2)
if os.path.isfile(db_path):
os.unlink(db_path)
aligner.fill_db(db_path, lang1, lang2, splitted_from, splitted_to)
batch_ids = [0, 1]
aligner.align_db(db_path,
model_name,
batch_size=100,
window=40,
batch_ids=batch_ids,
save_pic=False,
embed_batch_size=10,
normalize_embeddings=True,
show_progress_bar=True)
conflicts_to_solve, rest = resolver.get_all_conflicts(db_path, min_chain_length=2, max_conflicts_len=6, batch_id=-1)
resolver.get_statistics(conflicts_to_solve)
resolver.get_statistics(rest)
steps = 3
batch_id = -1
for i in range(steps):
conflicts, rest = resolver.get_all_conflicts(db_path, min_chain_length=2 + i, max_conflicts_len=6 * (i + 1), batch_id=batch_id)
resolver.resolve_all_conflicts(db_path, conflicts, model_name, show_logs=False)
vis_helper.visualize_alignment_by_db(db_path, output_path="img_test1.png", lang_name_from=lang1, lang_name_to=lang2, batch_size=400, size=(600, 600), plt_show=True)
if len(rest) == 0:
break
paragraphs_from, paragraphs_to, meta, _ = reader.get_paragraphs(db_path)
data = []
for from_paragraph, to_paragraph in zip(paragraphs_from, paragraphs_to):
for from_line, to_line in zip(from_paragraph, to_paragraph):
data.append({"From": from_line, "To": to_line})
df = pd.DataFrame(data)
return df
def save_to_excel(df, file_name: str):
df.to_excel(file_name, index=False)