File size: 3,134 Bytes
38a37b1 9d0b477 38a37b1 b120a5a a9d9c73 9d0b477 88433d1 a9d9c73 b24478e b120a5a a9d9c73 88433d1 b120a5a 88433d1 8bdbe52 88433d1 e20569a 88433d1 e20569a 88433d1 e20569a 88433d1 7518e72 00d6aea 2dc55ed 00d6aea 88433d1 7dc89f6 2dc55ed 7dc89f6 88433d1 b120a5a a9d9c73 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import os
import pandas as pd
import logging
from PyPDF2 import PdfReader
from lingtrain_aligner import preprocessor, splitter, aligner, resolver, reader, vis_helper
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def pdf_to_text(pdf_path):
text = ""
with open(pdf_path, "rb") as file:
reader = PdfReader(file)
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
text += page.extract_text() + "\n"
return text
def align_text(txt1: str, txt2: str, lang1: str, lang2: str) -> pd.DataFrame:
db_path = "bilingualdata.db"
models = ["sentence_transformer_multilingual", "sentence_transformer_multilingual_labse"]
model_name = models[0]
txt1 = txt1.split("\n")
txt2 = txt2.split("\n")
text1_prepared = preprocessor.mark_paragraphs(txt1)
text2_prepared = preprocessor.mark_paragraphs(txt2)
splitted_from = splitter.split_by_sentences_wrapper(text1_prepared, lang1)
splitted_to = splitter.split_by_sentences_wrapper(text2_prepared, lang2)
if os.path.isfile(db_path):
os.unlink(db_path)
aligner.fill_db(db_path, lang1, lang2, splitted_from, splitted_to)
batch_ids = [0, 1]
aligner.align_db(db_path,
model_name,
batch_size=100,
window=40,
batch_ids=batch_ids,
save_pic=False,
embed_batch_size=10,
normalize_embeddings=True,
show_progress_bar=True)
conflicts_to_solve, rest = resolver.get_all_conflicts(db_path, min_chain_length=2, max_conflicts_len=6, batch_id=-1)
resolver.get_statistics(conflicts_to_solve)
resolver.get_statistics(rest)
steps = 3
batch_id = -1
for i in range(steps):
conflicts, rest = resolver.get_all_conflicts(db_path, min_chain_length=2 + i, max_conflicts_len=6 * (i + 1), batch_id=batch_id)
resolver.resolve_all_conflicts(db_path, conflicts, model_name, show_logs=False)
#vis_helper.visualize_alignment_by_db(db_path, output_path="img_test1.png", lang_name_from=lang1, lang_name_to=lang2, batch_size=400, size=(600, 600), plt_show=True)
if len(rest) == 0:
break
paragraphs_dict, par_ids, meta_info, sent_counter_dict = reader.get_paragraphs(db_path)
# Log the keys of paragraphs_dict
logger.info(f"paragraphs_dict keys: {paragraphs_dict.keys()}")
paragraphs_from = paragraphs_dict["from"]
paragraphs_to = paragraphs_dict["to"]
data = []
for from_paragraph, to_paragraph in zip(paragraphs_from, paragraphs_to):
if isinstance(from_paragraph, int) or isinstance(to_paragraph, int):
logger.warning(f"from_paragraph: {from_paragraph}, to_paragraph: {to_paragraph}")
continue
for from_line, to_line in zip(from_paragraph, to_paragraph):
data.append({"From": from_line, "To": to_line})
df = pd.DataFrame(data)
return df
def save_to_excel(df, file_name: str):
df.to_excel(file_name, index=False) |