nelsonjq commited on
Commit
88433d1
·
verified ·
1 Parent(s): fcafcbd

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +58 -4
utils.py CHANGED
@@ -3,14 +3,68 @@ import pandas as pd
3
  from lingtrain_aligner import preprocessor, splitter, aligner, resolver, reader, vis_helper
4
  from PyPDF2 import PdfReader
5
 
6
- def pdf_to_text(pdf_path: str) -> str:
7
-
8
  text = ""
9
  with open(pdf_path, "rb") as file:
10
- reader = PdfReader(file)
11
- for page in reader.pages:
 
12
  text += page.extract_text() + "\n"
13
  return text
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def save_to_excel(df, file_name: str):
16
  df.to_excel(file_name, index=False)
 
3
  from lingtrain_aligner import preprocessor, splitter, aligner, resolver, reader, vis_helper
4
  from PyPDF2 import PdfReader
5
 
6
+ def pdf_to_text(pdf_path):
 
7
  text = ""
8
  with open(pdf_path, "rb") as file:
9
+ reader = PdfFileReader(file)
10
+ for page_num in range(reader.numPages):
11
+ page = reader.getPage(page_num)
12
  text += page.extract_text() + "\n"
13
  return text
14
 
15
+ def align_text(txt1: str, txt2: str, lang1: str, lang2: str) -> pd.DataFrame:
16
+ db_path = "docsdata.db"
17
+ models = ["sentence_transformer_multilingual", "sentence_transformer_multilingual_labse"]
18
+ model_name = models[0]
19
+
20
+ txt1 = txt1.split("\n")
21
+ txt2 = txt2.split("\n")
22
+
23
+ text1_prepared = preprocessor.mark_paragraphs(txt1)
24
+ text2_prepared = preprocessor.mark_paragraphs(txt2)
25
+ splitted_from = splitter.split_by_sentences_wrapper(text1_prepared, lang1, leave_marks=True)
26
+ splitted_to = splitter.split_by_sentences_wrapper(text2_prepared, lang2, leave_marks=True)
27
+
28
+ if os.path.isfile(db_path):
29
+ os.unlink(db_path)
30
+
31
+ aligner.fill_db(db_path, lang1, lang2, splitted_from, splitted_to)
32
+ batch_ids = [0, 1]
33
+
34
+ aligner.align_db(db_path,
35
+ model_name,
36
+ batch_size=100,
37
+ window=40,
38
+ batch_ids=batch_ids,
39
+ save_pic=False,
40
+ embed_batch_size=10,
41
+ normalize_embeddings=True,
42
+ show_progress_bar=True)
43
+
44
+ conflicts_to_solve, rest = resolver.get_all_conflicts(db_path, min_chain_length=2, max_conflicts_len=6, batch_id=-1)
45
+ resolver.get_statistics(conflicts_to_solve)
46
+ resolver.get_statistics(rest)
47
+
48
+ steps = 3
49
+ batch_id = -1
50
+
51
+ for i in range(steps):
52
+ conflicts, rest = resolver.get_all_conflicts(db_path, min_chain_length=2 + i, max_conflicts_len=6 * (i + 1), batch_id=batch_id)
53
+ resolver.resolve_all_conflicts(db_path, conflicts, model_name, show_logs=False)
54
+ vis_helper.visualize_alignment_by_db(db_path, output_path="img_test1.png", lang_name_from=lang1, lang_name_to=lang2, batch_size=400, size=(600, 600), plt_show=True)
55
+
56
+ if len(rest) == 0:
57
+ break
58
+
59
+ paragraphs_from, paragraphs_to, meta = reader.get_paragraphs(db_path)
60
+
61
+ data = []
62
+ for from_paragraph, to_paragraph in zip(paragraphs_from, paragraphs_to):
63
+ for from_line, to_line in zip(from_paragraph, to_paragraph):
64
+ data.append({"From": from_line, "To": to_line})
65
+
66
+ df = pd.DataFrame(data)
67
+ return df
68
+
69
  def save_to_excel(df, file_name: str):
70
  df.to_excel(file_name, index=False)