nelsonjq commited on
Commit
b120a5a
·
verified ·
1 Parent(s): b24478e

deprecated

Browse files
Files changed (1) hide show
  1. utils.py +5 -5
utils.py CHANGED
@@ -1,19 +1,19 @@
1
  import os
2
  import pandas as pd
3
- from lingtrain_aligner import preprocessor, splitter, aligner, resolver, reader, vis_helper
4
  from PyPDF2 import PdfReader
 
5
 
6
  def pdf_to_text(pdf_path):
7
  text = ""
8
  with open(pdf_path, "rb") as file:
9
  reader = PdfReader(file)
10
- for page_num in range(reader.numPages):
11
- page = reader.getPage(page_num)
12
  text += page.extract_text() + "\n"
13
  return text
14
 
15
  def align_text(txt1: str, txt2: str, lang1: str, lang2: str) -> pd.DataFrame:
16
- db_path = "docsdata.db"
17
  models = ["sentence_transformer_multilingual", "sentence_transformer_multilingual_labse"]
18
  model_name = models[0]
19
 
@@ -65,6 +65,6 @@ def align_text(txt1: str, txt2: str, lang1: str, lang2: str) -> pd.DataFrame:
65
 
66
  df = pd.DataFrame(data)
67
  return df
68
-
69
  def save_to_excel(df, file_name: str):
70
  df.to_excel(file_name, index=False)
 
1
  import os
2
  import pandas as pd
 
3
  from PyPDF2 import PdfReader
4
+ from lingtrain_aligner import preprocessor, splitter, aligner, resolver, reader, vis_helper
5
 
6
  def pdf_to_text(pdf_path):
7
  text = ""
8
  with open(pdf_path, "rb") as file:
9
  reader = PdfReader(file)
10
+ for page_num in range(len(reader.pages)):
11
+ page = reader.pages[page_num]
12
  text += page.extract_text() + "\n"
13
  return text
14
 
15
  def align_text(txt1: str, txt2: str, lang1: str, lang2: str) -> pd.DataFrame:
16
+ db_path = "bilingualdata.db"
17
  models = ["sentence_transformer_multilingual", "sentence_transformer_multilingual_labse"]
18
  model_name = models[0]
19
 
 
65
 
66
  df = pd.DataFrame(data)
67
  return df
68
+
69
  def save_to_excel(df, file_name: str):
70
  df.to_excel(file_name, index=False)