nelsonjq commited on
Commit
bbdbe4c
·
verified ·
1 Parent(s): dc9f272

Upload 3 files

Browse files
Files changed (3) hide show
  1. requirements.txt +5 -0
  2. src/app.py +44 -0
  3. src/utils.py +12 -0
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Gradio
2
+ pandas
3
+ PyMuPDF
4
+ openpyxl
5
+ lingtrain-aligner
src/app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gradio import Interface, File, Dropdown, Button, HTML
2
+ import pandas as pd
3
+ import os
4
+ from utils import pdf_to_text, align_text
5
+
6
+ def process_files(source_file, target_file, lang1, lang2):
7
+ if source_file is None or target_file is None:
8
+ return "Please upload both PDF files."
9
+
10
+ # Convert PDFs to text
11
+ text_content1 = pdf_to_text(source_file.name)
12
+ text_content2 = pdf_to_text(target_file.name)
13
+
14
+ # Align the texts
15
+ aligned_df = align_text(text_content1, text_content2, lang1, lang2)
16
+
17
+ # Convert DataFrame to HTML
18
+ aligned_html = aligned_df.to_html(index=False)
19
+
20
+ # Save DataFrame as Excel file
21
+ excel_path = "aligned_data.xlsx"
22
+ aligned_df.to_excel(excel_path, index=False)
23
+
24
+ return aligned_html, excel_path
25
+
26
+ # Define the Gradio interface
27
+ interface = Interface(
28
+ fn=process_files,
29
+ inputs=[
30
+ File(label="Upload Source PDF"),
31
+ File(label="Upload Target PDF"),
32
+ Dropdown(choices=["en", "es", "fr", "ch", "ar", "ru", "pt", "sw"], label="Select Language 1"),
33
+ Dropdown(choices=["en", "es", "fr", "ch", "ar", "ru", "pt", "sw"], label="Select Language 2"),
34
+ ],
35
+ outputs=[
36
+ HTML(label="Aligned DataFrame"),
37
+ Button(label="Download Aligned DataFrame")
38
+ ],
39
+ title="PDF Text Aligner",
40
+ description="Upload two PDF files and select languages to align the text."
41
+ )
42
+
43
+ if __name__ == "__main__":
44
+ interface.launch()
src/utils.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def pdf_to_text(pdf_path: str) -> str:
2
+ from PyPDF2 import PdfReader
3
+
4
+ text = ""
5
+ with open(pdf_path, "rb") as file:
6
+ reader = PdfReader(file)
7
+ for page in reader.pages:
8
+ text += page.extract_text() + "\n"
9
+ return text
10
+
11
+ def save_to_excel(df, file_name: str):
12
+ df.to_excel(file_name, index=False)