nelsonjq commited on
Commit
ffece79
·
verified ·
1 Parent(s): 1ac36aa

remove pdf doc libraries

Browse files
Files changed (1) hide show
  1. app.py +2 -296
app.py CHANGED
@@ -1,4 +1,4 @@
1
-
2
  import re
3
  import requests
4
  import sys
@@ -11,11 +11,6 @@ from urllib3.exceptions import InsecureRequestWarning
11
  from urllib3 import disable_warnings
12
  import email.utils
13
  import pandas as pd
14
- import pypandoc
15
- import fitz
16
- from docx import Document
17
- from spire.doc import *
18
- from spire.doc.common import *
19
 
20
  disable_warnings(InsecureRequestWarning)
21
 
@@ -813,59 +808,12 @@ def localize_URL(mi_URL: str, lengua: str="en") -> str:
813
  #print(localize_URL(url5, "fr"))
814
 
815
 
816
-
817
- def convert_docx_to_html(docx_file_path):
818
- output = pypandoc.convert_file(docx_file_path, 'html')
819
- return output
820
-
821
  def extract_href_attributes(html_content):
822
  soup = BeautifulSoup(html_content, 'html.parser')
823
  # creates a list
824
  href_values = [a['href'] for a in soup.find_all('a', href=True)]
825
  return href_values
826
 
827
- def generate_table_URLs_from_Docx(docx_path, lang_code):
828
- # Open the document
829
- document = Document(docx_path)
830
-
831
- # Extract hyperlinks
832
- input_urls = []
833
- for paragraph in document.paragraphs:
834
- for run in paragraph.runs:
835
- hyperlink = run.hyperlink
836
- if hyperlink is not None:
837
- input_urls.append(hyperlink.address)
838
-
839
- #input_urls
840
- data = []
841
-
842
-
843
- # Initialize lists to store data for the DataFrame
844
- index_list = []
845
- original_url_list = []
846
- localized_url_list = []
847
-
848
- # Apply localizeURL to each URL in the list
849
- for index, url in enumerate(input_urls):
850
- localized_url = localize_URL(url, lang_code) # Replace 'en' with the desired language code
851
- index_list.append(index)
852
- original_url_list.append(url)
853
- localized_url_list.append(localized_url)
854
-
855
- # Create a DataFrame
856
- df_docx = pd.DataFrame({
857
- 'index': index_list,
858
- 'url': original_url_list,
859
- 'localized_url': localized_url_list
860
- })
861
-
862
- # Export the DataFrame to a CSV file
863
- df_docx.to_csv(f"output_{lang_code}_{docx_path}", index=False, encoding="utf-8")
864
-
865
- # Display the DataFrame
866
- return df_docx
867
-
868
-
869
  #language_code = "es"
870
  #UNEP_URL_DOWNREPLACE = "https://www.unep.org/news-and-stories/press-release/global-annual-finance-flows-7-trillion-fueling-climate-biodiversity#"
871
 
@@ -975,248 +923,6 @@ def localize_UNEP_html(language_code, soup):
975
  return str(soup)
976
 
977
  #Code created by Nelson JAIMES-QUINTERO
978
- # -------------------- ## -------------------- ## -------------------- #
979
- # FUNCTIONS FOR LAUNCHING THE DOCUMENT/LINK PROCESSING #
980
-
981
- # DOC-HTML
982
- def docx2_bitable(docx_path: str, output_lang: str):
983
- """Takes an input doc/docx file and creates a CSV file with 3 columns:
984
- List number, URL found in the file, Localized URL in the input language.
985
- """
986
-
987
- if not docx_path.lower().endswith(".doc") and not docx_path.endswith(".docx"):
988
- print("ERROR: File not found or is not .DOC nor .DOCX. Verify the input_path field above.")
989
- return "ERROR: File not found or is not .DOC nor .DOCX. Verify the input_path field above."
990
- input_docx_path = docx_path # document
991
-
992
- # Name the output_file based on the docx's name
993
- last_slash_index = input_docx_path.rfind('/')
994
- if last_slash_index != -1:
995
- extracted_string = f"{input_docx_path[last_slash_index + 1:]}"
996
- extracted_string = extracted_string.replace("#", "")
997
- #print(extracted_string)
998
- else:
999
- #print("No '/' found in the URL.")
1000
- extracted_string = input_docx_path
1001
- extracted_string = extracted_string.replace("#", "")
1002
-
1003
-
1004
-
1005
- # Naming the output file
1006
- output_directory = '/content'
1007
- output_csv_path = f"{output_directory}/output_{output_lang}_{extracted_string[0:len(extracted_string)//2]}.csv"
1008
-
1009
- # Create the output directory if it doesn't exist
1010
- os.makedirs(output_directory, exist_ok=True)
1011
-
1012
- #output_csv_path = f"output_{output_lang}_{docx_path[0:len(docx_path)//2]}.html"
1013
-
1014
- # Convert DOCX to HTML
1015
- html_content = convert_docx_to_html(input_docx_path)
1016
- print("Doc converted into html successfully.")
1017
- # Write HTML content to a file
1018
- #with open(output_html_path, "w", encoding="utf-8") as html_file:
1019
- #html_file.write(html_content)
1020
-
1021
- #print("Conversion complete. HTML file saved at:", output_html_path)
1022
-
1023
- # Extract href attributes
1024
- href_attributes = extract_href_attributes(html_content)
1025
- #print("Extracted href attributes:", href_attributes)
1026
-
1027
- output_urls = [localize_URL(url, output_lang) for url in href_attributes]
1028
-
1029
- # Create a pandas DataFrame
1030
- df = pd.DataFrame({'index': range(1, len(href_attributes) + 1), 'input_url': href_attributes, 'output_url': output_urls})
1031
-
1032
- # Export the DataFrame to a CSV file
1033
- if not df.empty:
1034
- print("Check your exported csv file at the left on the Folder icon\nwith the name", output_csv_path)
1035
- df.to_csv(output_csv_path, index=False, encoding="utf-8")
1036
-
1037
- # Display the DataFrame
1038
- return df
1039
-
1040
-
1041
- # From PDF file -------------------- ##
1042
- # NEEDS FITZ
1043
- def pdf2_bitable(pdf_path: str, output_lang: str):
1044
- if not pdf_path.lower().endswith("pdf"):
1045
- print(f"ERROR: File not found or is not .pdf. Verify the input_path field above: {pdf_path}")
1046
- return None
1047
- # Create a document object
1048
- doc = fitz.open(pdf_path) # or fitz.Document(filename)
1049
-
1050
- # Create a pandas DataFrame
1051
- data = []
1052
-
1053
- # get the links on all pages
1054
- for i in range(doc.page_count):
1055
- page = doc.load_page(i)
1056
- links = page.get_links()
1057
- if links:
1058
- for item in links:
1059
- input_url = item.get('uri')
1060
- if input_url != None:
1061
- localized_url = localize_URL(input_url, output_lang)
1062
- data.append({'index': len(data) + 1, 'Page': i, 'input_url': input_url, 'localized_url': localized_url})
1063
-
1064
-
1065
-
1066
- # Create a pandas DataFrame
1067
- df_pdf = pd.DataFrame(data)
1068
-
1069
- # Name the file based on the pdf's name
1070
- last_slash_index = pdf_path.rfind('/')
1071
- if last_slash_index != -1:
1072
- extracted_string = f"{pdf_path[last_slash_index + 1:]}"
1073
- extracted_string = extracted_string.replace("#", "")
1074
- #print(extracted_string)
1075
- else:
1076
- #print("No '/' found in the URL.")
1077
- extracted_string = pdf_path
1078
- extracted_string = extracted_string.replace("#", "")
1079
-
1080
-
1081
-
1082
- # Naming the output file
1083
- output_directory = '/content'
1084
- output_csv_path = f"{output_directory}/output_{output_lang}_{extracted_string[0:len(extracted_string)//2]}.csv"
1085
-
1086
- # Create the output directory if it doesn't exist
1087
- os.makedirs(output_directory, exist_ok=True)
1088
-
1089
- if not df_pdf.empty:
1090
- # Export the DataFrame to a CSV file
1091
- df_pdf.to_csv(output_csv_path, index=False, encoding="utf-8")
1092
- print("Check your exported csv file at the left on the Folder icon\nwith the name", output_csv_path)
1093
- return df_pdf
1094
- else:
1095
- print("ERROR: File not found or is not .pdf. Verify the input_path field above.")
1096
- return None
1097
-
1098
-
1099
- # DOCX REPLACER -------------------- ##
1100
-
1101
- #Replace links in Docx with SpireDoc
1102
-
1103
- def docx2docx_replacer(my_chemin_docx: str, my_langue):
1104
- # Create a Document object
1105
- doc = Document()
1106
-
1107
- # Load a Word file
1108
- doc.LoadFromFile(my_chemin_docx)
1109
-
1110
- # Find all hyperlinks in the document
1111
- hyperlinks = []
1112
- for i in range(doc.Sections.Count):
1113
- section = doc.Sections.get_Item(i)
1114
- for j in range(section.Body.ChildObjects.Count):
1115
- sec = section.Body.ChildObjects.get_Item(j)
1116
- if sec.DocumentObjectType == DocumentObjectType.Paragraph:
1117
- for k in range((sec if isinstance(sec, Paragraph) else None).ChildObjects.Count):
1118
- para = (sec if isinstance(sec, Paragraph)
1119
- else None).ChildObjects.get_Item(k)
1120
- if para.DocumentObjectType == DocumentObjectType.Field:
1121
- field = para if isinstance(para, Field) else None
1122
- if field.Type == FieldType.FieldHyperlink:
1123
- hyperlinks.append(field)
1124
-
1125
- # Iterate through hyperlinks and update them
1126
- for hyperlink in hyperlinks:
1127
- # Get the current display text and URL
1128
- current_url = hyperlink.Code.replace('HYPERLINK "', '').replace('"', '')
1129
- match = re.search(r'HYPERLINK "(.*?)"', hyperlink.Code)
1130
- if match:
1131
- current_url = match.group(1)
1132
-
1133
- current_display_text = hyperlink.FieldText
1134
- localized_url = localize_URL(current_url, my_langue)
1135
- if localized_url:
1136
-
1137
- # Update the display text and URL of the hyperlink
1138
- #hyperlink.FieldText = "NEW DISPLAY TEXT" # Replace with your new display text
1139
- hyperlink.Code = f'HYPERLINK "{localized_url}"'
1140
-
1141
- if len(hyperlinks)>0:
1142
- # Naming output file
1143
- last_slash_index = my_chemin_docx.rfind('/')
1144
- if last_slash_index != -1:
1145
- extracted_string = f"{my_chemin_docx[last_slash_index + 1:]}"
1146
- extracted_string = extracted_string.replace("#", "")
1147
- #print(extracted_string)
1148
- else:
1149
- #print("No '/' found in the URL.")
1150
- extracted_string = my_chemin_docx
1151
- extracted_string = extracted_string.replace("#", "")
1152
-
1153
- output_directory = '/content'
1154
- output_path = f"{output_directory}/output_{my_langue}_{extracted_string[0:len(extracted_string)//2]}.docx"
1155
-
1156
- # Create the output directory if it doesn't exist
1157
- os.makedirs(output_directory, exist_ok=True)
1158
-
1159
-
1160
-
1161
- # Save the document to a docx file
1162
- print("\n\nSaving the output file:")
1163
- doc.SaveToFile(output_path, FileFormat.Docx)
1164
- print(f"Output file saved successfuly in your content folder as:\n\t{output_path}")
1165
- doc.Close()
1166
- else:
1167
- print(f"ERROR on processing the file: {my_chemin_docx}")
1168
-
1169
-
1170
-
1171
-
1172
- # 6. HTML downloader and link replacer -------------------- ##
1173
-
1174
- def link2_html_converter(UNEP_URL_DOWNREPLACE: str, language_code: str):
1175
- """Takes an input link from UNEP website. It downloads the webpage
1176
- translatable content, replace its links with the localized version and
1177
- exports a .txt file with the HTML tags ready to be used in any CAT tool
1178
- for human translation.
1179
- """
1180
- modified_html = localize_UNEP_html(language_code, UNEP_URL_DOWNREPLACE)
1181
-
1182
- if not modified_html:
1183
- print("ERROR: The input URL might not be accessible, or not an URL.")
1184
- raise ValueError("The input URL might not be accessible, or not an URL.")
1185
-
1186
- print(f"\nFile to be exported in your folder, or\n\n\t\tcopy the result from below :\n\n\n{modified_html}")
1187
-
1188
- # Name the file based on the webpage's name
1189
- last_slash_index = UNEP_URL_DOWNREPLACE.rfind('/')
1190
- if last_slash_index != -1:
1191
- extracted_string = f"{UNEP_URL_DOWNREPLACE[last_slash_index + 1:]}_replacedURLs_{language_code}.txt"
1192
- extracted_string = extracted_string.replace("#", "")
1193
- #print(extracted_string)
1194
- else:
1195
- #print("No '/' found in the URL.")
1196
- extracted_string = UNEP_URL_DOWNREPLACE + ".txt"
1197
- extracted_string = extracted_string.replace("#", "")
1198
-
1199
- # Save the modified HTML content to a .txt file in the current folder
1200
- with open(extracted_string, 'w', encoding='utf-8') as file:
1201
- print(type(modified_html))
1202
- print(modified_html)
1203
- file.write(modified_html)
1204
- print(f"File {extracted_string} exported succesfully")
1205
-
1206
- # Force download in Google Colab
1207
- try:
1208
- from google.colab import files
1209
- files.download(extracted_string)
1210
- except ImportError:
1211
- pass
1212
-
1213
- # Install necessary libraries
1214
- #!pip install gradio
1215
-
1216
- import gradio as gr
1217
- from bs4 import BeautifulSoup
1218
-
1219
-
1220
 
1221
  # Define your custom function
1222
  def render_html(htmltext, language):
@@ -1247,4 +953,4 @@ with gr.Blocks() as demo:
1247
  run_button.click(render_html, inputs=[html_input, language_dropdown], outputs=html_output)
1248
 
1249
  # Launch the Gradio app with debug=True and share=True
1250
- demo.launch(debug=True, share=True)
 
1
+ import gradio as gr
2
  import re
3
  import requests
4
  import sys
 
11
  from urllib3 import disable_warnings
12
  import email.utils
13
  import pandas as pd
 
 
 
 
 
14
 
15
  disable_warnings(InsecureRequestWarning)
16
 
 
808
  #print(localize_URL(url5, "fr"))
809
 
810
 
 
 
 
 
 
811
  def extract_href_attributes(html_content):
812
  soup = BeautifulSoup(html_content, 'html.parser')
813
  # creates a list
814
  href_values = [a['href'] for a in soup.find_all('a', href=True)]
815
  return href_values
816
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
817
  #language_code = "es"
818
  #UNEP_URL_DOWNREPLACE = "https://www.unep.org/news-and-stories/press-release/global-annual-finance-flows-7-trillion-fueling-climate-biodiversity#"
819
 
 
923
  return str(soup)
924
 
925
  #Code created by Nelson JAIMES-QUINTERO
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
926
 
927
  # Define your custom function
928
  def render_html(htmltext, language):
 
953
  run_button.click(render_html, inputs=[html_input, language_dropdown], outputs=html_output)
954
 
955
  # Launch the Gradio app with debug=True and share=True
956
+ demo.launch()