Spaces:
Sleeping
Sleeping
remove pdf doc libraries
Browse files
app.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
|
2 |
import re
|
3 |
import requests
|
4 |
import sys
|
@@ -11,11 +11,6 @@ from urllib3.exceptions import InsecureRequestWarning
|
|
11 |
from urllib3 import disable_warnings
|
12 |
import email.utils
|
13 |
import pandas as pd
|
14 |
-
import pypandoc
|
15 |
-
import fitz
|
16 |
-
from docx import Document
|
17 |
-
from spire.doc import *
|
18 |
-
from spire.doc.common import *
|
19 |
|
20 |
disable_warnings(InsecureRequestWarning)
|
21 |
|
@@ -813,59 +808,12 @@ def localize_URL(mi_URL: str, lengua: str="en") -> str:
|
|
813 |
#print(localize_URL(url5, "fr"))
|
814 |
|
815 |
|
816 |
-
|
817 |
-
def convert_docx_to_html(docx_file_path):
|
818 |
-
output = pypandoc.convert_file(docx_file_path, 'html')
|
819 |
-
return output
|
820 |
-
|
821 |
def extract_href_attributes(html_content):
|
822 |
soup = BeautifulSoup(html_content, 'html.parser')
|
823 |
# creates a list
|
824 |
href_values = [a['href'] for a in soup.find_all('a', href=True)]
|
825 |
return href_values
|
826 |
|
827 |
-
def generate_table_URLs_from_Docx(docx_path, lang_code):
|
828 |
-
# Open the document
|
829 |
-
document = Document(docx_path)
|
830 |
-
|
831 |
-
# Extract hyperlinks
|
832 |
-
input_urls = []
|
833 |
-
for paragraph in document.paragraphs:
|
834 |
-
for run in paragraph.runs:
|
835 |
-
hyperlink = run.hyperlink
|
836 |
-
if hyperlink is not None:
|
837 |
-
input_urls.append(hyperlink.address)
|
838 |
-
|
839 |
-
#input_urls
|
840 |
-
data = []
|
841 |
-
|
842 |
-
|
843 |
-
# Initialize lists to store data for the DataFrame
|
844 |
-
index_list = []
|
845 |
-
original_url_list = []
|
846 |
-
localized_url_list = []
|
847 |
-
|
848 |
-
# Apply localizeURL to each URL in the list
|
849 |
-
for index, url in enumerate(input_urls):
|
850 |
-
localized_url = localize_URL(url, lang_code) # Replace 'en' with the desired language code
|
851 |
-
index_list.append(index)
|
852 |
-
original_url_list.append(url)
|
853 |
-
localized_url_list.append(localized_url)
|
854 |
-
|
855 |
-
# Create a DataFrame
|
856 |
-
df_docx = pd.DataFrame({
|
857 |
-
'index': index_list,
|
858 |
-
'url': original_url_list,
|
859 |
-
'localized_url': localized_url_list
|
860 |
-
})
|
861 |
-
|
862 |
-
# Export the DataFrame to a CSV file
|
863 |
-
df_docx.to_csv(f"output_{lang_code}_{docx_path}", index=False, encoding="utf-8")
|
864 |
-
|
865 |
-
# Display the DataFrame
|
866 |
-
return df_docx
|
867 |
-
|
868 |
-
|
869 |
#language_code = "es"
|
870 |
#UNEP_URL_DOWNREPLACE = "https://www.unep.org/news-and-stories/press-release/global-annual-finance-flows-7-trillion-fueling-climate-biodiversity#"
|
871 |
|
@@ -975,248 +923,6 @@ def localize_UNEP_html(language_code, soup):
|
|
975 |
return str(soup)
|
976 |
|
977 |
#Code created by Nelson JAIMES-QUINTERO
|
978 |
-
# -------------------- ## -------------------- ## -------------------- #
|
979 |
-
# FUNCTIONS FOR LAUNCHING THE DOCUMENT/LINK PROCESSING #
|
980 |
-
|
981 |
-
# DOC-HTML
|
982 |
-
def docx2_bitable(docx_path: str, output_lang: str):
|
983 |
-
"""Takes an input doc/docx file and creates a CSV file with 3 columns:
|
984 |
-
List number, URL found in the file, Localized URL in the input language.
|
985 |
-
"""
|
986 |
-
|
987 |
-
if not docx_path.lower().endswith(".doc") and not docx_path.endswith(".docx"):
|
988 |
-
print("ERROR: File not found or is not .DOC nor .DOCX. Verify the input_path field above.")
|
989 |
-
return "ERROR: File not found or is not .DOC nor .DOCX. Verify the input_path field above."
|
990 |
-
input_docx_path = docx_path # document
|
991 |
-
|
992 |
-
# Name the output_file based on the docx's name
|
993 |
-
last_slash_index = input_docx_path.rfind('/')
|
994 |
-
if last_slash_index != -1:
|
995 |
-
extracted_string = f"{input_docx_path[last_slash_index + 1:]}"
|
996 |
-
extracted_string = extracted_string.replace("#", "")
|
997 |
-
#print(extracted_string)
|
998 |
-
else:
|
999 |
-
#print("No '/' found in the URL.")
|
1000 |
-
extracted_string = input_docx_path
|
1001 |
-
extracted_string = extracted_string.replace("#", "")
|
1002 |
-
|
1003 |
-
|
1004 |
-
|
1005 |
-
# Naming the output file
|
1006 |
-
output_directory = '/content'
|
1007 |
-
output_csv_path = f"{output_directory}/output_{output_lang}_{extracted_string[0:len(extracted_string)//2]}.csv"
|
1008 |
-
|
1009 |
-
# Create the output directory if it doesn't exist
|
1010 |
-
os.makedirs(output_directory, exist_ok=True)
|
1011 |
-
|
1012 |
-
#output_csv_path = f"output_{output_lang}_{docx_path[0:len(docx_path)//2]}.html"
|
1013 |
-
|
1014 |
-
# Convert DOCX to HTML
|
1015 |
-
html_content = convert_docx_to_html(input_docx_path)
|
1016 |
-
print("Doc converted into html successfully.")
|
1017 |
-
# Write HTML content to a file
|
1018 |
-
#with open(output_html_path, "w", encoding="utf-8") as html_file:
|
1019 |
-
#html_file.write(html_content)
|
1020 |
-
|
1021 |
-
#print("Conversion complete. HTML file saved at:", output_html_path)
|
1022 |
-
|
1023 |
-
# Extract href attributes
|
1024 |
-
href_attributes = extract_href_attributes(html_content)
|
1025 |
-
#print("Extracted href attributes:", href_attributes)
|
1026 |
-
|
1027 |
-
output_urls = [localize_URL(url, output_lang) for url in href_attributes]
|
1028 |
-
|
1029 |
-
# Create a pandas DataFrame
|
1030 |
-
df = pd.DataFrame({'index': range(1, len(href_attributes) + 1), 'input_url': href_attributes, 'output_url': output_urls})
|
1031 |
-
|
1032 |
-
# Export the DataFrame to a CSV file
|
1033 |
-
if not df.empty:
|
1034 |
-
print("Check your exported csv file at the left on the Folder icon\nwith the name", output_csv_path)
|
1035 |
-
df.to_csv(output_csv_path, index=False, encoding="utf-8")
|
1036 |
-
|
1037 |
-
# Display the DataFrame
|
1038 |
-
return df
|
1039 |
-
|
1040 |
-
|
1041 |
-
# From PDF file -------------------- ##
|
1042 |
-
# NEEDS FITZ
|
1043 |
-
def pdf2_bitable(pdf_path: str, output_lang: str):
|
1044 |
-
if not pdf_path.lower().endswith("pdf"):
|
1045 |
-
print(f"ERROR: File not found or is not .pdf. Verify the input_path field above: {pdf_path}")
|
1046 |
-
return None
|
1047 |
-
# Create a document object
|
1048 |
-
doc = fitz.open(pdf_path) # or fitz.Document(filename)
|
1049 |
-
|
1050 |
-
# Create a pandas DataFrame
|
1051 |
-
data = []
|
1052 |
-
|
1053 |
-
# get the links on all pages
|
1054 |
-
for i in range(doc.page_count):
|
1055 |
-
page = doc.load_page(i)
|
1056 |
-
links = page.get_links()
|
1057 |
-
if links:
|
1058 |
-
for item in links:
|
1059 |
-
input_url = item.get('uri')
|
1060 |
-
if input_url != None:
|
1061 |
-
localized_url = localize_URL(input_url, output_lang)
|
1062 |
-
data.append({'index': len(data) + 1, 'Page': i, 'input_url': input_url, 'localized_url': localized_url})
|
1063 |
-
|
1064 |
-
|
1065 |
-
|
1066 |
-
# Create a pandas DataFrame
|
1067 |
-
df_pdf = pd.DataFrame(data)
|
1068 |
-
|
1069 |
-
# Name the file based on the pdf's name
|
1070 |
-
last_slash_index = pdf_path.rfind('/')
|
1071 |
-
if last_slash_index != -1:
|
1072 |
-
extracted_string = f"{pdf_path[last_slash_index + 1:]}"
|
1073 |
-
extracted_string = extracted_string.replace("#", "")
|
1074 |
-
#print(extracted_string)
|
1075 |
-
else:
|
1076 |
-
#print("No '/' found in the URL.")
|
1077 |
-
extracted_string = pdf_path
|
1078 |
-
extracted_string = extracted_string.replace("#", "")
|
1079 |
-
|
1080 |
-
|
1081 |
-
|
1082 |
-
# Naming the output file
|
1083 |
-
output_directory = '/content'
|
1084 |
-
output_csv_path = f"{output_directory}/output_{output_lang}_{extracted_string[0:len(extracted_string)//2]}.csv"
|
1085 |
-
|
1086 |
-
# Create the output directory if it doesn't exist
|
1087 |
-
os.makedirs(output_directory, exist_ok=True)
|
1088 |
-
|
1089 |
-
if not df_pdf.empty:
|
1090 |
-
# Export the DataFrame to a CSV file
|
1091 |
-
df_pdf.to_csv(output_csv_path, index=False, encoding="utf-8")
|
1092 |
-
print("Check your exported csv file at the left on the Folder icon\nwith the name", output_csv_path)
|
1093 |
-
return df_pdf
|
1094 |
-
else:
|
1095 |
-
print("ERROR: File not found or is not .pdf. Verify the input_path field above.")
|
1096 |
-
return None
|
1097 |
-
|
1098 |
-
|
1099 |
-
# DOCX REPLACER -------------------- ##
|
1100 |
-
|
1101 |
-
#Replace links in Docx with SpireDoc
|
1102 |
-
|
1103 |
-
def docx2docx_replacer(my_chemin_docx: str, my_langue):
|
1104 |
-
# Create a Document object
|
1105 |
-
doc = Document()
|
1106 |
-
|
1107 |
-
# Load a Word file
|
1108 |
-
doc.LoadFromFile(my_chemin_docx)
|
1109 |
-
|
1110 |
-
# Find all hyperlinks in the document
|
1111 |
-
hyperlinks = []
|
1112 |
-
for i in range(doc.Sections.Count):
|
1113 |
-
section = doc.Sections.get_Item(i)
|
1114 |
-
for j in range(section.Body.ChildObjects.Count):
|
1115 |
-
sec = section.Body.ChildObjects.get_Item(j)
|
1116 |
-
if sec.DocumentObjectType == DocumentObjectType.Paragraph:
|
1117 |
-
for k in range((sec if isinstance(sec, Paragraph) else None).ChildObjects.Count):
|
1118 |
-
para = (sec if isinstance(sec, Paragraph)
|
1119 |
-
else None).ChildObjects.get_Item(k)
|
1120 |
-
if para.DocumentObjectType == DocumentObjectType.Field:
|
1121 |
-
field = para if isinstance(para, Field) else None
|
1122 |
-
if field.Type == FieldType.FieldHyperlink:
|
1123 |
-
hyperlinks.append(field)
|
1124 |
-
|
1125 |
-
# Iterate through hyperlinks and update them
|
1126 |
-
for hyperlink in hyperlinks:
|
1127 |
-
# Get the current display text and URL
|
1128 |
-
current_url = hyperlink.Code.replace('HYPERLINK "', '').replace('"', '')
|
1129 |
-
match = re.search(r'HYPERLINK "(.*?)"', hyperlink.Code)
|
1130 |
-
if match:
|
1131 |
-
current_url = match.group(1)
|
1132 |
-
|
1133 |
-
current_display_text = hyperlink.FieldText
|
1134 |
-
localized_url = localize_URL(current_url, my_langue)
|
1135 |
-
if localized_url:
|
1136 |
-
|
1137 |
-
# Update the display text and URL of the hyperlink
|
1138 |
-
#hyperlink.FieldText = "NEW DISPLAY TEXT" # Replace with your new display text
|
1139 |
-
hyperlink.Code = f'HYPERLINK "{localized_url}"'
|
1140 |
-
|
1141 |
-
if len(hyperlinks)>0:
|
1142 |
-
# Naming output file
|
1143 |
-
last_slash_index = my_chemin_docx.rfind('/')
|
1144 |
-
if last_slash_index != -1:
|
1145 |
-
extracted_string = f"{my_chemin_docx[last_slash_index + 1:]}"
|
1146 |
-
extracted_string = extracted_string.replace("#", "")
|
1147 |
-
#print(extracted_string)
|
1148 |
-
else:
|
1149 |
-
#print("No '/' found in the URL.")
|
1150 |
-
extracted_string = my_chemin_docx
|
1151 |
-
extracted_string = extracted_string.replace("#", "")
|
1152 |
-
|
1153 |
-
output_directory = '/content'
|
1154 |
-
output_path = f"{output_directory}/output_{my_langue}_{extracted_string[0:len(extracted_string)//2]}.docx"
|
1155 |
-
|
1156 |
-
# Create the output directory if it doesn't exist
|
1157 |
-
os.makedirs(output_directory, exist_ok=True)
|
1158 |
-
|
1159 |
-
|
1160 |
-
|
1161 |
-
# Save the document to a docx file
|
1162 |
-
print("\n\nSaving the output file:")
|
1163 |
-
doc.SaveToFile(output_path, FileFormat.Docx)
|
1164 |
-
print(f"Output file saved successfuly in your content folder as:\n\t{output_path}")
|
1165 |
-
doc.Close()
|
1166 |
-
else:
|
1167 |
-
print(f"ERROR on processing the file: {my_chemin_docx}")
|
1168 |
-
|
1169 |
-
|
1170 |
-
|
1171 |
-
|
1172 |
-
# 6. HTML downloader and link replacer -------------------- ##
|
1173 |
-
|
1174 |
-
def link2_html_converter(UNEP_URL_DOWNREPLACE: str, language_code: str):
|
1175 |
-
"""Takes an input link from UNEP website. It downloads the webpage
|
1176 |
-
translatable content, replace its links with the localized version and
|
1177 |
-
exports a .txt file with the HTML tags ready to be used in any CAT tool
|
1178 |
-
for human translation.
|
1179 |
-
"""
|
1180 |
-
modified_html = localize_UNEP_html(language_code, UNEP_URL_DOWNREPLACE)
|
1181 |
-
|
1182 |
-
if not modified_html:
|
1183 |
-
print("ERROR: The input URL might not be accessible, or not an URL.")
|
1184 |
-
raise ValueError("The input URL might not be accessible, or not an URL.")
|
1185 |
-
|
1186 |
-
print(f"\nFile to be exported in your folder, or\n\n\t\tcopy the result from below :\n\n\n{modified_html}")
|
1187 |
-
|
1188 |
-
# Name the file based on the webpage's name
|
1189 |
-
last_slash_index = UNEP_URL_DOWNREPLACE.rfind('/')
|
1190 |
-
if last_slash_index != -1:
|
1191 |
-
extracted_string = f"{UNEP_URL_DOWNREPLACE[last_slash_index + 1:]}_replacedURLs_{language_code}.txt"
|
1192 |
-
extracted_string = extracted_string.replace("#", "")
|
1193 |
-
#print(extracted_string)
|
1194 |
-
else:
|
1195 |
-
#print("No '/' found in the URL.")
|
1196 |
-
extracted_string = UNEP_URL_DOWNREPLACE + ".txt"
|
1197 |
-
extracted_string = extracted_string.replace("#", "")
|
1198 |
-
|
1199 |
-
# Save the modified HTML content to a .txt file in the current folder
|
1200 |
-
with open(extracted_string, 'w', encoding='utf-8') as file:
|
1201 |
-
print(type(modified_html))
|
1202 |
-
print(modified_html)
|
1203 |
-
file.write(modified_html)
|
1204 |
-
print(f"File {extracted_string} exported succesfully")
|
1205 |
-
|
1206 |
-
# Force download in Google Colab
|
1207 |
-
try:
|
1208 |
-
from google.colab import files
|
1209 |
-
files.download(extracted_string)
|
1210 |
-
except ImportError:
|
1211 |
-
pass
|
1212 |
-
|
1213 |
-
# Install necessary libraries
|
1214 |
-
#!pip install gradio
|
1215 |
-
|
1216 |
-
import gradio as gr
|
1217 |
-
from bs4 import BeautifulSoup
|
1218 |
-
|
1219 |
-
|
1220 |
|
1221 |
# Define your custom function
|
1222 |
def render_html(htmltext, language):
|
@@ -1247,4 +953,4 @@ with gr.Blocks() as demo:
|
|
1247 |
run_button.click(render_html, inputs=[html_input, language_dropdown], outputs=html_output)
|
1248 |
|
1249 |
# Launch the Gradio app with debug=True and share=True
|
1250 |
-
demo.launch(
|
|
|
1 |
+
import gradio as gr
|
2 |
import re
|
3 |
import requests
|
4 |
import sys
|
|
|
11 |
from urllib3 import disable_warnings
|
12 |
import email.utils
|
13 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
disable_warnings(InsecureRequestWarning)
|
16 |
|
|
|
808 |
#print(localize_URL(url5, "fr"))
|
809 |
|
810 |
|
|
|
|
|
|
|
|
|
|
|
811 |
def extract_href_attributes(html_content):
|
812 |
soup = BeautifulSoup(html_content, 'html.parser')
|
813 |
# creates a list
|
814 |
href_values = [a['href'] for a in soup.find_all('a', href=True)]
|
815 |
return href_values
|
816 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
817 |
#language_code = "es"
|
818 |
#UNEP_URL_DOWNREPLACE = "https://www.unep.org/news-and-stories/press-release/global-annual-finance-flows-7-trillion-fueling-climate-biodiversity#"
|
819 |
|
|
|
923 |
return str(soup)
|
924 |
|
925 |
#Code created by Nelson JAIMES-QUINTERO
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
926 |
|
927 |
# Define your custom function
|
928 |
def render_html(htmltext, language):
|
|
|
953 |
run_button.click(render_html, inputs=[html_input, language_dropdown], outputs=html_output)
|
954 |
|
955 |
# Launch the Gradio app with debug=True and share=True
|
956 |
+
demo.launch()
|