Spaces:

nelsonjq
/

UNEP-link-finder

Sleeping

App Files Files Community

nelsonjq commited on Jan 7

Commit

ffece79

verified ·

1 Parent(s): 1ac36aa

remove pdf doc libraries

Browse files

Files changed (1) hide show

app.py +2 -296

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
 import re
 import requests
 import sys
@@ -11,11 +11,6 @@ from urllib3.exceptions import InsecureRequestWarning
 from urllib3 import disable_warnings
 import email.utils
 import pandas as pd
-import pypandoc
-import fitz
-from docx import Document
-from spire.doc import *
-from spire.doc.common import *
 disable_warnings(InsecureRequestWarning)
@@ -813,59 +808,12 @@ def localize_URL(mi_URL: str, lengua: str="en") -> str:
 #print(localize_URL(url5, "fr"))
-def convert_docx_to_html(docx_file_path):
-    output = pypandoc.convert_file(docx_file_path, 'html')
-    return output
 def extract_href_attributes(html_content):
     soup = BeautifulSoup(html_content, 'html.parser')
     # creates a list
     href_values = [a['href'] for a in soup.find_all('a', href=True)]
     return href_values
-def generate_table_URLs_from_Docx(docx_path, lang_code):
-  # Open the document
-  document = Document(docx_path)
-  # Extract hyperlinks
-  input_urls = []
-  for paragraph in document.paragraphs:
-      for run in paragraph.runs:
-          hyperlink = run.hyperlink
-          if hyperlink is not None:
-            input_urls.append(hyperlink.address)
-  #input_urls
-  data = []
-  # Initialize lists to store data for the DataFrame
-  index_list = []
-  original_url_list = []
-  localized_url_list = []
-  # Apply localizeURL to each URL in the list
-  for index, url in enumerate(input_urls):
-      localized_url = localize_URL(url, lang_code)  # Replace 'en' with the desired language code
-      index_list.append(index)
-      original_url_list.append(url)
-      localized_url_list.append(localized_url)
-  # Create a DataFrame
-  df_docx = pd.DataFrame({
-      'index': index_list,
-      'url': original_url_list,
-      'localized_url': localized_url_list
-  })
-  # Export the DataFrame to a CSV file
-  df_docx.to_csv(f"output_{lang_code}_{docx_path}", index=False, encoding="utf-8")
-  # Display the DataFrame
-  return df_docx
 #language_code = "es"
 #UNEP_URL_DOWNREPLACE = "https://www.unep.org/news-and-stories/press-release/global-annual-finance-flows-7-trillion-fueling-climate-biodiversity#"
@@ -975,248 +923,6 @@ def localize_UNEP_html(language_code, soup):
     return str(soup)
 #Code created by Nelson JAIMES-QUINTERO
-# --------------------  ## -------------------- ## -------------------- #
-    #   FUNCTIONS FOR LAUNCHING THE DOCUMENT/LINK PROCESSING    #
-# DOC-HTML
-def docx2_bitable(docx_path: str, output_lang: str):
-    """Takes an input doc/docx file and creates a CSV file with 3 columns:
-    List number, URL found in the file, Localized URL in the input language.
-    """
-    if not docx_path.lower().endswith(".doc") and not docx_path.endswith(".docx"):
-      print("ERROR: File not found or is not .DOC nor .DOCX. Verify the input_path field above.")
-      return "ERROR: File not found or is not .DOC nor .DOCX. Verify the input_path field above."
-    input_docx_path = docx_path  # document
-      # Name the output_file based on the docx's name
-    last_slash_index = input_docx_path.rfind('/')
-    if last_slash_index != -1:
-        extracted_string = f"{input_docx_path[last_slash_index + 1:]}"
-        extracted_string = extracted_string.replace("#", "")
-        #print(extracted_string)
-    else:
-        #print("No '/' found in the URL.")
-        extracted_string = input_docx_path
-        extracted_string = extracted_string.replace("#", "")
-    # Naming the output file
-    output_directory = '/content'
-    output_csv_path = f"{output_directory}/output_{output_lang}_{extracted_string[0:len(extracted_string)//2]}.csv"
-    # Create the output directory if it doesn't exist
-    os.makedirs(output_directory, exist_ok=True)
-    #output_csv_path = f"output_{output_lang}_{docx_path[0:len(docx_path)//2]}.html"
-    # Convert DOCX to HTML
-    html_content = convert_docx_to_html(input_docx_path)
-    print("Doc converted into html successfully.")
-    # Write HTML content to a file
-    #with open(output_html_path, "w", encoding="utf-8") as html_file:
-        #html_file.write(html_content)
-    #print("Conversion complete. HTML file saved at:", output_html_path)
-    # Extract href attributes
-    href_attributes = extract_href_attributes(html_content)
-    #print("Extracted href attributes:", href_attributes)
-    output_urls = [localize_URL(url, output_lang) for url in href_attributes]
-    # Create a pandas DataFrame
-    df = pd.DataFrame({'index': range(1, len(href_attributes) + 1), 'input_url': href_attributes, 'output_url': output_urls})
-    # Export the DataFrame to a CSV file
-    if not df.empty:
-      print("Check your exported csv file at the left on the Folder icon\nwith the name", output_csv_path)
-      df.to_csv(output_csv_path, index=False, encoding="utf-8")
-    # Display the DataFrame
-    return df
-# From PDF file -------------------- ##
-# NEEDS FITZ
-def pdf2_bitable(pdf_path: str, output_lang: str):
-  if not pdf_path.lower().endswith("pdf"):
-    print(f"ERROR: File not found or is not .pdf. Verify the input_path field above: {pdf_path}")
-    return None
-  # Create a document object
-  doc = fitz.open(pdf_path)  # or fitz.Document(filename)
-  # Create a pandas DataFrame
-  data = []
-  # get the links on all pages
-  for i in range(doc.page_count):
-      page = doc.load_page(i)
-      links = page.get_links()
-      if links:
-          for item in links:
-              input_url = item.get('uri')
-              if input_url != None:
-                localized_url = localize_URL(input_url, output_lang)
-                data.append({'index': len(data) + 1, 'Page': i, 'input_url': input_url, 'localized_url': localized_url})
-  # Create a pandas DataFrame
-  df_pdf = pd.DataFrame(data)
-  # Name the file based on the pdf's name
-  last_slash_index = pdf_path.rfind('/')
-  if last_slash_index != -1:
-      extracted_string = f"{pdf_path[last_slash_index + 1:]}"
-      extracted_string = extracted_string.replace("#", "")
-      #print(extracted_string)
-  else:
-      #print("No '/' found in the URL.")
-      extracted_string = pdf_path
-      extracted_string = extracted_string.replace("#", "")
-  # Naming the output file
-  output_directory = '/content'
-  output_csv_path = f"{output_directory}/output_{output_lang}_{extracted_string[0:len(extracted_string)//2]}.csv"
-  # Create the output directory if it doesn't exist
-  os.makedirs(output_directory, exist_ok=True)
-  if not df_pdf.empty:
-    # Export the DataFrame to a CSV file
-    df_pdf.to_csv(output_csv_path, index=False, encoding="utf-8")
-    print("Check your exported csv file at the left on the Folder icon\nwith the name", output_csv_path)
-    return df_pdf
-  else:
-    print("ERROR: File not found or is not .pdf. Verify the input_path field above.")
-    return None
-# DOCX REPLACER -------------------- ##
-#Replace links in Docx with SpireDoc
-def docx2docx_replacer(my_chemin_docx: str, my_langue):
-# Create a Document object
-  doc = Document()
-  # Load a Word file
-  doc.LoadFromFile(my_chemin_docx)
-  # Find all hyperlinks in the document
-  hyperlinks = []
-  for i in range(doc.Sections.Count):
-      section = doc.Sections.get_Item(i)
-      for j in range(section.Body.ChildObjects.Count):
-          sec = section.Body.ChildObjects.get_Item(j)
-          if sec.DocumentObjectType == DocumentObjectType.Paragraph:
-              for k in range((sec if isinstance(sec, Paragraph) else None).ChildObjects.Count):
-                  para = (sec if isinstance(sec, Paragraph)
-                          else None).ChildObjects.get_Item(k)
-                  if para.DocumentObjectType == DocumentObjectType.Field:
-                      field = para if isinstance(para, Field) else None
-                      if field.Type == FieldType.FieldHyperlink:
-                          hyperlinks.append(field)
-  # Iterate through hyperlinks and update them
-  for hyperlink in hyperlinks:
-      # Get the current display text and URL
-      current_url = hyperlink.Code.replace('HYPERLINK "', '').replace('"', '')
-      match = re.search(r'HYPERLINK "(.*?)"', hyperlink.Code)
-      if match:
-          current_url = match.group(1)
-      current_display_text = hyperlink.FieldText
-      localized_url = localize_URL(current_url, my_langue)
-      if localized_url:
-      # Update the display text and URL of the hyperlink
-      #hyperlink.FieldText = "NEW DISPLAY TEXT"  # Replace with your new display text
-        hyperlink.Code = f'HYPERLINK "{localized_url}"'
-  if len(hyperlinks)>0:
-    # Naming output file
-    last_slash_index = my_chemin_docx.rfind('/')
-    if last_slash_index != -1:
-        extracted_string = f"{my_chemin_docx[last_slash_index + 1:]}"
-        extracted_string = extracted_string.replace("#", "")
-        #print(extracted_string)
-    else:
-        #print("No '/' found in the URL.")
-        extracted_string = my_chemin_docx
-        extracted_string = extracted_string.replace("#", "")
-    output_directory = '/content'
-    output_path = f"{output_directory}/output_{my_langue}_{extracted_string[0:len(extracted_string)//2]}.docx"
-    # Create the output directory if it doesn't exist
-    os.makedirs(output_directory, exist_ok=True)
-    # Save the document to a docx file
-    print("\n\nSaving the output file:")
-    doc.SaveToFile(output_path, FileFormat.Docx)
-    print(f"Output file saved successfuly in your content folder as:\n\t{output_path}")
-    doc.Close()
-  else:
-    print(f"ERROR on processing the file: {my_chemin_docx}")
-# 6. HTML downloader and link replacer -------------------- ##
-def link2_html_converter(UNEP_URL_DOWNREPLACE: str, language_code: str):
-  """Takes an input link from UNEP website. It downloads the webpage
-  translatable content, replace its links with the localized version and
-  exports a .txt file with the HTML tags ready to be used in any CAT tool
-  for human translation.
-  """
-  modified_html = localize_UNEP_html(language_code, UNEP_URL_DOWNREPLACE)
-  if not modified_html:
-    print("ERROR: The input URL might not be accessible, or not an URL.")
-    raise ValueError("The input URL might not be accessible, or not an URL.")
-  print(f"\nFile to be exported in your folder, or\n\n\t\tcopy the result from below :\n\n\n{modified_html}")
-  # Name the file based on the webpage's name
-  last_slash_index = UNEP_URL_DOWNREPLACE.rfind('/')
-  if last_slash_index != -1:
-      extracted_string = f"{UNEP_URL_DOWNREPLACE[last_slash_index + 1:]}_replacedURLs_{language_code}.txt"
-      extracted_string = extracted_string.replace("#", "")
-      #print(extracted_string)
-  else:
-      #print("No '/' found in the URL.")
-      extracted_string = UNEP_URL_DOWNREPLACE + ".txt"
-      extracted_string = extracted_string.replace("#", "")
-  # Save the modified HTML content to a .txt file in the current folder
-  with open(extracted_string, 'w', encoding='utf-8') as file:
-      print(type(modified_html))
-      print(modified_html)
-      file.write(modified_html)
-      print(f"File {extracted_string} exported succesfully")
-  # Force download in Google Colab
-  try:
-      from google.colab import files
-      files.download(extracted_string)
-  except ImportError:
-      pass
-# Install necessary libraries
-#!pip install gradio
-import gradio as gr
-from bs4 import BeautifulSoup
 # Define your custom function
 def render_html(htmltext, language):
@@ -1247,4 +953,4 @@ with gr.Blocks() as demo:
     run_button.click(render_html, inputs=[html_input, language_dropdown], outputs=html_output)
 # Launch the Gradio app with debug=True and share=True
-demo.launch(debug=True, share=True)

+import gradio as gr
 import re
 import requests
 import sys
 from urllib3 import disable_warnings
 import email.utils
 import pandas as pd
 disable_warnings(InsecureRequestWarning)
 #print(localize_URL(url5, "fr"))
 def extract_href_attributes(html_content):
     soup = BeautifulSoup(html_content, 'html.parser')
     # creates a list
     href_values = [a['href'] for a in soup.find_all('a', href=True)]
     return href_values
 #language_code = "es"
 #UNEP_URL_DOWNREPLACE = "https://www.unep.org/news-and-stories/press-release/global-annual-finance-flows-7-trillion-fueling-climate-biodiversity#"
     return str(soup)
 #Code created by Nelson JAIMES-QUINTERO
 # Define your custom function
 def render_html(htmltext, language):
     run_button.click(render_html, inputs=[html_input, language_dropdown], outputs=html_output)
 # Launch the Gradio app with debug=True and share=True
+demo.launch()