Spaces:

nelsonjq
/

UNEP-link-finder

Sleeping

App Files Files Community

nelsonjq commited on Jan 7

Commit

ebf2da8

verified ·

1 Parent(s): 6a59b4e

Create app.py

Browse files

Files changed (1) hide show

app.py +1250 -0

app.py ADDED Viewed

	@@ -0,0 +1,1250 @@

+import re
+import requests
+import sys
+import os
+import urllib
+from bs4 import BeautifulSoup
+import urllib.parse
+from urllib.parse import urlparse, urljoin
+from urllib3.exceptions import InsecureRequestWarning
+from urllib3 import disable_warnings
+import email.utils
+import pandas as pd
+import pypandoc
+import fitz
+from docx import Document
+from spire.doc import *
+from spire.doc.common import *
+disable_warnings(InsecureRequestWarning)
+def get_language_code(query):
+    """
+    Search for a value given a key or search for a key given a value in the language_dict.
+    Args:
+        query (str): The key or value to search for.
+    Returns:
+        str: The corresponding value or key.
+    """
+    for key, value in language_dict.items():
+        if query.lower() == key.lower():
+            return value
+        elif query.lower() == value.lower():
+            return key
+    return None
+# Example usage:
+language_dict = {
+    "Spanish": "es",
+    "French": "fr",
+    "Swahili": "sw",
+    "English": "en",
+    "Chinese": "zh-hans",
+    "Portuguese": "pt-br",
+    "Russian": "ru",
+    "Arabic": "ar"
+}
+#result_key = get_language_code("Spanish")
+#result_value = get_language_code("fr")
+#print(result_key)    # Output: "fr"
+#print(result_value)  # Output: "Spanish"
+#print(type(result_value))
+# Extract node's number from UNEP URL
+def find_UNEP_node(unep_full_link: str) -> str:
+  """find_UNEP_node access the input URL, finds the language version
+  of the webpage, return the URL's node that is common to all UNEP languages.
+  Args:
+    unep_full_link (str): String of full web url in UNEP website.
+  Returns:
+    str: URL's node
+  Examples:
+    >>> convert_UNEP_url('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts')
+    '34817'
+  """
+  # Send an URL request with headers to bypass CloudFlare as suggested by https://stackoverflow.com/a/74674276
+  req = urllib.request.Request(unep_full_link)
+  req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0')
+  req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8')
+  req.add_header('Accept-Language', 'en-US,en;q=0.5')
+  try:
+      response = urllib.request.urlopen(req)
+  except urllib.error.HTTPError as e:
+      print(f"HTTPError: {e.code} - {e.reason}")
+      # You can raise a custom exception or handle the error in any other way
+  except urllib.error.URLError as e:
+      print(f"URLError: {e.reason}")
+      # Handle other URL-related errors
+  except Exception as e:
+      print(f"An unexpected error occurred: {e}")
+      # Handle other unexpected errors
+  else:
+      # If no exception occurred, continue text processing
+      print("Scraping successful")
+      r = urllib.request.urlopen(req).read().decode('utf-8')
+      if r:
+        # Convert html into BeautifulSoup object
+        soup = BeautifulSoup(r, 'html.parser')
+        #print(soup)
+        # Find the <ul> element with class 'links'
+        ul_element = soup.find('ul', class_='links')
+        # Find the <li> element with class 'es is-active'
+        li_element = ul_element.find('li', class_=lambda x: x and x.endswith('is-active'))
+        # Extract the value of the 'data-drupal-link-system-path' attribute
+        attribute_value = li_element.get('data-drupal-link-system-path')
+        return attribute_value.split('node/')[1]
+# test
+#print(find_UNEP_node('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts'))
+#print(type(find_UNEP_node('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts')))
+# Main function: finds the language version of a web article in UNEP website.
+def convert_UNEP_url(unep_full_link: str, target_lang: str = 'en') -> str:
+  """convert_UNEP_url access the input URL, finds the URL of the translated version
+  of the webpage in the input language, return an URL.
+  Args:
+    unep_full_link (str): String of full web url in UNEP website.
+    target_lang (str): Target language, default = 'en'.
+  Returns:
+    str: New converted URL
+  Examples:
+    >>> convert_UNEP_url('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts', 'es')
+    'https://www.unep.org/es/noticias-y-reportajes/reportajes/los-pueblos-indigenas-recurren-los-tribunales-ante-la-crisis'
+  """
+  # Send an URL request with headers to bypass CloudFlare as suggested by https://stackoverflow.com/a/74674276
+  req = urllib.request.Request(unep_full_link)
+  req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0')
+  req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8')
+  req.add_header('Accept-Language', 'en-US,en;q=0.5')
+  try:
+      response = urllib.request.urlopen(req)
+  except urllib.error.HTTPError as e:
+      print(f"HTTPError: {e.code} - {e.reason}")
+      # You can raise a custom exception or handle the error in any other way
+      return None
+  except urllib.error.URLError as e:
+      print(f"URLError: {e.reason}")
+      # Handle other URL-related errors
+      return None
+  except Exception as e:
+      print(f"An unexpected error occurred: {e}")
+      # Handle other unexpected errors
+      return None
+  else:
+      # If no exception occurred, continue text processing
+      print("Scraping successful")
+      r = urllib.request.urlopen(req).read().decode('utf-8')
+      if r:
+        # Convert html into BeautifulSoup object
+        soup = BeautifulSoup(r, 'html.parser')
+        #print(soup)
+        # Looks for the link in the target language, whose class is "language-link"
+        lenguas = soup.find("a", class_="language-link", hreflang = target_lang)
+        #print(lenguas)
+        if lenguas:
+          #print(f"https://www.unep.org{lenguas['href']}")
+          if lenguas['href'].endswith('/node'):
+            return f"https://www.unep.org{lenguas['href'][0:-5]}"
+          return f"https://www.unep.org{lenguas['href']}"
+        elif not lenguas:
+            # Find the <ul> element with class 'links'
+          ul_element = soup.find('ul', class_='links')
+          if ul_element:
+          # Find the <li> element with class 'es is-active'
+            li_element = ul_element.find('li', class_=lambda x: x and x.endswith('is-active'))
+          # Extract the value of the 'data-drupal-link-system-path' attribute
+            node_value = li_element.get('data-drupal-link-system-path')
+            return find_from_nodeLink(int(node_value.split("/")[1]), target_lang)
+            #return f"https://www.unep.org/{node_value}"
+        else:
+          raise ValueError("Error: Webpage accessed but the tag 'a', class_='language-link' was not found. Probably because the website was blocked by firewall/CloudFlare")
+          return None
+      else:
+        print("\n<-- Error code. The programme could not access the webpage, forbidden")
+        return None
+# test
+#input = input("Enter your UNEP url:")
+#input = 'https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts'
+#input = "https://www.unep.org/ru"
+#print(convert_UNEP_url(input, 'es'))
+#print(convert_UNEP_url(input, 'fr'))
+UNEP_LANG_CODES = ['ar', 'es', 'fr', 'ru', 'sw', 'pt-br', 'ch', 'zh', 'zh-hans', 'en']
+def find_from_nodeLink(node_input, target_lang='empty'):
+    """Replaces a node_link to the corresponding language.
+    Args:
+        node_input (str, int): Either a string of web URL containing the word 'node' and its ID, or an integer ID (or a string representation of an integer).
+        target_lang (str): Target language, default = 'empty'.
+    Returns:
+        str: New converted URL
+    Examples:
+        >>> find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'fr')
+        'https://www.unep.org/fr/node/30010'
+        >>> find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'empty')
+        'https://www.unep.org/node/30010'
+        >>> find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'zh-hans')
+        'https://www.unep.org/zh-hans/node/30010'
+        >>> find_from_nodeLink(30010, 'fr')
+        'https://www.unep.org/fr/node/30010'
+        >>> find_from_nodeLink('30010', 'fr')
+        'https://www.unep.org/fr/node/30010'
+    """
+    if isinstance(node_input, str) and node_input.isdigit():
+        node_input = int(node_input)
+    if isinstance(node_input, int):
+        node_url = f'https://www.unep.org/{target_lang}/node/{node_input}'
+    elif isinstance(node_input, str):
+        node_url = node_input
+    else:
+        raise ValueError("Error: Provide either a string URL or an integer ID (or a string representation of an integer)")
+    pattern = r"https://www\.unep\.org/[a-z]*-?[a-z]*/?node/(\d+)"
+    if target_lang == "empty":
+        target_lang = "en"
+    if target_lang in ["ch", 'zh', 'cn']:
+      target_lang = "zh-hans"
+    if target_lang in ['pt', 'pt-pt']:
+      target_lang = "pt-br"
+    if target_lang in UNEP_LANG_CODES:
+        if re.findall(pattern, node_url):
+            # Replace the language part in the URL
+            new_url = re.sub(pattern, r"https://www.unep.org/{}/node/\1".format(target_lang), node_url)
+            return new_url
+        else:
+            raise ValueError("Error: URL not found, or website blocked by firewall/CloudFare")
+    else:
+        raise ValueError("Error: Provide a language code among these: 'ar','es','fr','ru','sw','pt-br','zh-hans', 'en' or leave empty")
+ # Generic scraper
+def get_HTML_generic(any_url: str) -> BeautifulSoup:
+  """Any website link converter, it access the website and returns the HTML.
+  Args:
+  any_url (str): String of web url from the web wedocs.unep.org
+  Returns:
+    str: parsed HTML with BeautifulSoup
+  Example:
+    >>> convert_WeDocs_href('https://wedocs.unep.org/handle/20.500.11822/43104', 'Chinese')
+    'https://wedocs.unep.org/bitstream/handle/20.500.11822/43104/PracticalGuide_ZH.pdf?sequence=5&isAllowed=y'
+  """
+  req = urllib.request.Request(any_url)
+  req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0')
+  req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8')
+  req.add_header('Accept-Language', 'en-US,en;q=0.5')
+  try:
+      response = urllib.request.urlopen(req)
+  except urllib.error.HTTPError as e:
+      print(f"HTTPError: {e.code} - {e.reason} when accessing {any_url}")
+      # You can raise a custom exception or handle the error in any other way
+  except urllib.error.URLError as e:
+      print(f"URLError: {e.reason} when accessing {any_url}")
+      # Handle other URL-related errors
+  except Exception as e:
+      print(f"An unexpected error occurred: {e} when accessing {any_url}")
+      # Handle other unexpected errors
+  else:
+      # If no exception occurred, continue text processing
+      print("Scraping successful")
+      r = urllib.request.urlopen(req).read().decode('utf-8')
+      if r:
+        # Convert html into BeautifulSoup object
+        soup = BeautifulSoup(r, 'html.parser')
+        return soup
+        #print(soup)
+# Example usage with an integer ID provided as a string
+#print(find_from_nodeLink('30010', 'fr'))
+#print(find_from_nodeLink(30010, 'fr'))
+#print(find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'zh-hans'))
+def try_lang_switcher(switcher_soup, lang_code: str, base_url) -> str:
+  # Find the <ul> element with class "language-switcher"
+  #language_switcher_ul = switcher_soup.find('ul', class_='language-switcher')
+  language_switcher_ul = switcher_soup.find('ul', class_=lambda value: value and value.startswith('language-switcher'))
+  # Extract href values from <a> elements within the <ul>
+  if language_switcher_ul:
+    href_values = [a['href'] for a in language_switcher_ul.find_all('a')]
+    for i, element in enumerate(href_values):
+          if lang_code in element:
+              new_link = urljoin(base_url, href_values[i])
+              return new_link
+  return None
+# Function to concatenate absolute paths if URL cannot be accessed
+from urllib.parse import urljoin
+def concatenate_missing_segments(arg1, arg2):
+    """
+    Concatenates missing URL segments from Arg1 to Arg2.
+    Args:
+        arg1 (str): The URL containing the missing segments, longer URL like
+        "https://www.unep.org/interactive/explore-ecosystems/mountains/en/index.php#/mountain-intro"
+        arg2 (str): The target URL, shorter URL like
+        "https://www.unep.org/interactive/explore-ecosystems/mountains/ar"
+    Returns:
+        str: The concatenated URL.
+        "https://www.unep.org/interactive/explore-ecosystems/mountains/ar/index.php#/mountain-intro"
+    """
+    if len(arg1)>len(arg2):
+      missing_segment = arg1[len(arg2):]
+      return arg2 + missing_segment
+# Example usage:
+#arg1 = "https://www.unep.org/interactive/explore-ecosystems/mountains/en/index.php#/mountain-intro"
+#arg2 = "https://www.unep.org/interactive/explore-ecosystems/mountains/ar"
+#result = concatenate_missing_segments(arg1, arg2)
+#print(result)
+def convert_URL_anyWebsite(any_web_url: str, lang_code) -> str:
+  # Access the URL to get the HTML with BeautifulSoup --> soup object
+  sauce_html = get_HTML_generic(any_web_url)
+  print(type(sauce_html))
+  if sauce_html:
+    # Search the language_switcher HTML tag and gets the language code
+    switcher_link = try_lang_switcher(sauce_html, lang_code.lower(), any_web_url)
+    if switcher_link and get_HTML_generic(switcher_link):
+      return switcher_link
+    elif switcher_link:
+      return concatenate_missing_segments(any_web_url, switcher_link)
+    elif sauce_html.find_all(lambda tag: tag.has_attr('data-sf-role') and tag['data-sf-role'] == lang_code):    #working for WHO news
+      print("trying WHO")
+      matching_tags = sauce_html.find_all(lambda tag: tag.has_attr('data-sf-role') and tag['data-sf-role'] == lang_code)
+      if matching_tags:
+        print(matching_tags)
+        return matching_tags[0]['value']
+    elif sauce_html.find_all(lambda tag: tag.has_attr('hreflang') and tag['hreflang'] == lang_code):
+      print("trying hreflang")
+      matching_tags = sauce_html.find_all(lambda tag: tag.has_attr('hreflang') and tag['hreflang'] == lang_code)
+      if matching_tags:
+        return matching_tags[0]['href']
+    elif sauce_html:
+      print("trying language_link") # working for UNESCO
+      lang_tag = sauce_html.find("a", class_="language-link", hreflang = lang_code)
+      #print(lang_tag)
+      if lang_tag != None:
+        return urljoin(any_web_url, lang_tag['href'])
+    else:
+      return None
+#output_li = convert_URL_anyWebsite("[email protected]", "es")
+#print(output_li)
+def weDocs_short(weDocs_url) -> str:
+  """Replaces a language specific WeDocs link with the landing page
+  Args:
+    weDocs_url (str): String of web url from the web wedocs.unep.org
+  Returns:
+    str: Landing page of the document, so it is not language specific.
+  Example:
+    >>> weDocs_short('https://wedocs.unep.org/bitstream/handle/20.500.11822/43104/Practical_Guide.pdf?sequence=1&isAllowed=y')
+    'https://wedocs.unep.org/handle/20.500.11822/43104/'
+  """
+  return re.sub(r"https://wedocs.unep.org/(bitstream/)?handle/([\w.-]+/\d+).+", r"https://wedocs.unep.org/handle/\2", weDocs_url)
+# WeDocs link converter, it access a short WeDocs link and returns a language-specific URL (pdf)
+def convert_WeDocs_href(url: str, target_lang: str ='English') -> str:
+    """WeDocs link converter, it access a short WeDocs link
+    and returns a language-specific URL (pdf)
+    Args:
+    weDocs_url (str): String of web url from the web wedocs.unep.org
+    target_lang (str): Language code of the document to find.
+    Returns:
+      str: Download link of the PDF in the language requested.
+    Example:
+      >>> convert_WeDocs_href('https://wedocs.unep.org/handle/20.500.11822/43104', 'Chinese')
+      'https://wedocs.unep.org/bitstream/handle/20.500.11822/43104/PracticalGuide_ZH.pdf?sequence=5&isAllowed=y'
+    """
+    try:
+        # Send an HTTP GET request to the URL
+        response = requests.get(url, verify=False)
+        # Check if the request was successful
+        if response.status_code == 200:
+            # Parse the HTML content using BeautifulSoup
+            pattern = re.compile(r".*{}.*".format(re.escape(target_lang.capitalize())))    # TODO normalize to take into account the Dico's key, in case user enters RU instead of Russian
+            soup = BeautifulSoup(response.text, 'html.parser')
+            #print(soup.prettify())
+            # Find the <a> tag with the word "Spanish" or the entered language name in its text
+            # Extract the href attribute value
+            lang_link = soup.find(string=re.compile(pattern)).parent['href']
+            #print(lang_link)
+            if lang_link:
+                # Merge the domain and PDF name to create the complete link
+              clean_link = "https://wedocs.unep.org" + lang_link
+              return clean_link
+            else:
+                return f"No link with '{target_lang}' text found."
+        else:
+            return "Failed to retrieve the URL."
+    except Exception as e:
+        return str(e)
+#spanish_href = extract_WeDocs_href(url, "Spanish")
+#portuguese_href = extract_WeDocs_href(url, "Portuguese")
+#ch_href = convert_WeDocs_href(url, "Chinese")
+#print(spanish_href)
+#print(portuguese_href)
+#print(ch_href)
+def access_un_library_by_id(user_input_id):
+    try:
+        # Base URL
+        base_url = "https://digitallibrary.un.org/search?"
+        # Construct the URL with the user-provided ID
+        url = f"{base_url}ln=fr&p={user_input_id}&f=&c=Resource%20Type&c=UN%20Bodies&sf=&so=d&rg=50&fti=0"
+        # Send an HTTP GET request to the URL
+        response = requests.get(url)
+        # Check if the request was successful (status code 200)
+        if response.status_code == 200:
+            print("Request was successful. Content:")
+            # Parse the HTML content using BeautifulSoup
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Find the <div> with class="result-title"
+            result_title_div = soup.find('div', class_='result-title')
+            if result_title_div:
+                # Find the first <a> tag within the result-title div and get its href value
+                result_title_a = result_title_div.find('a', href=True)
+                if result_title_a:
+                    href_value = result_title_a['href']
+                    return f"https://digitallibrary.un.org{href_value}"
+                else:
+                    print("No <a> tag found inside result-title.")
+            else:
+                print("No result-title div found in the HTML.")
+                return None
+        else:
+            print(f"Failed to retrieve the URL. Status code: {response.status_code}")
+            return None
+    except Exception as e:
+        print(f"An error occurred: {str(e)}")
+        return None
+# Get user input for the ID
+#user_input_id = input("Enter the ID: ")
+# Call the function with user input
+#resultado = access_un_library_by_id(user_input_id)
+#print(resultado)
+  # Send an URL request with headers to bypass CloudFlare as suggested by https://stackoverflow.com/a/74674276
+def access_un_library_byResourceURL(landing_url: str) -> BeautifulSoup:
+    req = urllib.request.Request(landing_url)
+    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0')
+    req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8')
+    req.add_header('Accept-Language', 'en-US,en;q=0.5')
+    try:
+        response = urllib.request.urlopen(req)
+    except urllib.error.HTTPError as e:
+        print(f"HTTPError: {e.code} - {e.reason}")
+        # You can raise a custom exception or handle the error in any other way
+        return None
+    except urllib.error.URLError as e:
+        print(f"URLError: {e.reason}")
+        return None
+        # Handle other URL-related errors
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return None
+        # Handle other unexpected errors
+    else:
+        # If no exception occurred, continue text processing
+        print("Scraping successful")
+        r = urllib.request.urlopen(req).read().decode('utf-8')
+        if r:
+          # Convert html into BeautifulSoup object
+          soup = BeautifulSoup(r, 'html.parser')
+          #print(soup)
+          return soup
+        else:
+          # HTML error
+          raise ValueError("Error in parsing the website content in HTML")
+          return None
+def extract_info_UNdocLink(url, lang2_code):
+    """
+    Extracts information from a given UNDocs URL.
+    Args:
+        url (str): The UNDocs URL.
+    Returns:
+        dict: A dictionary containing the extracted information.
+    """
+    # Define a regex pattern to match the components in the URL
+    # https://undocs.org/en/UNEP/EA.5/28/Corr.1
+    pattern = r'https://undocs\.org/([a-z]{2})?/?([A-Z]+)/(.*?)/(\d+)/(.*?)$'
+    # Use regex to find the components in the URL
+    match = re.match(pattern, url)
+    if match:
+        record_id = match.group(0)
+        symbol = match.group(2)
+        doc_type = match.group(3)
+        unga = match.group(4)
+        resolution_id = match.group(5)
+        language_code = match.group(1) if match.group(1) else None  # Optional language code
+        return  f"https://undocs.org/{lang2_code.lower()}/{symbol}/{doc_type}/{unga}/{resolution_id}"
+    else:
+        return None
+# Example usage:
+#url = "https://undocs.org/en/UNEP/EA.5/28/Corr.1"
+#result = extract_info_UNdocLink(url, "fr")
+#print(result)
+# Define the language dictionary
+language_dict = {
+    "Spanish": "es",
+    "French": "fr",
+    "English": "en",
+    "Chinese": "ch",
+    "Russian": "ru",
+    "Arabic": "ar"
+}
+#input_language = "Russian"
+# 1.7 UN Docs
+def get_jobID_undocs(url):
+    """
+    Extracts the job ID from a given URL of the ny.un.org website.
+    Args:
+        url (str): The URL of the document on ny.un.org.
+    Returns:
+        str: The extracted job ID.
+    """
+    # Define a regex pattern to match the job ID in the URL
+    pattern = r'dds-ny.*/([A-Za-z0-9]+)\.pdf'
+    # Use regex to find the job ID in the URL
+    match = re.search(pattern, url)
+    # Return the matched job ID or None if not found
+    return match.group(1) if match else None
+# Extract the `value` attribute of <option> tags with the specified regex pattern
+def find_lang_UNdoc(un_docs_link, input_language):
+    un_library_url = un_docs_link
+    # Define the language dictionary
+    UN_languages_dict = {
+        "Spanish": "es",
+        "French": "fr",
+        "English": "en",
+        "Chinese": "ch",
+        "Russian": "ru",
+        "Arabic": "ar"
+    }
+    if "undocs.org" in un_docs_link:
+      #return extract_info_UNdocLink(un_docs_link, UN_languages_dict[input_language])
+      return extract_info_UNdocLink(un_docs_link, input_language)
+    elif "dds-ny" in un_docs_link:
+      #extract ID TODO
+      un_library_url_ID = get_jobID_undocs(un_docs_link)
+      print(un_library_url_ID)
+      # Get URL from ID
+      un_library_url = access_un_library_by_id(un_library_url_ID)
+      print(un_library_url)
+    elif "digitallibrary.un.org" in un_docs_link:
+      un_library_url = un_docs_link
+    try:
+    # Get HTML from UN_lib URL
+    #soup = access_un_library_byResourceURL(un_library_url)
+      my_soup = access_un_library_byResourceURL(un_library_url)
+      if my_soup is None:
+        return None
+    except Exception as e:
+      print(f"An unexpected error occurred: {e}")
+      return None
+    else:
+      # Define the regex pattern
+      regex_pattern = r"-(\w{2})\.pdf"
+      # Find all <option> tags
+      options = my_soup.find_all('option', value=re.compile(regex_pattern))
+      # Extract and print the `value` attribute values
+      for option in options:
+          value = option['value']
+          match = re.search(regex_pattern, value)
+          if match:
+              language_code = match.group(1)
+              # Check if the language code is in the language_dict
+              language = next((k for k, v in UN_languages_dict.items() if v.startswith(language_code.lower())), 'Unknown')
+              #print(f"Option Value: {value}, Language Code: {language_code}, Language: {language}")
+              # Prepare the direct link for the requested language
+              if language == input_language:
+                output_links = [value]
+                # Define a regular expression pattern with capture groups
+                pattern = r"https://digitallibrary.un.org/record/(\d+)/files/([A-Z]+)_([A-Z]+)_([\d]+)_([\d]+)-(\w{2})\.pdf"
+                # Use re.search to find matches and capture groups
+                match = re.search(pattern, value)
+                if match:
+                    # Extract capture group values
+                    record_id = match.group(1)
+                    symbol = match.group(2)         # A
+                    doc_type = match.group(3)       # RES
+                    unga = match.group(4)           # 61
+                    resolution_id = match.group(5)  # 295
+                    language_code = match.group(6)  # es
+                    # Construct the output string  # https://undocs.org/es/A/RES/61/295
+                    output_links.append(f"https://undocs.org/{symbol}/{doc_type}/{unga}/{resolution_id}")
+                    output_links.append(f"https://undocs.org/{language_code.lower()}/{symbol}/{doc_type}/{unga}/{resolution_id}")
+                else:
+                    print("No match found for the input string.")
+                # Output is a list of 3 links:
+                  # 1 is UN Library:  https://digitallibrary.un.org/record/606782/files/A_RES_61_295-ZH.pdf
+                  # 2 is UN Docs multilingual shortlink: https://undocs.org/A/RES/61/295
+                  # 3 is UN Docs MONO-lingual shortlink: https://undocs.org/zh/A/RES/61/295
+                return output_links
+# Call the function to extract and print the option values
+#print(find_lang_UNdoc("https://undocs.org/en/UNEP/EA.5/28/Corr.1", "Russian"))
+#print(get_language_code("fr"))
+#print(find_lang_UNdoc("https://www.ohchr.org/en/documents/thematic-reports/ahrc3917-report-special-rapporteur-rights-indigenous-peoples", get_language_code("fr")))
+import re
+def convert_Intl_Day(url, language_code):
+    """
+    Converts the language code in a UN URL to the specified language.
+    Args:
+        url (str): The UN URL.
+        language_code (str): The target language code.
+    Returns:
+        str: The modified URL with the specified language code.
+    """
+    # Use regex to replace the language code in the URL
+    if language_code.lower() == "ch":
+      return re.sub(r'/([a-z]{2})/observances', f'/zh/observances', url)
+    else:
+      return re.sub(r'/([a-z]{2})/observances', f'/{language_code}/observances', url)
+# Example usage:
+#url = "https://www.un.org/es/observances/cities-day"
+#modified_url = convert_Intl_Day(url, "ch")
+#print(modified_url)
+import re
+def convert_URLendingBy_langEqualsCode(url, language_code):
+    """
+    Converts the language code in a URL with the pattern ?lang=[A-Z]{2} to the specified language.
+    No URL validation.
+    Args:
+        url (str): The URL.
+        language_code (str): The target language code.
+    Returns:
+        str: The modified URL with the specified language code.
+    """
+    if language_code.lower() == "ch":
+      return re.sub(r'(\?lang=)[A-Z]{2}', fr'\1ZH', url)
+    else:
+    # Use regex to replace the language code in the URL
+      return re.sub(r'(\?lang=)[A-Z]{2}', fr'\1{language_code.upper()}', url)
+# Example usage:
+#url = "https://www.unep.org/interactives/beat-plastic-pollution/?lang=ES"
+#modified_url = convert_URLendingBy_langEqualsCode(url, "ch")
+#print(modified_url)
+# Ultimate finder function
+def localize_URL(mi_URL: str, lengua: str="en") -> str:
+  '''Apply all functions to try to find a language version of the input webpage
+  in the provided language code.
+  '''
+  resulting_link = None
+  def is_email(string):
+      print(f"Validating if {string} is an email:")
+      email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
+      return bool(email_pattern.match(string))
+  # Check if URL is not an email
+  if is_email(mi_URL):
+      print(f"{mi_URL} is an email")
+      return None
+  else:
+    #try UN Docs
+    #TODO find a way to scrape this search engine https://documents.un.org/prod/ods.nsf/home.xsp
+    # or how to download the PDF, access the symbol tag and join the url to undocs.org/
+    print("Trying find_lang_UNdoc for ", mi_URL)
+    resulting_link = find_lang_UNdoc(mi_URL, get_language_code(lengua))
+    if resulting_link:
+      return resulting_link[-1]
+    # International Days
+    if "/observances/" in mi_URL and "un.org/" in mi_URL:
+      print("Trying convert_Intl_Day")
+      resulting_link = convert_Intl_Day(mi_URL, lengua)
+      return resulting_link
+    #  WeDocs UNEP
+    if "wedocs.unep.org" in mi_URL:
+      print("Trying convert_WeDocs_href")
+      short_weDocs_url = weDocs_short(mi_URL)
+      resulting_link = convert_WeDocs_href(short_weDocs_url, get_language_code(lengua))
+      return resulting_link
+    # try UNEP articles
+    if "unep.org" in mi_URL and "wedocs" not in mi_URL:
+      print("Trying convert_UNEP_url")
+      resulting_link = convert_UNEP_url(mi_URL, lengua)
+      return resulting_link
+    elif ".pdf" not in mi_URL:
+      print("Trying convert_URL_anyWebsite")
+      resulting_link = convert_URL_anyWebsite(mi_URL, lengua)
+      print(resulting_link)
+      if resulting_link is not None:
+        return resulting_link
+    else:
+      return None
+#print(localize_URL("https://documents-dds-ny.un.org/doc/UNDOC/GEN/N06/512/07/PDF/N0651207.pdf?OpenElement", "fr"))
+#print(localize_URL("https://documents-dds-ny.un.org/doc/UNDOC/GEN/G16/015/38/PDF/G1601538.pdf?OpenElement", "fr"))
+#print(localize_URL("https://undocs.org/FCCC/CP/2015/10/Add.1", "fr"))
+#print(localize_URL("https://www.un.org/en/observances/environment-in-war-protection-day", "fr"))
+#print(localize_URL(url5, "fr"))
+def convert_docx_to_html(docx_file_path):
+    output = pypandoc.convert_file(docx_file_path, 'html')
+    return output
+def extract_href_attributes(html_content):
+    soup = BeautifulSoup(html_content, 'html.parser')
+    # creates a list
+    href_values = [a['href'] for a in soup.find_all('a', href=True)]
+    return href_values
+def generate_table_URLs_from_Docx(docx_path, lang_code):
+  # Open the document
+  document = Document(docx_path)
+  # Extract hyperlinks
+  input_urls = []
+  for paragraph in document.paragraphs:
+      for run in paragraph.runs:
+          hyperlink = run.hyperlink
+          if hyperlink is not None:
+            input_urls.append(hyperlink.address)
+  #input_urls
+  data = []
+  # Initialize lists to store data for the DataFrame
+  index_list = []
+  original_url_list = []
+  localized_url_list = []
+  # Apply localizeURL to each URL in the list
+  for index, url in enumerate(input_urls):
+      localized_url = localize_URL(url, lang_code)  # Replace 'en' with the desired language code
+      index_list.append(index)
+      original_url_list.append(url)
+      localized_url_list.append(localized_url)
+  # Create a DataFrame
+  df_docx = pd.DataFrame({
+      'index': index_list,
+      'url': original_url_list,
+      'localized_url': localized_url_list
+  })
+  # Export the DataFrame to a CSV file
+  df_docx.to_csv(f"output_{lang_code}_{docx_path}", index=False, encoding="utf-8")
+  # Display the DataFrame
+  return df_docx
+#language_code = "es"
+#UNEP_URL_DOWNREPLACE = "https://www.unep.org/news-and-stories/press-release/global-annual-finance-flows-7-trillion-fueling-climate-biodiversity#"
+def extract_content_by_language(soup):
+    # Find the div with id="field_body"
+    field_body_div = soup.find('div', id='field_body')
+    if field_body_div:
+        # Helper function to recursively clean div tags deeper than direct children
+        def clean_div_tags(tag):
+            for child in tag.children:
+                if child.name == 'div':
+                    clean_div_tags(child)
+                else:
+                    content.append(str(child))
+        # Ignore secondary div tags and extract their children tags (except div tags)
+        content = []
+        for tag in field_body_div.find_all(recursive=False):
+            if tag.name == 'div':
+                # Clean div tags deeper than direct children
+                clean_div_tags(tag)
+            else:
+                # Include children tags (except div tags)
+                content.append(str(tag))
+        return ''.join(content).strip()
+    else:
+        print(f"Div with id='field_body' not found in the HTML.")
+        return None
+# Filter video frames and images HTML tags
+def transform_html_content(html_content):
+    # Parse the HTML content
+    soup = BeautifulSoup(html_content, 'html.parser')
+    # Transform iframe tags with "youtu" in src attribute to oembed tags
+    for iframe_tag in soup.find_all('iframe', src=lambda x: x and 'youtu' in x):
+        src_attribute = iframe_tag['src']
+        video_id = src_attribute.split('/')[-1]  # Extract video ID from the src attribute
+        oembed_tag = soup.new_tag('oembed')
+        oembed_tag.string = f'https://www.youtube.com/watch?v={video_id}'
+        iframe_tag.replace_with(oembed_tag)
+    # Merge figure tags and their children into a single img tag
+    for figure_tag in soup.find_all('figure'):
+        img_tag = figure_tag.find('img')
+        if img_tag:
+            # Create a new img tag with merged attributes
+            new_img_tag = soup.new_tag('img')
+            new_img_tag.attrs = img_tag.attrs
+            figcaption_tag = figure_tag.find('figcaption')
+            if figcaption_tag:
+                # Extract the content of figcaption tag for data-caption attribute
+                new_img_tag['data-caption'] = str(figcaption_tag.contents[0])
+            figure_tag.replace_with(new_img_tag)
+    # Return the modified HTML content
+    return soup
+# Link Replacer for HTML
+def localize_UNEP_html(language_code, soup):
+    """
+    Localizes the href attributes of <a> tags in HTML content based on the given language code.
+    Args:
+        language_code (str): The language code used for URL localization.
+        soup (BeautifulSoup): The BeautifulSoup object containing the parsed HTML content.
+    Returns:
+        str: The modified HTML content with localized href attributes.
+    Example:
+        language_code = "en"
+        soup = BeautifulSoup(html_content, 'html.parser')
+        modified_html = localize_UNEP_html(language_code, soup)
+        print(modified_html)
+    """
+    # Access the URL
+    print(f"Accessing the URL, type: {type(soup)}")
+    soup = get_HTML_generic(soup)
+    print(f"Accessing parsed HTML: {type(soup)}")
+    # Filter only translatable content
+    soup = extract_content_by_language(soup)
+    print(f"Filtered HTML: {type(soup)}")
+    # Transform images and embedded YouTube videos
+    soup = transform_html_content(soup)
+    print(f"Transformed IMG and IFRAME tags: {type(soup)}")
+    # Find all <a> tags in the HTML content
+    for a_tag in soup.find_all('a'):
+        # Get the current href attribute value
+        current_href = a_tag.get('href', '')
+        # Localize the URL using the provided language code
+        localized_url = localize_URL(current_href, language_code)
+        # Update the href attribute with the localized URL
+        if localized_url is not None:
+          a_tag['href'] = localized_url
+    # Return the modified HTML content
+    return str(soup)
+#Code created by Nelson JAIMES-QUINTERO
+# --------------------  ## -------------------- ## -------------------- #
+    #   FUNCTIONS FOR LAUNCHING THE DOCUMENT/LINK PROCESSING    #
+# DOC-HTML
+def docx2_bitable(docx_path: str, output_lang: str):
+    """Takes an input doc/docx file and creates a CSV file with 3 columns:
+    List number, URL found in the file, Localized URL in the input language.
+    """
+    if not docx_path.lower().endswith(".doc") and not docx_path.endswith(".docx"):
+      print("ERROR: File not found or is not .DOC nor .DOCX. Verify the input_path field above.")
+      return "ERROR: File not found or is not .DOC nor .DOCX. Verify the input_path field above."
+    input_docx_path = docx_path  # document
+      # Name the output_file based on the docx's name
+    last_slash_index = input_docx_path.rfind('/')
+    if last_slash_index != -1:
+        extracted_string = f"{input_docx_path[last_slash_index + 1:]}"
+        extracted_string = extracted_string.replace("#", "")
+        #print(extracted_string)
+    else:
+        #print("No '/' found in the URL.")
+        extracted_string = input_docx_path
+        extracted_string = extracted_string.replace("#", "")
+    # Naming the output file
+    output_directory = '/content'
+    output_csv_path = f"{output_directory}/output_{output_lang}_{extracted_string[0:len(extracted_string)//2]}.csv"
+    # Create the output directory if it doesn't exist
+    os.makedirs(output_directory, exist_ok=True)
+    #output_csv_path = f"output_{output_lang}_{docx_path[0:len(docx_path)//2]}.html"
+    # Convert DOCX to HTML
+    html_content = convert_docx_to_html(input_docx_path)
+    print("Doc converted into html successfully.")
+    # Write HTML content to a file
+    #with open(output_html_path, "w", encoding="utf-8") as html_file:
+        #html_file.write(html_content)
+    #print("Conversion complete. HTML file saved at:", output_html_path)
+    # Extract href attributes
+    href_attributes = extract_href_attributes(html_content)
+    #print("Extracted href attributes:", href_attributes)
+    output_urls = [localize_URL(url, output_lang) for url in href_attributes]
+    # Create a pandas DataFrame
+    df = pd.DataFrame({'index': range(1, len(href_attributes) + 1), 'input_url': href_attributes, 'output_url': output_urls})
+    # Export the DataFrame to a CSV file
+    if not df.empty:
+      print("Check your exported csv file at the left on the Folder icon\nwith the name", output_csv_path)
+      df.to_csv(output_csv_path, index=False, encoding="utf-8")
+    # Display the DataFrame
+    return df
+# From PDF file -------------------- ##
+# NEEDS FITZ
+def pdf2_bitable(pdf_path: str, output_lang: str):
+  if not pdf_path.lower().endswith("pdf"):
+    print(f"ERROR: File not found or is not .pdf. Verify the input_path field above: {pdf_path}")
+    return None
+  # Create a document object
+  doc = fitz.open(pdf_path)  # or fitz.Document(filename)
+  # Create a pandas DataFrame
+  data = []
+  # get the links on all pages
+  for i in range(doc.page_count):
+      page = doc.load_page(i)
+      links = page.get_links()
+      if links:
+          for item in links:
+              input_url = item.get('uri')
+              if input_url != None:
+                localized_url = localize_URL(input_url, output_lang)
+                data.append({'index': len(data) + 1, 'Page': i, 'input_url': input_url, 'localized_url': localized_url})
+  # Create a pandas DataFrame
+  df_pdf = pd.DataFrame(data)
+  # Name the file based on the pdf's name
+  last_slash_index = pdf_path.rfind('/')
+  if last_slash_index != -1:
+      extracted_string = f"{pdf_path[last_slash_index + 1:]}"
+      extracted_string = extracted_string.replace("#", "")
+      #print(extracted_string)
+  else:
+      #print("No '/' found in the URL.")
+      extracted_string = pdf_path
+      extracted_string = extracted_string.replace("#", "")
+  # Naming the output file
+  output_directory = '/content'
+  output_csv_path = f"{output_directory}/output_{output_lang}_{extracted_string[0:len(extracted_string)//2]}.csv"
+  # Create the output directory if it doesn't exist
+  os.makedirs(output_directory, exist_ok=True)
+  if not df_pdf.empty:
+    # Export the DataFrame to a CSV file
+    df_pdf.to_csv(output_csv_path, index=False, encoding="utf-8")
+    print("Check your exported csv file at the left on the Folder icon\nwith the name", output_csv_path)
+    return df_pdf
+  else:
+    print("ERROR: File not found or is not .pdf. Verify the input_path field above.")
+    return None
+# DOCX REPLACER -------------------- ##
+#Replace links in Docx with SpireDoc
+def docx2docx_replacer(my_chemin_docx: str, my_langue):
+# Create a Document object
+  doc = Document()
+  # Load a Word file
+  doc.LoadFromFile(my_chemin_docx)
+  # Find all hyperlinks in the document
+  hyperlinks = []
+  for i in range(doc.Sections.Count):
+      section = doc.Sections.get_Item(i)
+      for j in range(section.Body.ChildObjects.Count):
+          sec = section.Body.ChildObjects.get_Item(j)
+          if sec.DocumentObjectType == DocumentObjectType.Paragraph:
+              for k in range((sec if isinstance(sec, Paragraph) else None).ChildObjects.Count):
+                  para = (sec if isinstance(sec, Paragraph)
+                          else None).ChildObjects.get_Item(k)
+                  if para.DocumentObjectType == DocumentObjectType.Field:
+                      field = para if isinstance(para, Field) else None
+                      if field.Type == FieldType.FieldHyperlink:
+                          hyperlinks.append(field)
+  # Iterate through hyperlinks and update them
+  for hyperlink in hyperlinks:
+      # Get the current display text and URL
+      current_url = hyperlink.Code.replace('HYPERLINK "', '').replace('"', '')
+      match = re.search(r'HYPERLINK "(.*?)"', hyperlink.Code)
+      if match:
+          current_url = match.group(1)
+      current_display_text = hyperlink.FieldText
+      localized_url = localize_URL(current_url, my_langue)
+      if localized_url:
+      # Update the display text and URL of the hyperlink
+      #hyperlink.FieldText = "NEW DISPLAY TEXT"  # Replace with your new display text
+        hyperlink.Code = f'HYPERLINK "{localized_url}"'
+  if len(hyperlinks)>0:
+    # Naming output file
+    last_slash_index = my_chemin_docx.rfind('/')
+    if last_slash_index != -1:
+        extracted_string = f"{my_chemin_docx[last_slash_index + 1:]}"
+        extracted_string = extracted_string.replace("#", "")
+        #print(extracted_string)
+    else:
+        #print("No '/' found in the URL.")
+        extracted_string = my_chemin_docx
+        extracted_string = extracted_string.replace("#", "")
+    output_directory = '/content'
+    output_path = f"{output_directory}/output_{my_langue}_{extracted_string[0:len(extracted_string)//2]}.docx"
+    # Create the output directory if it doesn't exist
+    os.makedirs(output_directory, exist_ok=True)
+    # Save the document to a docx file
+    print("\n\nSaving the output file:")
+    doc.SaveToFile(output_path, FileFormat.Docx)
+    print(f"Output file saved successfuly in your content folder as:\n\t{output_path}")
+    doc.Close()
+  else:
+    print(f"ERROR on processing the file: {my_chemin_docx}")
+# 6. HTML downloader and link replacer -------------------- ##
+def link2_html_converter(UNEP_URL_DOWNREPLACE: str, language_code: str):
+  """Takes an input link from UNEP website. It downloads the webpage
+  translatable content, replace its links with the localized version and
+  exports a .txt file with the HTML tags ready to be used in any CAT tool
+  for human translation.
+  """
+  modified_html = localize_UNEP_html(language_code, UNEP_URL_DOWNREPLACE)
+  if not modified_html:
+    print("ERROR: The input URL might not be accessible, or not an URL.")
+    raise ValueError("The input URL might not be accessible, or not an URL.")
+  print(f"\nFile to be exported in your folder, or\n\n\t\tcopy the result from below :\n\n\n{modified_html}")
+  # Name the file based on the webpage's name
+  last_slash_index = UNEP_URL_DOWNREPLACE.rfind('/')
+  if last_slash_index != -1:
+      extracted_string = f"{UNEP_URL_DOWNREPLACE[last_slash_index + 1:]}_replacedURLs_{language_code}.txt"
+      extracted_string = extracted_string.replace("#", "")
+      #print(extracted_string)
+  else:
+      #print("No '/' found in the URL.")
+      extracted_string = UNEP_URL_DOWNREPLACE + ".txt"
+      extracted_string = extracted_string.replace("#", "")
+  # Save the modified HTML content to a .txt file in the current folder
+  with open(extracted_string, 'w', encoding='utf-8') as file:
+      print(type(modified_html))
+      print(modified_html)
+      file.write(modified_html)
+      print(f"File {extracted_string} exported succesfully")
+  # Force download in Google Colab
+  try:
+      from google.colab import files
+      files.download(extracted_string)
+  except ImportError:
+      pass
+# Install necessary libraries
+#!pip install gradio
+import gradio as gr
+from bs4 import BeautifulSoup
+# Define your custom function
+def render_html(htmltext, language):
+    soup = BeautifulSoup(htmltext, 'html.parser')
+    for a_tag in soup.find_all('a'):
+        # Get the current href attribute value
+        current_href = a_tag.get('href', '')
+        # Localize the URL using the provided language code
+        localized_url = localize_URL(current_href, language)
+        # Update the href attribute with the localized URL
+        if localized_url is not None:
+            a_tag['href'] = localized_url
+    # Return the modified HTML content
+    output = str(soup)
+    return output
+# Create the Gradio interface
+with gr.Blocks() as demo:
+    html_input = gr.Textbox(label="Enter HTML Code", lines=10, placeholder="Paste your HTML code here. You can convert Word file's content into HTML by using html-cleaner.com")
+    language_dropdown = gr.Dropdown(label="Select Language", choices=['es', 'fr', 'sw', 'en', 'zh-hans', 'pt-br', 'ru', 'ar'], value='es')
+    html_output = gr.HTML(label="Rendered HTML")
+    run_button = gr.Button("Find links!")
+    run_button.click(render_html, inputs=[html_input, language_dropdown], outputs=html_output)
+# Launch the Gradio app with debug=True and share=True
+demo.launch(debug=True, share=True)