Spaces:
Sleeping
Sleeping
import gradio as gr | |
import re | |
import requests | |
import sys | |
import os | |
import urllib | |
from bs4 import BeautifulSoup | |
import urllib.parse | |
from urllib.parse import urlparse, urljoin | |
from urllib3.exceptions import InsecureRequestWarning | |
from urllib3 import disable_warnings | |
import email.utils | |
import pandas as pd | |
disable_warnings(InsecureRequestWarning) | |
def get_language_code(query): | |
""" | |
Search for a value given a key or search for a key given a value in the language_dict. | |
Args: | |
query (str): The key or value to search for. | |
Returns: | |
str: The corresponding value or key. | |
""" | |
for key, value in language_dict.items(): | |
if query.lower() == key.lower(): | |
return value | |
elif query.lower() == value.lower(): | |
return key | |
return None | |
# Example usage: | |
language_dict = { | |
"Spanish": "es", | |
"French": "fr", | |
"Swahili": "sw", | |
"English": "en", | |
"Chinese": "zh-hans", | |
"Portuguese": "pt-br", | |
"Russian": "ru", | |
"Arabic": "ar" | |
} | |
#result_key = get_language_code("Spanish") | |
#result_value = get_language_code("fr") | |
#print(result_key) # Output: "fr" | |
#print(result_value) # Output: "Spanish" | |
#print(type(result_value)) | |
# Extract node's number from UNEP URL | |
def find_UNEP_node(unep_full_link: str) -> str: | |
"""find_UNEP_node access the input URL, finds the language version | |
of the webpage, return the URL's node that is common to all UNEP languages. | |
Args: | |
unep_full_link (str): String of full web url in UNEP website. | |
Returns: | |
str: URL's node | |
Examples: | |
>>> convert_UNEP_url('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts') | |
'34817' | |
""" | |
# Send an URL request with headers to bypass CloudFlare as suggested by https://stackoverflow.com/a/74674276 | |
req = urllib.request.Request(unep_full_link) | |
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0') | |
req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8') | |
req.add_header('Accept-Language', 'en-US,en;q=0.5') | |
try: | |
response = urllib.request.urlopen(req) | |
except urllib.error.HTTPError as e: | |
print(f"HTTPError: {e.code} - {e.reason}") | |
# You can raise a custom exception or handle the error in any other way | |
except urllib.error.URLError as e: | |
print(f"URLError: {e.reason}") | |
# Handle other URL-related errors | |
except Exception as e: | |
print(f"An unexpected error occurred: {e}") | |
# Handle other unexpected errors | |
else: | |
# If no exception occurred, continue text processing | |
print("Scraping successful") | |
r = urllib.request.urlopen(req).read().decode('utf-8') | |
if r: | |
# Convert html into BeautifulSoup object | |
soup = BeautifulSoup(r, 'html.parser') | |
#print(soup) | |
# Find the <ul> element with class 'links' | |
ul_element = soup.find('ul', class_='links') | |
# Find the <li> element with class 'es is-active' | |
li_element = ul_element.find('li', class_=lambda x: x and x.endswith('is-active')) | |
# Extract the value of the 'data-drupal-link-system-path' attribute | |
attribute_value = li_element.get('data-drupal-link-system-path') | |
return attribute_value.split('node/')[1] | |
# test | |
#print(find_UNEP_node('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts')) | |
#print(type(find_UNEP_node('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts'))) | |
# Main function: finds the language version of a web article in UNEP website. | |
def convert_UNEP_url(unep_full_link: str, target_lang: str = 'en') -> str: | |
"""convert_UNEP_url access the input URL, finds the URL of the translated version | |
of the webpage in the input language, return an URL. | |
Args: | |
unep_full_link (str): String of full web url in UNEP website. | |
target_lang (str): Target language, default = 'en'. | |
Returns: | |
str: New converted URL | |
Examples: | |
>>> convert_UNEP_url('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts', 'es') | |
'https://www.unep.org/es/noticias-y-reportajes/reportajes/los-pueblos-indigenas-recurren-los-tribunales-ante-la-crisis' | |
""" | |
# Send an URL request with headers to bypass CloudFlare as suggested by https://stackoverflow.com/a/74674276 | |
req = urllib.request.Request(unep_full_link) | |
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0') | |
req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8') | |
req.add_header('Accept-Language', 'en-US,en;q=0.5') | |
try: | |
response = urllib.request.urlopen(req) | |
except urllib.error.HTTPError as e: | |
print(f"HTTPError: {e.code} - {e.reason}") | |
# You can raise a custom exception or handle the error in any other way | |
return None | |
except urllib.error.URLError as e: | |
print(f"URLError: {e.reason}") | |
# Handle other URL-related errors | |
return None | |
except Exception as e: | |
print(f"An unexpected error occurred: {e}") | |
# Handle other unexpected errors | |
return None | |
else: | |
# If no exception occurred, continue text processing | |
print("Scraping successful") | |
r = urllib.request.urlopen(req).read().decode('utf-8') | |
if r: | |
# Convert html into BeautifulSoup object | |
soup = BeautifulSoup(r, 'html.parser') | |
#print(soup) | |
# Looks for the link in the target language, whose class is "language-link" | |
lenguas = soup.find("a", class_="language-link", hreflang = target_lang) | |
#print(lenguas) | |
if lenguas: | |
#print(f"https://www.unep.org{lenguas['href']}") | |
if lenguas['href'].endswith('/node'): | |
return f"https://www.unep.org{lenguas['href'][0:-5]}" | |
return f"https://www.unep.org{lenguas['href']}" | |
elif not lenguas: | |
# Find the <ul> element with class 'links' | |
ul_element = soup.find('ul', class_='links') | |
if ul_element: | |
# Find the <li> element with class 'es is-active' | |
li_element = ul_element.find('li', class_=lambda x: x and x.endswith('is-active')) | |
# Extract the value of the 'data-drupal-link-system-path' attribute | |
node_value = li_element.get('data-drupal-link-system-path') | |
return find_from_nodeLink(int(node_value.split("/")[1]), target_lang) | |
#return f"https://www.unep.org/{node_value}" | |
else: | |
raise ValueError("Error: Webpage accessed but the tag 'a', class_='language-link' was not found. Probably because the website was blocked by firewall/CloudFlare") | |
return None | |
else: | |
print("\n<-- Error code. The programme could not access the webpage, forbidden") | |
return None | |
# test | |
#input = input("Enter your UNEP url:") | |
#input = 'https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts' | |
#input = "https://www.unep.org/ru" | |
#print(convert_UNEP_url(input, 'es')) | |
#print(convert_UNEP_url(input, 'fr')) | |
UNEP_LANG_CODES = ['ar', 'es', 'fr', 'ru', 'sw', 'pt-br', 'ch', 'zh', 'zh-hans', 'en'] | |
def find_from_nodeLink(node_input, target_lang='empty'): | |
"""Replaces a node_link to the corresponding language. | |
Args: | |
node_input (str, int): Either a string of web URL containing the word 'node' and its ID, or an integer ID (or a string representation of an integer). | |
target_lang (str): Target language, default = 'empty'. | |
Returns: | |
str: New converted URL | |
Examples: | |
>>> find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'fr') | |
'https://www.unep.org/fr/node/30010' | |
>>> find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'empty') | |
'https://www.unep.org/node/30010' | |
>>> find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'zh-hans') | |
'https://www.unep.org/zh-hans/node/30010' | |
>>> find_from_nodeLink(30010, 'fr') | |
'https://www.unep.org/fr/node/30010' | |
>>> find_from_nodeLink('30010', 'fr') | |
'https://www.unep.org/fr/node/30010' | |
""" | |
if isinstance(node_input, str) and node_input.isdigit(): | |
node_input = int(node_input) | |
if isinstance(node_input, int): | |
node_url = f'https://www.unep.org/{target_lang}/node/{node_input}' | |
elif isinstance(node_input, str): | |
node_url = node_input | |
else: | |
raise ValueError("Error: Provide either a string URL or an integer ID (or a string representation of an integer)") | |
pattern = r"https://www\.unep\.org/[a-z]*-?[a-z]*/?node/(\d+)" | |
if target_lang == "empty": | |
target_lang = "en" | |
if target_lang in ["ch", 'zh', 'cn']: | |
target_lang = "zh-hans" | |
if target_lang in ['pt', 'pt-pt']: | |
target_lang = "pt-br" | |
if target_lang in UNEP_LANG_CODES: | |
if re.findall(pattern, node_url): | |
# Replace the language part in the URL | |
new_url = re.sub(pattern, r"https://www.unep.org/{}/node/\1".format(target_lang), node_url) | |
return new_url | |
else: | |
raise ValueError("Error: URL not found, or website blocked by firewall/CloudFare") | |
else: | |
raise ValueError("Error: Provide a language code among these: 'ar','es','fr','ru','sw','pt-br','zh-hans', 'en' or leave empty") | |
# Generic scraper | |
def get_HTML_generic(any_url: str) -> BeautifulSoup: | |
"""Any website link converter, it access the website and returns the HTML. | |
Args: | |
any_url (str): String of web url from the web wedocs.unep.org | |
Returns: | |
str: parsed HTML with BeautifulSoup | |
Example: | |
>>> convert_WeDocs_href('https://wedocs.unep.org/handle/20.500.11822/43104', 'Chinese') | |
'https://wedocs.unep.org/bitstream/handle/20.500.11822/43104/PracticalGuide_ZH.pdf?sequence=5&isAllowed=y' | |
""" | |
req = urllib.request.Request(any_url) | |
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0') | |
req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8') | |
req.add_header('Accept-Language', 'en-US,en;q=0.5') | |
try: | |
response = urllib.request.urlopen(req) | |
except urllib.error.HTTPError as e: | |
print(f"HTTPError: {e.code} - {e.reason} when accessing {any_url}") | |
# You can raise a custom exception or handle the error in any other way | |
except urllib.error.URLError as e: | |
print(f"URLError: {e.reason} when accessing {any_url}") | |
# Handle other URL-related errors | |
except Exception as e: | |
print(f"An unexpected error occurred: {e} when accessing {any_url}") | |
# Handle other unexpected errors | |
else: | |
# If no exception occurred, continue text processing | |
print("Scraping successful") | |
r = urllib.request.urlopen(req).read().decode('utf-8') | |
if r: | |
# Convert html into BeautifulSoup object | |
soup = BeautifulSoup(r, 'html.parser') | |
return soup | |
#print(soup) | |
# Example usage with an integer ID provided as a string | |
#print(find_from_nodeLink('30010', 'fr')) | |
#print(find_from_nodeLink(30010, 'fr')) | |
#print(find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'zh-hans')) | |
def try_lang_switcher(switcher_soup, lang_code: str, base_url) -> str: | |
# Find the <ul> element with class "language-switcher" | |
#language_switcher_ul = switcher_soup.find('ul', class_='language-switcher') | |
language_switcher_ul = switcher_soup.find('ul', class_=lambda value: value and value.startswith('language-switcher')) | |
# Extract href values from <a> elements within the <ul> | |
if language_switcher_ul: | |
href_values = [a['href'] for a in language_switcher_ul.find_all('a')] | |
for i, element in enumerate(href_values): | |
if lang_code in element: | |
new_link = urljoin(base_url, href_values[i]) | |
return new_link | |
return None | |
# Function to concatenate absolute paths if URL cannot be accessed | |
from urllib.parse import urljoin | |
def concatenate_missing_segments(arg1, arg2): | |
""" | |
Concatenates missing URL segments from Arg1 to Arg2. | |
Args: | |
arg1 (str): The URL containing the missing segments, longer URL like | |
"https://www.unep.org/interactive/explore-ecosystems/mountains/en/index.php#/mountain-intro" | |
arg2 (str): The target URL, shorter URL like | |
"https://www.unep.org/interactive/explore-ecosystems/mountains/ar" | |
Returns: | |
str: The concatenated URL. | |
"https://www.unep.org/interactive/explore-ecosystems/mountains/ar/index.php#/mountain-intro" | |
""" | |
if len(arg1)>len(arg2): | |
missing_segment = arg1[len(arg2):] | |
return arg2 + missing_segment | |
# Example usage: | |
#arg1 = "https://www.unep.org/interactive/explore-ecosystems/mountains/en/index.php#/mountain-intro" | |
#arg2 = "https://www.unep.org/interactive/explore-ecosystems/mountains/ar" | |
#result = concatenate_missing_segments(arg1, arg2) | |
#print(result) | |
def convert_URL_anyWebsite(any_web_url: str, lang_code) -> str: | |
# Access the URL to get the HTML with BeautifulSoup --> soup object | |
sauce_html = get_HTML_generic(any_web_url) | |
print(type(sauce_html)) | |
if sauce_html: | |
# Search the language_switcher HTML tag and gets the language code | |
switcher_link = try_lang_switcher(sauce_html, lang_code.lower(), any_web_url) | |
if switcher_link and get_HTML_generic(switcher_link): | |
return switcher_link | |
elif switcher_link: | |
return concatenate_missing_segments(any_web_url, switcher_link) | |
elif sauce_html.find_all(lambda tag: tag.has_attr('data-sf-role') and tag['data-sf-role'] == lang_code): #working for WHO news | |
print("trying WHO") | |
matching_tags = sauce_html.find_all(lambda tag: tag.has_attr('data-sf-role') and tag['data-sf-role'] == lang_code) | |
if matching_tags: | |
print(matching_tags) | |
return matching_tags[0]['value'] | |
elif sauce_html.find_all(lambda tag: tag.has_attr('hreflang') and tag['hreflang'] == lang_code): | |
print("trying hreflang") | |
matching_tags = sauce_html.find_all(lambda tag: tag.has_attr('hreflang') and tag['hreflang'] == lang_code) | |
if matching_tags: | |
return matching_tags[0]['href'] | |
elif sauce_html: | |
print("trying language_link") # working for UNESCO | |
lang_tag = sauce_html.find("a", class_="language-link", hreflang = lang_code) | |
#print(lang_tag) | |
if lang_tag != None: | |
return urljoin(any_web_url, lang_tag['href']) | |
else: | |
return None | |
#output_li = convert_URL_anyWebsite("[email protected]", "es") | |
#print(output_li) | |
def weDocs_short(weDocs_url) -> str: | |
"""Replaces a language specific WeDocs link with the landing page | |
Args: | |
weDocs_url (str): String of web url from the web wedocs.unep.org | |
Returns: | |
str: Landing page of the document, so it is not language specific. | |
Example: | |
>>> weDocs_short('https://wedocs.unep.org/bitstream/handle/20.500.11822/43104/Practical_Guide.pdf?sequence=1&isAllowed=y') | |
'https://wedocs.unep.org/handle/20.500.11822/43104/' | |
""" | |
return re.sub(r"https://wedocs.unep.org/(bitstream/)?handle/([\w.-]+/\d+).+", r"https://wedocs.unep.org/handle/\2", weDocs_url) | |
# WeDocs link converter, it access a short WeDocs link and returns a language-specific URL (pdf) | |
def convert_WeDocs_href(url: str, target_lang: str ='English') -> str: | |
"""WeDocs link converter, it access a short WeDocs link | |
and returns a language-specific URL (pdf) | |
Args: | |
weDocs_url (str): String of web url from the web wedocs.unep.org | |
target_lang (str): Language code of the document to find. | |
Returns: | |
str: Download link of the PDF in the language requested. | |
Example: | |
>>> convert_WeDocs_href('https://wedocs.unep.org/handle/20.500.11822/43104', 'Chinese') | |
'https://wedocs.unep.org/bitstream/handle/20.500.11822/43104/PracticalGuide_ZH.pdf?sequence=5&isAllowed=y' | |
""" | |
try: | |
# Send an HTTP GET request to the URL | |
response = requests.get(url, verify=False) | |
# Check if the request was successful | |
if response.status_code == 200: | |
# Parse the HTML content using BeautifulSoup | |
pattern = re.compile(r".*{}.*".format(re.escape(target_lang.capitalize()))) # TODO normalize to take into account the Dico's key, in case user enters RU instead of Russian | |
soup = BeautifulSoup(response.text, 'html.parser') | |
#print(soup.prettify()) | |
# Find the <a> tag with the word "Spanish" or the entered language name in its text | |
# Extract the href attribute value | |
lang_link = soup.find(string=re.compile(pattern)).parent['href'] | |
#print(lang_link) | |
if lang_link: | |
# Merge the domain and PDF name to create the complete link | |
clean_link = "https://wedocs.unep.org" + lang_link | |
return clean_link | |
else: | |
return f"No link with '{target_lang}' text found." | |
else: | |
return "Failed to retrieve the URL." | |
except Exception as e: | |
return str(e) | |
#spanish_href = extract_WeDocs_href(url, "Spanish") | |
#portuguese_href = extract_WeDocs_href(url, "Portuguese") | |
#ch_href = convert_WeDocs_href(url, "Chinese") | |
#print(spanish_href) | |
#print(portuguese_href) | |
#print(ch_href) | |
def access_un_library_by_id(user_input_id): | |
try: | |
# Base URL | |
base_url = "https://digitallibrary.un.org/search?" | |
# Construct the URL with the user-provided ID | |
url = f"{base_url}ln=fr&p={user_input_id}&f=&c=Resource%20Type&c=UN%20Bodies&sf=&so=d&rg=50&fti=0" | |
# Send an HTTP GET request to the URL | |
response = requests.get(url) | |
# Check if the request was successful (status code 200) | |
if response.status_code == 200: | |
print("Request was successful. Content:") | |
# Parse the HTML content using BeautifulSoup | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Find the <div> with class="result-title" | |
result_title_div = soup.find('div', class_='result-title') | |
if result_title_div: | |
# Find the first <a> tag within the result-title div and get its href value | |
result_title_a = result_title_div.find('a', href=True) | |
if result_title_a: | |
href_value = result_title_a['href'] | |
return f"https://digitallibrary.un.org{href_value}" | |
else: | |
print("No <a> tag found inside result-title.") | |
else: | |
print("No result-title div found in the HTML.") | |
return None | |
else: | |
print(f"Failed to retrieve the URL. Status code: {response.status_code}") | |
return None | |
except Exception as e: | |
print(f"An error occurred: {str(e)}") | |
return None | |
# Get user input for the ID | |
#user_input_id = input("Enter the ID: ") | |
# Call the function with user input | |
#resultado = access_un_library_by_id(user_input_id) | |
#print(resultado) | |
# Send an URL request with headers to bypass CloudFlare as suggested by https://stackoverflow.com/a/74674276 | |
def access_un_library_byResourceURL(landing_url: str) -> BeautifulSoup: | |
req = urllib.request.Request(landing_url) | |
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0') | |
req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8') | |
req.add_header('Accept-Language', 'en-US,en;q=0.5') | |
try: | |
response = urllib.request.urlopen(req) | |
except urllib.error.HTTPError as e: | |
print(f"HTTPError: {e.code} - {e.reason}") | |
# You can raise a custom exception or handle the error in any other way | |
return None | |
except urllib.error.URLError as e: | |
print(f"URLError: {e.reason}") | |
return None | |
# Handle other URL-related errors | |
except Exception as e: | |
print(f"An unexpected error occurred: {e}") | |
return None | |
# Handle other unexpected errors | |
else: | |
# If no exception occurred, continue text processing | |
print("Scraping successful") | |
r = urllib.request.urlopen(req).read().decode('utf-8') | |
if r: | |
# Convert html into BeautifulSoup object | |
soup = BeautifulSoup(r, 'html.parser') | |
#print(soup) | |
return soup | |
else: | |
# HTML error | |
raise ValueError("Error in parsing the website content in HTML") | |
return None | |
def extract_info_UNdocLink(url, lang2_code): | |
""" | |
Extracts information from a given UNDocs URL. | |
Args: | |
url (str): The UNDocs URL. | |
Returns: | |
dict: A dictionary containing the extracted information. | |
""" | |
# Define a regex pattern to match the components in the URL | |
# https://undocs.org/en/UNEP/EA.5/28/Corr.1 | |
pattern = r'https://undocs\.org/([a-z]{2})?/?([A-Z]+)/(.*?)/(\d+)/(.*?)$' | |
# Use regex to find the components in the URL | |
match = re.match(pattern, url) | |
if match: | |
record_id = match.group(0) | |
symbol = match.group(2) | |
doc_type = match.group(3) | |
unga = match.group(4) | |
resolution_id = match.group(5) | |
language_code = match.group(1) if match.group(1) else None # Optional language code | |
return f"https://undocs.org/{lang2_code.lower()}/{symbol}/{doc_type}/{unga}/{resolution_id}" | |
else: | |
return None | |
# Example usage: | |
#url = "https://undocs.org/en/UNEP/EA.5/28/Corr.1" | |
#result = extract_info_UNdocLink(url, "fr") | |
#print(result) | |
# Define the language dictionary | |
language_dict = { | |
"Spanish": "es", | |
"French": "fr", | |
"English": "en", | |
"Chinese": "ch", | |
"Russian": "ru", | |
"Arabic": "ar" | |
} | |
#input_language = "Russian" | |
# 1.7 UN Docs | |
def get_jobID_undocs(url): | |
""" | |
Extracts the job ID from a given URL of the ny.un.org website. | |
Args: | |
url (str): The URL of the document on ny.un.org. | |
Returns: | |
str: The extracted job ID. | |
""" | |
# Define a regex pattern to match the job ID in the URL | |
pattern = r'dds-ny.*/([A-Za-z0-9]+)\.pdf' | |
# Use regex to find the job ID in the URL | |
match = re.search(pattern, url) | |
# Return the matched job ID or None if not found | |
return match.group(1) if match else None | |
# Extract the `value` attribute of <option> tags with the specified regex pattern | |
def find_lang_UNdoc(un_docs_link, input_language): | |
un_library_url = un_docs_link | |
# Define the language dictionary | |
UN_languages_dict = { | |
"Spanish": "es", | |
"French": "fr", | |
"English": "en", | |
"Chinese": "ch", | |
"Russian": "ru", | |
"Arabic": "ar" | |
} | |
if "undocs.org" in un_docs_link: | |
#return extract_info_UNdocLink(un_docs_link, UN_languages_dict[input_language]) | |
return extract_info_UNdocLink(un_docs_link, input_language) | |
elif "dds-ny" in un_docs_link: | |
#extract ID TODO | |
un_library_url_ID = get_jobID_undocs(un_docs_link) | |
print(un_library_url_ID) | |
# Get URL from ID | |
un_library_url = access_un_library_by_id(un_library_url_ID) | |
print(un_library_url) | |
elif "digitallibrary.un.org" in un_docs_link: | |
un_library_url = un_docs_link | |
try: | |
# Get HTML from UN_lib URL | |
#soup = access_un_library_byResourceURL(un_library_url) | |
my_soup = access_un_library_byResourceURL(un_library_url) | |
if my_soup is None: | |
return None | |
except Exception as e: | |
print(f"An unexpected error occurred: {e}") | |
return None | |
else: | |
# Define the regex pattern | |
regex_pattern = r"-(\w{2})\.pdf" | |
# Find all <option> tags | |
options = my_soup.find_all('option', value=re.compile(regex_pattern)) | |
# Extract and print the `value` attribute values | |
for option in options: | |
value = option['value'] | |
match = re.search(regex_pattern, value) | |
if match: | |
language_code = match.group(1) | |
# Check if the language code is in the language_dict | |
language = next((k for k, v in UN_languages_dict.items() if v.startswith(language_code.lower())), 'Unknown') | |
#print(f"Option Value: {value}, Language Code: {language_code}, Language: {language}") | |
# Prepare the direct link for the requested language | |
if language == input_language: | |
output_links = [value] | |
# Define a regular expression pattern with capture groups | |
pattern = r"https://digitallibrary.un.org/record/(\d+)/files/([A-Z]+)_([A-Z]+)_([\d]+)_([\d]+)-(\w{2})\.pdf" | |
# Use re.search to find matches and capture groups | |
match = re.search(pattern, value) | |
if match: | |
# Extract capture group values | |
record_id = match.group(1) | |
symbol = match.group(2) # A | |
doc_type = match.group(3) # RES | |
unga = match.group(4) # 61 | |
resolution_id = match.group(5) # 295 | |
language_code = match.group(6) # es | |
# Construct the output string # https://undocs.org/es/A/RES/61/295 | |
output_links.append(f"https://undocs.org/{symbol}/{doc_type}/{unga}/{resolution_id}") | |
output_links.append(f"https://undocs.org/{language_code.lower()}/{symbol}/{doc_type}/{unga}/{resolution_id}") | |
else: | |
print("No match found for the input string.") | |
# Output is a list of 3 links: | |
# 1 is UN Library: https://digitallibrary.un.org/record/606782/files/A_RES_61_295-ZH.pdf | |
# 2 is UN Docs multilingual shortlink: https://undocs.org/A/RES/61/295 | |
# 3 is UN Docs MONO-lingual shortlink: https://undocs.org/zh/A/RES/61/295 | |
return output_links | |
# Call the function to extract and print the option values | |
#print(find_lang_UNdoc("https://undocs.org/en/UNEP/EA.5/28/Corr.1", "Russian")) | |
#print(get_language_code("fr")) | |
#print(find_lang_UNdoc("https://www.ohchr.org/en/documents/thematic-reports/ahrc3917-report-special-rapporteur-rights-indigenous-peoples", get_language_code("fr"))) | |
import re | |
def convert_Intl_Day(url, language_code): | |
""" | |
Converts the language code in a UN URL to the specified language. | |
Args: | |
url (str): The UN URL. | |
language_code (str): The target language code. | |
Returns: | |
str: The modified URL with the specified language code. | |
""" | |
# Use regex to replace the language code in the URL | |
if language_code.lower() == "ch": | |
return re.sub(r'/([a-z]{2})/observances', f'/zh/observances', url) | |
else: | |
return re.sub(r'/([a-z]{2})/observances', f'/{language_code}/observances', url) | |
# Example usage: | |
#url = "https://www.un.org/es/observances/cities-day" | |
#modified_url = convert_Intl_Day(url, "ch") | |
#print(modified_url) | |
import re | |
def convert_URLendingBy_langEqualsCode(url, language_code): | |
""" | |
Converts the language code in a URL with the pattern ?lang=[A-Z]{2} to the specified language. | |
No URL validation. | |
Args: | |
url (str): The URL. | |
language_code (str): The target language code. | |
Returns: | |
str: The modified URL with the specified language code. | |
""" | |
if language_code.lower() == "ch": | |
return re.sub(r'(\?lang=)[A-Z]{2}', fr'\1ZH', url) | |
else: | |
# Use regex to replace the language code in the URL | |
return re.sub(r'(\?lang=)[A-Z]{2}', fr'\1{language_code.upper()}', url) | |
# Example usage: | |
#url = "https://www.unep.org/interactives/beat-plastic-pollution/?lang=ES" | |
#modified_url = convert_URLendingBy_langEqualsCode(url, "ch") | |
#print(modified_url) | |
# Ultimate finder function | |
def localize_URL(mi_URL: str, lengua: str="en") -> str: | |
'''Apply all functions to try to find a language version of the input webpage | |
in the provided language code. | |
''' | |
resulting_link = None | |
def is_email(string): | |
print(f"Validating if {string} is an email:") | |
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b') | |
return bool(email_pattern.match(string)) | |
# Check if URL is not an email | |
if is_email(mi_URL): | |
print(f"{mi_URL} is an email") | |
return None | |
else: | |
#try UN Docs | |
#TODO find a way to scrape this search engine https://documents.un.org/prod/ods.nsf/home.xsp | |
# or how to download the PDF, access the symbol tag and join the url to undocs.org/ | |
print("Trying find_lang_UNdoc for ", mi_URL) | |
resulting_link = find_lang_UNdoc(mi_URL, get_language_code(lengua)) | |
if resulting_link: | |
return resulting_link[-1] | |
# International Days | |
if "/observances/" in mi_URL and "un.org/" in mi_URL: | |
print("Trying convert_Intl_Day") | |
resulting_link = convert_Intl_Day(mi_URL, lengua) | |
return resulting_link | |
# WeDocs UNEP | |
if "wedocs.unep.org" in mi_URL: | |
print("Trying convert_WeDocs_href") | |
short_weDocs_url = weDocs_short(mi_URL) | |
resulting_link = convert_WeDocs_href(short_weDocs_url, get_language_code(lengua)) | |
return resulting_link | |
# try UNEP articles | |
if "unep.org" in mi_URL and "wedocs" not in mi_URL: | |
print("Trying convert_UNEP_url") | |
resulting_link = convert_UNEP_url(mi_URL, lengua) | |
return resulting_link | |
elif ".pdf" not in mi_URL: | |
print("Trying convert_URL_anyWebsite") | |
resulting_link = convert_URL_anyWebsite(mi_URL, lengua) | |
print(resulting_link) | |
if resulting_link is not None: | |
return resulting_link | |
else: | |
return None | |
#print(localize_URL("https://documents-dds-ny.un.org/doc/UNDOC/GEN/N06/512/07/PDF/N0651207.pdf?OpenElement", "fr")) | |
#print(localize_URL("https://documents-dds-ny.un.org/doc/UNDOC/GEN/G16/015/38/PDF/G1601538.pdf?OpenElement", "fr")) | |
#print(localize_URL("https://undocs.org/FCCC/CP/2015/10/Add.1", "fr")) | |
#print(localize_URL("https://www.un.org/en/observances/environment-in-war-protection-day", "fr")) | |
#print(localize_URL(url5, "fr")) | |
def extract_href_attributes(html_content): | |
soup = BeautifulSoup(html_content, 'html.parser') | |
# creates a list | |
href_values = [a['href'] for a in soup.find_all('a', href=True)] | |
return href_values | |
#language_code = "es" | |
#UNEP_URL_DOWNREPLACE = "https://www.unep.org/news-and-stories/press-release/global-annual-finance-flows-7-trillion-fueling-climate-biodiversity#" | |
def extract_content_by_language(soup): | |
# Find the div with id="field_body" | |
field_body_div = soup.find('div', id='field_body') | |
if field_body_div: | |
# Helper function to recursively clean div tags deeper than direct children | |
def clean_div_tags(tag): | |
for child in tag.children: | |
if child.name == 'div': | |
clean_div_tags(child) | |
else: | |
content.append(str(child)) | |
# Ignore secondary div tags and extract their children tags (except div tags) | |
content = [] | |
for tag in field_body_div.find_all(recursive=False): | |
if tag.name == 'div': | |
# Clean div tags deeper than direct children | |
clean_div_tags(tag) | |
else: | |
# Include children tags (except div tags) | |
content.append(str(tag)) | |
return ''.join(content).strip() | |
else: | |
print(f"Div with id='field_body' not found in the HTML.") | |
return None | |
# Filter video frames and images HTML tags | |
def transform_html_content(html_content): | |
# Parse the HTML content | |
soup = BeautifulSoup(html_content, 'html.parser') | |
# Transform iframe tags with "youtu" in src attribute to oembed tags | |
for iframe_tag in soup.find_all('iframe', src=lambda x: x and 'youtu' in x): | |
src_attribute = iframe_tag['src'] | |
video_id = src_attribute.split('/')[-1] # Extract video ID from the src attribute | |
oembed_tag = soup.new_tag('oembed') | |
oembed_tag.string = f'https://www.youtube.com/watch?v={video_id}' | |
iframe_tag.replace_with(oembed_tag) | |
# Merge figure tags and their children into a single img tag | |
for figure_tag in soup.find_all('figure'): | |
img_tag = figure_tag.find('img') | |
if img_tag: | |
# Create a new img tag with merged attributes | |
new_img_tag = soup.new_tag('img') | |
new_img_tag.attrs = img_tag.attrs | |
figcaption_tag = figure_tag.find('figcaption') | |
if figcaption_tag: | |
# Extract the content of figcaption tag for data-caption attribute | |
new_img_tag['data-caption'] = str(figcaption_tag.contents[0]) | |
figure_tag.replace_with(new_img_tag) | |
# Return the modified HTML content | |
return soup | |
# Link Replacer for HTML | |
def localize_UNEP_html(language_code, soup): | |
""" | |
Localizes the href attributes of <a> tags in HTML content based on the given language code. | |
Args: | |
language_code (str): The language code used for URL localization. | |
soup (BeautifulSoup): The BeautifulSoup object containing the parsed HTML content. | |
Returns: | |
str: The modified HTML content with localized href attributes. | |
Example: | |
language_code = "en" | |
soup = BeautifulSoup(html_content, 'html.parser') | |
modified_html = localize_UNEP_html(language_code, soup) | |
print(modified_html) | |
""" | |
# Access the URL | |
print(f"Accessing the URL, type: {type(soup)}") | |
soup = get_HTML_generic(soup) | |
print(f"Accessing parsed HTML: {type(soup)}") | |
# Filter only translatable content | |
soup = extract_content_by_language(soup) | |
print(f"Filtered HTML: {type(soup)}") | |
# Transform images and embedded YouTube videos | |
soup = transform_html_content(soup) | |
print(f"Transformed IMG and IFRAME tags: {type(soup)}") | |
# Find all <a> tags in the HTML content | |
for a_tag in soup.find_all('a'): | |
# Get the current href attribute value | |
current_href = a_tag.get('href', '') | |
# Localize the URL using the provided language code | |
localized_url = localize_URL(current_href, language_code) | |
# Update the href attribute with the localized URL | |
if localized_url is not None: | |
a_tag['href'] = localized_url | |
# Return the modified HTML content | |
return str(soup) | |
#Code created by Nelson JAIMES-QUINTERO | |
# Define your custom function | |
def render_html(htmltext, language): | |
soup = BeautifulSoup(htmltext, 'html.parser') | |
for a_tag in soup.find_all('a'): | |
# Get the current href attribute value | |
current_href = a_tag.get('href', '') | |
# Localize the URL using the provided language code | |
localized_url = localize_URL(current_href, language) | |
# Update the href attribute with the localized URL | |
if localized_url is not None: | |
a_tag['href'] = localized_url | |
# Return the modified HTML content | |
output = str(soup) | |
return output | |
# Create the Gradio interface | |
with gr.Blocks() as demo: | |
html_input = gr.Textbox(label="Enter HTML Code", lines=10, placeholder="Paste your HTML code here. You can convert Word file's content into HTML by using html-cleaner.com") | |
language_dropdown = gr.Dropdown(label="Select Language", choices=['es', 'fr', 'sw', 'en', 'zh-hans', 'pt-br', 'ru', 'ar'], value='es') | |
html_output = gr.HTML(label="Rendered HTML") | |
run_button = gr.Button("Find links!") | |
run_button.click(render_html, inputs=[html_input, language_dropdown], outputs=html_output) | |
# Launch the Gradio app with debug=True and share=True | |
demo.launch() |