Spaces:
Running
Running
import re | |
import requests | |
import sys | |
import os | |
import urllib | |
from bs4 import BeautifulSoup | |
import urllib.parse | |
from urllib.parse import urlparse, urljoin | |
from urllib3.exceptions import InsecureRequestWarning | |
from urllib3 import disable_warnings | |
import email.utils | |
import pandas as pd | |
import pypandoc | |
import fitz | |
from docx import Document | |
from spire.doc import * | |
from spire.doc.common import * | |
disable_warnings(InsecureRequestWarning) | |
def get_language_code(query): | |
""" | |
Search for a value given a key or search for a key given a value in the language_dict. | |
Args: | |
query (str): The key or value to search for. | |
Returns: | |
str: The corresponding value or key. | |
""" | |
for key, value in language_dict.items(): | |
if query.lower() == key.lower(): | |
return value | |
elif query.lower() == value.lower(): | |
return key | |
return None | |
# Example usage: | |
language_dict = { | |
"Spanish": "es", | |
"French": "fr", | |
"Swahili": "sw", | |
"English": "en", | |
"Chinese": "zh-hans", | |
"Portuguese": "pt-br", | |
"Russian": "ru", | |
"Arabic": "ar" | |
} | |
#result_key = get_language_code("Spanish") | |
#result_value = get_language_code("fr") | |
#print(result_key) # Output: "fr" | |
#print(result_value) # Output: "Spanish" | |
#print(type(result_value)) | |
# Extract node's number from UNEP URL | |
def find_UNEP_node(unep_full_link: str) -> str: | |
"""find_UNEP_node access the input URL, finds the language version | |
of the webpage, return the URL's node that is common to all UNEP languages. | |
Args: | |
unep_full_link (str): String of full web url in UNEP website. | |
Returns: | |
str: URL's node | |
Examples: | |
>>> convert_UNEP_url('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts') | |
'34817' | |
""" | |
# Send an URL request with headers to bypass CloudFlare as suggested by https://stackoverflow.com/a/74674276 | |
req = urllib.request.Request(unep_full_link) | |
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0') | |
req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8') | |
req.add_header('Accept-Language', 'en-US,en;q=0.5') | |
try: | |
response = urllib.request.urlopen(req) | |
except urllib.error.HTTPError as e: | |
print(f"HTTPError: {e.code} - {e.reason}") | |
# You can raise a custom exception or handle the error in any other way | |
except urllib.error.URLError as e: | |
print(f"URLError: {e.reason}") | |
# Handle other URL-related errors | |
except Exception as e: | |
print(f"An unexpected error occurred: {e}") | |
# Handle other unexpected errors | |
else: | |
# If no exception occurred, continue text processing | |
print("Scraping successful") | |
r = urllib.request.urlopen(req).read().decode('utf-8') | |
if r: | |
# Convert html into BeautifulSoup object | |
soup = BeautifulSoup(r, 'html.parser') | |
#print(soup) | |
# Find the <ul> element with class 'links' | |
ul_element = soup.find('ul', class_='links') | |
# Find the <li> element with class 'es is-active' | |
li_element = ul_element.find('li', class_=lambda x: x and x.endswith('is-active')) | |
# Extract the value of the 'data-drupal-link-system-path' attribute | |
attribute_value = li_element.get('data-drupal-link-system-path') | |
return attribute_value.split('node/')[1] | |
# test | |
#print(find_UNEP_node('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts')) | |
#print(type(find_UNEP_node('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts'))) | |
# Main function: finds the language version of a web article in UNEP website. | |
def convert_UNEP_url(unep_full_link: str, target_lang: str = 'en') -> str: | |
"""convert_UNEP_url access the input URL, finds the URL of the translated version | |
of the webpage in the input language, return an URL. | |
Args: | |
unep_full_link (str): String of full web url in UNEP website. | |
target_lang (str): Target language, default = 'en'. | |
Returns: | |
str: New converted URL | |
Examples: | |
>>> convert_UNEP_url('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts', 'es') | |
'https://www.unep.org/es/noticias-y-reportajes/reportajes/los-pueblos-indigenas-recurren-los-tribunales-ante-la-crisis' | |
""" | |
# Send an URL request with headers to bypass CloudFlare as suggested by https://stackoverflow.com/a/74674276 | |
req = urllib.request.Request(unep_full_link) | |
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0') | |
req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8') | |
req.add_header('Accept-Language', 'en-US,en;q=0.5') | |
try: | |
response = urllib.request.urlopen(req) | |
except urllib.error.HTTPError as e: | |
print(f"HTTPError: {e.code} - {e.reason}") | |
# You can raise a custom exception or handle the error in any other way | |
return None | |
except urllib.error.URLError as e: | |
print(f"URLError: {e.reason}") | |
# Handle other URL-related errors | |
return None | |
except Exception as e: | |
print(f"An unexpected error occurred: {e}") | |
# Handle other unexpected errors | |
return None | |
else: | |
# If no exception occurred, continue text processing | |
print("Scraping successful") | |
r = urllib.request.urlopen(req).read().decode('utf-8') | |
if r: | |
# Convert html into BeautifulSoup object | |
soup = BeautifulSoup(r, 'html.parser') | |
#print(soup) | |
# Looks for the link in the target language, whose class is "language-link" | |
lenguas = soup.find("a", class_="language-link", hreflang = target_lang) | |
#print(lenguas) | |
if lenguas: | |
#print(f"https://www.unep.org{lenguas['href']}") | |
if lenguas['href'].endswith('/node'): | |
return f"https://www.unep.org{lenguas['href'][0:-5]}" | |
return f"https://www.unep.org{lenguas['href']}" | |
elif not lenguas: | |
# Find the <ul> element with class 'links' | |
ul_element = soup.find('ul', class_='links') | |
if ul_element: | |
# Find the <li> element with class 'es is-active' | |
li_element = ul_element.find('li', class_=lambda x: x and x.endswith('is-active')) | |
# Extract the value of the 'data-drupal-link-system-path' attribute | |
node_value = li_element.get('data-drupal-link-system-path') | |
return find_from_nodeLink(int(node_value.split("/")[1]), target_lang) | |
#return f"https://www.unep.org/{node_value}" | |
else: | |
raise ValueError("Error: Webpage accessed but the tag 'a', class_='language-link' was not found. Probably because the website was blocked by firewall/CloudFlare") | |
return None | |
else: | |
print("\n<-- Error code. The programme could not access the webpage, forbidden") | |
return None | |
# test | |
#input = input("Enter your UNEP url:") | |
#input = 'https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts' | |
#input = "https://www.unep.org/ru" | |
#print(convert_UNEP_url(input, 'es')) | |
#print(convert_UNEP_url(input, 'fr')) | |
UNEP_LANG_CODES = ['ar', 'es', 'fr', 'ru', 'sw', 'pt-br', 'ch', 'zh', 'zh-hans', 'en'] | |
def find_from_nodeLink(node_input, target_lang='empty'): | |
"""Replaces a node_link to the corresponding language. | |
Args: | |
node_input (str, int): Either a string of web URL containing the word 'node' and its ID, or an integer ID (or a string representation of an integer). | |
target_lang (str): Target language, default = 'empty'. | |
Returns: | |
str: New converted URL | |
Examples: | |
>>> find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'fr') | |
'https://www.unep.org/fr/node/30010' | |
>>> find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'empty') | |
'https://www.unep.org/node/30010' | |
>>> find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'zh-hans') | |
'https://www.unep.org/zh-hans/node/30010' | |
>>> find_from_nodeLink(30010, 'fr') | |
'https://www.unep.org/fr/node/30010' | |
>>> find_from_nodeLink('30010', 'fr') | |
'https://www.unep.org/fr/node/30010' | |
""" | |
if isinstance(node_input, str) and node_input.isdigit(): | |
node_input = int(node_input) | |
if isinstance(node_input, int): | |
node_url = f'https://www.unep.org/{target_lang}/node/{node_input}' | |
elif isinstance(node_input, str): | |
node_url = node_input | |
else: | |
raise ValueError("Error: Provide either a string URL or an integer ID (or a string representation of an integer)") | |
pattern = r"https://www\.unep\.org/[a-z]*-?[a-z]*/?node/(\d+)" | |
if target_lang == "empty": | |
target_lang = "en" | |
if target_lang in ["ch", 'zh', 'cn']: | |
target_lang = "zh-hans" | |
if target_lang in ['pt', 'pt-pt']: | |
target_lang = "pt-br" | |
if target_lang in UNEP_LANG_CODES: | |
if re.findall(pattern, node_url): | |
# Replace the language part in the URL | |
new_url = re.sub(pattern, r"https://www.unep.org/{}/node/\1".format(target_lang), node_url) | |
return new_url | |
else: | |
raise ValueError("Error: URL not found, or website blocked by firewall/CloudFare") | |
else: | |
raise ValueError("Error: Provide a language code among these: 'ar','es','fr','ru','sw','pt-br','zh-hans', 'en' or leave empty") | |
# Generic scraper | |
def get_HTML_generic(any_url: str) -> BeautifulSoup: | |
"""Any website link converter, it access the website and returns the HTML. | |
Args: | |
any_url (str): String of web url from the web wedocs.unep.org | |
Returns: | |
str: parsed HTML with BeautifulSoup | |
Example: | |
>>> convert_WeDocs_href('https://wedocs.unep.org/handle/20.500.11822/43104', 'Chinese') | |
'https://wedocs.unep.org/bitstream/handle/20.500.11822/43104/PracticalGuide_ZH.pdf?sequence=5&isAllowed=y' | |
""" | |
req = urllib.request.Request(any_url) | |
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0') | |
req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8') | |
req.add_header('Accept-Language', 'en-US,en;q=0.5') | |
try: | |
response = urllib.request.urlopen(req) | |
except urllib.error.HTTPError as e: | |
print(f"HTTPError: {e.code} - {e.reason} when accessing {any_url}") | |
# You can raise a custom exception or handle the error in any other way | |
except urllib.error.URLError as e: | |
print(f"URLError: {e.reason} when accessing {any_url}") | |
# Handle other URL-related errors | |
except Exception as e: | |
print(f"An unexpected error occurred: {e} when accessing {any_url}") | |
# Handle other unexpected errors | |
else: | |
# If no exception occurred, continue text processing | |
print("Scraping successful") | |
r = urllib.request.urlopen(req).read().decode('utf-8') | |
if r: | |
# Convert html into BeautifulSoup object | |
soup = BeautifulSoup(r, 'html.parser') | |
return soup | |
#print(soup) | |
# Example usage with an integer ID provided as a string | |
#print(find_from_nodeLink('30010', 'fr')) | |
#print(find_from_nodeLink(30010, 'fr')) | |
#print(find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'zh-hans')) | |
def try_lang_switcher(switcher_soup, lang_code: str, base_url) -> str: | |
# Find the <ul> element with class "language-switcher" | |
#language_switcher_ul = switcher_soup.find('ul', class_='language-switcher') | |
language_switcher_ul = switcher_soup.find('ul', class_=lambda value: value and value.startswith('language-switcher')) | |
# Extract href values from <a> elements within the <ul> | |
if language_switcher_ul: | |
href_values = [a['href'] for a in language_switcher_ul.find_all('a')] | |
for i, element in enumerate(href_values): | |
if lang_code in element: | |
new_link = urljoin(base_url, href_values[i]) | |
return new_link | |
return None | |
# Function to concatenate absolute paths if URL cannot be accessed | |
from urllib.parse import urljoin | |
def concatenate_missing_segments(arg1, arg2): | |
""" | |
Concatenates missing URL segments from Arg1 to Arg2. | |
Args: | |
arg1 (str): The URL containing the missing segments, longer URL like | |
"https://www.unep.org/interactive/explore-ecosystems/mountains/en/index.php#/mountain-intro" | |
arg2 (str): The target URL, shorter URL like | |
"https://www.unep.org/interactive/explore-ecosystems/mountains/ar" | |
Returns: | |
str: The concatenated URL. | |
"https://www.unep.org/interactive/explore-ecosystems/mountains/ar/index.php#/mountain-intro" | |
""" | |
if len(arg1)>len(arg2): | |
missing_segment = arg1[len(arg2):] | |
return arg2 + missing_segment | |
# Example usage: | |
#arg1 = "https://www.unep.org/interactive/explore-ecosystems/mountains/en/index.php#/mountain-intro" | |
#arg2 = "https://www.unep.org/interactive/explore-ecosystems/mountains/ar" | |
#result = concatenate_missing_segments(arg1, arg2) | |
#print(result) | |
def convert_URL_anyWebsite(any_web_url: str, lang_code) -> str: | |
# Access the URL to get the HTML with BeautifulSoup --> soup object | |
sauce_html = get_HTML_generic(any_web_url) | |
print(type(sauce_html)) | |
if sauce_html: | |
# Search the language_switcher HTML tag and gets the language code | |
switcher_link = try_lang_switcher(sauce_html, lang_code.lower(), any_web_url) | |
if switcher_link and get_HTML_generic(switcher_link): | |
return switcher_link | |
elif switcher_link: | |
return concatenate_missing_segments(any_web_url, switcher_link) | |
elif sauce_html.find_all(lambda tag: tag.has_attr('data-sf-role') and tag['data-sf-role'] == lang_code): #working for WHO news | |
print("trying WHO") | |
matching_tags = sauce_html.find_all(lambda tag: tag.has_attr('data-sf-role') and tag['data-sf-role'] == lang_code) | |
if matching_tags: | |
print(matching_tags) | |
return matching_tags[0]['value'] | |
elif sauce_html.find_all(lambda tag: tag.has_attr('hreflang') and tag['hreflang'] == lang_code): | |
print("trying hreflang") | |
matching_tags = sauce_html.find_all(lambda tag: tag.has_attr('hreflang') and tag['hreflang'] == lang_code) | |
if matching_tags: | |
return matching_tags[0]['href'] | |
elif sauce_html: | |
print("trying language_link") # working for UNESCO | |
lang_tag = sauce_html.find("a", class_="language-link", hreflang = lang_code) | |
#print(lang_tag) | |
if lang_tag != None: | |
return urljoin(any_web_url, lang_tag['href']) | |
else: | |
return None | |
#output_li = convert_URL_anyWebsite("[email protected]", "es") | |
#print(output_li) | |
def weDocs_short(weDocs_url) -> str: | |
"""Replaces a language specific WeDocs link with the landing page | |
Args: | |
weDocs_url (str): String of web url from the web wedocs.unep.org | |
Returns: | |
str: Landing page of the document, so it is not language specific. | |
Example: | |
>>> weDocs_short('https://wedocs.unep.org/bitstream/handle/20.500.11822/43104/Practical_Guide.pdf?sequence=1&isAllowed=y') | |
'https://wedocs.unep.org/handle/20.500.11822/43104/' | |
""" | |
return re.sub(r"https://wedocs.unep.org/(bitstream/)?handle/([\w.-]+/\d+).+", r"https://wedocs.unep.org/handle/\2", weDocs_url) | |
# WeDocs link converter, it access a short WeDocs link and returns a language-specific URL (pdf) | |
def convert_WeDocs_href(url: str, target_lang: str ='English') -> str: | |
"""WeDocs link converter, it access a short WeDocs link | |
and returns a language-specific URL (pdf) | |
Args: | |
weDocs_url (str): String of web url from the web wedocs.unep.org | |
target_lang (str): Language code of the document to find. | |
Returns: | |
str: Download link of the PDF in the language requested. | |
Example: | |
>>> convert_WeDocs_href('https://wedocs.unep.org/handle/20.500.11822/43104', 'Chinese') | |
'https://wedocs.unep.org/bitstream/handle/20.500.11822/43104/PracticalGuide_ZH.pdf?sequence=5&isAllowed=y' | |
""" | |
try: | |
# Send an HTTP GET request to the URL | |
response = requests.get(url, verify=False) | |
# Check if the request was successful | |
if response.status_code == 200: | |
# Parse the HTML content using BeautifulSoup | |
pattern = re.compile(r".*{}.*".format(re.escape(target_lang.capitalize()))) # TODO normalize to take into account the Dico's key, in case user enters RU instead of Russian | |
soup = BeautifulSoup(response.text, 'html.parser') | |
#print(soup.prettify()) | |
# Find the <a> tag with the word "Spanish" or the entered language name in its text | |
# Extract the href attribute value | |
lang_link = soup.find(string=re.compile(pattern)).parent['href'] | |
#print(lang_link) | |
if lang_link: | |
# Merge the domain and PDF name to create the complete link | |
clean_link = "https://wedocs.unep.org" + lang_link | |
return clean_link | |
else: | |
return f"No link with '{target_lang}' text found." | |
else: | |
return "Failed to retrieve the URL." | |
except Exception as e: | |
return str(e) | |
#spanish_href = extract_WeDocs_href(url, "Spanish") | |
#portuguese_href = extract_WeDocs_href(url, "Portuguese") | |
#ch_href = convert_WeDocs_href(url, "Chinese") | |
#print(spanish_href) | |
#print(portuguese_href) | |
#print(ch_href) | |
def access_un_library_by_id(user_input_id): | |
try: | |
# Base URL | |
base_url = "https://digitallibrary.un.org/search?" | |
# Construct the URL with the user-provided ID | |
url = f"{base_url}ln=fr&p={user_input_id}&f=&c=Resource%20Type&c=UN%20Bodies&sf=&so=d&rg=50&fti=0" | |
# Send an HTTP GET request to the URL | |
response = requests.get(url) | |
# Check if the request was successful (status code 200) | |
if response.status_code == 200: | |
print("Request was successful. Content:") | |
# Parse the HTML content using BeautifulSoup | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Find the <div> with class="result-title" | |
result_title_div = soup.find('div', class_='result-title') | |
if result_title_div: | |
# Find the first <a> tag within the result-title div and get its href value | |
result_title_a = result_title_div.find('a', href=True) | |
if result_title_a: | |
href_value = result_title_a['href'] | |
return f"https://digitallibrary.un.org{href_value}" | |
else: | |
print("No <a> tag found inside result-title.") | |
else: | |
print("No result-title div found in the HTML.") | |
return None | |
else: | |
print(f"Failed to retrieve the URL. Status code: {response.status_code}") | |
return None | |
except Exception as e: | |
print(f"An error occurred: {str(e)}") | |
return None | |
# Get user input for the ID | |
#user_input_id = input("Enter the ID: ") | |
# Call the function with user input | |
#resultado = access_un_library_by_id(user_input_id) | |
#print(resultado) | |
# Send an URL request with headers to bypass CloudFlare as suggested by https://stackoverflow.com/a/74674276 | |
def access_un_library_byResourceURL(landing_url: str) -> BeautifulSoup: | |
req = urllib.request.Request(landing_url) | |
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0') | |
req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8') | |
req.add_header('Accept-Language', 'en-US,en;q=0.5') | |
try: | |
response = urllib.request.urlopen(req) | |
except urllib.error.HTTPError as e: | |
print(f"HTTPError: {e.code} - {e.reason}") | |
# You can raise a custom exception or handle the error in any other way | |
return None | |
except urllib.error.URLError as e: | |
print(f"URLError: {e.reason}") | |
return None | |
# Handle other URL-related errors | |
except Exception as e: | |
print(f"An unexpected error occurred: {e}") | |
return None | |
# Handle other unexpected errors | |
else: | |
# If no exception occurred, continue text processing | |
print("Scraping successful") | |
r = urllib.request.urlopen(req).read().decode('utf-8') | |
if r: | |
# Convert html into BeautifulSoup object | |
soup = BeautifulSoup(r, 'html.parser') | |
#print(soup) | |
return soup | |
else: | |
# HTML error | |
raise ValueError("Error in parsing the website content in HTML") | |
return None | |
def extract_info_UNdocLink(url, lang2_code): | |
""" | |
Extracts information from a given UNDocs URL. | |
Args: | |
url (str): The UNDocs URL. | |
Returns: | |
dict: A dictionary containing the extracted information. | |
""" | |
# Define a regex pattern to match the components in the URL | |
# https://undocs.org/en/UNEP/EA.5/28/Corr.1 | |
pattern = r'https://undocs\.org/([a-z]{2})?/?([A-Z]+)/(.*?)/(\d+)/(.*?)$' | |
# Use regex to find the components in the URL | |
match = re.match(pattern, url) | |
if match: | |
record_id = match.group(0) | |
symbol = match.group(2) | |
doc_type = match.group(3) | |
unga = match.group(4) | |
resolution_id = match.group(5) | |
language_code = match.group(1) if match.group(1) else None # Optional language code | |
return f"https://undocs.org/{lang2_code.lower()}/{symbol}/{doc_type}/{unga}/{resolution_id}" | |
else: | |
return None | |
# Example usage: | |
#url = "https://undocs.org/en/UNEP/EA.5/28/Corr.1" | |
#result = extract_info_UNdocLink(url, "fr") | |
#print(result) | |
# Define the language dictionary | |
language_dict = { | |
"Spanish": "es", | |
"French": "fr", | |
"English": "en", | |
"Chinese": "ch", | |
"Russian": "ru", | |
"Arabic": "ar" | |
} | |
#input_language = "Russian" | |
# 1.7 UN Docs | |
def get_jobID_undocs(url): | |
""" | |
Extracts the job ID from a given URL of the ny.un.org website. | |
Args: | |
url (str): The URL of the document on ny.un.org. | |
Returns: | |
str: The extracted job ID. | |
""" | |
# Define a regex pattern to match the job ID in the URL | |
pattern = r'dds-ny.*/([A-Za-z0-9]+)\.pdf' | |
# Use regex to find the job ID in the URL | |
match = re.search(pattern, url) | |
# Return the matched job ID or None if not found | |
return match.group(1) if match else None | |
# Extract the `value` attribute of <option> tags with the specified regex pattern | |
def find_lang_UNdoc(un_docs_link, input_language): | |
un_library_url = un_docs_link | |
# Define the language dictionary | |
UN_languages_dict = { | |
"Spanish": "es", | |
"French": "fr", | |
"English": "en", | |
"Chinese": "ch", | |
"Russian": "ru", | |
"Arabic": "ar" | |
} | |
if "undocs.org" in un_docs_link: | |
#return extract_info_UNdocLink(un_docs_link, UN_languages_dict[input_language]) | |
return extract_info_UNdocLink(un_docs_link, input_language) | |
elif "dds-ny" in un_docs_link: | |
#extract ID TODO | |
un_library_url_ID = get_jobID_undocs(un_docs_link) | |
print(un_library_url_ID) | |
# Get URL from ID | |
un_library_url = access_un_library_by_id(un_library_url_ID) | |
print(un_library_url) | |
elif "digitallibrary.un.org" in un_docs_link: | |
un_library_url = un_docs_link | |
try: | |
# Get HTML from UN_lib URL | |
#soup = access_un_library_byResourceURL(un_library_url) | |
my_soup = access_un_library_byResourceURL(un_library_url) | |
if my_soup is None: | |
return None | |
except Exception as e: | |
print(f"An unexpected error occurred: {e}") | |
return None | |
else: | |
# Define the regex pattern | |
regex_pattern = r"-(\w{2})\.pdf" | |
# Find all <option> tags | |
options = my_soup.find_all('option', value=re.compile(regex_pattern)) | |
# Extract and print the `value` attribute values | |
for option in options: | |
value = option['value'] | |
match = re.search(regex_pattern, value) | |
if match: | |
language_code = match.group(1) | |
# Check if the language code is in the language_dict | |
language = next((k for k, v in UN_languages_dict.items() if v.startswith(language_code.lower())), 'Unknown') | |
#print(f"Option Value: {value}, Language Code: {language_code}, Language: {language}") | |
# Prepare the direct link for the requested language | |
if language == input_language: | |
output_links = [value] | |
# Define a regular expression pattern with capture groups | |
pattern = r"https://digitallibrary.un.org/record/(\d+)/files/([A-Z]+)_([A-Z]+)_([\d]+)_([\d]+)-(\w{2})\.pdf" | |
# Use re.search to find matches and capture groups | |
match = re.search(pattern, value) | |
if match: | |
# Extract capture group values | |
record_id = match.group(1) | |
symbol = match.group(2) # A | |
doc_type = match.group(3) # RES | |
unga = match.group(4) # 61 | |
resolution_id = match.group(5) # 295 | |
language_code = match.group(6) # es | |
# Construct the output string # https://undocs.org/es/A/RES/61/295 | |
output_links.append(f"https://undocs.org/{symbol}/{doc_type}/{unga}/{resolution_id}") | |
output_links.append(f"https://undocs.org/{language_code.lower()}/{symbol}/{doc_type}/{unga}/{resolution_id}") | |
else: | |
print("No match found for the input string.") | |
# Output is a list of 3 links: | |
# 1 is UN Library: https://digitallibrary.un.org/record/606782/files/A_RES_61_295-ZH.pdf | |
# 2 is UN Docs multilingual shortlink: https://undocs.org/A/RES/61/295 | |
# 3 is UN Docs MONO-lingual shortlink: https://undocs.org/zh/A/RES/61/295 | |
return output_links | |
# Call the function to extract and print the option values | |
#print(find_lang_UNdoc("https://undocs.org/en/UNEP/EA.5/28/Corr.1", "Russian")) | |
#print(get_language_code("fr")) | |
#print(find_lang_UNdoc("https://www.ohchr.org/en/documents/thematic-reports/ahrc3917-report-special-rapporteur-rights-indigenous-peoples", get_language_code("fr"))) | |
import re | |
def convert_Intl_Day(url, language_code): | |
""" | |
Converts the language code in a UN URL to the specified language. | |
Args: | |
url (str): The UN URL. | |
language_code (str): The target language code. | |
Returns: | |
str: The modified URL with the specified language code. | |
""" | |
# Use regex to replace the language code in the URL | |
if language_code.lower() == "ch": | |
return re.sub(r'/([a-z]{2})/observances', f'/zh/observances', url) | |
else: | |
return re.sub(r'/([a-z]{2})/observances', f'/{language_code}/observances', url) | |
# Example usage: | |
#url = "https://www.un.org/es/observances/cities-day" | |
#modified_url = convert_Intl_Day(url, "ch") | |
#print(modified_url) | |
import re | |
def convert_URLendingBy_langEqualsCode(url, language_code): | |
""" | |
Converts the language code in a URL with the pattern ?lang=[A-Z]{2} to the specified language. | |
No URL validation. | |
Args: | |
url (str): The URL. | |
language_code (str): The target language code. | |
Returns: | |
str: The modified URL with the specified language code. | |
""" | |
if language_code.lower() == "ch": | |
return re.sub(r'(\?lang=)[A-Z]{2}', fr'\1ZH', url) | |
else: | |
# Use regex to replace the language code in the URL | |
return re.sub(r'(\?lang=)[A-Z]{2}', fr'\1{language_code.upper()}', url) | |
# Example usage: | |
#url = "https://www.unep.org/interactives/beat-plastic-pollution/?lang=ES" | |
#modified_url = convert_URLendingBy_langEqualsCode(url, "ch") | |
#print(modified_url) | |
# Ultimate finder function | |
def localize_URL(mi_URL: str, lengua: str="en") -> str: | |
'''Apply all functions to try to find a language version of the input webpage | |
in the provided language code. | |
''' | |
resulting_link = None | |
def is_email(string): | |
print(f"Validating if {string} is an email:") | |
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b') | |
return bool(email_pattern.match(string)) | |
# Check if URL is not an email | |
if is_email(mi_URL): | |
print(f"{mi_URL} is an email") | |
return None | |
else: | |
#try UN Docs | |
#TODO find a way to scrape this search engine https://documents.un.org/prod/ods.nsf/home.xsp | |
# or how to download the PDF, access the symbol tag and join the url to undocs.org/ | |
print("Trying find_lang_UNdoc for ", mi_URL) | |
resulting_link = find_lang_UNdoc(mi_URL, get_language_code(lengua)) | |
if resulting_link: | |
return resulting_link[-1] | |
# International Days | |
if "/observances/" in mi_URL and "un.org/" in mi_URL: | |
print("Trying convert_Intl_Day") | |
resulting_link = convert_Intl_Day(mi_URL, lengua) | |
return resulting_link | |
# WeDocs UNEP | |
if "wedocs.unep.org" in mi_URL: | |
print("Trying convert_WeDocs_href") | |
short_weDocs_url = weDocs_short(mi_URL) | |
resulting_link = convert_WeDocs_href(short_weDocs_url, get_language_code(lengua)) | |
return resulting_link | |
# try UNEP articles | |
if "unep.org" in mi_URL and "wedocs" not in mi_URL: | |
print("Trying convert_UNEP_url") | |
resulting_link = convert_UNEP_url(mi_URL, lengua) | |
return resulting_link | |
elif ".pdf" not in mi_URL: | |
print("Trying convert_URL_anyWebsite") | |
resulting_link = convert_URL_anyWebsite(mi_URL, lengua) | |
print(resulting_link) | |
if resulting_link is not None: | |
return resulting_link | |
else: | |
return None | |
#print(localize_URL("https://documents-dds-ny.un.org/doc/UNDOC/GEN/N06/512/07/PDF/N0651207.pdf?OpenElement", "fr")) | |
#print(localize_URL("https://documents-dds-ny.un.org/doc/UNDOC/GEN/G16/015/38/PDF/G1601538.pdf?OpenElement", "fr")) | |
#print(localize_URL("https://undocs.org/FCCC/CP/2015/10/Add.1", "fr")) | |
#print(localize_URL("https://www.un.org/en/observances/environment-in-war-protection-day", "fr")) | |
#print(localize_URL(url5, "fr")) | |
def convert_docx_to_html(docx_file_path): | |
output = pypandoc.convert_file(docx_file_path, 'html') | |
return output | |
def extract_href_attributes(html_content): | |
soup = BeautifulSoup(html_content, 'html.parser') | |
# creates a list | |
href_values = [a['href'] for a in soup.find_all('a', href=True)] | |
return href_values | |
def generate_table_URLs_from_Docx(docx_path, lang_code): | |
# Open the document | |
document = Document(docx_path) | |
# Extract hyperlinks | |
input_urls = [] | |
for paragraph in document.paragraphs: | |
for run in paragraph.runs: | |
hyperlink = run.hyperlink | |
if hyperlink is not None: | |
input_urls.append(hyperlink.address) | |
#input_urls | |
data = [] | |
# Initialize lists to store data for the DataFrame | |
index_list = [] | |
original_url_list = [] | |
localized_url_list = [] | |
# Apply localizeURL to each URL in the list | |
for index, url in enumerate(input_urls): | |
localized_url = localize_URL(url, lang_code) # Replace 'en' with the desired language code | |
index_list.append(index) | |
original_url_list.append(url) | |
localized_url_list.append(localized_url) | |
# Create a DataFrame | |
df_docx = pd.DataFrame({ | |
'index': index_list, | |
'url': original_url_list, | |
'localized_url': localized_url_list | |
}) | |
# Export the DataFrame to a CSV file | |
df_docx.to_csv(f"output_{lang_code}_{docx_path}", index=False, encoding="utf-8") | |
# Display the DataFrame | |
return df_docx | |
#language_code = "es" | |
#UNEP_URL_DOWNREPLACE = "https://www.unep.org/news-and-stories/press-release/global-annual-finance-flows-7-trillion-fueling-climate-biodiversity#" | |
def extract_content_by_language(soup): | |
# Find the div with id="field_body" | |
field_body_div = soup.find('div', id='field_body') | |
if field_body_div: | |
# Helper function to recursively clean div tags deeper than direct children | |
def clean_div_tags(tag): | |
for child in tag.children: | |
if child.name == 'div': | |
clean_div_tags(child) | |
else: | |
content.append(str(child)) | |
# Ignore secondary div tags and extract their children tags (except div tags) | |
content = [] | |
for tag in field_body_div.find_all(recursive=False): | |
if tag.name == 'div': | |
# Clean div tags deeper than direct children | |
clean_div_tags(tag) | |
else: | |
# Include children tags (except div tags) | |
content.append(str(tag)) | |
return ''.join(content).strip() | |
else: | |
print(f"Div with id='field_body' not found in the HTML.") | |
return None | |
# Filter video frames and images HTML tags | |
def transform_html_content(html_content): | |
# Parse the HTML content | |
soup = BeautifulSoup(html_content, 'html.parser') | |
# Transform iframe tags with "youtu" in src attribute to oembed tags | |
for iframe_tag in soup.find_all('iframe', src=lambda x: x and 'youtu' in x): | |
src_attribute = iframe_tag['src'] | |
video_id = src_attribute.split('/')[-1] # Extract video ID from the src attribute | |
oembed_tag = soup.new_tag('oembed') | |
oembed_tag.string = f'https://www.youtube.com/watch?v={video_id}' | |
iframe_tag.replace_with(oembed_tag) | |
# Merge figure tags and their children into a single img tag | |
for figure_tag in soup.find_all('figure'): | |
img_tag = figure_tag.find('img') | |
if img_tag: | |
# Create a new img tag with merged attributes | |
new_img_tag = soup.new_tag('img') | |
new_img_tag.attrs = img_tag.attrs | |
figcaption_tag = figure_tag.find('figcaption') | |
if figcaption_tag: | |
# Extract the content of figcaption tag for data-caption attribute | |
new_img_tag['data-caption'] = str(figcaption_tag.contents[0]) | |
figure_tag.replace_with(new_img_tag) | |
# Return the modified HTML content | |
return soup | |
# Link Replacer for HTML | |
def localize_UNEP_html(language_code, soup): | |
""" | |
Localizes the href attributes of <a> tags in HTML content based on the given language code. | |
Args: | |
language_code (str): The language code used for URL localization. | |
soup (BeautifulSoup): The BeautifulSoup object containing the parsed HTML content. | |
Returns: | |
str: The modified HTML content with localized href attributes. | |
Example: | |
language_code = "en" | |
soup = BeautifulSoup(html_content, 'html.parser') | |
modified_html = localize_UNEP_html(language_code, soup) | |
print(modified_html) | |
""" | |
# Access the URL | |
print(f"Accessing the URL, type: {type(soup)}") | |
soup = get_HTML_generic(soup) | |
print(f"Accessing parsed HTML: {type(soup)}") | |
# Filter only translatable content | |
soup = extract_content_by_language(soup) | |
print(f"Filtered HTML: {type(soup)}") | |
# Transform images and embedded YouTube videos | |
soup = transform_html_content(soup) | |
print(f"Transformed IMG and IFRAME tags: {type(soup)}") | |
# Find all <a> tags in the HTML content | |
for a_tag in soup.find_all('a'): | |
# Get the current href attribute value | |
current_href = a_tag.get('href', '') | |
# Localize the URL using the provided language code | |
localized_url = localize_URL(current_href, language_code) | |
# Update the href attribute with the localized URL | |
if localized_url is not None: | |
a_tag['href'] = localized_url | |
# Return the modified HTML content | |
return str(soup) | |
#Code created by Nelson JAIMES-QUINTERO | |
# -------------------- ## -------------------- ## -------------------- # | |
# FUNCTIONS FOR LAUNCHING THE DOCUMENT/LINK PROCESSING # | |
# DOC-HTML | |
def docx2_bitable(docx_path: str, output_lang: str): | |
"""Takes an input doc/docx file and creates a CSV file with 3 columns: | |
List number, URL found in the file, Localized URL in the input language. | |
""" | |
if not docx_path.lower().endswith(".doc") and not docx_path.endswith(".docx"): | |
print("ERROR: File not found or is not .DOC nor .DOCX. Verify the input_path field above.") | |
return "ERROR: File not found or is not .DOC nor .DOCX. Verify the input_path field above." | |
input_docx_path = docx_path # document | |
# Name the output_file based on the docx's name | |
last_slash_index = input_docx_path.rfind('/') | |
if last_slash_index != -1: | |
extracted_string = f"{input_docx_path[last_slash_index + 1:]}" | |
extracted_string = extracted_string.replace("#", "") | |
#print(extracted_string) | |
else: | |
#print("No '/' found in the URL.") | |
extracted_string = input_docx_path | |
extracted_string = extracted_string.replace("#", "") | |
# Naming the output file | |
output_directory = '/content' | |
output_csv_path = f"{output_directory}/output_{output_lang}_{extracted_string[0:len(extracted_string)//2]}.csv" | |
# Create the output directory if it doesn't exist | |
os.makedirs(output_directory, exist_ok=True) | |
#output_csv_path = f"output_{output_lang}_{docx_path[0:len(docx_path)//2]}.html" | |
# Convert DOCX to HTML | |
html_content = convert_docx_to_html(input_docx_path) | |
print("Doc converted into html successfully.") | |
# Write HTML content to a file | |
#with open(output_html_path, "w", encoding="utf-8") as html_file: | |
#html_file.write(html_content) | |
#print("Conversion complete. HTML file saved at:", output_html_path) | |
# Extract href attributes | |
href_attributes = extract_href_attributes(html_content) | |
#print("Extracted href attributes:", href_attributes) | |
output_urls = [localize_URL(url, output_lang) for url in href_attributes] | |
# Create a pandas DataFrame | |
df = pd.DataFrame({'index': range(1, len(href_attributes) + 1), 'input_url': href_attributes, 'output_url': output_urls}) | |
# Export the DataFrame to a CSV file | |
if not df.empty: | |
print("Check your exported csv file at the left on the Folder icon\nwith the name", output_csv_path) | |
df.to_csv(output_csv_path, index=False, encoding="utf-8") | |
# Display the DataFrame | |
return df | |
# From PDF file -------------------- ## | |
# NEEDS FITZ | |
def pdf2_bitable(pdf_path: str, output_lang: str): | |
if not pdf_path.lower().endswith("pdf"): | |
print(f"ERROR: File not found or is not .pdf. Verify the input_path field above: {pdf_path}") | |
return None | |
# Create a document object | |
doc = fitz.open(pdf_path) # or fitz.Document(filename) | |
# Create a pandas DataFrame | |
data = [] | |
# get the links on all pages | |
for i in range(doc.page_count): | |
page = doc.load_page(i) | |
links = page.get_links() | |
if links: | |
for item in links: | |
input_url = item.get('uri') | |
if input_url != None: | |
localized_url = localize_URL(input_url, output_lang) | |
data.append({'index': len(data) + 1, 'Page': i, 'input_url': input_url, 'localized_url': localized_url}) | |
# Create a pandas DataFrame | |
df_pdf = pd.DataFrame(data) | |
# Name the file based on the pdf's name | |
last_slash_index = pdf_path.rfind('/') | |
if last_slash_index != -1: | |
extracted_string = f"{pdf_path[last_slash_index + 1:]}" | |
extracted_string = extracted_string.replace("#", "") | |
#print(extracted_string) | |
else: | |
#print("No '/' found in the URL.") | |
extracted_string = pdf_path | |
extracted_string = extracted_string.replace("#", "") | |
# Naming the output file | |
output_directory = '/content' | |
output_csv_path = f"{output_directory}/output_{output_lang}_{extracted_string[0:len(extracted_string)//2]}.csv" | |
# Create the output directory if it doesn't exist | |
os.makedirs(output_directory, exist_ok=True) | |
if not df_pdf.empty: | |
# Export the DataFrame to a CSV file | |
df_pdf.to_csv(output_csv_path, index=False, encoding="utf-8") | |
print("Check your exported csv file at the left on the Folder icon\nwith the name", output_csv_path) | |
return df_pdf | |
else: | |
print("ERROR: File not found or is not .pdf. Verify the input_path field above.") | |
return None | |
# DOCX REPLACER -------------------- ## | |
#Replace links in Docx with SpireDoc | |
def docx2docx_replacer(my_chemin_docx: str, my_langue): | |
# Create a Document object | |
doc = Document() | |
# Load a Word file | |
doc.LoadFromFile(my_chemin_docx) | |
# Find all hyperlinks in the document | |
hyperlinks = [] | |
for i in range(doc.Sections.Count): | |
section = doc.Sections.get_Item(i) | |
for j in range(section.Body.ChildObjects.Count): | |
sec = section.Body.ChildObjects.get_Item(j) | |
if sec.DocumentObjectType == DocumentObjectType.Paragraph: | |
for k in range((sec if isinstance(sec, Paragraph) else None).ChildObjects.Count): | |
para = (sec if isinstance(sec, Paragraph) | |
else None).ChildObjects.get_Item(k) | |
if para.DocumentObjectType == DocumentObjectType.Field: | |
field = para if isinstance(para, Field) else None | |
if field.Type == FieldType.FieldHyperlink: | |
hyperlinks.append(field) | |
# Iterate through hyperlinks and update them | |
for hyperlink in hyperlinks: | |
# Get the current display text and URL | |
current_url = hyperlink.Code.replace('HYPERLINK "', '').replace('"', '') | |
match = re.search(r'HYPERLINK "(.*?)"', hyperlink.Code) | |
if match: | |
current_url = match.group(1) | |
current_display_text = hyperlink.FieldText | |
localized_url = localize_URL(current_url, my_langue) | |
if localized_url: | |
# Update the display text and URL of the hyperlink | |
#hyperlink.FieldText = "NEW DISPLAY TEXT" # Replace with your new display text | |
hyperlink.Code = f'HYPERLINK "{localized_url}"' | |
if len(hyperlinks)>0: | |
# Naming output file | |
last_slash_index = my_chemin_docx.rfind('/') | |
if last_slash_index != -1: | |
extracted_string = f"{my_chemin_docx[last_slash_index + 1:]}" | |
extracted_string = extracted_string.replace("#", "") | |
#print(extracted_string) | |
else: | |
#print("No '/' found in the URL.") | |
extracted_string = my_chemin_docx | |
extracted_string = extracted_string.replace("#", "") | |
output_directory = '/content' | |
output_path = f"{output_directory}/output_{my_langue}_{extracted_string[0:len(extracted_string)//2]}.docx" | |
# Create the output directory if it doesn't exist | |
os.makedirs(output_directory, exist_ok=True) | |
# Save the document to a docx file | |
print("\n\nSaving the output file:") | |
doc.SaveToFile(output_path, FileFormat.Docx) | |
print(f"Output file saved successfuly in your content folder as:\n\t{output_path}") | |
doc.Close() | |
else: | |
print(f"ERROR on processing the file: {my_chemin_docx}") | |
# 6. HTML downloader and link replacer -------------------- ## | |
def link2_html_converter(UNEP_URL_DOWNREPLACE: str, language_code: str): | |
"""Takes an input link from UNEP website. It downloads the webpage | |
translatable content, replace its links with the localized version and | |
exports a .txt file with the HTML tags ready to be used in any CAT tool | |
for human translation. | |
""" | |
modified_html = localize_UNEP_html(language_code, UNEP_URL_DOWNREPLACE) | |
if not modified_html: | |
print("ERROR: The input URL might not be accessible, or not an URL.") | |
raise ValueError("The input URL might not be accessible, or not an URL.") | |
print(f"\nFile to be exported in your folder, or\n\n\t\tcopy the result from below :\n\n\n{modified_html}") | |
# Name the file based on the webpage's name | |
last_slash_index = UNEP_URL_DOWNREPLACE.rfind('/') | |
if last_slash_index != -1: | |
extracted_string = f"{UNEP_URL_DOWNREPLACE[last_slash_index + 1:]}_replacedURLs_{language_code}.txt" | |
extracted_string = extracted_string.replace("#", "") | |
#print(extracted_string) | |
else: | |
#print("No '/' found in the URL.") | |
extracted_string = UNEP_URL_DOWNREPLACE + ".txt" | |
extracted_string = extracted_string.replace("#", "") | |
# Save the modified HTML content to a .txt file in the current folder | |
with open(extracted_string, 'w', encoding='utf-8') as file: | |
print(type(modified_html)) | |
print(modified_html) | |
file.write(modified_html) | |
print(f"File {extracted_string} exported succesfully") | |
# Force download in Google Colab | |
try: | |
from google.colab import files | |
files.download(extracted_string) | |
except ImportError: | |
pass | |
# Install necessary libraries | |
#!pip install gradio | |
import gradio as gr | |
from bs4 import BeautifulSoup | |
# Define your custom function | |
def render_html(htmltext, language): | |
soup = BeautifulSoup(htmltext, 'html.parser') | |
for a_tag in soup.find_all('a'): | |
# Get the current href attribute value | |
current_href = a_tag.get('href', '') | |
# Localize the URL using the provided language code | |
localized_url = localize_URL(current_href, language) | |
# Update the href attribute with the localized URL | |
if localized_url is not None: | |
a_tag['href'] = localized_url | |
# Return the modified HTML content | |
output = str(soup) | |
return output | |
# Create the Gradio interface | |
with gr.Blocks() as demo: | |
html_input = gr.Textbox(label="Enter HTML Code", lines=10, placeholder="Paste your HTML code here. You can convert Word file's content into HTML by using html-cleaner.com") | |
language_dropdown = gr.Dropdown(label="Select Language", choices=['es', 'fr', 'sw', 'en', 'zh-hans', 'pt-br', 'ru', 'ar'], value='es') | |
html_output = gr.HTML(label="Rendered HTML") | |
run_button = gr.Button("Find links!") | |
run_button.click(render_html, inputs=[html_input, language_dropdown], outputs=html_output) | |
# Launch the Gradio app with debug=True and share=True | |
demo.launch(debug=True, share=True) |