import gradio as gr import re import requests import sys import os import urllib from bs4 import BeautifulSoup import urllib.parse from urllib.parse import urlparse, urljoin from urllib3.exceptions import InsecureRequestWarning from urllib3 import disable_warnings import email.utils import pandas as pd disable_warnings(InsecureRequestWarning) def get_language_code(query): """ Search for a value given a key or search for a key given a value in the language_dict. Args: query (str): The key or value to search for. Returns: str: The corresponding value or key. """ for key, value in language_dict.items(): if query.lower() == key.lower(): return value elif query.lower() == value.lower(): return key return None # Example usage: language_dict = { "Spanish": "es", "French": "fr", "Swahili": "sw", "English": "en", "Chinese": "zh-hans", "Portuguese": "pt-br", "Russian": "ru", "Arabic": "ar" } #result_key = get_language_code("Spanish") #result_value = get_language_code("fr") #print(result_key) # Output: "fr" #print(result_value) # Output: "Spanish" #print(type(result_value)) # Extract node's number from UNEP URL def find_UNEP_node(unep_full_link: str) -> str: """find_UNEP_node access the input URL, finds the language version of the webpage, return the URL's node that is common to all UNEP languages. Args: unep_full_link (str): String of full web url in UNEP website. Returns: str: URL's node Examples: >>> convert_UNEP_url('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts') '34817' """ # Send an URL request with headers to bypass CloudFlare as suggested by https://stackoverflow.com/a/74674276 req = urllib.request.Request(unep_full_link) req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0') req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8') req.add_header('Accept-Language', 'en-US,en;q=0.5') try: response = urllib.request.urlopen(req) except urllib.error.HTTPError as e: print(f"HTTPError: {e.code} - {e.reason}") # You can raise a custom exception or handle the error in any other way except urllib.error.URLError as e: print(f"URLError: {e.reason}") # Handle other URL-related errors except Exception as e: print(f"An unexpected error occurred: {e}") # Handle other unexpected errors else: # If no exception occurred, continue text processing print("Scraping successful") r = urllib.request.urlopen(req).read().decode('utf-8') if r: # Convert html into BeautifulSoup object soup = BeautifulSoup(r, 'html.parser') #print(soup) # Find the