nelsonjq commited on
Commit
ebf2da8
·
verified ·
1 Parent(s): 6a59b4e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +1250 -0
app.py ADDED
@@ -0,0 +1,1250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import re
3
+ import requests
4
+ import sys
5
+ import os
6
+ import urllib
7
+ from bs4 import BeautifulSoup
8
+ import urllib.parse
9
+ from urllib.parse import urlparse, urljoin
10
+ from urllib3.exceptions import InsecureRequestWarning
11
+ from urllib3 import disable_warnings
12
+ import email.utils
13
+ import pandas as pd
14
+ import pypandoc
15
+ import fitz
16
+ from docx import Document
17
+ from spire.doc import *
18
+ from spire.doc.common import *
19
+
20
+ disable_warnings(InsecureRequestWarning)
21
+
22
+
23
+ def get_language_code(query):
24
+ """
25
+ Search for a value given a key or search for a key given a value in the language_dict.
26
+
27
+ Args:
28
+ query (str): The key or value to search for.
29
+
30
+ Returns:
31
+ str: The corresponding value or key.
32
+ """
33
+ for key, value in language_dict.items():
34
+ if query.lower() == key.lower():
35
+ return value
36
+ elif query.lower() == value.lower():
37
+ return key
38
+
39
+ return None
40
+
41
+ # Example usage:
42
+ language_dict = {
43
+ "Spanish": "es",
44
+ "French": "fr",
45
+ "Swahili": "sw",
46
+ "English": "en",
47
+ "Chinese": "zh-hans",
48
+ "Portuguese": "pt-br",
49
+ "Russian": "ru",
50
+ "Arabic": "ar"
51
+ }
52
+
53
+ #result_key = get_language_code("Spanish")
54
+ #result_value = get_language_code("fr")
55
+
56
+ #print(result_key) # Output: "fr"
57
+ #print(result_value) # Output: "Spanish"
58
+ #print(type(result_value))
59
+
60
+ # Extract node's number from UNEP URL
61
+ def find_UNEP_node(unep_full_link: str) -> str:
62
+ """find_UNEP_node access the input URL, finds the language version
63
+ of the webpage, return the URL's node that is common to all UNEP languages.
64
+
65
+ Args:
66
+ unep_full_link (str): String of full web url in UNEP website.
67
+
68
+ Returns:
69
+ str: URL's node
70
+
71
+ Examples:
72
+ >>> convert_UNEP_url('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts')
73
+ '34817'
74
+ """
75
+ # Send an URL request with headers to bypass CloudFlare as suggested by https://stackoverflow.com/a/74674276
76
+ req = urllib.request.Request(unep_full_link)
77
+ req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0')
78
+ req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8')
79
+ req.add_header('Accept-Language', 'en-US,en;q=0.5')
80
+ try:
81
+ response = urllib.request.urlopen(req)
82
+
83
+ except urllib.error.HTTPError as e:
84
+ print(f"HTTPError: {e.code} - {e.reason}")
85
+ # You can raise a custom exception or handle the error in any other way
86
+ except urllib.error.URLError as e:
87
+ print(f"URLError: {e.reason}")
88
+ # Handle other URL-related errors
89
+ except Exception as e:
90
+ print(f"An unexpected error occurred: {e}")
91
+ # Handle other unexpected errors
92
+ else:
93
+ # If no exception occurred, continue text processing
94
+ print("Scraping successful")
95
+
96
+ r = urllib.request.urlopen(req).read().decode('utf-8')
97
+ if r:
98
+ # Convert html into BeautifulSoup object
99
+ soup = BeautifulSoup(r, 'html.parser')
100
+ #print(soup)
101
+
102
+ # Find the <ul> element with class 'links'
103
+ ul_element = soup.find('ul', class_='links')
104
+
105
+ # Find the <li> element with class 'es is-active'
106
+ li_element = ul_element.find('li', class_=lambda x: x and x.endswith('is-active'))
107
+
108
+ # Extract the value of the 'data-drupal-link-system-path' attribute
109
+ attribute_value = li_element.get('data-drupal-link-system-path')
110
+ return attribute_value.split('node/')[1]
111
+
112
+ # test
113
+ #print(find_UNEP_node('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts'))
114
+ #print(type(find_UNEP_node('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts')))
115
+
116
+ # Main function: finds the language version of a web article in UNEP website.
117
+
118
+ def convert_UNEP_url(unep_full_link: str, target_lang: str = 'en') -> str:
119
+ """convert_UNEP_url access the input URL, finds the URL of the translated version
120
+ of the webpage in the input language, return an URL.
121
+
122
+ Args:
123
+ unep_full_link (str): String of full web url in UNEP website.
124
+ target_lang (str): Target language, default = 'en'.
125
+
126
+ Returns:
127
+ str: New converted URL
128
+
129
+ Examples:
130
+ >>> convert_UNEP_url('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts', 'es')
131
+ 'https://www.unep.org/es/noticias-y-reportajes/reportajes/los-pueblos-indigenas-recurren-los-tribunales-ante-la-crisis'
132
+ """
133
+ # Send an URL request with headers to bypass CloudFlare as suggested by https://stackoverflow.com/a/74674276
134
+ req = urllib.request.Request(unep_full_link)
135
+ req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0')
136
+ req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8')
137
+ req.add_header('Accept-Language', 'en-US,en;q=0.5')
138
+ try:
139
+ response = urllib.request.urlopen(req)
140
+
141
+ except urllib.error.HTTPError as e:
142
+ print(f"HTTPError: {e.code} - {e.reason}")
143
+ # You can raise a custom exception or handle the error in any other way
144
+ return None
145
+ except urllib.error.URLError as e:
146
+ print(f"URLError: {e.reason}")
147
+ # Handle other URL-related errors
148
+ return None
149
+ except Exception as e:
150
+ print(f"An unexpected error occurred: {e}")
151
+ # Handle other unexpected errors
152
+ return None
153
+ else:
154
+ # If no exception occurred, continue text processing
155
+ print("Scraping successful")
156
+
157
+ r = urllib.request.urlopen(req).read().decode('utf-8')
158
+ if r:
159
+ # Convert html into BeautifulSoup object
160
+ soup = BeautifulSoup(r, 'html.parser')
161
+ #print(soup)
162
+
163
+ # Looks for the link in the target language, whose class is "language-link"
164
+ lenguas = soup.find("a", class_="language-link", hreflang = target_lang)
165
+ #print(lenguas)
166
+ if lenguas:
167
+ #print(f"https://www.unep.org{lenguas['href']}")
168
+ if lenguas['href'].endswith('/node'):
169
+ return f"https://www.unep.org{lenguas['href'][0:-5]}"
170
+ return f"https://www.unep.org{lenguas['href']}"
171
+ elif not lenguas:
172
+ # Find the <ul> element with class 'links'
173
+ ul_element = soup.find('ul', class_='links')
174
+ if ul_element:
175
+ # Find the <li> element with class 'es is-active'
176
+ li_element = ul_element.find('li', class_=lambda x: x and x.endswith('is-active'))
177
+
178
+ # Extract the value of the 'data-drupal-link-system-path' attribute
179
+ node_value = li_element.get('data-drupal-link-system-path')
180
+ return find_from_nodeLink(int(node_value.split("/")[1]), target_lang)
181
+ #return f"https://www.unep.org/{node_value}"
182
+ else:
183
+ raise ValueError("Error: Webpage accessed but the tag 'a', class_='language-link' was not found. Probably because the website was blocked by firewall/CloudFlare")
184
+ return None
185
+ else:
186
+ print("\n<-- Error code. The programme could not access the webpage, forbidden")
187
+ return None
188
+
189
+ # test
190
+ #input = input("Enter your UNEP url:")
191
+ #input = 'https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts'
192
+ #input = "https://www.unep.org/ru"
193
+ #print(convert_UNEP_url(input, 'es'))
194
+ #print(convert_UNEP_url(input, 'fr'))
195
+
196
+
197
+ UNEP_LANG_CODES = ['ar', 'es', 'fr', 'ru', 'sw', 'pt-br', 'ch', 'zh', 'zh-hans', 'en']
198
+
199
+ def find_from_nodeLink(node_input, target_lang='empty'):
200
+ """Replaces a node_link to the corresponding language.
201
+
202
+ Args:
203
+ node_input (str, int): Either a string of web URL containing the word 'node' and its ID, or an integer ID (or a string representation of an integer).
204
+ target_lang (str): Target language, default = 'empty'.
205
+
206
+ Returns:
207
+ str: New converted URL
208
+
209
+ Examples:
210
+ >>> find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'fr')
211
+ 'https://www.unep.org/fr/node/30010'
212
+ >>> find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'empty')
213
+ 'https://www.unep.org/node/30010'
214
+ >>> find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'zh-hans')
215
+ 'https://www.unep.org/zh-hans/node/30010'
216
+ >>> find_from_nodeLink(30010, 'fr')
217
+ 'https://www.unep.org/fr/node/30010'
218
+ >>> find_from_nodeLink('30010', 'fr')
219
+ 'https://www.unep.org/fr/node/30010'
220
+ """
221
+
222
+ if isinstance(node_input, str) and node_input.isdigit():
223
+ node_input = int(node_input)
224
+
225
+ if isinstance(node_input, int):
226
+ node_url = f'https://www.unep.org/{target_lang}/node/{node_input}'
227
+ elif isinstance(node_input, str):
228
+ node_url = node_input
229
+ else:
230
+ raise ValueError("Error: Provide either a string URL or an integer ID (or a string representation of an integer)")
231
+
232
+ pattern = r"https://www\.unep\.org/[a-z]*-?[a-z]*/?node/(\d+)"
233
+
234
+ if target_lang == "empty":
235
+ target_lang = "en"
236
+ if target_lang in ["ch", 'zh', 'cn']:
237
+ target_lang = "zh-hans"
238
+ if target_lang in ['pt', 'pt-pt']:
239
+ target_lang = "pt-br"
240
+ if target_lang in UNEP_LANG_CODES:
241
+ if re.findall(pattern, node_url):
242
+ # Replace the language part in the URL
243
+ new_url = re.sub(pattern, r"https://www.unep.org/{}/node/\1".format(target_lang), node_url)
244
+ return new_url
245
+ else:
246
+ raise ValueError("Error: URL not found, or website blocked by firewall/CloudFare")
247
+ else:
248
+ raise ValueError("Error: Provide a language code among these: 'ar','es','fr','ru','sw','pt-br','zh-hans', 'en' or leave empty")
249
+
250
+ # Generic scraper
251
+
252
+ def get_HTML_generic(any_url: str) -> BeautifulSoup:
253
+
254
+ """Any website link converter, it access the website and returns the HTML.
255
+
256
+ Args:
257
+ any_url (str): String of web url from the web wedocs.unep.org
258
+
259
+ Returns:
260
+ str: parsed HTML with BeautifulSoup
261
+
262
+ Example:
263
+ >>> convert_WeDocs_href('https://wedocs.unep.org/handle/20.500.11822/43104', 'Chinese')
264
+ 'https://wedocs.unep.org/bitstream/handle/20.500.11822/43104/PracticalGuide_ZH.pdf?sequence=5&isAllowed=y'
265
+ """
266
+ req = urllib.request.Request(any_url)
267
+ req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0')
268
+ req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8')
269
+ req.add_header('Accept-Language', 'en-US,en;q=0.5')
270
+ try:
271
+ response = urllib.request.urlopen(req)
272
+
273
+ except urllib.error.HTTPError as e:
274
+ print(f"HTTPError: {e.code} - {e.reason} when accessing {any_url}")
275
+ # You can raise a custom exception or handle the error in any other way
276
+ except urllib.error.URLError as e:
277
+ print(f"URLError: {e.reason} when accessing {any_url}")
278
+ # Handle other URL-related errors
279
+ except Exception as e:
280
+ print(f"An unexpected error occurred: {e} when accessing {any_url}")
281
+ # Handle other unexpected errors
282
+ else:
283
+ # If no exception occurred, continue text processing
284
+ print("Scraping successful")
285
+
286
+ r = urllib.request.urlopen(req).read().decode('utf-8')
287
+ if r:
288
+ # Convert html into BeautifulSoup object
289
+ soup = BeautifulSoup(r, 'html.parser')
290
+ return soup
291
+ #print(soup)
292
+
293
+ # Example usage with an integer ID provided as a string
294
+ #print(find_from_nodeLink('30010', 'fr'))
295
+ #print(find_from_nodeLink(30010, 'fr'))
296
+ #print(find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'zh-hans'))
297
+
298
+
299
+ def try_lang_switcher(switcher_soup, lang_code: str, base_url) -> str:
300
+
301
+ # Find the <ul> element with class "language-switcher"
302
+ #language_switcher_ul = switcher_soup.find('ul', class_='language-switcher')
303
+ language_switcher_ul = switcher_soup.find('ul', class_=lambda value: value and value.startswith('language-switcher'))
304
+
305
+ # Extract href values from <a> elements within the <ul>
306
+ if language_switcher_ul:
307
+ href_values = [a['href'] for a in language_switcher_ul.find_all('a')]
308
+
309
+ for i, element in enumerate(href_values):
310
+ if lang_code in element:
311
+ new_link = urljoin(base_url, href_values[i])
312
+ return new_link
313
+ return None
314
+
315
+ # Function to concatenate absolute paths if URL cannot be accessed
316
+ from urllib.parse import urljoin
317
+
318
+ def concatenate_missing_segments(arg1, arg2):
319
+ """
320
+ Concatenates missing URL segments from Arg1 to Arg2.
321
+
322
+ Args:
323
+ arg1 (str): The URL containing the missing segments, longer URL like
324
+ "https://www.unep.org/interactive/explore-ecosystems/mountains/en/index.php#/mountain-intro"
325
+
326
+ arg2 (str): The target URL, shorter URL like
327
+ "https://www.unep.org/interactive/explore-ecosystems/mountains/ar"
328
+
329
+ Returns:
330
+ str: The concatenated URL.
331
+ "https://www.unep.org/interactive/explore-ecosystems/mountains/ar/index.php#/mountain-intro"
332
+ """
333
+ if len(arg1)>len(arg2):
334
+ missing_segment = arg1[len(arg2):]
335
+ return arg2 + missing_segment
336
+
337
+ # Example usage:
338
+ #arg1 = "https://www.unep.org/interactive/explore-ecosystems/mountains/en/index.php#/mountain-intro"
339
+ #arg2 = "https://www.unep.org/interactive/explore-ecosystems/mountains/ar"
340
+
341
+ #result = concatenate_missing_segments(arg1, arg2)
342
+ #print(result)
343
+
344
+
345
+ def convert_URL_anyWebsite(any_web_url: str, lang_code) -> str:
346
+ # Access the URL to get the HTML with BeautifulSoup --> soup object
347
+ sauce_html = get_HTML_generic(any_web_url)
348
+ print(type(sauce_html))
349
+ if sauce_html:
350
+ # Search the language_switcher HTML tag and gets the language code
351
+ switcher_link = try_lang_switcher(sauce_html, lang_code.lower(), any_web_url)
352
+ if switcher_link and get_HTML_generic(switcher_link):
353
+ return switcher_link
354
+ elif switcher_link:
355
+ return concatenate_missing_segments(any_web_url, switcher_link)
356
+ elif sauce_html.find_all(lambda tag: tag.has_attr('data-sf-role') and tag['data-sf-role'] == lang_code): #working for WHO news
357
+ print("trying WHO")
358
+ matching_tags = sauce_html.find_all(lambda tag: tag.has_attr('data-sf-role') and tag['data-sf-role'] == lang_code)
359
+ if matching_tags:
360
+ print(matching_tags)
361
+ return matching_tags[0]['value']
362
+ elif sauce_html.find_all(lambda tag: tag.has_attr('hreflang') and tag['hreflang'] == lang_code):
363
+ print("trying hreflang")
364
+ matching_tags = sauce_html.find_all(lambda tag: tag.has_attr('hreflang') and tag['hreflang'] == lang_code)
365
+ if matching_tags:
366
+ return matching_tags[0]['href']
367
+ elif sauce_html:
368
+ print("trying language_link") # working for UNESCO
369
+ lang_tag = sauce_html.find("a", class_="language-link", hreflang = lang_code)
370
+ #print(lang_tag)
371
+ if lang_tag != None:
372
+ return urljoin(any_web_url, lang_tag['href'])
373
+ else:
374
+ return None
375
+ #output_li = convert_URL_anyWebsite("[email protected]", "es")
376
+ #print(output_li)
377
+
378
+ def weDocs_short(weDocs_url) -> str:
379
+ """Replaces a language specific WeDocs link with the landing page
380
+
381
+ Args:
382
+ weDocs_url (str): String of web url from the web wedocs.unep.org
383
+
384
+ Returns:
385
+ str: Landing page of the document, so it is not language specific.
386
+
387
+ Example:
388
+ >>> weDocs_short('https://wedocs.unep.org/bitstream/handle/20.500.11822/43104/Practical_Guide.pdf?sequence=1&isAllowed=y')
389
+ 'https://wedocs.unep.org/handle/20.500.11822/43104/'
390
+ """
391
+ return re.sub(r"https://wedocs.unep.org/(bitstream/)?handle/([\w.-]+/\d+).+", r"https://wedocs.unep.org/handle/\2", weDocs_url)
392
+
393
+ # WeDocs link converter, it access a short WeDocs link and returns a language-specific URL (pdf)
394
+
395
+
396
+ def convert_WeDocs_href(url: str, target_lang: str ='English') -> str:
397
+
398
+ """WeDocs link converter, it access a short WeDocs link
399
+ and returns a language-specific URL (pdf)
400
+
401
+ Args:
402
+ weDocs_url (str): String of web url from the web wedocs.unep.org
403
+ target_lang (str): Language code of the document to find.
404
+
405
+ Returns:
406
+ str: Download link of the PDF in the language requested.
407
+
408
+ Example:
409
+ >>> convert_WeDocs_href('https://wedocs.unep.org/handle/20.500.11822/43104', 'Chinese')
410
+ 'https://wedocs.unep.org/bitstream/handle/20.500.11822/43104/PracticalGuide_ZH.pdf?sequence=5&isAllowed=y'
411
+
412
+ """
413
+ try:
414
+ # Send an HTTP GET request to the URL
415
+ response = requests.get(url, verify=False)
416
+
417
+ # Check if the request was successful
418
+ if response.status_code == 200:
419
+ # Parse the HTML content using BeautifulSoup
420
+ pattern = re.compile(r".*{}.*".format(re.escape(target_lang.capitalize()))) # TODO normalize to take into account the Dico's key, in case user enters RU instead of Russian
421
+ soup = BeautifulSoup(response.text, 'html.parser')
422
+ #print(soup.prettify())
423
+
424
+ # Find the <a> tag with the word "Spanish" or the entered language name in its text
425
+ # Extract the href attribute value
426
+ lang_link = soup.find(string=re.compile(pattern)).parent['href']
427
+ #print(lang_link)
428
+
429
+ if lang_link:
430
+ # Merge the domain and PDF name to create the complete link
431
+ clean_link = "https://wedocs.unep.org" + lang_link
432
+ return clean_link
433
+ else:
434
+ return f"No link with '{target_lang}' text found."
435
+ else:
436
+ return "Failed to retrieve the URL."
437
+
438
+ except Exception as e:
439
+ return str(e)
440
+
441
+ #spanish_href = extract_WeDocs_href(url, "Spanish")
442
+ #portuguese_href = extract_WeDocs_href(url, "Portuguese")
443
+ #ch_href = convert_WeDocs_href(url, "Chinese")
444
+ #print(spanish_href)
445
+ #print(portuguese_href)
446
+ #print(ch_href)
447
+
448
+ def access_un_library_by_id(user_input_id):
449
+ try:
450
+ # Base URL
451
+ base_url = "https://digitallibrary.un.org/search?"
452
+
453
+ # Construct the URL with the user-provided ID
454
+ url = f"{base_url}ln=fr&p={user_input_id}&f=&c=Resource%20Type&c=UN%20Bodies&sf=&so=d&rg=50&fti=0"
455
+
456
+ # Send an HTTP GET request to the URL
457
+ response = requests.get(url)
458
+
459
+ # Check if the request was successful (status code 200)
460
+ if response.status_code == 200:
461
+ print("Request was successful. Content:")
462
+
463
+ # Parse the HTML content using BeautifulSoup
464
+ soup = BeautifulSoup(response.text, 'html.parser')
465
+
466
+ # Find the <div> with class="result-title"
467
+ result_title_div = soup.find('div', class_='result-title')
468
+
469
+ if result_title_div:
470
+ # Find the first <a> tag within the result-title div and get its href value
471
+ result_title_a = result_title_div.find('a', href=True)
472
+ if result_title_a:
473
+ href_value = result_title_a['href']
474
+ return f"https://digitallibrary.un.org{href_value}"
475
+ else:
476
+ print("No <a> tag found inside result-title.")
477
+ else:
478
+ print("No result-title div found in the HTML.")
479
+ return None
480
+
481
+
482
+ else:
483
+ print(f"Failed to retrieve the URL. Status code: {response.status_code}")
484
+ return None
485
+ except Exception as e:
486
+ print(f"An error occurred: {str(e)}")
487
+ return None
488
+
489
+ # Get user input for the ID
490
+ #user_input_id = input("Enter the ID: ")
491
+
492
+ # Call the function with user input
493
+ #resultado = access_un_library_by_id(user_input_id)
494
+ #print(resultado)
495
+
496
+ # Send an URL request with headers to bypass CloudFlare as suggested by https://stackoverflow.com/a/74674276
497
+ def access_un_library_byResourceURL(landing_url: str) -> BeautifulSoup:
498
+ req = urllib.request.Request(landing_url)
499
+ req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0')
500
+ req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8')
501
+ req.add_header('Accept-Language', 'en-US,en;q=0.5')
502
+ try:
503
+ response = urllib.request.urlopen(req)
504
+
505
+ except urllib.error.HTTPError as e:
506
+ print(f"HTTPError: {e.code} - {e.reason}")
507
+ # You can raise a custom exception or handle the error in any other way
508
+ return None
509
+ except urllib.error.URLError as e:
510
+ print(f"URLError: {e.reason}")
511
+ return None
512
+ # Handle other URL-related errors
513
+ except Exception as e:
514
+ print(f"An unexpected error occurred: {e}")
515
+ return None
516
+ # Handle other unexpected errors
517
+ else:
518
+ # If no exception occurred, continue text processing
519
+ print("Scraping successful")
520
+
521
+ r = urllib.request.urlopen(req).read().decode('utf-8')
522
+ if r:
523
+ # Convert html into BeautifulSoup object
524
+ soup = BeautifulSoup(r, 'html.parser')
525
+ #print(soup)
526
+ return soup
527
+ else:
528
+ # HTML error
529
+ raise ValueError("Error in parsing the website content in HTML")
530
+ return None
531
+
532
+
533
+
534
+ def extract_info_UNdocLink(url, lang2_code):
535
+ """
536
+ Extracts information from a given UNDocs URL.
537
+
538
+ Args:
539
+ url (str): The UNDocs URL.
540
+
541
+ Returns:
542
+ dict: A dictionary containing the extracted information.
543
+ """
544
+ # Define a regex pattern to match the components in the URL
545
+ # https://undocs.org/en/UNEP/EA.5/28/Corr.1
546
+ pattern = r'https://undocs\.org/([a-z]{2})?/?([A-Z]+)/(.*?)/(\d+)/(.*?)$'
547
+
548
+ # Use regex to find the components in the URL
549
+ match = re.match(pattern, url)
550
+
551
+ if match:
552
+ record_id = match.group(0)
553
+ symbol = match.group(2)
554
+ doc_type = match.group(3)
555
+ unga = match.group(4)
556
+ resolution_id = match.group(5)
557
+ language_code = match.group(1) if match.group(1) else None # Optional language code
558
+ return f"https://undocs.org/{lang2_code.lower()}/{symbol}/{doc_type}/{unga}/{resolution_id}"
559
+ else:
560
+ return None
561
+
562
+ # Example usage:
563
+ #url = "https://undocs.org/en/UNEP/EA.5/28/Corr.1"
564
+ #result = extract_info_UNdocLink(url, "fr")
565
+ #print(result)
566
+
567
+ # Define the language dictionary
568
+ language_dict = {
569
+ "Spanish": "es",
570
+ "French": "fr",
571
+ "English": "en",
572
+ "Chinese": "ch",
573
+ "Russian": "ru",
574
+ "Arabic": "ar"
575
+ }
576
+
577
+ #input_language = "Russian"
578
+
579
+ # 1.7 UN Docs
580
+ def get_jobID_undocs(url):
581
+ """
582
+ Extracts the job ID from a given URL of the ny.un.org website.
583
+
584
+ Args:
585
+ url (str): The URL of the document on ny.un.org.
586
+
587
+ Returns:
588
+ str: The extracted job ID.
589
+ """
590
+ # Define a regex pattern to match the job ID in the URL
591
+ pattern = r'dds-ny.*/([A-Za-z0-9]+)\.pdf'
592
+
593
+ # Use regex to find the job ID in the URL
594
+ match = re.search(pattern, url)
595
+
596
+ # Return the matched job ID or None if not found
597
+ return match.group(1) if match else None
598
+
599
+
600
+
601
+ # Extract the `value` attribute of <option> tags with the specified regex pattern
602
+ def find_lang_UNdoc(un_docs_link, input_language):
603
+ un_library_url = un_docs_link
604
+
605
+ # Define the language dictionary
606
+ UN_languages_dict = {
607
+ "Spanish": "es",
608
+ "French": "fr",
609
+ "English": "en",
610
+ "Chinese": "ch",
611
+ "Russian": "ru",
612
+ "Arabic": "ar"
613
+ }
614
+
615
+ if "undocs.org" in un_docs_link:
616
+ #return extract_info_UNdocLink(un_docs_link, UN_languages_dict[input_language])
617
+ return extract_info_UNdocLink(un_docs_link, input_language)
618
+ elif "dds-ny" in un_docs_link:
619
+ #extract ID TODO
620
+ un_library_url_ID = get_jobID_undocs(un_docs_link)
621
+ print(un_library_url_ID)
622
+ # Get URL from ID
623
+ un_library_url = access_un_library_by_id(un_library_url_ID)
624
+ print(un_library_url)
625
+
626
+ elif "digitallibrary.un.org" in un_docs_link:
627
+ un_library_url = un_docs_link
628
+
629
+
630
+ try:
631
+ # Get HTML from UN_lib URL
632
+ #soup = access_un_library_byResourceURL(un_library_url)
633
+ my_soup = access_un_library_byResourceURL(un_library_url)
634
+ if my_soup is None:
635
+ return None
636
+ except Exception as e:
637
+ print(f"An unexpected error occurred: {e}")
638
+ return None
639
+ else:
640
+ # Define the regex pattern
641
+ regex_pattern = r"-(\w{2})\.pdf"
642
+
643
+ # Find all <option> tags
644
+ options = my_soup.find_all('option', value=re.compile(regex_pattern))
645
+
646
+ # Extract and print the `value` attribute values
647
+ for option in options:
648
+ value = option['value']
649
+ match = re.search(regex_pattern, value)
650
+ if match:
651
+ language_code = match.group(1)
652
+ # Check if the language code is in the language_dict
653
+ language = next((k for k, v in UN_languages_dict.items() if v.startswith(language_code.lower())), 'Unknown')
654
+
655
+ #print(f"Option Value: {value}, Language Code: {language_code}, Language: {language}")
656
+
657
+ # Prepare the direct link for the requested language
658
+ if language == input_language:
659
+ output_links = [value]
660
+
661
+ # Define a regular expression pattern with capture groups
662
+ pattern = r"https://digitallibrary.un.org/record/(\d+)/files/([A-Z]+)_([A-Z]+)_([\d]+)_([\d]+)-(\w{2})\.pdf"
663
+
664
+ # Use re.search to find matches and capture groups
665
+ match = re.search(pattern, value)
666
+
667
+ if match:
668
+ # Extract capture group values
669
+ record_id = match.group(1)
670
+ symbol = match.group(2) # A
671
+ doc_type = match.group(3) # RES
672
+ unga = match.group(4) # 61
673
+ resolution_id = match.group(5) # 295
674
+ language_code = match.group(6) # es
675
+
676
+ # Construct the output string # https://undocs.org/es/A/RES/61/295
677
+ output_links.append(f"https://undocs.org/{symbol}/{doc_type}/{unga}/{resolution_id}")
678
+ output_links.append(f"https://undocs.org/{language_code.lower()}/{symbol}/{doc_type}/{unga}/{resolution_id}")
679
+
680
+ else:
681
+ print("No match found for the input string.")
682
+ # Output is a list of 3 links:
683
+ # 1 is UN Library: https://digitallibrary.un.org/record/606782/files/A_RES_61_295-ZH.pdf
684
+ # 2 is UN Docs multilingual shortlink: https://undocs.org/A/RES/61/295
685
+ # 3 is UN Docs MONO-lingual shortlink: https://undocs.org/zh/A/RES/61/295
686
+
687
+ return output_links
688
+
689
+
690
+ # Call the function to extract and print the option values
691
+ #print(find_lang_UNdoc("https://undocs.org/en/UNEP/EA.5/28/Corr.1", "Russian"))
692
+ #print(get_language_code("fr"))
693
+ #print(find_lang_UNdoc("https://www.ohchr.org/en/documents/thematic-reports/ahrc3917-report-special-rapporteur-rights-indigenous-peoples", get_language_code("fr")))
694
+
695
+ import re
696
+ def convert_Intl_Day(url, language_code):
697
+ """
698
+ Converts the language code in a UN URL to the specified language.
699
+
700
+ Args:
701
+ url (str): The UN URL.
702
+ language_code (str): The target language code.
703
+
704
+ Returns:
705
+ str: The modified URL with the specified language code.
706
+ """
707
+
708
+ # Use regex to replace the language code in the URL
709
+ if language_code.lower() == "ch":
710
+ return re.sub(r'/([a-z]{2})/observances', f'/zh/observances', url)
711
+ else:
712
+ return re.sub(r'/([a-z]{2})/observances', f'/{language_code}/observances', url)
713
+
714
+
715
+ # Example usage:
716
+ #url = "https://www.un.org/es/observances/cities-day"
717
+ #modified_url = convert_Intl_Day(url, "ch")
718
+ #print(modified_url)
719
+
720
+ import re
721
+
722
+ def convert_URLendingBy_langEqualsCode(url, language_code):
723
+ """
724
+ Converts the language code in a URL with the pattern ?lang=[A-Z]{2} to the specified language.
725
+ No URL validation.
726
+
727
+ Args:
728
+ url (str): The URL.
729
+ language_code (str): The target language code.
730
+
731
+ Returns:
732
+ str: The modified URL with the specified language code.
733
+ """
734
+
735
+ if language_code.lower() == "ch":
736
+ return re.sub(r'(\?lang=)[A-Z]{2}', fr'\1ZH', url)
737
+ else:
738
+ # Use regex to replace the language code in the URL
739
+ return re.sub(r'(\?lang=)[A-Z]{2}', fr'\1{language_code.upper()}', url)
740
+
741
+
742
+ # Example usage:
743
+ #url = "https://www.unep.org/interactives/beat-plastic-pollution/?lang=ES"
744
+ #modified_url = convert_URLendingBy_langEqualsCode(url, "ch")
745
+ #print(modified_url)
746
+
747
+ # Ultimate finder function
748
+
749
+ def localize_URL(mi_URL: str, lengua: str="en") -> str:
750
+
751
+ '''Apply all functions to try to find a language version of the input webpage
752
+ in the provided language code.
753
+
754
+ '''
755
+ resulting_link = None
756
+
757
+
758
+
759
+ def is_email(string):
760
+ print(f"Validating if {string} is an email:")
761
+ email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
762
+ return bool(email_pattern.match(string))
763
+
764
+ # Check if URL is not an email
765
+
766
+ if is_email(mi_URL):
767
+ print(f"{mi_URL} is an email")
768
+ return None
769
+ else:
770
+
771
+ #try UN Docs
772
+ #TODO find a way to scrape this search engine https://documents.un.org/prod/ods.nsf/home.xsp
773
+ # or how to download the PDF, access the symbol tag and join the url to undocs.org/
774
+
775
+ print("Trying find_lang_UNdoc for ", mi_URL)
776
+ resulting_link = find_lang_UNdoc(mi_URL, get_language_code(lengua))
777
+ if resulting_link:
778
+ return resulting_link[-1]
779
+
780
+ # International Days
781
+ if "/observances/" in mi_URL and "un.org/" in mi_URL:
782
+ print("Trying convert_Intl_Day")
783
+ resulting_link = convert_Intl_Day(mi_URL, lengua)
784
+ return resulting_link
785
+
786
+ # WeDocs UNEP
787
+ if "wedocs.unep.org" in mi_URL:
788
+ print("Trying convert_WeDocs_href")
789
+ short_weDocs_url = weDocs_short(mi_URL)
790
+ resulting_link = convert_WeDocs_href(short_weDocs_url, get_language_code(lengua))
791
+ return resulting_link
792
+
793
+ # try UNEP articles
794
+ if "unep.org" in mi_URL and "wedocs" not in mi_URL:
795
+ print("Trying convert_UNEP_url")
796
+ resulting_link = convert_UNEP_url(mi_URL, lengua)
797
+ return resulting_link
798
+
799
+ elif ".pdf" not in mi_URL:
800
+ print("Trying convert_URL_anyWebsite")
801
+ resulting_link = convert_URL_anyWebsite(mi_URL, lengua)
802
+ print(resulting_link)
803
+ if resulting_link is not None:
804
+ return resulting_link
805
+ else:
806
+ return None
807
+
808
+
809
+ #print(localize_URL("https://documents-dds-ny.un.org/doc/UNDOC/GEN/N06/512/07/PDF/N0651207.pdf?OpenElement", "fr"))
810
+ #print(localize_URL("https://documents-dds-ny.un.org/doc/UNDOC/GEN/G16/015/38/PDF/G1601538.pdf?OpenElement", "fr"))
811
+ #print(localize_URL("https://undocs.org/FCCC/CP/2015/10/Add.1", "fr"))
812
+ #print(localize_URL("https://www.un.org/en/observances/environment-in-war-protection-day", "fr"))
813
+ #print(localize_URL(url5, "fr"))
814
+
815
+
816
+
817
+ def convert_docx_to_html(docx_file_path):
818
+ output = pypandoc.convert_file(docx_file_path, 'html')
819
+ return output
820
+
821
+ def extract_href_attributes(html_content):
822
+ soup = BeautifulSoup(html_content, 'html.parser')
823
+ # creates a list
824
+ href_values = [a['href'] for a in soup.find_all('a', href=True)]
825
+ return href_values
826
+
827
+ def generate_table_URLs_from_Docx(docx_path, lang_code):
828
+ # Open the document
829
+ document = Document(docx_path)
830
+
831
+ # Extract hyperlinks
832
+ input_urls = []
833
+ for paragraph in document.paragraphs:
834
+ for run in paragraph.runs:
835
+ hyperlink = run.hyperlink
836
+ if hyperlink is not None:
837
+ input_urls.append(hyperlink.address)
838
+
839
+ #input_urls
840
+ data = []
841
+
842
+
843
+ # Initialize lists to store data for the DataFrame
844
+ index_list = []
845
+ original_url_list = []
846
+ localized_url_list = []
847
+
848
+ # Apply localizeURL to each URL in the list
849
+ for index, url in enumerate(input_urls):
850
+ localized_url = localize_URL(url, lang_code) # Replace 'en' with the desired language code
851
+ index_list.append(index)
852
+ original_url_list.append(url)
853
+ localized_url_list.append(localized_url)
854
+
855
+ # Create a DataFrame
856
+ df_docx = pd.DataFrame({
857
+ 'index': index_list,
858
+ 'url': original_url_list,
859
+ 'localized_url': localized_url_list
860
+ })
861
+
862
+ # Export the DataFrame to a CSV file
863
+ df_docx.to_csv(f"output_{lang_code}_{docx_path}", index=False, encoding="utf-8")
864
+
865
+ # Display the DataFrame
866
+ return df_docx
867
+
868
+
869
+ #language_code = "es"
870
+ #UNEP_URL_DOWNREPLACE = "https://www.unep.org/news-and-stories/press-release/global-annual-finance-flows-7-trillion-fueling-climate-biodiversity#"
871
+
872
+ def extract_content_by_language(soup):
873
+
874
+ # Find the div with id="field_body"
875
+ field_body_div = soup.find('div', id='field_body')
876
+
877
+ if field_body_div:
878
+ # Helper function to recursively clean div tags deeper than direct children
879
+ def clean_div_tags(tag):
880
+ for child in tag.children:
881
+ if child.name == 'div':
882
+ clean_div_tags(child)
883
+ else:
884
+ content.append(str(child))
885
+
886
+ # Ignore secondary div tags and extract their children tags (except div tags)
887
+ content = []
888
+ for tag in field_body_div.find_all(recursive=False):
889
+ if tag.name == 'div':
890
+ # Clean div tags deeper than direct children
891
+ clean_div_tags(tag)
892
+ else:
893
+ # Include children tags (except div tags)
894
+ content.append(str(tag))
895
+
896
+ return ''.join(content).strip()
897
+ else:
898
+ print(f"Div with id='field_body' not found in the HTML.")
899
+ return None
900
+
901
+ # Filter video frames and images HTML tags
902
+
903
+ def transform_html_content(html_content):
904
+ # Parse the HTML content
905
+ soup = BeautifulSoup(html_content, 'html.parser')
906
+
907
+ # Transform iframe tags with "youtu" in src attribute to oembed tags
908
+ for iframe_tag in soup.find_all('iframe', src=lambda x: x and 'youtu' in x):
909
+ src_attribute = iframe_tag['src']
910
+ video_id = src_attribute.split('/')[-1] # Extract video ID from the src attribute
911
+ oembed_tag = soup.new_tag('oembed')
912
+ oembed_tag.string = f'https://www.youtube.com/watch?v={video_id}'
913
+ iframe_tag.replace_with(oembed_tag)
914
+
915
+ # Merge figure tags and their children into a single img tag
916
+ for figure_tag in soup.find_all('figure'):
917
+ img_tag = figure_tag.find('img')
918
+ if img_tag:
919
+ # Create a new img tag with merged attributes
920
+ new_img_tag = soup.new_tag('img')
921
+ new_img_tag.attrs = img_tag.attrs
922
+ figcaption_tag = figure_tag.find('figcaption')
923
+ if figcaption_tag:
924
+ # Extract the content of figcaption tag for data-caption attribute
925
+ new_img_tag['data-caption'] = str(figcaption_tag.contents[0])
926
+ figure_tag.replace_with(new_img_tag)
927
+
928
+ # Return the modified HTML content
929
+ return soup
930
+
931
+ # Link Replacer for HTML
932
+
933
+ def localize_UNEP_html(language_code, soup):
934
+ """
935
+ Localizes the href attributes of <a> tags in HTML content based on the given language code.
936
+
937
+ Args:
938
+ language_code (str): The language code used for URL localization.
939
+ soup (BeautifulSoup): The BeautifulSoup object containing the parsed HTML content.
940
+
941
+ Returns:
942
+ str: The modified HTML content with localized href attributes.
943
+
944
+ Example:
945
+ language_code = "en"
946
+ soup = BeautifulSoup(html_content, 'html.parser')
947
+ modified_html = localize_UNEP_html(language_code, soup)
948
+ print(modified_html)
949
+ """
950
+ # Access the URL
951
+ print(f"Accessing the URL, type: {type(soup)}")
952
+ soup = get_HTML_generic(soup)
953
+ print(f"Accessing parsed HTML: {type(soup)}")
954
+ # Filter only translatable content
955
+ soup = extract_content_by_language(soup)
956
+ print(f"Filtered HTML: {type(soup)}")
957
+
958
+ # Transform images and embedded YouTube videos
959
+ soup = transform_html_content(soup)
960
+ print(f"Transformed IMG and IFRAME tags: {type(soup)}")
961
+
962
+ # Find all <a> tags in the HTML content
963
+ for a_tag in soup.find_all('a'):
964
+ # Get the current href attribute value
965
+ current_href = a_tag.get('href', '')
966
+
967
+ # Localize the URL using the provided language code
968
+ localized_url = localize_URL(current_href, language_code)
969
+
970
+ # Update the href attribute with the localized URL
971
+ if localized_url is not None:
972
+ a_tag['href'] = localized_url
973
+
974
+ # Return the modified HTML content
975
+ return str(soup)
976
+
977
+ #Code created by Nelson JAIMES-QUINTERO
978
+ # -------------------- ## -------------------- ## -------------------- #
979
+ # FUNCTIONS FOR LAUNCHING THE DOCUMENT/LINK PROCESSING #
980
+
981
+ # DOC-HTML
982
+ def docx2_bitable(docx_path: str, output_lang: str):
983
+ """Takes an input doc/docx file and creates a CSV file with 3 columns:
984
+ List number, URL found in the file, Localized URL in the input language.
985
+ """
986
+
987
+ if not docx_path.lower().endswith(".doc") and not docx_path.endswith(".docx"):
988
+ print("ERROR: File not found or is not .DOC nor .DOCX. Verify the input_path field above.")
989
+ return "ERROR: File not found or is not .DOC nor .DOCX. Verify the input_path field above."
990
+ input_docx_path = docx_path # document
991
+
992
+ # Name the output_file based on the docx's name
993
+ last_slash_index = input_docx_path.rfind('/')
994
+ if last_slash_index != -1:
995
+ extracted_string = f"{input_docx_path[last_slash_index + 1:]}"
996
+ extracted_string = extracted_string.replace("#", "")
997
+ #print(extracted_string)
998
+ else:
999
+ #print("No '/' found in the URL.")
1000
+ extracted_string = input_docx_path
1001
+ extracted_string = extracted_string.replace("#", "")
1002
+
1003
+
1004
+
1005
+ # Naming the output file
1006
+ output_directory = '/content'
1007
+ output_csv_path = f"{output_directory}/output_{output_lang}_{extracted_string[0:len(extracted_string)//2]}.csv"
1008
+
1009
+ # Create the output directory if it doesn't exist
1010
+ os.makedirs(output_directory, exist_ok=True)
1011
+
1012
+ #output_csv_path = f"output_{output_lang}_{docx_path[0:len(docx_path)//2]}.html"
1013
+
1014
+ # Convert DOCX to HTML
1015
+ html_content = convert_docx_to_html(input_docx_path)
1016
+ print("Doc converted into html successfully.")
1017
+ # Write HTML content to a file
1018
+ #with open(output_html_path, "w", encoding="utf-8") as html_file:
1019
+ #html_file.write(html_content)
1020
+
1021
+ #print("Conversion complete. HTML file saved at:", output_html_path)
1022
+
1023
+ # Extract href attributes
1024
+ href_attributes = extract_href_attributes(html_content)
1025
+ #print("Extracted href attributes:", href_attributes)
1026
+
1027
+ output_urls = [localize_URL(url, output_lang) for url in href_attributes]
1028
+
1029
+ # Create a pandas DataFrame
1030
+ df = pd.DataFrame({'index': range(1, len(href_attributes) + 1), 'input_url': href_attributes, 'output_url': output_urls})
1031
+
1032
+ # Export the DataFrame to a CSV file
1033
+ if not df.empty:
1034
+ print("Check your exported csv file at the left on the Folder icon\nwith the name", output_csv_path)
1035
+ df.to_csv(output_csv_path, index=False, encoding="utf-8")
1036
+
1037
+ # Display the DataFrame
1038
+ return df
1039
+
1040
+
1041
+ # From PDF file -------------------- ##
1042
+ # NEEDS FITZ
1043
+ def pdf2_bitable(pdf_path: str, output_lang: str):
1044
+ if not pdf_path.lower().endswith("pdf"):
1045
+ print(f"ERROR: File not found or is not .pdf. Verify the input_path field above: {pdf_path}")
1046
+ return None
1047
+ # Create a document object
1048
+ doc = fitz.open(pdf_path) # or fitz.Document(filename)
1049
+
1050
+ # Create a pandas DataFrame
1051
+ data = []
1052
+
1053
+ # get the links on all pages
1054
+ for i in range(doc.page_count):
1055
+ page = doc.load_page(i)
1056
+ links = page.get_links()
1057
+ if links:
1058
+ for item in links:
1059
+ input_url = item.get('uri')
1060
+ if input_url != None:
1061
+ localized_url = localize_URL(input_url, output_lang)
1062
+ data.append({'index': len(data) + 1, 'Page': i, 'input_url': input_url, 'localized_url': localized_url})
1063
+
1064
+
1065
+
1066
+ # Create a pandas DataFrame
1067
+ df_pdf = pd.DataFrame(data)
1068
+
1069
+ # Name the file based on the pdf's name
1070
+ last_slash_index = pdf_path.rfind('/')
1071
+ if last_slash_index != -1:
1072
+ extracted_string = f"{pdf_path[last_slash_index + 1:]}"
1073
+ extracted_string = extracted_string.replace("#", "")
1074
+ #print(extracted_string)
1075
+ else:
1076
+ #print("No '/' found in the URL.")
1077
+ extracted_string = pdf_path
1078
+ extracted_string = extracted_string.replace("#", "")
1079
+
1080
+
1081
+
1082
+ # Naming the output file
1083
+ output_directory = '/content'
1084
+ output_csv_path = f"{output_directory}/output_{output_lang}_{extracted_string[0:len(extracted_string)//2]}.csv"
1085
+
1086
+ # Create the output directory if it doesn't exist
1087
+ os.makedirs(output_directory, exist_ok=True)
1088
+
1089
+ if not df_pdf.empty:
1090
+ # Export the DataFrame to a CSV file
1091
+ df_pdf.to_csv(output_csv_path, index=False, encoding="utf-8")
1092
+ print("Check your exported csv file at the left on the Folder icon\nwith the name", output_csv_path)
1093
+ return df_pdf
1094
+ else:
1095
+ print("ERROR: File not found or is not .pdf. Verify the input_path field above.")
1096
+ return None
1097
+
1098
+
1099
+ # DOCX REPLACER -------------------- ##
1100
+
1101
+ #Replace links in Docx with SpireDoc
1102
+
1103
+ def docx2docx_replacer(my_chemin_docx: str, my_langue):
1104
+ # Create a Document object
1105
+ doc = Document()
1106
+
1107
+ # Load a Word file
1108
+ doc.LoadFromFile(my_chemin_docx)
1109
+
1110
+ # Find all hyperlinks in the document
1111
+ hyperlinks = []
1112
+ for i in range(doc.Sections.Count):
1113
+ section = doc.Sections.get_Item(i)
1114
+ for j in range(section.Body.ChildObjects.Count):
1115
+ sec = section.Body.ChildObjects.get_Item(j)
1116
+ if sec.DocumentObjectType == DocumentObjectType.Paragraph:
1117
+ for k in range((sec if isinstance(sec, Paragraph) else None).ChildObjects.Count):
1118
+ para = (sec if isinstance(sec, Paragraph)
1119
+ else None).ChildObjects.get_Item(k)
1120
+ if para.DocumentObjectType == DocumentObjectType.Field:
1121
+ field = para if isinstance(para, Field) else None
1122
+ if field.Type == FieldType.FieldHyperlink:
1123
+ hyperlinks.append(field)
1124
+
1125
+ # Iterate through hyperlinks and update them
1126
+ for hyperlink in hyperlinks:
1127
+ # Get the current display text and URL
1128
+ current_url = hyperlink.Code.replace('HYPERLINK "', '').replace('"', '')
1129
+ match = re.search(r'HYPERLINK "(.*?)"', hyperlink.Code)
1130
+ if match:
1131
+ current_url = match.group(1)
1132
+
1133
+ current_display_text = hyperlink.FieldText
1134
+ localized_url = localize_URL(current_url, my_langue)
1135
+ if localized_url:
1136
+
1137
+ # Update the display text and URL of the hyperlink
1138
+ #hyperlink.FieldText = "NEW DISPLAY TEXT" # Replace with your new display text
1139
+ hyperlink.Code = f'HYPERLINK "{localized_url}"'
1140
+
1141
+ if len(hyperlinks)>0:
1142
+ # Naming output file
1143
+ last_slash_index = my_chemin_docx.rfind('/')
1144
+ if last_slash_index != -1:
1145
+ extracted_string = f"{my_chemin_docx[last_slash_index + 1:]}"
1146
+ extracted_string = extracted_string.replace("#", "")
1147
+ #print(extracted_string)
1148
+ else:
1149
+ #print("No '/' found in the URL.")
1150
+ extracted_string = my_chemin_docx
1151
+ extracted_string = extracted_string.replace("#", "")
1152
+
1153
+ output_directory = '/content'
1154
+ output_path = f"{output_directory}/output_{my_langue}_{extracted_string[0:len(extracted_string)//2]}.docx"
1155
+
1156
+ # Create the output directory if it doesn't exist
1157
+ os.makedirs(output_directory, exist_ok=True)
1158
+
1159
+
1160
+
1161
+ # Save the document to a docx file
1162
+ print("\n\nSaving the output file:")
1163
+ doc.SaveToFile(output_path, FileFormat.Docx)
1164
+ print(f"Output file saved successfuly in your content folder as:\n\t{output_path}")
1165
+ doc.Close()
1166
+ else:
1167
+ print(f"ERROR on processing the file: {my_chemin_docx}")
1168
+
1169
+
1170
+
1171
+
1172
+ # 6. HTML downloader and link replacer -------------------- ##
1173
+
1174
+ def link2_html_converter(UNEP_URL_DOWNREPLACE: str, language_code: str):
1175
+ """Takes an input link from UNEP website. It downloads the webpage
1176
+ translatable content, replace its links with the localized version and
1177
+ exports a .txt file with the HTML tags ready to be used in any CAT tool
1178
+ for human translation.
1179
+ """
1180
+ modified_html = localize_UNEP_html(language_code, UNEP_URL_DOWNREPLACE)
1181
+
1182
+ if not modified_html:
1183
+ print("ERROR: The input URL might not be accessible, or not an URL.")
1184
+ raise ValueError("The input URL might not be accessible, or not an URL.")
1185
+
1186
+ print(f"\nFile to be exported in your folder, or\n\n\t\tcopy the result from below :\n\n\n{modified_html}")
1187
+
1188
+ # Name the file based on the webpage's name
1189
+ last_slash_index = UNEP_URL_DOWNREPLACE.rfind('/')
1190
+ if last_slash_index != -1:
1191
+ extracted_string = f"{UNEP_URL_DOWNREPLACE[last_slash_index + 1:]}_replacedURLs_{language_code}.txt"
1192
+ extracted_string = extracted_string.replace("#", "")
1193
+ #print(extracted_string)
1194
+ else:
1195
+ #print("No '/' found in the URL.")
1196
+ extracted_string = UNEP_URL_DOWNREPLACE + ".txt"
1197
+ extracted_string = extracted_string.replace("#", "")
1198
+
1199
+ # Save the modified HTML content to a .txt file in the current folder
1200
+ with open(extracted_string, 'w', encoding='utf-8') as file:
1201
+ print(type(modified_html))
1202
+ print(modified_html)
1203
+ file.write(modified_html)
1204
+ print(f"File {extracted_string} exported succesfully")
1205
+
1206
+ # Force download in Google Colab
1207
+ try:
1208
+ from google.colab import files
1209
+ files.download(extracted_string)
1210
+ except ImportError:
1211
+ pass
1212
+
1213
+ # Install necessary libraries
1214
+ #!pip install gradio
1215
+
1216
+ import gradio as gr
1217
+ from bs4 import BeautifulSoup
1218
+
1219
+
1220
+
1221
+ # Define your custom function
1222
+ def render_html(htmltext, language):
1223
+ soup = BeautifulSoup(htmltext, 'html.parser')
1224
+ for a_tag in soup.find_all('a'):
1225
+ # Get the current href attribute value
1226
+ current_href = a_tag.get('href', '')
1227
+
1228
+ # Localize the URL using the provided language code
1229
+ localized_url = localize_URL(current_href, language)
1230
+
1231
+ # Update the href attribute with the localized URL
1232
+ if localized_url is not None:
1233
+ a_tag['href'] = localized_url
1234
+
1235
+ # Return the modified HTML content
1236
+ output = str(soup)
1237
+ return output
1238
+
1239
+ # Create the Gradio interface
1240
+ with gr.Blocks() as demo:
1241
+
1242
+ html_input = gr.Textbox(label="Enter HTML Code", lines=10, placeholder="Paste your HTML code here. You can convert Word file's content into HTML by using html-cleaner.com")
1243
+ language_dropdown = gr.Dropdown(label="Select Language", choices=['es', 'fr', 'sw', 'en', 'zh-hans', 'pt-br', 'ru', 'ar'], value='es')
1244
+ html_output = gr.HTML(label="Rendered HTML")
1245
+ run_button = gr.Button("Find links!")
1246
+
1247
+ run_button.click(render_html, inputs=[html_input, language_dropdown], outputs=html_output)
1248
+
1249
+ # Launch the Gradio app with debug=True and share=True
1250
+ demo.launch(debug=True, share=True)