| import concurrent.futures | |
| import re | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import extensions.superboogav2.parameters as parameters | |
| from .data_processor import process_and_add_to_collector | |
| from .utils import create_metadata_source | |
| def _download_single(url): | |
| response = requests.get(url, timeout=5) | |
| if response.status_code == 200: | |
| return response.content | |
| else: | |
| raise Exception("Failed to download URL") | |
| def _download_urls(urls, threads=1): | |
| with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: | |
| futures = [] | |
| for url in urls: | |
| future = executor.submit(_download_single, url) | |
| futures.append(future) | |
| results = [] | |
| i = 0 | |
| for future in concurrent.futures.as_completed(futures): | |
| try: | |
| result = future.result() | |
| results.append(result) | |
| i += 1 | |
| yield f"{i}/{len(urls)}", results | |
| except Exception: | |
| pass | |
| yield "Done", results | |
| def feed_url_into_collector(urls, collector): | |
| all_text = '' | |
| cumulative = '' | |
| urls = urls.strip().split('\n') | |
| cumulative += f'Loading {len(urls)} URLs with {parameters.get_num_threads()} threads...\n\n' | |
| yield cumulative | |
| for update, contents in _download_urls(urls, threads=parameters.get_num_threads()): | |
| yield cumulative + update | |
| cumulative += 'Processing the HTML sources...' | |
| yield cumulative | |
| for content in contents: | |
| soup = BeautifulSoup(content, features="lxml") | |
| for script in soup(["script", "style"]): | |
| script.extract() | |
| strings = soup.stripped_strings | |
| if parameters.get_is_strong_cleanup(): | |
| strings = [s for s in strings if re.search("[A-Za-z] ", s)] | |
| text = '\n'.join([s.strip() for s in strings]) | |
| all_text += text | |
| process_and_add_to_collector(all_text, collector, False, create_metadata_source('url-download')) | |