Spaces:

jchen8000
/

text-webui-CPUBasic

Paused

App Files Files Community

text-webui-CPUBasic / extensions /superboogav2 /download_urls.py

zhengr

First version

19dc0f3 over 1 year ago

raw

history blame

2 kB

	import concurrent.futures
	import re

	import requests
	from bs4 import BeautifulSoup

	import extensions.superboogav2.parameters as parameters

	from .data_processor import process_and_add_to_collector
	from .utils import create_metadata_source


	def _download_single(url):
	response = requests.get(url, timeout=5)
	if response.status_code == 200:
	return response.content
	else:
	raise Exception("Failed to download URL")


	def _download_urls(urls, threads=1):
	with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
	futures = []
	for url in urls:
	future = executor.submit(_download_single, url)
	futures.append(future)

	results = []
	i = 0
	for future in concurrent.futures.as_completed(futures):
	try:
	result = future.result()
	results.append(result)
	i += 1
	yield f"{i}/{len(urls)}", results
	except Exception:
	pass

	yield "Done", results


	def feed_url_into_collector(urls, collector):
	all_text = ''
	cumulative = ''

	urls = urls.strip().split('\n')
	cumulative += f'Loading {len(urls)} URLs with {parameters.get_num_threads()} threads...\n\n'
	yield cumulative
	for update, contents in _download_urls(urls, threads=parameters.get_num_threads()):
	yield cumulative + update

	cumulative += 'Processing the HTML sources...'
	yield cumulative
	for content in contents:
	soup = BeautifulSoup(content, features="lxml")
	for script in soup(["script", "style"]):
	script.extract()

	strings = soup.stripped_strings
	if parameters.get_is_strong_cleanup():
	strings = [s for s in strings if re.search("[A-Za-z] ", s)]

	text = '\n'.join([s.strip() for s in strings])
	all_text += text

	process_and_add_to_collector(all_text, collector, False, create_metadata_source('url-download'))