Spaces:

acecalisto3
/

urld

Running

urld / app.py

Create app.py

60a25ab verified 9 months ago

1.74 kB


	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	import json
	import time
	from tqdm import tqdm

	def fetch_content(url):
	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	return response.text
	except requests.RequestException as e:
	print(f"Error fetching {url}: {e}")
	return None

	def extract_text(html):
	soup = BeautifulSoup(html, 'html.parser')
	for script in soup(["script", "style"]):
	script.decompose()
	text = soup.get_text()
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = '\n'.join(chunk for chunk in chunks if chunk)
	return text

	def create_dataset(urls):
	dataset = []
	for url in tqdm(urls, desc="Fetching URLs"):
	html = fetch_content(url)
	if html:
	text = extract_text(html)
	dataset.append({
	"url": url,
	"content": text
	})
	time.sleep(1) # Be polite to the server
	return dataset

	def process_urls(url_input):
	urls = [url.strip() for url in url_input.split(',')]
	dataset = create_dataset(urls)

	# Save the dataset as JSON
	with open('dataset.json', 'w') as f:
	json.dump(dataset, f, indent=2)

	return 'dataset.json'

	# Gradio Interface
	iface = gr.Interface(
	fn=process_urls,
	inputs=gr.Textbox(lines=5, placeholder="Enter comma-separated URLs here..."),
	outputs=gr.File(label="Download Dataset"),
	title="URL to Dataset Converter",
	description="Enter a list of comma-separated URLs to create a dataset for AI training.",
	)

	# Launch the interface
	iface.launch()