Spaces:
Running
Running
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
import json | |
import time | |
from tqdm import tqdm | |
def fetch_content(url): | |
try: | |
response = requests.get(url, timeout=10) | |
response.raise_for_status() | |
return response.text | |
except requests.RequestException as e: | |
print(f"Error fetching {url}: {e}") | |
return None | |
def extract_text(html): | |
soup = BeautifulSoup(html, 'html.parser') | |
for script in soup(["script", "style"]): | |
script.decompose() | |
text = soup.get_text() | |
lines = (line.strip() for line in text.splitlines()) | |
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
text = '\n'.join(chunk for chunk in chunks if chunk) | |
return text | |
def create_dataset(urls): | |
dataset = [] | |
for url in tqdm(urls, desc="Fetching URLs"): | |
html = fetch_content(url) | |
if html: | |
text = extract_text(html) | |
dataset.append({ | |
"url": url, | |
"content": text | |
}) | |
time.sleep(1) # Be polite to the server | |
return dataset | |
def process_urls(url_input): | |
urls = [url.strip() for url in url_input.split(',')] | |
dataset = create_dataset(urls) | |
# Save the dataset as JSON | |
with open('dataset.json', 'w') as f: | |
json.dump(dataset, f, indent=2) | |
return 'dataset.json' | |
# Gradio Interface | |
iface = gr.Interface( | |
fn=process_urls, | |
inputs=gr.Textbox(lines=5, placeholder="Enter comma-separated URLs here..."), | |
outputs=gr.File(label="Download Dataset"), | |
title="URL to Dataset Converter", | |
description="Enter a list of comma-separated URLs to create a dataset for AI training.", | |
) | |
# Launch the interface | |
iface.launch() | |