urld / app.py
acecalisto3's picture
Create app.py
60a25ab verified
raw
history blame
1.74 kB
import gradio as gr
import requests
from bs4 import BeautifulSoup
import json
import time
from tqdm import tqdm
def fetch_content(url):
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
def extract_text(html):
soup = BeautifulSoup(html, 'html.parser')
for script in soup(["script", "style"]):
script.decompose()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
def create_dataset(urls):
dataset = []
for url in tqdm(urls, desc="Fetching URLs"):
html = fetch_content(url)
if html:
text = extract_text(html)
dataset.append({
"url": url,
"content": text
})
time.sleep(1) # Be polite to the server
return dataset
def process_urls(url_input):
urls = [url.strip() for url in url_input.split(',')]
dataset = create_dataset(urls)
# Save the dataset as JSON
with open('dataset.json', 'w') as f:
json.dump(dataset, f, indent=2)
return 'dataset.json'
# Gradio Interface
iface = gr.Interface(
fn=process_urls,
inputs=gr.Textbox(lines=5, placeholder="Enter comma-separated URLs here..."),
outputs=gr.File(label="Download Dataset"),
title="URL to Dataset Converter",
description="Enter a list of comma-separated URLs to create a dataset for AI training.",
)
# Launch the interface
iface.launch()