urld / app.py
acecalisto3's picture
Update app.py
52b6878 verified
raw
history blame
4.58 kB
import gradio as gr
import requests
from bs4 import BeautifulSoup
import json
import time
import zipfile
import os
import tempfile
import mimetypes
from tqdm import tqdm
def fetch_content(url):
"""Fetch content from a given URL."""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
def extract_text(html):
"""Extract text from HTML content."""
soup = BeautifulSoup(html, 'html.parser')
for script in soup(["script", "style"]):
script.decompose()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
return '\n'.join(chunk for chunk in chunks if chunk)
def process_urls(urls):
"""Process a list of URLs and return their extracted text."""
dataset = []
for url in tqdm(urls, desc="Fetching URLs"):
html = fetch_content(url)
if html:
text = extract_text(html)
dataset.append({
"source": "url",
"url": url,
"content": text
})
time.sleep(1) # Be polite to the server
return dataset
def process_file(file):
"""Process uploaded files (including zip files) and extract text."""
dataset = []
with tempfile.TemporaryDirectory() as temp_dir:
if zipfile.is_zipfile(file.name):
with zipfile.ZipFile(file.name, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
# Process each extracted file
for root, _, files in os.walk(temp_dir):
for filename in files:
filepath = os.path.join(root, filename)
mime_type, _ = mimetypes.guess_type(filepath)
if mime_type and mime_type.startswith('text'):
with open(filepath, 'r', errors='ignore') as f:
content = f.read()
dataset.append({
"source": "file",
"filename": filename,
"content": content
})
else:
# For non-text files, just store the filename
dataset.append({
"source": "file",
"filename": filename,
"content": "Binary file - content not extracted"
})
else:
mime_type, _ = mimetypes.guess_type(file.name)
if mime_type and mime_type.startswith('text'):
content = file.read().decode('utf-8', errors='ignore')
dataset.append({
"source": "file",
"filename": os.path.basename(file.name),
"content": content
})
else:
# For non-text files, just store the filename
dataset.append({
"source": "file",
"filename": os.path.basename(file.name),
"content": "Binary file - content not extracted"
})
return dataset
def process_text(text):
"""Process raw text input."""
return [{
"source": "text_input",
"content": text
}]
def create_dataset(urls, file, text_input):
"""Create a combined dataset from URLs, uploaded files, and text input."""
dataset = []
if urls:
dataset.extend(process_urls([url.strip() for url in urls.split(',') if url.strip()]))
if file:
dataset.extend(process_file(file))
if text_input:
dataset.extend(process_text(text_input))
# Save the dataset as JSON
output_file = 'combined_dataset.json'
with open(output_file, 'w') as f:
json.dump(dataset, f, indent=2)
return output_file
# Gradio Interface
iface = gr.Interface(
fn=create_dataset,
inputs=[
gr.Textbox(lines=5, label="Enter comma-separated URLs"),
gr.File(label="Upload file (including zip files)"),
gr.Textbox(lines=10, label="Enter or paste large text")
],
outputs=gr.File(label="Download Combined Dataset"),
title="URL, File, and Text to Dataset Converter",
description="Enter URLs, upload files (including zip files), and/or paste text to create a combined dataset for AI training.",
)
# Launch the interface
iface.launch()