File size: 3,080 Bytes
60a25ab
 
 
 
 
 
1b5b9ce
 
 
 
 
60a25ab
 
1b5b9ce
 
 
 
 
 
 
60a25ab
 
1b5b9ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60a25ab
1b5b9ce
 
 
 
 
 
 
 
 
60a25ab
 
1b5b9ce
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import gradio as gr
import requests
from bs4 import BeautifulSoup
import json
import time
from tqdm import tqdm
import zipfile
import io
import os
import tempfile
import mimetypes

def fetch_content(url):
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return None

def extract_text(html):
soup = BeautifulSoup(html, 'html.parser')
for script in soup(["script", "style"]):
script.decompose()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
text = '\n'.join(chunk for chunk in chunks if chunk)
return text

def process_urls(urls):
dataset = []
for url in tqdm(urls, desc="Fetching URLs"):
html = fetch_content(url)
if html:
text = extract_text(html)
dataset.append({
"source": "url",
"url": url,
"content": text
})
time.sleep(1)  # Be polite to the server
return dataset

def process_file(file):
dataset = []
with tempfile.TemporaryDirectory() as temp_dir:
if zipfile.is_zipfile(file.name):
with zipfile.ZipFile(file.name, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
for root, _, files in os.walk(temp_dir):
for filename in files:
filepath = os.path.join(root, filename)
mime_type, _ = mimetypes.guess_type(filepath)
if mime_type and mime_type.startswith('text'):
with open(filepath, 'r', errors='ignore') as f:
content = f.read()
dataset.append({
"source": "file",
"filename": filename,
"content": content
})
else:
# For non-text files, just store the filename
dataset.append({
"source": "file",
"filename": filename,
"content": "Binary file - content not extracted"
})
else:
mime_type, _ = mimetypes.guess_type(file.name)
if mime_type and mime_type.startswith('text'):
content = file.read().decode('utf-8', errors='ignore')
dataset.append({
"source": "file",
"filename": os.path.basename(file.name),
"content": content
})
else:
# For non-text files, just store the filename
dataset.append({
"source": "file",
"filename": os.path.basename(file.name),
"content": "Binary file - content not extracted"
})
return dataset

def process_text(text):
return [{
"source": "text_input",
"content": text
}]

def create_dataset(urls, file, text_input):
dataset = []
if urls:
dataset.extend(process_urls(urls.split(',')))
if file:
dataset.extend(process_file(file))
if text_input:
dataset.extend(process_text(text_input))

# Save the dataset as JSON
with open('combined_dataset.json', 'w') as f:
    json.dump(dataset, f, indent=2)

return 'combined_dataset.json'
Gradio Interface
iface = gr.Interface(
fn=create_dataset,
inputs=[
gr.Textbox(lines=5, label="Enter comma-separated URLs"),
gr.File(label="Upload file (including zip files)"),
gr.Textbox(lines=10, label="Enter or paste large text")
],
outputs=gr.File(label="Download Combined Dataset"),
title="URL, File, and Text to Dataset Converter",
description="Enter URLs, upload files (including zip files), and/or paste text to create a combined dataset for AI training.",
)

Launch the interface
iface.launch()