acecalisto3 commited on
Commit
1b5b9ce
·
verified ·
1 Parent(s): 57f4381

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -48
app.py CHANGED
@@ -1,61 +1,123 @@
1
-
2
  import gradio as gr
3
  import requests
4
  from bs4 import BeautifulSoup
5
  import json
6
  import time
7
  from tqdm import tqdm
 
 
 
 
 
8
 
9
  def fetch_content(url):
10
- try:
11
- response = requests.get(url, timeout=10)
12
- response.raise_for_status()
13
- return response.text
14
- except requests.RequestException as e:
15
- print(f"Error fetching {url}: {e}")
16
- return None
17
 
18
  def extract_text(html):
19
- soup = BeautifulSoup(html, 'html.parser')
20
- for script in soup(["script", "style"]):
21
- script.decompose()
22
- text = soup.get_text()
23
- lines = (line.strip() for line in text.splitlines())
24
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
25
- text = '\n'.join(chunk for chunk in chunks if chunk)
26
- return text
27
-
28
- def create_dataset(urls):
29
- dataset = []
30
- for url in tqdm(urls, desc="Fetching URLs"):
31
- html = fetch_content(url)
32
- if html:
33
- text = extract_text(html)
34
- dataset.append({
35
- "url": url,
36
- "content": text
37
- })
38
- time.sleep(1) # Be polite to the server
39
- return dataset
40
-
41
- def process_urls(url_input):
42
- urls = [url.strip() for url in url_input.split(',')]
43
- dataset = create_dataset(urls)
44
-
45
- # Save the dataset as JSON
46
- with open('dataset.json', 'w') as f:
47
- json.dump(dataset, f, indent=2)
48
-
49
- return 'dataset.json'
50
-
51
- # Gradio Interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  iface = gr.Interface(
53
- fn=process_urls,
54
- inputs=gr.Textbox(lines=5, placeholder="Enter comma-separated URLs here..."),
55
- outputs=gr.File(label="Download Dataset"),
56
- title="URL to Dataset Converter",
57
- description="Enter a list of comma-separated URLs to create a dataset for AI training.",
 
 
 
 
58
  )
59
 
60
- # Launch the interface
61
- iface.launch()
 
 
1
  import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
4
  import json
5
  import time
6
  from tqdm import tqdm
7
+ import zipfile
8
+ import io
9
+ import os
10
+ import tempfile
11
+ import mimetypes
12
 
13
  def fetch_content(url):
14
+ try:
15
+ response = requests.get(url, timeout=10)
16
+ response.raise_for_status()
17
+ return response.text
18
+ except requests.RequestException as e:
19
+ print(f"Error fetching {url}: {e}")
20
+ return None
21
 
22
  def extract_text(html):
23
+ soup = BeautifulSoup(html, 'html.parser')
24
+ for script in soup(["script", "style"]):
25
+ script.decompose()
26
+ text = soup.get_text()
27
+ lines = (line.strip() for line in text.splitlines())
28
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
29
+ text = '\n'.join(chunk for chunk in chunks if chunk)
30
+ return text
31
+
32
+ def process_urls(urls):
33
+ dataset = []
34
+ for url in tqdm(urls, desc="Fetching URLs"):
35
+ html = fetch_content(url)
36
+ if html:
37
+ text = extract_text(html)
38
+ dataset.append({
39
+ "source": "url",
40
+ "url": url,
41
+ "content": text
42
+ })
43
+ time.sleep(1) # Be polite to the server
44
+ return dataset
45
+
46
+ def process_file(file):
47
+ dataset = []
48
+ with tempfile.TemporaryDirectory() as temp_dir:
49
+ if zipfile.is_zipfile(file.name):
50
+ with zipfile.ZipFile(file.name, 'r') as zip_ref:
51
+ zip_ref.extractall(temp_dir)
52
+ for root, _, files in os.walk(temp_dir):
53
+ for filename in files:
54
+ filepath = os.path.join(root, filename)
55
+ mime_type, _ = mimetypes.guess_type(filepath)
56
+ if mime_type and mime_type.startswith('text'):
57
+ with open(filepath, 'r', errors='ignore') as f:
58
+ content = f.read()
59
+ dataset.append({
60
+ "source": "file",
61
+ "filename": filename,
62
+ "content": content
63
+ })
64
+ else:
65
+ # For non-text files, just store the filename
66
+ dataset.append({
67
+ "source": "file",
68
+ "filename": filename,
69
+ "content": "Binary file - content not extracted"
70
+ })
71
+ else:
72
+ mime_type, _ = mimetypes.guess_type(file.name)
73
+ if mime_type and mime_type.startswith('text'):
74
+ content = file.read().decode('utf-8', errors='ignore')
75
+ dataset.append({
76
+ "source": "file",
77
+ "filename": os.path.basename(file.name),
78
+ "content": content
79
+ })
80
+ else:
81
+ # For non-text files, just store the filename
82
+ dataset.append({
83
+ "source": "file",
84
+ "filename": os.path.basename(file.name),
85
+ "content": "Binary file - content not extracted"
86
+ })
87
+ return dataset
88
+
89
+ def process_text(text):
90
+ return [{
91
+ "source": "text_input",
92
+ "content": text
93
+ }]
94
+
95
+ def create_dataset(urls, file, text_input):
96
+ dataset = []
97
+ if urls:
98
+ dataset.extend(process_urls(urls.split(',')))
99
+ if file:
100
+ dataset.extend(process_file(file))
101
+ if text_input:
102
+ dataset.extend(process_text(text_input))
103
+
104
+ # Save the dataset as JSON
105
+ with open('combined_dataset.json', 'w') as f:
106
+ json.dump(dataset, f, indent=2)
107
+
108
+ return 'combined_dataset.json'
109
+ Gradio Interface
110
  iface = gr.Interface(
111
+ fn=create_dataset,
112
+ inputs=[
113
+ gr.Textbox(lines=5, label="Enter comma-separated URLs"),
114
+ gr.File(label="Upload file (including zip files)"),
115
+ gr.Textbox(lines=10, label="Enter or paste large text")
116
+ ],
117
+ outputs=gr.File(label="Download Combined Dataset"),
118
+ title="URL, File, and Text to Dataset Converter",
119
+ description="Enter URLs, upload files (including zip files), and/or paste text to create a combined dataset for AI training.",
120
  )
121
 
122
+ Launch the interface
123
+ iface.launch()