acecalisto3 commited on
Commit
60a25ab
·
verified ·
1 Parent(s): aeab835

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -0
app.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ import json
6
+ import time
7
+ from tqdm import tqdm
8
+
9
+ def fetch_content(url):
10
+ try:
11
+ response = requests.get(url, timeout=10)
12
+ response.raise_for_status()
13
+ return response.text
14
+ except requests.RequestException as e:
15
+ print(f"Error fetching {url}: {e}")
16
+ return None
17
+
18
+ def extract_text(html):
19
+ soup = BeautifulSoup(html, 'html.parser')
20
+ for script in soup(["script", "style"]):
21
+ script.decompose()
22
+ text = soup.get_text()
23
+ lines = (line.strip() for line in text.splitlines())
24
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
25
+ text = '\n'.join(chunk for chunk in chunks if chunk)
26
+ return text
27
+
28
+ def create_dataset(urls):
29
+ dataset = []
30
+ for url in tqdm(urls, desc="Fetching URLs"):
31
+ html = fetch_content(url)
32
+ if html:
33
+ text = extract_text(html)
34
+ dataset.append({
35
+ "url": url,
36
+ "content": text
37
+ })
38
+ time.sleep(1) # Be polite to the server
39
+ return dataset
40
+
41
+ def process_urls(url_input):
42
+ urls = [url.strip() for url in url_input.split(',')]
43
+ dataset = create_dataset(urls)
44
+
45
+ # Save the dataset as JSON
46
+ with open('dataset.json', 'w') as f:
47
+ json.dump(dataset, f, indent=2)
48
+
49
+ return 'dataset.json'
50
+
51
+ # Gradio Interface
52
+ iface = gr.Interface(
53
+ fn=process_urls,
54
+ inputs=gr.Textbox(lines=5, placeholder="Enter comma-separated URLs here..."),
55
+ outputs=gr.File(label="Download Dataset"),
56
+ title="URL to Dataset Converter",
57
+ description="Enter a list of comma-separated URLs to create a dataset for AI training.",
58
+ )
59
+
60
+ # Launch the interface
61
+ iface.launch()