acecalisto3 commited on
Commit
52b6878
·
verified ·
1 Parent(s): 1b5b9ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -96
app.py CHANGED
@@ -3,121 +3,128 @@ import requests
3
  from bs4 import BeautifulSoup
4
  import json
5
  import time
6
- from tqdm import tqdm
7
  import zipfile
8
- import io
9
  import os
10
  import tempfile
11
  import mimetypes
 
12
 
13
  def fetch_content(url):
14
- try:
15
- response = requests.get(url, timeout=10)
16
- response.raise_for_status()
17
- return response.text
18
- except requests.RequestException as e:
19
- print(f"Error fetching {url}: {e}")
20
- return None
 
21
 
22
  def extract_text(html):
23
- soup = BeautifulSoup(html, 'html.parser')
24
- for script in soup(["script", "style"]):
25
- script.decompose()
26
- text = soup.get_text()
27
- lines = (line.strip() for line in text.splitlines())
28
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
29
- text = '\n'.join(chunk for chunk in chunks if chunk)
30
- return text
31
 
32
  def process_urls(urls):
33
- dataset = []
34
- for url in tqdm(urls, desc="Fetching URLs"):
35
- html = fetch_content(url)
36
- if html:
37
- text = extract_text(html)
38
- dataset.append({
39
- "source": "url",
40
- "url": url,
41
- "content": text
42
- })
43
- time.sleep(1) # Be polite to the server
44
- return dataset
 
45
 
46
  def process_file(file):
47
- dataset = []
48
- with tempfile.TemporaryDirectory() as temp_dir:
49
- if zipfile.is_zipfile(file.name):
50
- with zipfile.ZipFile(file.name, 'r') as zip_ref:
51
- zip_ref.extractall(temp_dir)
52
- for root, _, files in os.walk(temp_dir):
53
- for filename in files:
54
- filepath = os.path.join(root, filename)
55
- mime_type, _ = mimetypes.guess_type(filepath)
56
- if mime_type and mime_type.startswith('text'):
57
- with open(filepath, 'r', errors='ignore') as f:
58
- content = f.read()
59
- dataset.append({
60
- "source": "file",
61
- "filename": filename,
62
- "content": content
63
- })
64
- else:
65
- # For non-text files, just store the filename
66
- dataset.append({
67
- "source": "file",
68
- "filename": filename,
69
- "content": "Binary file - content not extracted"
70
- })
71
- else:
72
- mime_type, _ = mimetypes.guess_type(file.name)
73
- if mime_type and mime_type.startswith('text'):
74
- content = file.read().decode('utf-8', errors='ignore')
75
- dataset.append({
76
- "source": "file",
77
- "filename": os.path.basename(file.name),
78
- "content": content
79
- })
80
- else:
81
- # For non-text files, just store the filename
82
- dataset.append({
83
- "source": "file",
84
- "filename": os.path.basename(file.name),
85
- "content": "Binary file - content not extracted"
86
- })
87
- return dataset
 
 
88
 
89
  def process_text(text):
90
- return [{
91
- "source": "text_input",
92
- "content": text
93
- }]
 
94
 
95
  def create_dataset(urls, file, text_input):
96
- dataset = []
97
- if urls:
98
- dataset.extend(process_urls(urls.split(',')))
99
- if file:
100
- dataset.extend(process_file(file))
101
- if text_input:
102
- dataset.extend(process_text(text_input))
 
 
 
 
 
 
103
 
104
- # Save the dataset as JSON
105
- with open('combined_dataset.json', 'w') as f:
106
- json.dump(dataset, f, indent=2)
107
 
108
- return 'combined_dataset.json'
109
- Gradio Interface
110
  iface = gr.Interface(
111
- fn=create_dataset,
112
- inputs=[
113
- gr.Textbox(lines=5, label="Enter comma-separated URLs"),
114
- gr.File(label="Upload file (including zip files)"),
115
- gr.Textbox(lines=10, label="Enter or paste large text")
116
- ],
117
- outputs=gr.File(label="Download Combined Dataset"),
118
- title="URL, File, and Text to Dataset Converter",
119
- description="Enter URLs, upload files (including zip files), and/or paste text to create a combined dataset for AI training.",
120
  )
121
 
122
- Launch the interface
123
  iface.launch()
 
3
  from bs4 import BeautifulSoup
4
  import json
5
  import time
 
6
  import zipfile
 
7
  import os
8
  import tempfile
9
  import mimetypes
10
+ from tqdm import tqdm
11
 
12
  def fetch_content(url):
13
+ """Fetch content from a given URL."""
14
+ try:
15
+ response = requests.get(url, timeout=10)
16
+ response.raise_for_status()
17
+ return response.text
18
+ except requests.RequestException as e:
19
+ print(f"Error fetching {url}: {e}")
20
+ return None
21
 
22
  def extract_text(html):
23
+ """Extract text from HTML content."""
24
+ soup = BeautifulSoup(html, 'html.parser')
25
+ for script in soup(["script", "style"]):
26
+ script.decompose()
27
+ text = soup.get_text()
28
+ lines = (line.strip() for line in text.splitlines())
29
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
30
+ return '\n'.join(chunk for chunk in chunks if chunk)
31
 
32
  def process_urls(urls):
33
+ """Process a list of URLs and return their extracted text."""
34
+ dataset = []
35
+ for url in tqdm(urls, desc="Fetching URLs"):
36
+ html = fetch_content(url)
37
+ if html:
38
+ text = extract_text(html)
39
+ dataset.append({
40
+ "source": "url",
41
+ "url": url,
42
+ "content": text
43
+ })
44
+ time.sleep(1) # Be polite to the server
45
+ return dataset
46
 
47
  def process_file(file):
48
+ """Process uploaded files (including zip files) and extract text."""
49
+ dataset = []
50
+ with tempfile.TemporaryDirectory() as temp_dir:
51
+ if zipfile.is_zipfile(file.name):
52
+ with zipfile.ZipFile(file.name, 'r') as zip_ref:
53
+ zip_ref.extractall(temp_dir)
54
+ # Process each extracted file
55
+ for root, _, files in os.walk(temp_dir):
56
+ for filename in files:
57
+ filepath = os.path.join(root, filename)
58
+ mime_type, _ = mimetypes.guess_type(filepath)
59
+ if mime_type and mime_type.startswith('text'):
60
+ with open(filepath, 'r', errors='ignore') as f:
61
+ content = f.read()
62
+ dataset.append({
63
+ "source": "file",
64
+ "filename": filename,
65
+ "content": content
66
+ })
67
+ else:
68
+ # For non-text files, just store the filename
69
+ dataset.append({
70
+ "source": "file",
71
+ "filename": filename,
72
+ "content": "Binary file - content not extracted"
73
+ })
74
+ else:
75
+ mime_type, _ = mimetypes.guess_type(file.name)
76
+ if mime_type and mime_type.startswith('text'):
77
+ content = file.read().decode('utf-8', errors='ignore')
78
+ dataset.append({
79
+ "source": "file",
80
+ "filename": os.path.basename(file.name),
81
+ "content": content
82
+ })
83
+ else:
84
+ # For non-text files, just store the filename
85
+ dataset.append({
86
+ "source": "file",
87
+ "filename": os.path.basename(file.name),
88
+ "content": "Binary file - content not extracted"
89
+ })
90
+ return dataset
91
 
92
  def process_text(text):
93
+ """Process raw text input."""
94
+ return [{
95
+ "source": "text_input",
96
+ "content": text
97
+ }]
98
 
99
  def create_dataset(urls, file, text_input):
100
+ """Create a combined dataset from URLs, uploaded files, and text input."""
101
+ dataset = []
102
+ if urls:
103
+ dataset.extend(process_urls([url.strip() for url in urls.split(',') if url.strip()]))
104
+ if file:
105
+ dataset.extend(process_file(file))
106
+ if text_input:
107
+ dataset.extend(process_text(text_input))
108
+
109
+ # Save the dataset as JSON
110
+ output_file = 'combined_dataset.json'
111
+ with open(output_file, 'w') as f:
112
+ json.dump(dataset, f, indent=2)
113
 
114
+ return output_file
 
 
115
 
116
+ # Gradio Interface
 
117
  iface = gr.Interface(
118
+ fn=create_dataset,
119
+ inputs=[
120
+ gr.Textbox(lines=5, label="Enter comma-separated URLs"),
121
+ gr.File(label="Upload file (including zip files)"),
122
+ gr.Textbox(lines=10, label="Enter or paste large text")
123
+ ],
124
+ outputs=gr.File(label="Download Combined Dataset"),
125
+ title="URL, File, and Text to Dataset Converter",
126
+ description="Enter URLs, upload files (including zip files), and/or paste text to create a combined dataset for AI training.",
127
  )
128
 
129
+ # Launch the interface
130
  iface.launch()