Spaces:
Running
Running
Update app2.py
Browse files
app2.py
CHANGED
@@ -4,23 +4,23 @@ import re
|
|
4 |
import time
|
5 |
import logging
|
6 |
import mimetypes
|
|
|
|
|
|
|
7 |
import tempfile
|
8 |
from datetime import datetime
|
|
|
9 |
from pathlib import Path
|
10 |
from urllib.parse import urlparse
|
11 |
-
|
12 |
import requests
|
13 |
import validators
|
14 |
import gradio as gr
|
15 |
from diskcache import Cache
|
16 |
from bs4 import BeautifulSoup
|
17 |
from fake_useragent import UserAgent
|
|
|
18 |
from cleantext import clean
|
19 |
-
import qrcode
|
20 |
-
if sys.version_info >= (3, 6):
|
21 |
-
import zipfile
|
22 |
-
else:
|
23 |
-
import zipfile36 as zipfile
|
24 |
|
25 |
# Setup logging with detailed configuration
|
26 |
logging.basicConfig(
|
@@ -45,12 +45,13 @@ class URLProcessor:
|
|
45 |
'Connection': 'keep-alive',
|
46 |
'Upgrade-Insecure-Requests': '1'
|
47 |
})
|
48 |
-
|
49 |
def advanced_text_cleaning(self, text: str) -> str:
|
50 |
"""Robust text cleaning with version compatibility"""
|
51 |
try:
|
52 |
cleaned_text = clean(
|
53 |
text,
|
|
|
54 |
to_ascii=True,
|
55 |
lower=True,
|
56 |
no_line_breaks=True,
|
@@ -149,6 +150,14 @@ class URLProcessor:
|
|
149 |
# Extract main content
|
150 |
main_content = soup.find('main') or soup.find('article') or soup.body
|
151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
# Clean and structure content
|
153 |
text_content = main_content.get_text(separator='\n', strip=True)
|
154 |
cleaned_content = self.advanced_text_cleaning(text_content)
|
@@ -202,15 +211,29 @@ class FileProcessor:
|
|
202 |
|
203 |
return dataset
|
204 |
|
205 |
-
def
|
206 |
-
"""
|
207 |
-
|
208 |
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
209 |
-
zip_ref.extractall(
|
210 |
-
for
|
211 |
-
|
212 |
-
|
213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
def _process_single_file(self, file) -> List[Dict]:
|
215 |
try:
|
216 |
file_stat = os.stat(file.name)
|
@@ -247,14 +270,17 @@ def _process_single_file(self, file) -> List[Dict]:
|
|
247 |
logger.error(f"File processing error: {e}")
|
248 |
return []
|
249 |
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
|
|
|
|
|
|
|
|
256 |
|
257 |
-
def create_interface():
|
258 |
"""Create a comprehensive Gradio interface with advanced features"""
|
259 |
|
260 |
css = """
|
@@ -286,31 +312,12 @@ def create_interface():
|
|
286 |
placeholder="Paste your text here..."
|
287 |
)
|
288 |
|
289 |
-
with gr.Tab("JSON Editor"):
|
290 |
-
json_editor = gr.Textbox(
|
291 |
-
label="JSON Editor",
|
292 |
-
lines=20,
|
293 |
-
placeholder="View and edit your JSON data here...",
|
294 |
-
interactive=True,
|
295 |
-
elem_id="json-editor" # Optional: for custom styling
|
296 |
-
)
|
297 |
-
|
298 |
-
with gr.Tab("Scratchpad"):
|
299 |
-
scratchpad = gr.Textbox(
|
300 |
-
label="Scratchpad",
|
301 |
-
lines=10,
|
302 |
-
placeholder="Quick notes or text collections...",
|
303 |
-
interactive=True
|
304 |
-
)
|
305 |
-
|
306 |
process_btn = gr.Button("Process Input", variant="primary")
|
307 |
-
qr_btn = gr.Button("Generate QR Code", variant="secondary")
|
308 |
|
309 |
output_text = gr.Textbox(label="Processing Results", interactive=False)
|
310 |
output_file = gr.File(label="Processed Output")
|
311 |
-
qr_output = gr.Image(label="QR Code", type="filepath") # To display the generated QR code
|
312 |
|
313 |
-
def process_all_inputs(urls, file, text
|
314 |
"""Process all input types with progress tracking"""
|
315 |
try:
|
316 |
processor = URLProcessor()
|
@@ -357,31 +364,19 @@ def create_interface():
|
|
357 |
json.dump(results, f, ensure_ascii=False, indent=2)
|
358 |
|
359 |
summary = f"Processed {len(results)} items successfully!"
|
360 |
-
|
361 |
-
return str(output_path), summary
|
362 |
else:
|
363 |
-
return None, "No valid content to process."
|
364 |
|
365 |
except Exception as e:
|
366 |
logger.error(f"Processing error: {e}")
|
367 |
-
return None, f"Error: {str(e)}"
|
368 |
-
|
369 |
-
def generate_qr(json_data):
|
370 |
-
"""Generate QR code from JSON data and return the file path."""
|
371 |
-
if json_data:
|
372 |
-
return generate_qr_code(json_data)
|
373 |
-
return None
|
374 |
|
375 |
process_btn.click(
|
376 |
process_all_inputs,
|
377 |
-
inputs=[url_input, file_input, text_input
|
378 |
-
outputs=[output_file, output_text
|
379 |
-
)
|
380 |
-
|
381 |
-
qr_btn.click(
|
382 |
-
generate_qr,
|
383 |
-
inputs=json_editor,
|
384 |
-
outputs=qr_output
|
385 |
)
|
386 |
|
387 |
gr.Markdown("""
|
@@ -389,8 +384,6 @@ def create_interface():
|
|
389 |
- **URL Processing**: Enter valid HTTP/HTTPS URLs
|
390 |
- **File Input**: Upload text files or ZIP archives
|
391 |
- **Text Input**: Direct text processing
|
392 |
-
- **JSON Editor**: View and edit your JSON data
|
393 |
-
- **Scratchpad**: Quick notes or text collections
|
394 |
- Advanced cleaning and validation included
|
395 |
""")
|
396 |
|
@@ -412,6 +405,5 @@ def main():
|
|
412 |
inbrowser=True,
|
413 |
debug=True
|
414 |
)
|
415 |
-
|
416 |
if __name__ == "__main__":
|
417 |
main()
|
|
|
4 |
import time
|
5 |
import logging
|
6 |
import mimetypes
|
7 |
+
import concurrent.futures
|
8 |
+
import string
|
9 |
+
import zipfile
|
10 |
import tempfile
|
11 |
from datetime import datetime
|
12 |
+
from typing import List, Dict, Optional, Union
|
13 |
from pathlib import Path
|
14 |
from urllib.parse import urlparse
|
15 |
+
|
16 |
import requests
|
17 |
import validators
|
18 |
import gradio as gr
|
19 |
from diskcache import Cache
|
20 |
from bs4 import BeautifulSoup
|
21 |
from fake_useragent import UserAgent
|
22 |
+
from ratelimit import limits, sleep_and_retry
|
23 |
from cleantext import clean
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
# Setup logging with detailed configuration
|
26 |
logging.basicConfig(
|
|
|
45 |
'Connection': 'keep-alive',
|
46 |
'Upgrade-Insecure-Requests': '1'
|
47 |
})
|
48 |
+
|
49 |
def advanced_text_cleaning(self, text: str) -> str:
|
50 |
"""Robust text cleaning with version compatibility"""
|
51 |
try:
|
52 |
cleaned_text = clean(
|
53 |
text,
|
54 |
+
fix_unicode=True,
|
55 |
to_ascii=True,
|
56 |
lower=True,
|
57 |
no_line_breaks=True,
|
|
|
150 |
# Extract main content
|
151 |
main_content = soup.find('main') or soup.find('article') or soup.body
|
152 |
|
153 |
+
if main_content is None:
|
154 |
+
logger.warning(f"No main content found for URL: {url}")
|
155 |
+
return {
|
156 |
+
'content': '',
|
157 |
+
'content_type': response.headers.get('Content-Type', ''),
|
158 |
+
'timestamp': datetime.now().isoformat()
|
159 |
+
}
|
160 |
+
|
161 |
# Clean and structure content
|
162 |
text_content = main_content.get_text(separator='\n', strip=True)
|
163 |
cleaned_content = self.advanced_text_cleaning(text_content)
|
|
|
211 |
|
212 |
return dataset
|
213 |
|
214 |
+
def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
|
215 |
+
"""Process ZIP file contents"""
|
216 |
+
results = []
|
217 |
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
218 |
+
zip_ref.extractall(temp_dir)
|
219 |
+
for root, _, files in os.walk(temp_dir):
|
220 |
+
for filename in files:
|
221 |
+
filepath = os.path.join(root, filename)
|
222 |
+
if self.is_text_file(filepath):
|
223 |
+
try:
|
224 |
+
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
225 |
+
content = f.read()
|
226 |
+
if content.strip():
|
227 |
+
results.append({
|
228 |
+
"source": "file",
|
229 |
+
"filename": filename,
|
230 |
+
"content": content,
|
231 |
+
"timestamp": datetime.now().isoformat()
|
232 |
+
})
|
233 |
+
except Exception as e:
|
234 |
+
logger.error(f"Error reading file {filename}: {str(e)}")
|
235 |
+
return results
|
236 |
+
|
237 |
def _process_single_file(self, file) -> List[Dict]:
|
238 |
try:
|
239 |
file_stat = os.stat(file.name)
|
|
|
270 |
logger.error(f"File processing error: {e}")
|
271 |
return []
|
272 |
|
273 |
+
import qrcode # Import the qrcode library
|
274 |
+
|
275 |
+
def generate_qr(json_data):
|
276 |
+
"""Generate QR code from JSON data and return the file path."""
|
277 |
+
if json_data:
|
278 |
+
qr = qrcode.make(json_data)
|
279 |
+
qr_path = f"output/qr_code_{int(time.time())}.png"
|
280 |
+
qr.save(qr_path)
|
281 |
+
return qr_path
|
282 |
+
return None
|
283 |
|
|
|
284 |
"""Create a comprehensive Gradio interface with advanced features"""
|
285 |
|
286 |
css = """
|
|
|
312 |
placeholder="Paste your text here..."
|
313 |
)
|
314 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
process_btn = gr.Button("Process Input", variant="primary")
|
|
|
316 |
|
317 |
output_text = gr.Textbox(label="Processing Results", interactive=False)
|
318 |
output_file = gr.File(label="Processed Output")
|
|
|
319 |
|
320 |
+
def process_all_inputs(urls, file, text):
|
321 |
"""Process all input types with progress tracking"""
|
322 |
try:
|
323 |
processor = URLProcessor()
|
|
|
364 |
json.dump(results, f, ensure_ascii=False, indent=2)
|
365 |
|
366 |
summary = f"Processed {len(results)} items successfully!"
|
367 |
+
# Convert Path object to string here
|
368 |
+
return str(output_path), summary
|
369 |
else:
|
370 |
+
return None, "No valid content to process."
|
371 |
|
372 |
except Exception as e:
|
373 |
logger.error(f"Processing error: {e}")
|
374 |
+
return None, f"Error: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
375 |
|
376 |
process_btn.click(
|
377 |
process_all_inputs,
|
378 |
+
inputs=[url_input, file_input, text_input],
|
379 |
+
outputs=[output_file, output_text]
|
|
|
|
|
|
|
|
|
|
|
|
|
380 |
)
|
381 |
|
382 |
gr.Markdown("""
|
|
|
384 |
- **URL Processing**: Enter valid HTTP/HTTPS URLs
|
385 |
- **File Input**: Upload text files or ZIP archives
|
386 |
- **Text Input**: Direct text processing
|
|
|
|
|
387 |
- Advanced cleaning and validation included
|
388 |
""")
|
389 |
|
|
|
405 |
inbrowser=True,
|
406 |
debug=True
|
407 |
)
|
|
|
408 |
if __name__ == "__main__":
|
409 |
main()
|