Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,31 +1,19 @@
|
|
1 |
-
|
2 |
import json
|
3 |
import os
|
4 |
import re
|
5 |
import time
|
6 |
-
try:
|
7 |
-
import matplotlib
|
8 |
-
except ImportError:
|
9 |
-
import subprocess
|
10 |
-
subprocess.run(["pip", "install", "matplotlib"])
|
11 |
-
import matplotlib
|
12 |
import logging
|
13 |
import mimetypes
|
14 |
-
import concurrent.futures
|
15 |
-
import string
|
16 |
import zipfile
|
17 |
import tempfile
|
18 |
from datetime import datetime
|
19 |
-
from typing import List, Dict, Optional
|
20 |
from pathlib import Path
|
21 |
-
from urllib.parse import urlparse
|
22 |
import requests
|
23 |
import validators
|
24 |
import gradio as gr
|
25 |
-
from diskcache import Cache
|
26 |
from bs4 import BeautifulSoup
|
27 |
from fake_useragent import UserAgent
|
28 |
-
from ratelimit import limits, sleep_and_retry
|
29 |
from cleantext import clean
|
30 |
|
31 |
# Setup logging with detailed configuration
|
@@ -44,7 +32,7 @@ class URLProcessor:
|
|
44 |
self.session = requests.Session()
|
45 |
self.timeout = 10 # seconds
|
46 |
self.session.headers.update({
|
47 |
-
'User-Agent': UserAgent().random,
|
48 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
49 |
'Accept-Language': 'en-US,en;q=0.5',
|
50 |
'Accept-Encoding': 'gzip, deflate, br',
|
@@ -92,15 +80,10 @@ class URLProcessor:
|
|
92 |
def fetch_content(self, url: str) -> Optional[Dict]:
|
93 |
"""Universal content fetcher with special case handling"""
|
94 |
try:
|
95 |
-
# Google Drive document handling
|
96 |
if 'drive.google.com' in url:
|
97 |
return self._handle_google_drive(url)
|
98 |
-
|
99 |
-
# Google Calendar ICS handling
|
100 |
if 'calendar.google.com' in url and 'ical' in url:
|
101 |
return self._handle_google_calendar(url)
|
102 |
-
|
103 |
-
# Standard HTML processing
|
104 |
return self._fetch_html_content(url)
|
105 |
except Exception as e:
|
106 |
logger.error(f"Content fetch failed: {e}")
|
@@ -115,7 +98,7 @@ class URLProcessor:
|
|
115 |
return None
|
116 |
|
117 |
direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
|
118 |
-
response = self.session.get(direct_url, timeout=self.timeout)
|
119 |
response.raise_for_status()
|
120 |
|
121 |
return {
|
@@ -149,14 +132,11 @@ class URLProcessor:
|
|
149 |
|
150 |
soup = BeautifulSoup(response.text, 'html.parser')
|
151 |
|
152 |
-
# Remove unwanted elements
|
153 |
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
|
154 |
element.decompose()
|
155 |
|
156 |
-
# Extract main content
|
157 |
main_content = soup.find('main') or soup.find('article') or soup.body
|
158 |
|
159 |
-
# Clean and structure content
|
160 |
text_content = main_content.get_text(separator='\n', strip=True)
|
161 |
cleaned_content = self.advanced_text_cleaning(text_content)
|
162 |
|
@@ -336,7 +316,6 @@ def create_interface():
|
|
336 |
json.dump(results, f, ensure_ascii=False, indent=2)
|
337 |
|
338 |
summary = f"Processed {len(results)} items successfully!"
|
339 |
-
# Convert Path object to string here
|
340 |
return str(output_path), summary
|
341 |
else:
|
342 |
return None, "No valid content to process."
|
|
|
|
|
1 |
import json
|
2 |
import os
|
3 |
import re
|
4 |
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
import logging
|
6 |
import mimetypes
|
|
|
|
|
7 |
import zipfile
|
8 |
import tempfile
|
9 |
from datetime import datetime
|
10 |
+
from typing import List, Dict, Optional
|
11 |
from pathlib import Path
|
|
|
12 |
import requests
|
13 |
import validators
|
14 |
import gradio as gr
|
|
|
15 |
from bs4 import BeautifulSoup
|
16 |
from fake_useragent import UserAgent
|
|
|
17 |
from cleantext import clean
|
18 |
|
19 |
# Setup logging with detailed configuration
|
|
|
32 |
self.session = requests.Session()
|
33 |
self.timeout = 10 # seconds
|
34 |
self.session.headers.update({
|
35 |
+
'User -Agent': UserAgent().random,
|
36 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
37 |
'Accept-Language': 'en-US,en;q=0.5',
|
38 |
'Accept-Encoding': 'gzip, deflate, br',
|
|
|
80 |
def fetch_content(self, url: str) -> Optional[Dict]:
|
81 |
"""Universal content fetcher with special case handling"""
|
82 |
try:
|
|
|
83 |
if 'drive.google.com' in url:
|
84 |
return self._handle_google_drive(url)
|
|
|
|
|
85 |
if 'calendar.google.com' in url and 'ical' in url:
|
86 |
return self._handle_google_calendar(url)
|
|
|
|
|
87 |
return self._fetch_html_content(url)
|
88 |
except Exception as e:
|
89 |
logger.error(f"Content fetch failed: {e}")
|
|
|
98 |
return None
|
99 |
|
100 |
direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
|
101 |
+
response = self.session.get(direct_url, timeout = self.timeout)
|
102 |
response.raise_for_status()
|
103 |
|
104 |
return {
|
|
|
132 |
|
133 |
soup = BeautifulSoup(response.text, 'html.parser')
|
134 |
|
|
|
135 |
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
|
136 |
element.decompose()
|
137 |
|
|
|
138 |
main_content = soup.find('main') or soup.find('article') or soup.body
|
139 |
|
|
|
140 |
text_content = main_content.get_text(separator='\n', strip=True)
|
141 |
cleaned_content = self.advanced_text_cleaning(text_content)
|
142 |
|
|
|
316 |
json.dump(results, f, ensure_ascii=False, indent=2)
|
317 |
|
318 |
summary = f"Processed {len(results)} items successfully!"
|
|
|
319 |
return str(output_path), summary
|
320 |
else:
|
321 |
return None, "No valid content to process."
|