Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -13,10 +13,11 @@ from urllib.parse import urlparse
|
|
13 |
import requests
|
14 |
import validators
|
15 |
import gradio as gr
|
16 |
-
import
|
17 |
from bs4 import BeautifulSoup
|
18 |
from fake_useragent import UserAgent
|
19 |
from ratelimit import limits, sleep_and_retry
|
|
|
20 |
|
21 |
# Setup logging with detailed configuration
|
22 |
logging.basicConfig(
|
@@ -30,17 +31,13 @@ logging.basicConfig(
|
|
30 |
logger = logging.getLogger(__name__)
|
31 |
|
32 |
class URLProcessor:
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
self.timeout = timeout
|
37 |
-
self.max_retries = max_retries
|
38 |
-
self.concurrent_requests = concurrent_requests
|
39 |
-
self.ua = UserAgent() # Initialize UserAgent
|
40 |
|
41 |
-
#
|
42 |
-
self.url_cache =
|
43 |
-
self.content_cache =
|
44 |
|
45 |
self.session = requests.Session()
|
46 |
self.session.headers.update({
|
@@ -97,63 +94,64 @@ class URLProcessor:
|
|
97 |
|
98 |
@sleep_and_retry
|
99 |
@limits(calls=20, period=60) # Refined rate limiting
|
100 |
-
def fetch_content(self, url: str) -> Optional[
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
self.content_cache[url] = cleaned_text
|
124 |
-
return cleaned_text
|
125 |
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
def advanced_text_cleaning(self, text: str) -> str:
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
# Remove HTML entities
|
147 |
-
text = re.sub(r'&[a-zA-Z]+;', '', text)
|
148 |
-
|
149 |
-
# Normalize quotation marks
|
150 |
-
text = text.replace('"', '"').replace('"', '"')
|
151 |
-
text = text.replace('‘', "'").replace('’', "'")
|
152 |
-
|
153 |
-
# Remove excessive punctuation
|
154 |
-
text = re.sub(r'([.,!?]){2,}', r'\1', text)
|
155 |
-
|
156 |
-
return text.strip()
|
157 |
|
158 |
class FileProcessor:
|
159 |
"""Class to handle file processing"""
|
@@ -170,6 +168,8 @@ class FileProcessor:
|
|
170 |
except Exception:
|
171 |
return False
|
172 |
|
|
|
|
|
173 |
def process_file(self, file) -> List[Dict]:
|
174 |
"""Process uploaded file with enhanced error handling"""
|
175 |
if not file:
|
@@ -218,20 +218,23 @@ class FileProcessor:
|
|
218 |
return results
|
219 |
|
220 |
def _process_single_file(self, file) -> List[Dict]:
|
221 |
-
"""Process single file"""
|
222 |
-
results = []
|
223 |
try:
|
|
|
224 |
content = file.read().decode('utf-8', errors='ignore')
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
|
|
|
|
|
|
|
|
232 |
except Exception as e:
|
233 |
-
logger.error(f"
|
234 |
-
|
235 |
|
236 |
def create_interface():
|
237 |
"""Create a comprehensive Gradio interface with advanced features"""
|
|
|
13 |
import requests
|
14 |
import validators
|
15 |
import gradio as gr
|
16 |
+
from diskcache import Cache
|
17 |
from bs4 import BeautifulSoup
|
18 |
from fake_useragent import UserAgent
|
19 |
from ratelimit import limits, sleep_and_retry
|
20 |
+
from cleantext import clean
|
21 |
|
22 |
# Setup logging with detailed configuration
|
23 |
logging.basicConfig(
|
|
|
31 |
logger = logging.getLogger(__name__)
|
32 |
|
33 |
class URLProcessor:
|
34 |
+
def __init__(self, timeout=15, max_retries=3, concurrent_requests=5, cache_dir='cache'):
|
35 |
+
self.cache_dir = Path(cache_dir)
|
36 |
+
self.cache_dir.mkdir(exist_ok=True)
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
# Persistent disk-based caches
|
39 |
+
self.url_cache = Cache(str(self.cache_dir / 'url_cache'))
|
40 |
+
self.content_cache = Cache(str(self.cache_dir / 'content_cache'), size_limit=2**30)
|
41 |
|
42 |
self.session = requests.Session()
|
43 |
self.session.headers.update({
|
|
|
94 |
|
95 |
@sleep_and_retry
|
96 |
@limits(calls=20, period=60) # Refined rate limiting
|
97 |
+
def fetch_content(self, url: str) -> Optional[Dict]:
|
98 |
+
cached = self.content_cache.get(url)
|
99 |
+
if cached:
|
100 |
+
return cached
|
101 |
+
|
102 |
+
try:
|
103 |
+
response = self.session.get(url, timeout=self.timeout)
|
104 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
105 |
+
|
106 |
+
# Extract structured elements
|
107 |
+
title = soup.title.text.strip() if soup.title else ''
|
108 |
+
meta_desc = soup.find('meta', {'name': 'description'})
|
109 |
+
description = meta_desc['content'].strip() if meta_desc else ''
|
110 |
+
|
111 |
+
headings = [{'level': h.name, 'text': h.text.strip()}
|
112 |
+
for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
|
113 |
+
|
114 |
+
links = [a['href'] for a in soup.find_all('a', href=True)
|
115 |
+
if validators.url(a['href'])]
|
116 |
+
|
117 |
+
# Main content extraction
|
118 |
+
for element in soup(['script', 'style', 'nav', 'footer']):
|
119 |
+
element.decompose()
|
|
|
|
|
120 |
|
121 |
+
main_content = soup.find('main') or soup.find('article') or soup.body
|
122 |
+
text = main_content.get_text(separator=' ') if main_content else ''
|
123 |
+
|
124 |
+
structured_data = {
|
125 |
+
'title': title,
|
126 |
+
'description': description,
|
127 |
+
'headings': headings,
|
128 |
+
'links': links,
|
129 |
+
'content': self.advanced_text_cleaning(text),
|
130 |
+
'status_code': response.status_code,
|
131 |
+
'content_type': response.headers.get('Content-Type', ''),
|
132 |
+
'timestamp': datetime.now().isoformat()
|
133 |
+
}
|
134 |
+
|
135 |
+
self.content_cache.set(url, structured_data, expire=3600)
|
136 |
+
return structured_data
|
137 |
+
except Exception as e:
|
138 |
+
logger.error(f"Error fetching {url}: {e}")
|
139 |
+
return None
|
140 |
|
141 |
def advanced_text_cleaning(self, text: str) -> str:
|
142 |
+
return clean(text,
|
143 |
+
fix_unicode=True,
|
144 |
+
to_ascii=True,
|
145 |
+
lower=True,
|
146 |
+
no_line_breaks=True,
|
147 |
+
no_urls=True,
|
148 |
+
no_emails=True,
|
149 |
+
no_phone_numbers=True,
|
150 |
+
no_numbers=False,
|
151 |
+
no_digits=False,
|
152 |
+
no_currency_symbols=True,
|
153 |
+
no_punct=False
|
154 |
+
).strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
|
156 |
class FileProcessor:
|
157 |
"""Class to handle file processing"""
|
|
|
168 |
except Exception:
|
169 |
return False
|
170 |
|
171 |
+
|
172 |
+
|
173 |
def process_file(self, file) -> List[Dict]:
|
174 |
"""Process uploaded file with enhanced error handling"""
|
175 |
if not file:
|
|
|
218 |
return results
|
219 |
|
220 |
def _process_single_file(self, file) -> List[Dict]:
|
|
|
|
|
221 |
try:
|
222 |
+
file_stat = os.stat(file.name)
|
223 |
content = file.read().decode('utf-8', errors='ignore')
|
224 |
+
|
225 |
+
return [{
|
226 |
+
'source': 'file',
|
227 |
+
'filename': os.path.basename(file.name),
|
228 |
+
'file_size': file_stat.st_size,
|
229 |
+
'mime_type': mimetypes.guess_type(file.name)[0],
|
230 |
+
'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
|
231 |
+
'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
|
232 |
+
'content': content,
|
233 |
+
'timestamp': datetime.now().isoformat()
|
234 |
+
}]
|
235 |
except Exception as e:
|
236 |
+
logger.error(f"File processing error: {e}")
|
237 |
+
return []
|
238 |
|
239 |
def create_interface():
|
240 |
"""Create a comprehensive Gradio interface with advanced features"""
|