Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -30,11 +30,26 @@ logging.basicConfig(
|
|
30 |
logger = logging.getLogger(__name__)
|
31 |
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
class URLProcessor:
|
34 |
def __init__(self):
|
35 |
-
self.
|
36 |
-
self.
|
37 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
'User-Agent': UserAgent().random,
|
39 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
40 |
'Accept-Language': 'en-US,en;q=0.5',
|
@@ -42,133 +57,86 @@ class URLProcessor:
|
|
42 |
'Connection': 'keep-alive',
|
43 |
'Upgrade-Insecure-Requests': '1'
|
44 |
})
|
|
|
45 |
|
46 |
-
def
|
47 |
-
"""Robust text cleaning with version compatibility"""
|
48 |
-
try:
|
49 |
-
cleaned_text = clean(
|
50 |
-
text,
|
51 |
-
to_ascii=True,
|
52 |
-
lower=True,
|
53 |
-
no_line_breaks=True,
|
54 |
-
no_urls=True,
|
55 |
-
no_emails=True,
|
56 |
-
no_phone_numbers=True,
|
57 |
-
no_numbers=False,
|
58 |
-
no_digits=False,
|
59 |
-
no_currency_symbols=True,
|
60 |
-
no_punct=False
|
61 |
-
).strip()
|
62 |
-
return cleaned_text
|
63 |
-
except Exception as e:
|
64 |
-
logger.warning(f"Text cleaning error: {e}. Using fallback method.")
|
65 |
-
text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text) # Remove control characters
|
66 |
-
text = text.encode('ascii', 'ignore').decode('ascii') # Remove non-ASCII characters
|
67 |
-
text = re.sub(r'\s+', ' ', text) # Normalize whitespace
|
68 |
-
return text.strip()
|
69 |
-
|
70 |
-
def validate_url(self, url: str) -> Dict:
|
71 |
-
"""Validate URL format and accessibility"""
|
72 |
try:
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
response.raise_for_status()
|
82 |
-
except (requests.exceptions.RequestException, Exception) as e:
|
83 |
-
logger.warning(f"HEAD request failed for {url}, trying GET: {e}")
|
84 |
-
# Try with GET request if HEAD fails
|
85 |
-
response = self.session.get(url, timeout=self.timeout, stream=True)
|
86 |
-
response.raise_for_status()
|
87 |
-
# Close the connection to avoid downloading the entire content
|
88 |
-
response.close()
|
89 |
-
|
90 |
-
return {'is_valid': True, 'message': 'URL is valid and accessible'}
|
91 |
-
except requests.exceptions.ConnectionError as e:
|
92 |
-
if "NameResolutionError" in str(e) or "Failed to resolve" in str(e):
|
93 |
-
logger.warning(f"DNS resolution failed for {url}, attempt {attempt + 1}/3")
|
94 |
-
time.sleep(1) # Wait a bit before retrying
|
95 |
-
continue
|
96 |
-
else:
|
97 |
-
raise
|
98 |
-
except Exception as e:
|
99 |
-
raise
|
100 |
-
# If we get here, all attempts failed
|
101 |
-
return {'is_valid': False,
|
102 |
-
'message': f'URL validation failed: DNS resolution failed after multiple attempts'}
|
103 |
-
except Exception as e:
|
104 |
-
logger.error(f"URL validation failed for {url}: {str(e)}")
|
105 |
-
return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
|
106 |
|
107 |
-
|
108 |
-
"""Universal content fetcher with special case handling"""
|
109 |
-
try:
|
110 |
-
logger.info(f"Fetching content from: {url}")
|
111 |
-
|
112 |
-
# Google Drive document handling
|
113 |
-
if 'drive.google.com' in url:
|
114 |
-
return self._handle_google_drive(url)
|
115 |
-
# Google Calendar ICS handling
|
116 |
-
if 'calendar.google.com' in url and 'ical' in url:
|
117 |
-
return self._handle_google_calendar(url)
|
118 |
-
# Try standard HTML processing first
|
119 |
-
result = self._fetch_html_content(url)
|
120 |
-
|
121 |
-
# If standard processing failed or returned minimal content, try with Selenium
|
122 |
-
if not result or len(result.get('content', '')) < 100:
|
123 |
-
logger.info(
|
124 |
-
f"Standard processing failed or returned minimal content for {url}, trying Selenium")
|
125 |
-
selenium_html = self._fetch_with_selenium(url)
|
126 |
-
if selenium_html:
|
127 |
-
# Process the Selenium HTML
|
128 |
-
soup = BeautifulSoup(selenium_html, 'html.parser')
|
129 |
-
# Remove unwanted elements
|
130 |
-
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
|
131 |
-
element.decompose()
|
132 |
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
except Exception as e:
|
169 |
-
logger.error(f"
|
170 |
return None
|
171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
def _fetch_html_content(self, url: str) -> Optional[Dict]:
|
173 |
"""Standard HTML content processing"""
|
174 |
try:
|
@@ -682,6 +650,7 @@ def create_interface():
|
|
682 |
.warning { background-color: #fff3cd; color: #856404; }
|
683 |
.error { background-color: #f8d7da; color: #721c24; }
|
684 |
"""
|
|
|
685 |
with gr.Blocks(css=css, title="Advanced Text & URL Processing") as interface:
|
686 |
gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
|
687 |
|
@@ -805,4 +774,4 @@ def main():
|
|
805 |
|
806 |
if __name__ == "__main__":
|
807 |
main()
|
808 |
-
|
|
|
30 |
logger = logging.getLogger(__name__)
|
31 |
|
32 |
|
33 |
+
# Add these imports at the top
|
34 |
+
from config import Config
|
35 |
+
from proxy_handler import ProxyHandler
|
36 |
+
from robots_handler import RobotsHandler
|
37 |
+
import asyncio
|
38 |
+
import aiohttp
|
39 |
+
from tqdm import tqdm
|
40 |
+
|
41 |
class URLProcessor:
|
42 |
def __init__(self):
|
43 |
+
self.config = Config()
|
44 |
+
self.proxy_handler = ProxyHandler(self.config.get('PROXY_URL'))
|
45 |
+
self.robots_handler = RobotsHandler()
|
46 |
+
self.session = self._create_session()
|
47 |
+
|
48 |
+
def _create_session(self):
|
49 |
+
session = requests.Session()
|
50 |
+
if self.config.get('USE_PROXY'):
|
51 |
+
session.proxies = self.proxy_handler.get_proxy_config()
|
52 |
+
session.headers.update({
|
53 |
'User-Agent': UserAgent().random,
|
54 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
55 |
'Accept-Language': 'en-US,en;q=0.5',
|
|
|
57 |
'Connection': 'keep-alive',
|
58 |
'Upgrade-Insecure-Requests': '1'
|
59 |
})
|
60 |
+
return session
|
61 |
|
62 |
+
def _fetch_with_selenium(self, url: str) -> Optional[str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
try:
|
64 |
+
chrome_options = Options()
|
65 |
+
from selenium import webdriver
|
66 |
+
from selenium.webdriver.chrome.options import Options
|
67 |
+
from selenium.webdriver.common.by import By
|
68 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
69 |
+
from selenium.webdriver.support import expected_conditions as EC
|
70 |
+
from selenium.common.exceptions import TimeoutException
|
71 |
+
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
+
logger.info(f"Attempting to fetch {url} with Selenium")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
+
# Set up Chrome options
|
76 |
+
chrome_options = Options()
|
77 |
+
chrome_options.add_argument("--headless")
|
78 |
+
chrome_options.add_argument("--no-sandbox")
|
79 |
+
chrome_options.add_argument("--disable-dev-shm-usage")
|
80 |
+
chrome_options.add_argument("--disable-gpu")
|
81 |
+
chrome_options.add_argument("--window-size=1920,1080")
|
82 |
+
chrome_options.add_argument(
|
83 |
+
"user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
84 |
+
|
85 |
+
# Initialize the driver
|
86 |
+
driver = webdriver.Chrome(options=chrome_options)
|
87 |
+
|
88 |
+
try:
|
89 |
+
# Navigate to the URL
|
90 |
+
driver.get(url)
|
91 |
+
|
92 |
+
# Wait for the page to load
|
93 |
+
WebDriverWait(driver, 10).until(
|
94 |
+
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
95 |
+
)
|
96 |
+
|
97 |
+
# Simulate pressing ESC key to dismiss overlays
|
98 |
+
from selenium.webdriver.common.keys import Keys
|
99 |
+
action_chains = webdriver.ActionChains(driver)
|
100 |
+
action_chains.send_keys(Keys.ESCAPE).perform()
|
101 |
+
time.sleep(1) # give it a moment to take effect
|
102 |
+
action_chains.reset_actions() # Clear actions
|
103 |
+
|
104 |
+
# try again
|
105 |
+
action_chains.send_keys(Keys.ESCAPE).perform()
|
106 |
+
time.sleep(1) # give it a moment to take effect
|
107 |
+
action_chains.reset_actions()
|
108 |
+
|
109 |
+
# Get the page source
|
110 |
+
page_source = driver.page_source
|
111 |
+
|
112 |
+
# Save the Selenium HTML for debugging
|
113 |
+
debug_path = f"/Users/a2014/urld/debug_selenium_{int(time.time())}.html"
|
114 |
+
with open(debug_path, "w", encoding="utf-8") as f:
|
115 |
+
f.write(page_source)
|
116 |
+
logger.info(f"Saved Selenium HTML to {debug_path}")
|
117 |
+
|
118 |
+
return page_source
|
119 |
+
finally:
|
120 |
+
driver.quit()
|
121 |
+
|
122 |
+
except ImportError:
|
123 |
+
logger.error("Selenium is not installed. Cannot use browser automation.")
|
124 |
+
return None
|
125 |
except Exception as e:
|
126 |
+
logger.error(f"Selenium processing failed for {url}: {e}")
|
127 |
return None
|
128 |
|
129 |
+
async def fetch_urls_async(self, urls: List[str]) -> List[Dict]:
|
130 |
+
async with aiohttp.ClientSession() as session:
|
131 |
+
tasks = []
|
132 |
+
for url in urls:
|
133 |
+
if self.config.get('RESPECT_ROBOTS'):
|
134 |
+
if not self.robots_handler.can_fetch(url, self.session.headers['User-Agent']):
|
135 |
+
logger.warning(f"Skipping {url} due to robots.txt restrictions")
|
136 |
+
continue
|
137 |
+
tasks.append(self.fetch_content_async(session, url))
|
138 |
+
return await asyncio.gather(*tasks)
|
139 |
+
|
140 |
def _fetch_html_content(self, url: str) -> Optional[Dict]:
|
141 |
"""Standard HTML content processing"""
|
142 |
try:
|
|
|
650 |
.warning { background-color: #fff3cd; color: #856404; }
|
651 |
.error { background-color: #f8d7da; color: #721c24; }
|
652 |
"""
|
653 |
+
|
654 |
with gr.Blocks(css=css, title="Advanced Text & URL Processing") as interface:
|
655 |
gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
|
656 |
|
|
|
774 |
|
775 |
if __name__ == "__main__":
|
776 |
main()
|
777 |
+
|