Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -49,10 +49,19 @@ def get_subdirectory_pages(url, base_url, visited=set(), max_pages=100):
|
|
49 |
href = link.get('href')
|
50 |
if href:
|
51 |
full_url = urljoin(base_url, href)
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
except Exception as e:
|
57 |
logger.error(f"Error processing {url}: {e}")
|
58 |
|
|
|
49 |
href = link.get('href')
|
50 |
if href:
|
51 |
full_url = urljoin(base_url, href)
|
52 |
+
parsed_full_url = urlparse(full_url)
|
53 |
+
parsed_base_url = urlparse(base_url)
|
54 |
+
|
55 |
+
# Check if the URL is one level deeper
|
56 |
+
if (parsed_full_url.scheme == parsed_base_url.scheme and
|
57 |
+
parsed_full_url.netloc == parsed_base_url.netloc and
|
58 |
+
parsed_full_url.path.startswith(parsed_base_url.path) and
|
59 |
+
parsed_full_url.path.count('/') == parsed_base_url.path.count('/') + 1):
|
60 |
+
|
61 |
+
if full_url not in visited:
|
62 |
+
pages.extend(get_subdirectory_pages(full_url, full_url, visited, max_pages))
|
63 |
+
if len(visited) >= max_pages:
|
64 |
+
break
|
65 |
except Exception as e:
|
66 |
logger.error(f"Error processing {url}: {e}")
|
67 |
|