File size: 3,983 Bytes
0883890
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from fastapi import FastAPI, HTTPException, Query
from playwright.async_api import async_playwright
from urllib.parse import urljoin, urlparse
from typing import List, Set
import re

app = FastAPI()

# In-memory cache for JS URLs
js_cache: Set[str] = set()

# Regex for extracting API-like URLs
def extract_possible_endpoints(text: str) -> List[str]:
    pattern = re.compile(r'https?://[^\s"\'<>]+')
    urls = pattern.findall(text)
    return list(set([
        url for url in urls if '/api/' in url or re.search(r'\.(json|php|xml|ajax|aspx|jsp)', url)
    ]))

# Extract all internal links from a page
async def extract_internal_links(page, base_url: str) -> List[str]:
    anchors = await page.eval_on_selector_all('a[href]', 'els => els.map(el => el.href)')
    domain = urlparse(base_url).netloc
    internal_links = [
        link for link in anchors
        if urlparse(link).netloc == domain
    ]
    return list(set(internal_links))


async def scrape_page_for_endpoints(page, url: str) -> List[str]:
    found_endpoints = []

    try:
        await page.goto(url, timeout=60000)
        await page.wait_for_timeout(2000)

        # --- Log network requests ---
        network_logs = []

        def handle_request(req):
            network_logs.append(req.url)

        page.on("request", handle_request)

        # --- JS File Endpoint Extraction ---
        js_urls = await page.eval_on_selector_all(
            'script[src]',
            "elements => elements.map(el => el.src)"
        )

        js_based_endpoints = []

        for js_url in js_urls:
            if js_url in js_cache:
                continue
            js_cache.add(js_url)

            try:
                response = await page.request.get(js_url)
                if response.ok:
                    body = await response.text()
                    js_based_endpoints.extend(extract_possible_endpoints(body))
            except:
                continue

        # Extract from network requests
        network_endpoints = extract_possible_endpoints('\n'.join(network_logs))

        # Combine all
        found_endpoints = list(set(js_based_endpoints + network_endpoints))

    except Exception as e:
        print(f"[!] Failed to scrape {url}: {e}")

    return found_endpoints


@app.get("/scrape-api-endpoints")
async def scrape_api_endpoints(
    website: str = Query(..., description="Website URL to scrape"),
    max_depth: int = Query(1, description="Max depth of link crawling (1 = base page only)")
):
    try:
        visited = set()
        all_endpoints = []

        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context()
            page = await context.new_page()

            queue = [(website, 0)]

            while queue:
                current_url, depth = queue.pop(0)
                if current_url in visited or depth > max_depth:
                    continue
                visited.add(current_url)

                print(f"[+] Scraping: {current_url}")
                endpoints = await scrape_page_for_endpoints(page, current_url)
                all_endpoints.extend(endpoints)

                if depth < max_depth:
                    try:
                        internal_links = await extract_internal_links(page, website)
                        for link in internal_links:
                            if link not in visited:
                                queue.append((link, depth + 1))
                    except Exception as e:
                        print(f"[!] Link extraction failed: {e}")

            await browser.close()

        return {
            "website": website,
            "pages_visited": len(visited),
            "total_endpoints_found": len(set(all_endpoints)),
            "api_endpoints": list(set(all_endpoints)),
        }

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Scraping failed: {str(e)}")