Spaces:
Running
Running
import json | |
import re | |
import os | |
from urllib.parse import urlparse | |
from CloudflareBypasser import CloudflareBypasser | |
from DrissionPage import ChromiumPage, ChromiumOptions | |
from fastapi import FastAPI, HTTPException, Response | |
from pydantic import BaseModel | |
from typing import Dict | |
import argparse | |
from pyvirtualdisplay import Display | |
import uvicorn | |
import atexit | |
import time | |
# Check if running in Docker mode | |
DOCKER_MODE = os.getenv("DOCKERMODE", "false").lower() == "true" | |
SERVER_PORT = int(os.getenv("SERVER_PORT", 8000)) | |
# Chromium options arguments | |
arguments = [ | |
# "--remote-debugging-port=9222", # Add this line for remote debugging | |
"-no-first-run", | |
"-force-color-profile=srgb", | |
"-metrics-recording-only", | |
"-password-store=basic", | |
"-use-mock-keychain", | |
"-export-tagged-pdf", | |
"-no-default-browser-check", | |
"-disable-background-mode", | |
"-enable-features=NetworkService,NetworkServiceInProcess,LoadCryptoTokenExtension,PermuteTLSExtensions", | |
"-disable-features=FlashDeprecationWarning,EnablePasswordsAccountStorage", | |
"-deny-permission-prompts", | |
"-disable-gpu", | |
"-accept-lang=en-US", | |
#"-incognito" # You can add this line to open the browser in incognito mode by default | |
] | |
browser_path = "/usr/bin/google-chrome" | |
app = FastAPI() | |
# Pydantic model for the response | |
class CookieResponse(BaseModel): | |
cookies: Dict[str, str] | |
user_agent: str | |
# Function to check if the URL is safe | |
def is_safe_url(url: str) -> bool: | |
parsed_url = urlparse(url) | |
ip_pattern = re.compile( | |
r"^(127\.0\.0\.1|localhost|0\.0\.0\.0|::1|10\.\d+\.\d+\.\d+|172\.1[6-9]\.\d+\.\d+|172\.2[0-9]\.\d+\.\d+|172\.3[0-1]\.\d+\.\d+|192\.168\.\d+\.\d+)$" | |
) | |
hostname = parsed_url.hostname | |
if (hostname and ip_pattern.match(hostname)) or parsed_url.scheme == "file": | |
return False | |
return True | |
# Function to verify if the page has loaded properly | |
def verify_page_loaded(driver: ChromiumPage) -> bool: | |
"""Verify if the page has loaded properly""" | |
try: | |
# Wait for body element to be present | |
body = driver.ele('tag:body', timeout=10) | |
# Check if page has actual content | |
return len(body.html) > 100 | |
except: | |
return False | |
# Function to bypass Cloudflare protection | |
def bypass_cloudflare(url: str, retries: int, log: bool, proxy: str = None) -> ChromiumPage: | |
max_load_retries = 3 | |
for load_attempt in range(max_load_retries): | |
options = ChromiumOptions().auto_port() | |
if DOCKER_MODE: | |
options.set_argument("--auto-open-devtools-for-tabs", "true") | |
options.set_argument("--remote-debugging-port=9222") | |
options.set_argument("--no-sandbox") # Necessary for Docker | |
options.set_argument("--disable-gpu") # Optional, helps in some cases | |
options.set_paths(browser_path=browser_path).headless(False) | |
else: | |
options.set_paths(browser_path=browser_path).headless(False) | |
if proxy: | |
options.set_proxy(proxy) | |
driver = ChromiumPage(addr_or_opts=options) | |
try: | |
driver.get(url) | |
# Wait for initial page load | |
time.sleep(5) | |
if not verify_page_loaded(driver): | |
driver.quit() | |
if load_attempt < max_load_retries - 1: | |
time.sleep(3) | |
continue | |
else: | |
raise Exception("Failed to load page properly after multiple attempts") | |
cf_bypasser = CloudflareBypasser(driver, retries, log) | |
cf_bypasser.bypass() | |
return driver | |
except Exception as e: | |
driver.quit() | |
if load_attempt < max_load_retries - 1: | |
time.sleep(3) | |
continue | |
raise e | |
# Endpoint to get cookies | |
async def get_cookies(url: str, retries: int = 5, proxy: str = None): | |
if not is_safe_url(url): | |
raise HTTPException(status_code=400, detail="Invalid URL") | |
try: | |
driver = bypass_cloudflare(url, retries, log, proxy) | |
cookies = {cookie.get("name", ""): cookie.get("value", " ") for cookie in driver.cookies()} | |
user_agent = driver.user_agent | |
driver.quit() | |
return CookieResponse(cookies=cookies, user_agent=user_agent) | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=str(e)) | |
# Endpoint to get HTML content and cookies | |
async def get_html(url: str, retries: int = 5, proxy: str = None): | |
if not is_safe_url(url): | |
raise HTTPException(status_code=400, detail="Invalid URL") | |
try: | |
driver = bypass_cloudflare(url, retries, log, proxy) | |
html = driver.html | |
cookies_json = {cookie.get("name", ""): cookie.get("value", " ") for cookie in driver.cookies()} | |
response = Response(content=html, media_type="text/html") | |
response.headers["cookies"] = json.dumps(cookies_json) | |
response.headers["user_agent"] = driver.user_agent | |
driver.quit() | |
return response | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=str(e)) | |
# Main entry point | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Cloudflare bypass api") | |
parser.add_argument("--nolog", action="store_true", help="Disable logging") | |
parser.add_argument("--headless", action="store_true", help="Run in headless mode") | |
args = parser.parse_args() | |
display = None | |
if args.headless or DOCKER_MODE: | |
display = Display(visible=0, size=(1920, 1080)) | |
display.start() | |
def cleanup_display(): | |
if display: | |
display.stop() | |
atexit.register(cleanup_display) | |
if args.nolog: | |
log = False | |
else: | |
log = True | |
uvicorn.run(app, host="0.0.0.0", port=SERVER_PORT) |