File size: 6,822 Bytes
841ef85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e5677c
841ef85
 
 
2fb1bb2
 
 
841ef85
 
 
 
 
72a7077
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import gradio as gr
import requests
from bs4 import BeautifulSoup # For pretty-printing HTML

# --- Function to extract HTML ---
def get_html_content(url: str):
    if not url:
        return "Please enter a URL.", "Status: No URL provided."
    
    original_url_for_error = url # Keep original input for error messages

    # Add https:// if no scheme is present, as requests requires it.
    if not (url.startswith("http://") or url.startswith("https://")):
        # Try https first as it's more common and secure
        url_https = "https://" + url
        url_http = "http://" + url
        
        # Briefly check if HTTPS is responsive before defaulting to it for the main request
        try:
            print(f"No scheme provided for '{original_url_for_error}', trying to determine scheme (HTTPS first)...")
            # Using a HEAD request to be lighter, with a short timeout
            response_head = requests.head(url_https, timeout=5, allow_redirects=True, headers={'User-Agent': 'HuggingFaceSpaceHTMLSchemeChecker/1.0'})
            if response_head.status_code < 400:
                url = url_https
                print(f"HTTPS seems responsive for '{original_url_for_error}'. Proceeding with {url}")
            else:
                # If HTTPS gives an error or non-success, try HTTP
                print(f"HTTPS check for '{original_url_for_error}' returned {response_head.status_code}. Trying HTTP.")
                response_head_http = requests.head(url_http, timeout=5, allow_redirects=True, headers={'User-Agent': 'HuggingFaceSpaceHTMLSchemeChecker/1.0'})
                if response_head_http.status_code < 400:
                    url = url_http
                    print(f"HTTP seems responsive for '{original_url_for_error}'. Proceeding with {url}")
                else:
                    # If both fail, default to HTTPS for the main GET request to provide a potentially more useful error from the GET
                    print(f"HTTP check for '{original_url_for_error}' also returned {response_head_http.status_code}. Defaulting to HTTPS for the main fetch attempt.")
                    url = url_https # Stick with HTTPS for the main request
        except requests.RequestException as e:
            print(f"Error during scheme probing for '{original_url_for_error}': {e}. Defaulting to HTTPS for the main fetch attempt: {url_https}")
            url = url_https


    status_message = f"Attempting to fetch HTML from: {url}"
    print(status_message)

    try:
        # It's good practice to set a User-Agent. Some sites may block default Python/requests UAs.
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 HuggingFaceSpaceHTMLScraper/1.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br' # Requests handles decompression
        }
        
        # Allow a reasonable timeout for the request
        response = requests.get(url, headers=headers, timeout=20) # 20 seconds timeout
        
        # This will raise an HTTPError if the HTTP request returned an unsuccessful status code (4xx or 5xx)
        response.raise_for_status()

        # Use BeautifulSoup to parse and prettify the HTML for better readability
        # response.content is used instead of response.text to let BS handle encoding detection better
        soup = BeautifulSoup(response.content, 'html.parser')
        pretty_html = soup.prettify()
        
        status_message = f"Successfully fetched and parsed HTML from {url} (Status: {response.status_code})."
        print(status_message)
        return pretty_html, status_message
        
    except requests.exceptions.HTTPError as e:
        error_detail = f"HTTP Error: {e.response.status_code} for URL: {url}."
        if e.response.text: # Include some of the response body if available and error is client/server
            error_detail += f" Response preview: {e.response.text[:200]}"
        print(error_detail)
        return f"Error fetching HTML: {error_detail}", error_detail
    except requests.exceptions.ConnectionError as e:
        error_detail = f"Connection Error: Could not connect to {url}. The server may be down or the domain name incorrect. (Details: {e})"
        print(error_detail)
        return f"Error fetching HTML: {error_detail}", error_detail
    except requests.exceptions.Timeout as e:
        error_detail = f"Timeout Error: The request to {url} timed out. The server might be too slow or unreachable. (Details: {e})"
        print(error_detail)
        return f"Error fetching HTML: {error_detail}", error_detail
    except requests.exceptions.RequestException as e: # Catch any other requests-related errors
        error_detail = f"Request Error: An error occurred while trying to fetch {url}. (Details: {e})"
        print(error_detail)
        return f"Error fetching HTML: {error_detail}", error_detail
    except Exception as e:
        # Catch any other unexpected errors, including potential BeautifulSoup errors during parsing
        error_detail = f"An unexpected error occurred during processing: {str(e)}"
        print(error_detail)
        return f"Error processing HTML: {error_detail}", error_detail

# --- Gradio Interface ---
iface = gr.Interface(
    fn=get_html_content,
    inputs=gr.Textbox(
        label="Website URL",
        placeholder="e.g., https://www.example.com or example.com"
    ),
    outputs=[
        gr.Textbox(label="Extracted HTML Code", lines=25, show_copy_button=True, interactive=False),
        gr.Textbox(label="Status", interactive=False)
    ],
    title="HTML Content Extractor 🌐",
    description=(
        "Enter a website URL to extract its raw HTML content. "
        "The tool fetches the HTML as served by the server and uses BeautifulSoup to prettify it. "
        "It will **not** execute JavaScript. For websites that build their content dynamically with JavaScript, "
        "the extracted HTML will be the initial source before JavaScript execution. "
        "Please be respectful of website terms of service and robots.txt."
    ),
    examples=[
        ["en.wikipedia.org/wiki/Python_(programming_language)"],
        ["httpbin.org/html"],
        ["example.com"]
    ],
    cache_examples=False, # <--- ADD THIS LINE TO DISABLE EXAMPLE CACHING
    flagging_options=None, 
    css=".gradio-container {max-width: 1024px !important; margin: auto !important;}" 
)

# --- Main launch ---
if __name__ == "__main__":
    print("Starting Gradio HTML Extractor application...")
    iface.launch(ssr_mode=False)