Spaces:
Running
Running
| import gradio as gr | |
| import requests | |
| from bs4 import BeautifulSoup # For pretty-printing HTML | |
| # --- Function to extract HTML --- | |
| def get_html_content(url: str): | |
| if not url: | |
| return "Please enter a URL.", "Status: No URL provided." | |
| original_url_for_error = url # Keep original input for error messages | |
| # Add https:// if no scheme is present, as requests requires it. | |
| if not (url.startswith("http://") or url.startswith("https://")): | |
| # Try https first as it's more common and secure | |
| url_https = "https://" + url | |
| url_http = "http://" + url | |
| # Briefly check if HTTPS is responsive before defaulting to it for the main request | |
| try: | |
| print(f"No scheme provided for '{original_url_for_error}', trying to determine scheme (HTTPS first)...") | |
| # Using a HEAD request to be lighter, with a short timeout | |
| response_head = requests.head(url_https, timeout=5, allow_redirects=True, headers={'User-Agent': 'HuggingFaceSpaceHTMLSchemeChecker/1.0'}) | |
| if response_head.status_code < 400: | |
| url = url_https | |
| print(f"HTTPS seems responsive for '{original_url_for_error}'. Proceeding with {url}") | |
| else: | |
| # If HTTPS gives an error or non-success, try HTTP | |
| print(f"HTTPS check for '{original_url_for_error}' returned {response_head.status_code}. Trying HTTP.") | |
| response_head_http = requests.head(url_http, timeout=5, allow_redirects=True, headers={'User-Agent': 'HuggingFaceSpaceHTMLSchemeChecker/1.0'}) | |
| if response_head_http.status_code < 400: | |
| url = url_http | |
| print(f"HTTP seems responsive for '{original_url_for_error}'. Proceeding with {url}") | |
| else: | |
| # If both fail, default to HTTPS for the main GET request to provide a potentially more useful error from the GET | |
| print(f"HTTP check for '{original_url_for_error}' also returned {response_head_http.status_code}. Defaulting to HTTPS for the main fetch attempt.") | |
| url = url_https # Stick with HTTPS for the main request | |
| except requests.RequestException as e: | |
| print(f"Error during scheme probing for '{original_url_for_error}': {e}. Defaulting to HTTPS for the main fetch attempt: {url_https}") | |
| url = url_https | |
| status_message = f"Attempting to fetch HTML from: {url}" | |
| print(status_message) | |
| try: | |
| # It's good practice to set a User-Agent. Some sites may block default Python/requests UAs. | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 HuggingFaceSpaceHTMLScraper/1.0', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', | |
| 'Accept-Language': 'en-US,en;q=0.9', | |
| 'Accept-Encoding': 'gzip, deflate, br' # Requests handles decompression | |
| } | |
| # Allow a reasonable timeout for the request | |
| response = requests.get(url, headers=headers, timeout=20) # 20 seconds timeout | |
| # This will raise an HTTPError if the HTTP request returned an unsuccessful status code (4xx or 5xx) | |
| response.raise_for_status() | |
| # Use BeautifulSoup to parse and prettify the HTML for better readability | |
| # response.content is used instead of response.text to let BS handle encoding detection better | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| pretty_html = soup.prettify() | |
| status_message = f"Successfully fetched and parsed HTML from {url} (Status: {response.status_code})." | |
| print(status_message) | |
| return pretty_html, status_message | |
| except requests.exceptions.HTTPError as e: | |
| error_detail = f"HTTP Error: {e.response.status_code} for URL: {url}." | |
| if e.response.text: # Include some of the response body if available and error is client/server | |
| error_detail += f" Response preview: {e.response.text[:200]}" | |
| print(error_detail) | |
| return f"Error fetching HTML: {error_detail}", error_detail | |
| except requests.exceptions.ConnectionError as e: | |
| error_detail = f"Connection Error: Could not connect to {url}. The server may be down or the domain name incorrect. (Details: {e})" | |
| print(error_detail) | |
| return f"Error fetching HTML: {error_detail}", error_detail | |
| except requests.exceptions.Timeout as e: | |
| error_detail = f"Timeout Error: The request to {url} timed out. The server might be too slow or unreachable. (Details: {e})" | |
| print(error_detail) | |
| return f"Error fetching HTML: {error_detail}", error_detail | |
| except requests.exceptions.RequestException as e: # Catch any other requests-related errors | |
| error_detail = f"Request Error: An error occurred while trying to fetch {url}. (Details: {e})" | |
| print(error_detail) | |
| return f"Error fetching HTML: {error_detail}", error_detail | |
| except Exception as e: | |
| # Catch any other unexpected errors, including potential BeautifulSoup errors during parsing | |
| error_detail = f"An unexpected error occurred during processing: {str(e)}" | |
| print(error_detail) | |
| return f"Error processing HTML: {error_detail}", error_detail | |
| # --- Gradio Interface --- | |
| iface = gr.Interface( | |
| fn=get_html_content, | |
| inputs=gr.Textbox( | |
| label="Website URL", | |
| placeholder="e.g., https://www.example.com or example.com" | |
| ), | |
| outputs=[ | |
| gr.Textbox(label="Extracted HTML Code", lines=25, show_copy_button=True, interactive=False), | |
| gr.Textbox(label="Status", interactive=False) | |
| ], | |
| title="HTML Content Extractor π", | |
| description=( | |
| "Enter a website URL to extract its raw HTML content. " | |
| "The tool fetches the HTML as served by the server and uses BeautifulSoup to prettify it. " | |
| "It will **not** execute JavaScript. For websites that build their content dynamically with JavaScript, " | |
| "the extracted HTML will be the initial source before JavaScript execution. " | |
| "Please be respectful of website terms of service and robots.txt." | |
| ), | |
| examples=[ | |
| ["https://gradio.app"], | |
| ["httpbin.org/html"], | |
| ["example.com"] | |
| ], | |
| flagging_options=None, # Updated from allow_flagging="never" | |
| css=".gradio-container {max-width: 1024px !important; margin: auto !important;}" # Optional: for better layout | |
| ) | |
| # --- Main launch --- | |
| if __name__ == "__main__": | |
| print("Starting Gradio HTML Extractor application...") | |
| # Disable SSR (Server-Side Rendering) as it's experimental and can cause issues | |
| iface.launch(ssr_mode=False) | |