Spaces:
Running
Running
File size: 6,822 Bytes
841ef85 1e5677c 841ef85 2fb1bb2 841ef85 72a7077 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import gradio as gr
import requests
from bs4 import BeautifulSoup # For pretty-printing HTML
# --- Function to extract HTML ---
def get_html_content(url: str):
if not url:
return "Please enter a URL.", "Status: No URL provided."
original_url_for_error = url # Keep original input for error messages
# Add https:// if no scheme is present, as requests requires it.
if not (url.startswith("http://") or url.startswith("https://")):
# Try https first as it's more common and secure
url_https = "https://" + url
url_http = "http://" + url
# Briefly check if HTTPS is responsive before defaulting to it for the main request
try:
print(f"No scheme provided for '{original_url_for_error}', trying to determine scheme (HTTPS first)...")
# Using a HEAD request to be lighter, with a short timeout
response_head = requests.head(url_https, timeout=5, allow_redirects=True, headers={'User-Agent': 'HuggingFaceSpaceHTMLSchemeChecker/1.0'})
if response_head.status_code < 400:
url = url_https
print(f"HTTPS seems responsive for '{original_url_for_error}'. Proceeding with {url}")
else:
# If HTTPS gives an error or non-success, try HTTP
print(f"HTTPS check for '{original_url_for_error}' returned {response_head.status_code}. Trying HTTP.")
response_head_http = requests.head(url_http, timeout=5, allow_redirects=True, headers={'User-Agent': 'HuggingFaceSpaceHTMLSchemeChecker/1.0'})
if response_head_http.status_code < 400:
url = url_http
print(f"HTTP seems responsive for '{original_url_for_error}'. Proceeding with {url}")
else:
# If both fail, default to HTTPS for the main GET request to provide a potentially more useful error from the GET
print(f"HTTP check for '{original_url_for_error}' also returned {response_head_http.status_code}. Defaulting to HTTPS for the main fetch attempt.")
url = url_https # Stick with HTTPS for the main request
except requests.RequestException as e:
print(f"Error during scheme probing for '{original_url_for_error}': {e}. Defaulting to HTTPS for the main fetch attempt: {url_https}")
url = url_https
status_message = f"Attempting to fetch HTML from: {url}"
print(status_message)
try:
# It's good practice to set a User-Agent. Some sites may block default Python/requests UAs.
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 HuggingFaceSpaceHTMLScraper/1.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br' # Requests handles decompression
}
# Allow a reasonable timeout for the request
response = requests.get(url, headers=headers, timeout=20) # 20 seconds timeout
# This will raise an HTTPError if the HTTP request returned an unsuccessful status code (4xx or 5xx)
response.raise_for_status()
# Use BeautifulSoup to parse and prettify the HTML for better readability
# response.content is used instead of response.text to let BS handle encoding detection better
soup = BeautifulSoup(response.content, 'html.parser')
pretty_html = soup.prettify()
status_message = f"Successfully fetched and parsed HTML from {url} (Status: {response.status_code})."
print(status_message)
return pretty_html, status_message
except requests.exceptions.HTTPError as e:
error_detail = f"HTTP Error: {e.response.status_code} for URL: {url}."
if e.response.text: # Include some of the response body if available and error is client/server
error_detail += f" Response preview: {e.response.text[:200]}"
print(error_detail)
return f"Error fetching HTML: {error_detail}", error_detail
except requests.exceptions.ConnectionError as e:
error_detail = f"Connection Error: Could not connect to {url}. The server may be down or the domain name incorrect. (Details: {e})"
print(error_detail)
return f"Error fetching HTML: {error_detail}", error_detail
except requests.exceptions.Timeout as e:
error_detail = f"Timeout Error: The request to {url} timed out. The server might be too slow or unreachable. (Details: {e})"
print(error_detail)
return f"Error fetching HTML: {error_detail}", error_detail
except requests.exceptions.RequestException as e: # Catch any other requests-related errors
error_detail = f"Request Error: An error occurred while trying to fetch {url}. (Details: {e})"
print(error_detail)
return f"Error fetching HTML: {error_detail}", error_detail
except Exception as e:
# Catch any other unexpected errors, including potential BeautifulSoup errors during parsing
error_detail = f"An unexpected error occurred during processing: {str(e)}"
print(error_detail)
return f"Error processing HTML: {error_detail}", error_detail
# --- Gradio Interface ---
iface = gr.Interface(
fn=get_html_content,
inputs=gr.Textbox(
label="Website URL",
placeholder="e.g., https://www.example.com or example.com"
),
outputs=[
gr.Textbox(label="Extracted HTML Code", lines=25, show_copy_button=True, interactive=False),
gr.Textbox(label="Status", interactive=False)
],
title="HTML Content Extractor π",
description=(
"Enter a website URL to extract its raw HTML content. "
"The tool fetches the HTML as served by the server and uses BeautifulSoup to prettify it. "
"It will **not** execute JavaScript. For websites that build their content dynamically with JavaScript, "
"the extracted HTML will be the initial source before JavaScript execution. "
"Please be respectful of website terms of service and robots.txt."
),
examples=[
["en.wikipedia.org/wiki/Python_(programming_language)"],
["httpbin.org/html"],
["example.com"]
],
cache_examples=False, # <--- ADD THIS LINE TO DISABLE EXAMPLE CACHING
flagging_options=None,
css=".gradio-container {max-width: 1024px !important; margin: auto !important;}"
)
# --- Main launch ---
if __name__ == "__main__":
print("Starting Gradio HTML Extractor application...")
iface.launch(ssr_mode=False)
|