Spaces:

Felguk
/

Felguk-url-to-text

Sleeping

File size: 5,132 Bytes

80fb263
 
0eaed2f
 
80fb263
 
f7d9daf
80fb263
 
f7d9daf
80fb263
 
 
0eaed2f
f7d9daf
0eaed2f
 
 
 
 
f7d9daf
0eaed2f
 
f7d9daf
0eaed2f
 
f7d9daf
0eaed2f
 
 
 
 
 
a682e5d
 
 
 
 
 
 
 
 
80fb263
 
a682e5d
80fb263
 
f7d9daf
48bc3a2
 
 
 
f7d9daf
48bc3a2
f7d9daf
 
 
48bc3a2
66c6476
f7d9daf
66c6476
 
 
 
f7d9daf
0eaed2f
 
a682e5d
 
 
 
 
48bc3a2
a682e5d
48bc3a2
f7d9daf
3c69552
 
 
 
 
f7d9daf
3c69552
f7d9daf
3c69552
 
 
f7d9daf
3c69552
 
f7d9daf
66c6476
80fb263
f7d9daf
66c6476
f7d9daf
 
48bc3a2
 
f7d9daf
48bc3a2
 
f7d9daf
 
48bc3a2
66c6476
f7d9daf
 
48bc3a2
f7d9daf
0eaed2f
 
 
a682e5d
 
 
 
 
0eaed2f
 
f7d9daf
 
 
 
0eaed2f
f7d9daf
 
0eaed2f
f7d9daf
 
a682e5d
 
 
 
 
 
80fb263
f7d9daf
48bc3a2

import gradio as gr
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup

def is_valid_url(url):
    """Checks if the string is a valid URL."""
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])  # Check for scheme and domain
    except:
        return False

def extract_additional_resources(url):
    """Extracts links to CSS, JS, and images from HTML code."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract CSS links
        css_links = [urljoin(url, link["href"]) for link in soup.find_all("link", rel="stylesheet") if "href" in link.attrs]

        # Extract JS links
        js_links = [urljoin(url, script["src"]) for script in soup.find_all("script") if "src" in script.attrs]

        # Extract image links
        img_links = [urljoin(url, img["src"]) for img in soup.find_all("img") if "src" in img.attrs]

        return css_links, js_links, img_links
    except Exception as e:
        return [], [], []

def fetch_file_content(url):
    """Fetches the content of a file (CSS, JS, etc.) from a URL."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except:
        return "Failed to fetch content."

def convert_to_text(url):
    if not is_valid_url(url):
        return "Error: Please enter a valid URL.", "", None, [], [], [], [], []  # Return error message and empty data

    try:
        # Set headers to mimic a browser request
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Check for HTTP errors (e.g., 404, 500)

        # Return results
        status = f"Request status: {response.status_code}"
        content_length = f"Content size: {len(response.text)} characters"
        results = f"{status}\n{content_length}"

        # Save text content to a file
        file_path = "downloaded_content.txt"
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(response.text)

        # Extract additional resources
        css_links, js_links, img_links = extract_additional_resources(url)

        # Fetch CSS and JS content
        css_content = [fetch_file_content(link) for link in css_links]
        js_content = [fetch_file_content(link) for link in js_links]

        return results, response.text, file_path, css_links, js_links, img_links, css_content, js_content
    except requests.exceptions.RequestException as e:
        return f"Error: {e}", "", None, [], [], [], [], []  # Return error message and empty data

# HTML and JavaScript for the "Copy Code" button
copy_button_html = """
<script>
function copyCode() {
    const text = document.querySelector("#output-text textarea").value;
    navigator.clipboard.writeText(text).then(() => {
        alert("Text copied to clipboard!");
    }).catch(() => {
        alert("Failed to copy text.");
    });
}
</script>
<button onclick="copyCode()">Copy Code</button>
"""

# Link to the CSS file
css = "app.css"

# Create the Gradio interface
with gr.Blocks(css=css) as demo:
    gr.Markdown("## URL to Text Converter")
    gr.Markdown("Enter a URL to fetch its text content and download it as a .txt file.")
    
    with gr.Row():
        url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com")
    
    with gr.Row():
        results_output = gr.Textbox(label="Request Results", interactive=False)
        text_output = gr.Textbox(label="Text Content", interactive=True, elem_id="output-text")
    
    with gr.Row():
        gr.HTML(copy_button_html)  # Add the "Copy Code" button
        file_output = gr.File(label="Download File", visible=False)  # Hidden file download component
    
    submit_button = gr.Button("Fetch Content")
    submit_button.click(
        fn=convert_to_text,
        inputs=url_input,
        outputs=[
            results_output, text_output, file_output,
            gr.Textbox(label="CSS Files"), gr.Textbox(label="JS Files"), gr.Textbox(label="Images"),
            gr.Textbox(label="CSS Content"), gr.Textbox(label="JS Content")
        ]
    )

    # Add an Accordion to show/hide additional resources
    with gr.Accordion("Show/Hide Additional Resources", open=False):
        gr.Markdown("### CSS Files")
        css_output = gr.Textbox(label="CSS Files", interactive=False)
        
        gr.Markdown("### JS Files")
        js_output = gr.Textbox(label="JS Files", interactive=False)
        
        gr.Markdown("### Images")
        img_output = gr.Textbox(label="Images", interactive=False)
        
        gr.Markdown("### CSS Content")
        css_content_output = gr.Textbox(label="CSS Content", interactive=True)
        
        gr.Markdown("### JS Content")
        js_content_output = gr.Textbox(label="JS Content", interactive=True)

# Launch the interface
demo.launch()