Spaces:

Felguk
/

Felguk-url-to-text

Running

File size: 18,910 Bytes

80fb263
 
0eaed2f
 
4f25906
80fb263
7a18d26
 
 
 
 
 
 
 
 
 
 
 
 
 
aff9dae
80fb263
f7d9daf
80fb263
 
f7d9daf
80fb263
 
 
4f25906
 
 
 
 
 
 
 
 
aff9dae
4f25906
f7d9daf
0eaed2f
4f25906
0eaed2f
 
4e43700
 
 
0eaed2f
4e43700
 
0eaed2f
4e43700
 
0eaed2f
4e43700
 
0eaed2f
4e43700
 
 
 
 
 
 
 
4f25906
 
a682e5d
4f25906
827719f
 
 
 
80fb263
a682e5d
80fb263
 
f7d9daf
48bc3a2
 
 
4f25906
f7d9daf
48bc3a2
f7d9daf
 
 
48bc3a2
66c6476
f7d9daf
66c6476
 
 
 
f7d9daf
4f25906
a682e5d
 
48bc3a2
a682e5d
48bc3a2
aff9dae
d81c533
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d06a8cb
 
aff9dae
d06a8cb
 
8b34ee3
 
 
d06a8cb
 
 
 
 
aff9dae
d06a8cb
aff9dae
d06a8cb
 
 
 
aff9dae
d06a8cb
aff9dae
8b34ee3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3fe55a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e19a5af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7d9daf
7a18d26
704e96a
 
aff9dae
 
 
 
 
 
 
827719f
aff9dae
 
 
 
 
 
704e96a
aff9dae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d06a8cb
aff9dae
 
 
d06a8cb
aff9dae
d81c533
 
 
 
aff9dae
704e96a
aff9dae
 
704e96a
aff9dae
d81c533
 
 
 
 
 
aff9dae
d06a8cb
 
 
aff9dae
80fb263
8b34ee3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3fe55a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e19a5af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7d9daf
48bc3a2

import gradio as gr
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import asyncio

# HTML and JavaScript for the "Copy Code" button
copy_button_html = """
<script>
function copyCode(textareaId) {
    const text = document.querySelector(`#${textareaId} textarea`).value;
    navigator.clipboard.writeText(text).then(() => {
        alert("Text copied to clipboard!");
    }).catch(() => {
        alert("Failed to copy text.");
    });
}
</script>
"""

# Common functions
def is_valid_url(url):
    """Checks if the string is a valid URL."""
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])  # Check for scheme and domain
    except:
        return False

async def fetch_file_content(url):
    """Fetches the content of a file (CSS, JS, etc.) from a URL."""
    try:
        response = await asyncio.to_thread(requests.get, url, timeout=5)
        response.raise_for_status()
        return response.text
    except:
        return "Failed to fetch content."

# URL to Text Converter
async def extract_additional_resources(url):
    """Extracts links to CSS, JS, and images from HTML code."""
    try:
        response = await asyncio.to_thread(requests.get, url, timeout=5)
        response.raise_for_status()

        # Check if the content is HTML
        if 'text/html' in response.headers.get('Content-Type', ''):
            soup = BeautifulSoup(response.text, "html.parser")

            # Extract CSS links (limit to 5)
            css_links = [urljoin(url, link["href"]) for link in soup.find_all("link", rel="stylesheet") if "href" in link.attrs][:5]

            # Extract JS links (limit to 5)
            js_links = [urljoin(url, script["src"]) for script in soup.find_all("script") if "src" in script.attrs][:5]

            # Extract image links (limit to 5)
            img_links = [urljoin(url, img["src"]) for img in soup.find_all("img") if "src" in img.attrs][:5]

            # Fetch CSS and JS content asynchronously
            css_content = await asyncio.gather(*[fetch_file_content(link) for link in css_links])
            js_content = await asyncio.gather(*[fetch_file_content(link) for link in js_links])

            return css_links, js_links, img_links, css_content, js_content
        else:
            # If it's not HTML, treat it as a file
            return [], [], [], [response.text], []
    except Exception as e:
        return [], [], [], [], []

async def convert_to_text(url):
    # Handle view-source: URLs
    if url.startswith("view-source:"):
        url = url[len("view-source:"):]

    if not is_valid_url(url):
        return "Error: Please enter a valid URL.", "", None, [], [], [], [], []  # Return error message and empty data

    try:
        # Set headers to mimic a browser request
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=5)
        response.raise_for_status()  # Check for HTTP errors (e.g., 404, 500)

        # Return results
        status = f"Request status: {response.status_code}"
        content_length = f"Content size: {len(response.text)} characters"
        results = f"{status}\n{content_length}"

        # Save text content to a file
        file_path = "downloaded_content.txt"
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(response.text)

        # Extract additional resources
        css_links, js_links, img_links, css_content, js_content = await extract_additional_resources(url)

        return results, response.text, file_path, css_links, js_links, img_links, css_content, js_content
    except requests.exceptions.RequestException as e:
        return f"Error: {e}", "", None, [], [], [], [], []  # Return error message and empty data

# Model to Text Converter
async def fetch_model_info(model_url):
    """Fetches model description and installation instructions."""
    try:
        if "huggingface.co" in model_url:
            # Fetch model card from Hugging Face
            response = await asyncio.to_thread(requests.get, model_url, timeout=5)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            # Extract model description
            description = soup.find("div", {"class": "prose"}).get_text(strip=True) if soup.find("div", {"class": "prose"}) else "No description available."

            # Generate installation instructions
            model_name = model_url.split("/")[-1]
            install_instructions = f"To install this model, run:\n```bash\npip install transformers\n```\nThen load the model in Python:\n```python\nfrom transformers import AutoModel, AutoTokenizer\nmodel = AutoModel.from_pretrained('{model_name}')\ntokenizer = AutoTokenizer.from_pretrained('{model_name}')\n```"

            return description, install_instructions
        elif "github.com" in model_url:
            # Fetch README from GitHub
            readme_url = f"{model_url}/raw/main/README.md"
            response = await asyncio.to_thread(requests.get, readme_url, timeout=5)
            response.raise_for_status()

            # Extract description from README
            description = response.text if response.text else "No description available."

            # Generate installation instructions
            install_instructions = f"To install this model, clone the repository:\n```bash\ngit clone {model_url}.git\ncd {model_url.split('/')[-1]}\n```"

            return description, install_instructions
        else:
            return "Unsupported repository.", ""
    except Exception as e:
        return f"Error: {e}", ""

async def fetch_model_file_content(model_url, file_path):
    """Fetches the content of a file from a model repository (Hugging Face or GitHub)."""
    try:
        # Construct the full URL to the file
        if "huggingface.co" in model_url:
            # Убираем /blob/main/ из URL, если он есть
            if "/blob/main/" in model_url:
                model_url = model_url.replace("/blob/main/", "/")
            # Hugging Face URL format: https://huggingface.co/{model}/raw/main/{file_path}
            full_url = f"{model_url}/raw/main/{file_path}"
        elif "github.com" in model_url:
            # GitHub URL format: https://github.com/{user}/{repo}/raw/main/{file_path}
            full_url = f"{model_url}/raw/main/{file_path}"
        else:
            return "Error: Unsupported repository."

        # Fetch the file content
        response = await asyncio.to_thread(requests.get, full_url, timeout=5)
        response.raise_for_status()
        return response.text
    except Exception as e:
        return f"Error: {e}"

# Space to Text Converter
async def fetch_space_file_content(space_url, file_path):
    """Fetches the content of a file from a Hugging Face Space."""
    try:
        # Construct the full URL to the file
        if "huggingface.co/spaces" in space_url:
            # Hugging Face Spaces URL format: https://huggingface.co/spaces/{user}/{space}/raw/main/{file_path}
            full_url = f"{space_url}/raw/main/{file_path}"
        else:
            return "Error: Unsupported repository. Please provide a Hugging Face Space URL."

        # Fetch the file content
        response = await asyncio.to_thread(requests.get, full_url, timeout=5)
        response.raise_for_status()
        return response.text
    except Exception as e:
        return f"Error: {e}"

# CodePen to Text Converter
async def fetch_codepen_project(codepen_url):
    """Fetches the HTML, CSS, and JavaScript content from a CodePen project."""
    try:
        # Extract the project ID from the URL
        if "codepen.io" not in codepen_url:
            return "Error: Please enter a valid CodePen URL.", "", ""

        # Fetch the CodePen project page
        response = await asyncio.to_thread(requests.get, codepen_url, timeout=5)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract HTML, CSS, and JS content
        html_content = soup.find("textarea", {"id": "html-input"}).text if soup.find("textarea", {"id": "html-input"}) else ""
        css_content = soup.find("textarea", {"id": "css-input"}).text if soup.find("textarea", {"id": "css-input"}) else ""
        js_content = soup.find("textarea", {"id": "js-input"}).text if soup.find("textarea", {"id": "js-input"}) else ""

        return html_content, css_content, js_content
    except Exception as e:
        return f"Error: {e}", "", ""

# Web Data Extractor
async def extract_web_data(url):
    """Extracts additional web data like description, image preview, colors, fonts, similar code, videos, and files."""
    try:
        response = await asyncio.to_thread(requests.get, url, timeout=5)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract site description
        description = soup.find("meta", attrs={"name": "description"})["content"] if soup.find("meta", attrs={"name": "description"}) else "No description available."

        # Extract image preview (first image on the page)
        image_preview = soup.find("img")["src"] if soup.find("img") else "No image preview available."

        # Extract colors (from CSS or inline styles)
        colors = []
        for style in soup.find_all("style"):
            colors.extend([color for color in style.text.split() if color.startswith("#")])
        colors = list(set(colors))[:5]  # Limit to 5 unique colors

        # Extract fonts (from CSS or Google Fonts)
        fonts = []
        for link in soup.find_all("link", attrs={"href": True}):
            if "fonts.googleapis.com" in link["href"]:
                fonts.append(link["href"])
        fonts = list(set(fonts))[:5]  # Limit to 5 unique fonts

        # Extract similar code (shorter version of the HTML)
        similar_code = str(soup)[:1000]  # Limit to first 1000 characters

        # Extract videos (embedded iframes)
        videos = [iframe["src"] for iframe in soup.find_all("iframe") if "src" in iframe.attrs]

        # Extract files (links to downloadable files)
        files = [a["href"] for a in soup.find_all("a", attrs={"href": True}) if a["href"].endswith((".pdf", ".zip", ".doc", ".docx", ".xls", ".xlsx"))]

        return description, image_preview, colors, fonts, similar_code, videos, files
    except Exception as e:
        return f"Error: {e}", "", [], [], "", [], []

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.HTML(copy_button_html)  # Add the "Copy Code" script

    with gr.Tabs():
        # Tab 1: URL to Text Converter
        with gr.Tab("URL to Text Converter"):
            gr.Markdown("## URL to Text Converter")
            gr.Markdown("Enter a URL to fetch its text content and download it as a .txt file.")
            
            with gr.Row():
                url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com or view-source:https://example.com")
            
            with gr.Row():
                results_output = gr.Textbox(label="Request Results", interactive=False)
                text_output = gr.Textbox(label="Text Content", interactive=True, elem_id="output-text")
            
            with gr.Row():
                gr.HTML("<button onclick='copyCode(\"output-text\")'>Copy Code</button>")  # Add the "Copy Code" button
                file_output = gr.File(label="Download File", visible=False)  # Hidden file download component
            
            submit_button = gr.Button("Fetch Content")
            submit_button.click(
                fn=convert_to_text,
                inputs=url_input,
                outputs=[
                    results_output, text_output, file_output,
                    gr.Textbox(label="CSS Files"), gr.Textbox(label="JS Files"), gr.Textbox(label="Images"),
                    gr.Textbox(label="CSS Content"), gr.Textbox(label="JS Content")
                ]
            )

            # Add an Accordion to show/hide additional resources
            with gr.Accordion("Show/Hide Additional Resources", open=False):
                gr.Markdown("### CSS Files")
                css_output = gr.Textbox(label="CSS Files", interactive=False)
                
                gr.Markdown("### JS Files")
                js_output = gr.Textbox(label="JS Files", interactive=False)
                
                gr.Markdown("### Images")
                img_output = gr.Textbox(label="Images", interactive=False)
                
                gr.Markdown("### CSS Content")
                css_content_output = gr.Textbox(label="CSS Content", interactive=True)
                
                gr.Markdown("### JS Content")
                js_content_output = gr.Textbox(label="JS Content", interactive=True)

        # Tab 2: Model to Text Converter
        with gr.Tab("Model to Text Converter"):
            gr.Markdown("## Model to Text Converter")
            gr.Markdown("Enter a link to a model on Hugging Face or GitHub, and specify the file path.")
            
            with gr.Row():
                model_url_input = gr.Textbox(label="Model URL", placeholder="https://huggingface.co/... or https://github.com/...")
                file_path_input = gr.Textbox(label="File Path", placeholder="e.g., config.json or README.md")
            
            with gr.Row():
                model_description_output = gr.Textbox(label="Model Description", interactive=False)
                install_instructions_output = gr.Textbox(label="Installation Instructions", interactive=False)
            
            with gr.Row():
                model_content_output = gr.Textbox(label="File Content", interactive=True, elem_id="model-content-output")
            
            with gr.Row():
                gr.HTML("<button onclick='copyCode(\"model-content-output\")'>Copy Code</button>")  # Add the "Copy Code" button
            
            submit_model_button = gr.Button("Fetch Model Info and File Content")
            submit_model_button.click(
                fn=fetch_model_info,
                inputs=[model_url_input],
                outputs=[model_description_output, install_instructions_output]
            )
            submit_model_button.click(
                fn=fetch_model_file_content,
                inputs=[model_url_input, file_path_input],
                outputs=[model_content_output]
            )

        # Tab 3: Space to Text Converter
        with gr.Tab("Space to Text Converter"):
            gr.Markdown("## Space to Text Converter")
            gr.Markdown("Enter a link to a Hugging Face Space and specify the file path to fetch its content.")
            
            with gr.Row():
                space_url_input = gr.Textbox(label="Space URL", placeholder="https://huggingface.co/spaces/...")
                space_file_path_input = gr.Textbox(label="File Path", placeholder="e.g., app.py or README.md")
            
            with gr.Row():
                space_content_output = gr.Textbox(label="File Content", interactive=True, elem_id="space-content-output")
            
            with gr.Row():
                gr.HTML("<button onclick='copyCode(\"space-content-output\")'>Copy Code</button>")  # Add the "Copy Code" button
            
            submit_space_button = gr.Button("Fetch File Content")
            submit_space_button.click(
                fn=fetch_space_file_content,
                inputs=[space_url_input, space_file_path_input],
                outputs=[space_content_output]
            )

        # Tab 4: CodePen to Text Converter
        with gr.Tab("CodePen to Text Converter"):
            gr.Markdown("## CodePen to Text Converter")
            gr.Markdown("Enter a CodePen project URL to fetch its HTML, CSS, and JavaScript content.")
            
            with gr.Row():
                codepen_url_input = gr.Textbox(label="CodePen URL", placeholder="https://codepen.io/.../pen/...")
            
            with gr.Row():
                html_output = gr.Textbox(label="HTML Content", interactive=True, elem_id="html-output")
                css_output = gr.Textbox(label="CSS Content", interactive=True, elem_id="css-output")
                js_output = gr.Textbox(label="JavaScript Content", interactive=True, elem_id="js-output")
            
            with gr.Row():
                gr.HTML("<button onclick='copyCode(\"html-output\")'>Copy HTML</button>")
                gr.HTML("<button onclick='copyCode(\"css-output\")'>Copy CSS</button>")
                gr.HTML("<button onclick='copyCode(\"js-output\")'>Copy JS</button>")
            
            submit_codepen_button = gr.Button("Fetch CodePen Content")
            submit_codepen_button.click(
                fn=fetch_codepen_project,
                inputs=[codepen_url_input],
                outputs=[html_output, css_output, js_output]
            )

        # Tab 5: Web Data Extractor
        with gr.Tab("Web Data Extractor"):
            gr.Markdown("## Web Data Extractor")
            gr.Markdown("Enter a URL to extract additional web data like description, image preview, colors, fonts, similar code, videos, and files.")
            
            with gr.Row():
                web_data_url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com")
            
            with gr.Row():
                description_output = gr.Textbox(label="Site Description", interactive=False)
                image_preview_output = gr.Image(label="Image Preview", interactive=False)
            
            with gr.Row():
                colors_output = gr.Textbox(label="Colors", interactive=False)
                fonts_output = gr.Textbox(label="Fonts", interactive=False)
            
            with gr.Row():
                similar_code_output = gr.Textbox(label="Similar Code", interactive=True, elem_id="similar-code-output")
            
            with gr.Row():
                videos_output = gr.Textbox(label="Videos", interactive=False)
                files_output = gr.Textbox(label="Files", interactive=False)
            
            with gr.Row():
                gr.HTML("<button onclick='copyCode(\"similar-code-output\")'>Copy Code</button>")  # Add the "Copy Code" button
            
            submit_web_data_button = gr.Button("Extract Web Data")
            submit_web_data_button.click(
                fn=extract_web_data,
                inputs=[web_data_url_input],
                outputs=[description_output, image_preview_output, colors_output, fonts_output, similar_code_output, videos_output, files_output]
            )

# Launch the interface
demo.launch()