''' 
# Web Scrapping 
[@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping)
'''

import os,re, requests, uuid, zipfile, hashlib, shutil
import gradio as gr
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from transformers import pipeline

# Function to validate URLs
def validator(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def finder(url, soup, media_type):
    files = []
    # Find text
    if media_type == "text":
        text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong']
        for tag in text_tags:
            for element in soup.find_all(tag):
                files.append(element.get_text())
    # Find links
    else:
        for link in soup.find_all('a'):
            file = link.get('href')
            if file and media_type in file:
                file_url = file
                if not validator(file_url): # Assuming 'validator' is a function defined elsewhere
                    file_url = urljoin(url, file_url)
                files.append(file_url)
    return files

def scrapper(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except (requests.exceptions.RequestException, ValueError) as e:
        raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    # Add text files to the text folder
    text_content = finder(url, soup, 'text')
    os.makedirs('text', exist_ok=True)
    full_text = ''
    if text_content:
        with open('text/content.txt', 'w') as text_file:
            for line in text_content:
                text_file.write(line + '\n')
                full_text += line + ' '

    # Initialize the summarization pipeline
    summarizer = pipeline('summarization')

    # Summarize the content
    summary = summarizer(full_text, max_length=200, min_length=50, do_sample=False)

    # Extract the summary text
    summary_text = summary[0]['summary_text']
    return summary_text

def checker(url):
    if not url:
        raise Exception("URL cannot be empty.")
    if not url.startswith("https://"):
        raise Exception("The URL must begin with https://")

    try:
        summary_text = scrapper(url)
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 403:
            raise Exception("HTTP Error: Forbidden. Access to the URL is forbidden.")
        else:
            raise Exception(f"HTTP Error: {e.response.status_code}")
    except TypeError as e:
        raise Exception(f"TypeError: {str(e)}")
    except (requests.exceptions.RequestException, ValueError) as e:
        raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")

    if not summary_text:
        raise Exception("Found no text.")

    print(f"Returning summarized text from {url} ...")

    return summary_text

with gr.Blocks(theme="dwancin/theme") as app:
    title = gr.Markdown('''# Web Scraping 🕵️''')
    description = gr.Markdown('''Get the summarized text from your desired webpages with just a few clicks.''')
    with gr.Row():
        with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"):
            url_name = gr.Textbox(
                placeholder="Enter URL here",
                show_label=True,
                label="Website",
            )

            submit_button = gr.Button(
                "Submit",
                variant="primary",
                interactive=True,
            )

        with gr.Column(scale=2):
            summary_output = gr.Textbox(
                label="Summary",
                elem_id="summary-text",
                size="lg",
                show_label=False,
                readonly=True,
            )
    
    submit_button.click(
        checker, 
        inputs=[url_name], 
        outputs=[summary_output],
    )

app.launch()