Spaces:

Agents-MCP-Hackathon
/

Link2Doc

Running

File size: 14,943 Bytes

import gradio as gr
import os
import warnings
from WebScraper import WebsiteScraper
from merge_md import merge_md_to_pdf_and_convert_to_url  

warnings.filterwarnings("ignore")
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

global_output_dir = ""

def scrape_website(url, site_name, site_description="", site_category="General", 
                  max_pages=20, max_depth=3, delay=2, scrape_external_links=False):
    scraper = WebsiteScraper(
        base_url=url,
        site_name=site_name,
        site_description=site_description,
        site_category=site_category,
        max_pages=max_pages,
        max_depth=max_depth,
        delay=delay,
        scrape_external_links=scrape_external_links
    )
    return scraper.start()

with gr.Blocks(title="General Website Scraper", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# General Website Scraper")
    gr.Markdown("Scrape content from any website, save as markdown files, and merge into a PDF with viewer and downloadable link.")
    
    with gr.Row():
        url_input = gr.Textbox(
            label="Website URL", 
            placeholder="e.g., https://example.com or https://blog.example.com",
            info="Enter the starting URL to scrape"
        )
        site_name_input = gr.Textbox(
            label="Site Name", 
            placeholder="e.g., Example Blog",
            info="A descriptive name for the website"
        )
    
    with gr.Row():
        site_description_input = gr.Textbox(
            label="Site Description (Optional)", 
            placeholder="e.g., A technology blog about AI and programming",
            info="Brief description of the website content"
        )
        site_category_input = gr.Dropdown(
            label="Site Category",
            choices=[
                "General", "Blog", "News", "E-commerce", "Portfolio", 
                "Company", "Documentation", "Forum", "Social Media",
                "Education", "Technology", "Entertainment", "Health",
                "Finance", "Travel", "Food", "Sports", "Art", "Other"
            ],
            value="General",
            info="Select the most appropriate category"
        )
    
    with gr.Row():
        max_pages_input = gr.Number(
            label="Max Pages", value=20, precision=0, minimum=1, maximum=1000,
            info="Maximum number of pages to scrape"
        )
        max_depth_input = gr.Number(
            label="Max Depth", value=3, precision=0, minimum=1, maximum=10,
            info="How many clicks deep to follow links"
        )
        delay_input = gr.Number(
            label="Delay (seconds)", value=2, precision=1, minimum=0.5, maximum=10,
            info="Delay between requests to avoid overwhelming the server"
        )
    
    with gr.Row():
        external_links_input = gr.Checkbox(
            label="Include External Links", value=False,
            info="Scrape links that go outside the original domain (use with caution)"
        )
    
    scrape_btn = gr.Button("Start Scraping", variant="primary", size="lg")
    
    with gr.Row():
        output = gr.Textbox(
            label="Scraping Results", 
            lines=10, 
            max_lines=20,
            info="Real-time scraping progress and results will appear here"
        )
    
    gr.Markdown("## PDF Generation & Viewer")
    
    with gr.Row():
        merge_pdf_btn = gr.Button("Merge to PDF and Get Link", variant="secondary", size="lg")
        
    with gr.Row():
        with gr.Column(scale=1):
            pdf_output = gr.Textbox(
                label="PDF Merge Results", 
                lines=5, 
                max_lines=10,
                info="Results of merging Markdown files to PDF"
            )
            
            pdf_download = gr.File(
                label="Download Merged PDF (Local File)",
                file_types=[".pdf"],
                visible=False
            )
            
            pdf_url_output = gr.HTML(
                label="PDF Download Link",
                visible=False
            )
        
        with gr.Column(scale=2):
            pdf_viewer = gr.File(
                label="PDF Viewer - View Merged Content",
                file_types=[".pdf"],
                visible=False,
                interactive=False
            )
    gr.Markdown("## Related Video Demo")
    youtube_embed = gr.HTML(
        value="""
        <div style='text-align: center;'>
            <iframe width='560' height='315' src='https://www.youtube.com/embed/Wf2CqjQgOcI' 
            frameborder='0' allow='accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture' 
            allowfullscreen></iframe>
        </div>
        """,
        label="Tutorial Video",
        visible=True
    )
    def process_scrape(url, site_name, site_description, site_category, max_pages, max_depth, delay, external_links):
        """
        The function `process_scrape` takes in parameters related to website scraping, performs the
        scraping operation, and returns a success message or an error message based on the result.
        
        :param url: The `url` parameter is the URL of the website that you want to scrape
        :param site_name: The `site_name` parameter is a string that represents the name of the website
        being scraped. It is one of the required parameters for the `process_scrape` function
        :param site_description: The `site_description` parameter in the `process_scrape` function is
        used to provide a description of the website being scraped. It is a text description that helps
        in identifying and describing the content or purpose of the website. This information can be
        used for various purposes such as categorizing the website,
        :param site_category: The `site_category` parameter in the `process_scrape` function is used to
        specify the category of the website being scraped. It is one of the inputs required for the
        scraping process
        :param max_pages: The `max_pages` parameter in the `process_scrape` function represents the
        maximum number of pages to scrape on the website. It is an integer value that determines the
        limit for the number of pages that will be scraped during the process
        :param max_depth: The `max_depth` parameter in the `process_scrape` function represents the
        maximum depth of links to follow during the website scraping process. It determines how many
        levels deep the scraper will navigate through the website's links starting from the initial URL.
        This parameter helps control the extent of the scraping process and
        :param delay: The `delay` parameter in the `process_scrape` function represents the time delay
        (in seconds) between consecutive requests made during the scraping process. This delay is useful
        for preventing overwhelming the target website with too many requests in a short period, which
        could lead to being blocked or flagged as suspicious activity
        :param external_links: The `external_links` parameter in the `process_scrape` function is a
        boolean flag that determines whether external links should be scraped along with the internal
        links of the website. If `external_links` is set to `True`, the scraper will also follow and
        scrape external links found on the website
        :return: The function `process_scrape` returns a tuple containing a message string, and three
        `None` values. The message string can vary depending on the outcome of the scraping process. If
        the scraping is successful, it returns a success message with details such as the number of
        pages scraped, duration, output directory, and a list of files created. If the scraping fails,
        it returns an error message indicating
        """
        global global_output_dir
        if not url or not site_name:
            return "Please provide both URL and Site Name", None, None, None
        
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        
        try:
            result = scrape_website(
                url=url,
                site_name=site_name,
                site_description=site_description,
                site_category=site_category,
                max_pages=int(max_pages),
                max_depth=int(max_depth),
                delay=float(delay),
                scrape_external_links=external_links
            )
            
            if result["success"]:
                global_output_dir = result['output_dir']
                return (
                    f"Successfully scraped {result['pages_scraped']} pages!\n"
                    f"Duration: {result['duration']}\n"
                    f"Files saved to: {result['output_dir']}\n\n"
                    f"Files created:\n"
                    f"  • Individual page files (.md)\n"
                    f"  • scraping_summary.md\n"
                    f"  • scraping_log.txt\n\n"
                    f"Ready to merge into PDF - click 'Merge to PDF' button below."
                ), None, None, None
            else:
                return f"Scraping failed: {result['error']}", None, None, None
        except Exception as e:
            return f"Error: {str(e)}", None, None, None

    def process_merge_to_pdf():
        """
        The function `process_merge_to_pdf` merges Markdown files into a PDF and provides download
        options for the generated PDF.
        :return: The `process_merge_to_pdf` function returns a tuple containing four elements:
        """
        global global_output_dir
        if not global_output_dir:
            return ("No scraping output directory found. Please scrape a website first.", 
                   None, None, gr.update(visible=False))
        
        try:
            result = merge_md_to_pdf_and_convert_to_url(
                output_dir=global_output_dir,
                site_name="Scraped Website",
                site_description="Scraped content from website",
                site_category="Technology",
                output_format="pdf"
            )
            
            if result["success"]:
                pdf_url = result["output_url"]
                local_pdf_path = result["converted_path"]
                
                message = (
                    f"{result['message']}\n\n"
                    f"PDF created successfully!\n"
                    f"Local file: {local_pdf_path}\n"
                    f"Download URL: {pdf_url}\n\n"
                    f"View the PDF in the viewer on the right."
                )
                
                download_html = f'''
                <div style="padding: 10px; background-color: #f0f0f0; border-radius: 5px; margin: 10px 0;">
                    <h4>Download Options:</h4>
                    <p><a href="{pdf_url}" target="_blank" style="color: #1f77b4; text-decoration: none; font-weight: bold;">
                        Click here to download PDF from web link
                    </a></p>
                    <p><small>The PDF is also available in the viewer on the right and as a downloadable file above.</small></p>
                </div>
                '''
                
                return (
                    message, 
                    local_pdf_path,
                    download_html,
                    gr.update(value=local_pdf_path, visible=True)
                )
            else:
                return (
                    f"PDF merge failed: {result['error']}", 
                    None, 
                    None, 
                    gr.update(visible=False)
                )
        except Exception as e:
            return (
                f"Error during PDF merge: {str(e)}", 
                None, 
                None, 
                gr.update(visible=False)
            )

    scrape_btn.click(
        process_scrape,
        inputs=[
            url_input, site_name_input, site_description_input, site_category_input,
            max_pages_input, max_depth_input, delay_input, external_links_input
        ],
        outputs=[output, pdf_download, pdf_url_output, pdf_viewer]
    )
    
    merge_pdf_btn.click(
        process_merge_to_pdf,
        inputs=[],
        outputs=[pdf_output, pdf_download, pdf_url_output, pdf_viewer]
    )
    
    with gr.Accordion("Example Usage & Tips", open=False):
        gr.Markdown("""
        ### Common Use Cases:
        - News Websites: `https://techcrunch.com` - scrape latest tech news articles
        - Blogs: `https://blog.openai.com` - scrape all blog posts and updates
        - Company Sites: `https://company.com/products` - scrape product pages and documentation
        - Personal Portfolios: `https://designer.com` - scrape project galleries and case studies
        - Forums/Communities: `https://stackoverflow.com/questions/tagged/python` - scrape Q&A content
        - E-commerce: `https://shop.com/category` - scrape product listings and descriptions
        
        ### Tips for Better Results:
        - Start with specific sections: Instead of `https://wikipedia.org`, try `https://en.wikipedia.org/wiki/Category:Artificial_intelligence`
        - Use reasonable limits: Start with 10-20 pages to test, then increase if needed
        - Respect rate limits: Use 2-3 second delays for most sites
        - External links: Only enable for trusted sites to avoid scraping the entire internet
        - Check robots.txt: Make sure you're allowed to scrape the site (`site.com/robots.txt`)
        
        ### Output Files Explained:
        - Individual .md files: Each scraped page saved as markdown
        - scraping_summary.md: Overview of all scraped content with links
        - scraping_log.txt: Detailed log of the scraping process
        - Merged PDF: Combined content of all Markdown files, viewable in the interface and downloadable
        
        ### PDF Features:
        - Inline Viewer: View the merged PDF directly in the interface
        - Download Options: Download via direct file or web link
        - Multiple Formats: Local file and web-hosted version available
        """)
    
    gr.Markdown("""
    ---
    Important Notes:
    - Always respect website terms of service and robots.txt
    - Use reasonable delays to avoid overwhelming servers
    - Some sites may block automated scraping
    - Consider the website's bandwidth and server load
    - The merged PDF is uploaded to a public link for easy sharing
    - PDF viewer works best with modern browsers that support PDF display
    """)

if __name__ == "__main__":
    demo.launch(mcp_server=True, share=True, server_port=7860)


# @https://google.github.io/adk-docs/get-started/installation/ 
# use process scrape to extract above link and of maxpages of 2 and mergepdf using process_merge_to_pdf