Spaces:

Agents-MCP-Hackathon
/

Link2Doc

Running

App Files Files Community

n0v33n commited on 4 days ago

Commit

ff3a25c

1 Parent(s): 777a5e5

Create required file for this space

Browse files

Files changed (6) hide show

.gitignore +1 -0
DockerFile +26 -0
WebScraper.py +355 -0
app.py +315 -0
merge_md.py +263 -0
requirements.txt +12 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

DockerFile ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM python:3.12-slim
+# Install system dependencies for Chrome/Chromium
+RUN apt-get update && apt-get install -y \
+    wget \
+    gnupg \
+    unzip \
+    curl \
+    chromium \
+    chromium-driver \
+    && rm -rf /var/lib/apt/lists/*
+# Set Chrome path for Selenium
+ENV CHROME_BIN=/usr/bin/chromium
+ENV CHROMEDRIVER_PATH=/usr/bin/chromedriver
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["python", "app.py"]

WebScraper.py ADDED Viewed

	@@ -0,0 +1,355 @@

+import time
+import os
+import re
+import urllib.parse
+from datetime import datetime
+# from selenium import webdriver
+# from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from bs4 import BeautifulSoup
+try:
+    from selenium import webdriver
+    from selenium.webdriver.chrome.options import Options
+    SELENIUM_AVAILABLE = True
+except ImportError:
+    SELENIUM_AVAILABLE = False
+    print("Selenium not available. Some features may not work.")
+class WebsiteScraper:
+    def __init__(self, base_url, site_name, site_description="", site_category="General",
+                 output_dir=None, max_depth=3, max_pages=50, delay=2, headless=True,
+                 scrape_external_links=False, content_selectors=None):
+        """
+        Initialize the website scraper.
+        Args:
+            base_url (str): Starting URL to scrape
+            site_name (str): Name of the website
+            site_description (str): Description of the website
+            site_category (str): Category of the website
+            output_dir (str): Directory to save files (auto-generated if None)
+            max_depth (int): Maximum depth to crawl
+            max_pages (int): Maximum number of pages to scrape
+            delay (float): Delay between requests in seconds
+            headless (bool): Run browser in headless mode
+            scrape_external_links (bool): Whether to follow external links
+            content_selectors (list): CSS selectors to find main content
+        """
+        parsed_url = urllib.parse.urlparse(base_url)
+        self.base_url = base_url
+        self.base_domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
+        self.domain_name = parsed_url.netloc
+        self.site_name = site_name
+        self.site_description = site_description
+        self.site_category = site_category
+        self.scrape_external_links = scrape_external_links
+        self.content_selectors = content_selectors or [
+            'main', 'article', '.content', '#content', '.main-content',
+            '.post-content', '.entry-content', '.page-content', 'body'
+        ]
+        self.max_depth = max_depth
+        self.max_pages = max_pages
+        self.delay = delay
+        self.visited_links = set()
+        self.page_count = 0
+        self.start_time = datetime.now()
+        if output_dir is None:
+            domain_safe = self.domain_name.replace(".", "_").replace(":", "_")
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            self.output_dir = f"{site_name}_{domain_safe}_{timestamp}"
+        else:
+            self.output_dir = output_dir
+        if not os.path.exists(self.output_dir):
+            os.makedirs(self.output_dir)
+        self.log_path = os.path.join(self.output_dir, "scraping_log.txt")
+        with open(self.log_path, "w", encoding="utf-8") as log_file:
+            log_file.write(f"Website scraping started at: {self.start_time}\n")
+            log_file.write(f"Website: {self.site_name}\n")
+            log_file.write(f"Description: {self.site_description}\n")
+            log_file.write(f"Category: {self.site_category}\n")
+            log_file.write(f"Base URL: {self.base_url}\n")
+            log_file.write(f"Domain: {self.domain_name}\n")
+            log_file.write(f"Max depth: {self.max_depth}\n")
+            log_file.write(f"Max pages: {self.max_pages}\n")
+            log_file.write(f"External links: {self.scrape_external_links}\n\n")
+        self.setup_driver(headless)
+        self.documents = []
+def setup_driver(self, headless):
+    """Setup Chrome driver with options."""
+    try:
+        chrome_options = Options()
+        if headless:
+            chrome_options.add_argument("--headless")
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        chrome_options.add_argument("--disable-logging")
+        chrome_options.add_argument("--log-level=3")
+        chrome_options.add_argument("--disable-extensions")
+        chrome_options.add_argument("--disable-gpu")
+        chrome_options.add_argument("--window-size=1920,1080")
+        chrome_options.add_argument("--disable-web-security")
+        chrome_options.add_argument("--allow-running-insecure-content")
+        chrome_options.add_argument("--disable-features=VizDisplayCompositor")
+        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
+        chrome_options.binary_location = "/usr/bin/chromium"
+        chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
+        chrome_options.add_experimental_option('useAutomationExtension', False)
+        try:
+            self.driver = webdriver.Chrome(
+                executable_path="/usr/bin/chromedriver",
+                options=chrome_options
+            )
+        except:
+            from webdriver_manager.chrome import ChromeDriverManager
+            self.driver = webdriver.Chrome(
+                ChromeDriverManager().install(),
+                options=chrome_options
+            )
+        self.log_message("Chrome driver initialized successfully")
+    except Exception as e:
+        self.log_message(f"Error setting up Chrome driver: {e}")
+        raise
+    def log_message(self, message):
+        """Write message to console and log file."""
+        print(message)
+        with open(self.log_path, "a", encoding="utf-8") as log_file:
+            log_file.write(f"{message}\n")
+    def is_valid_url(self, url):
+        """Check if URL should be scraped."""
+        if not self.scrape_external_links and not url.startswith(self.base_domain):
+            return False
+        if re.search(r"\.(pdf|doc|docx|xls|xlsx|ppt|pptx|jpg|jpeg|png|gif|svg|ico|css|js|xml|json|zip|tar|gz|rar|7z|exe|dmg|mp3|mp4|avi|mov|wmv)$", url, re.IGNORECASE):
+            return False
+        if "#" in url:
+            url = url.split("#")[0]
+            if url in self.visited_links:
+                return False
+        skip_patterns = [
+            '/login', '/register', '/signup', '/sign-up', '/signin', '/sign-in',
+            '/logout', '/password', '/forgot', '/reset',
+            '/admin', '/dashboard', '/account', '/profile',
+            '/cart', '/checkout', '/payment', '/billing',
+            '/terms', '/privacy', '/legal', '/disclaimer',
+            '/sitemap', '/robots.txt', '/favicon'
+        ]
+        url_lower = url.lower()
+        for pattern in skip_patterns:
+            if pattern in url_lower:
+                return False
+        spam_patterns = ['popup', 'advertisement', 'tracking', 'analytics']
+        for pattern in spam_patterns:
+            if pattern in url_lower:
+                return False
+        return True
+    def sanitize_filename(self, text):
+        """Convert text to safe filename."""
+        if not text or len(text.strip()) == 0:
+            return f"page_{self.page_count}"
+        safe_name = re.sub(r'[^\w\s()-]', "_", text)
+        safe_name = re.sub(r'\s+', "_", safe_name)
+        safe_name = safe_name.strip("_")
+        return safe_name[:100] if len(safe_name) > 100 else safe_name
+    def extract_links(self):
+        """Extract valid links from current page."""
+        links = self.driver.find_elements(By.TAG_NAME, "a")
+        valid_links = []
+        for link in links:
+            try:
+                href = link.get_attribute("href")
+                if href:
+                    if href.startswith('/'):
+                        href = self.base_domain + href
+                    elif href.startswith('./') or not href.startswith('http'):
+                        current_url = self.driver.current_url
+                        base_path = '/'.join(current_url.split('/')[:-1])
+                        href = base_path + '/' + href.lstrip('./')
+                    if self.is_valid_url(href) and href not in self.visited_links:
+                        valid_links.append(href)
+            except Exception:
+                continue
+        return list(set(valid_links))
+    def extract_main_content(self, soup):
+        """Extract main content using various selectors."""
+        content_element = None
+        for selector in self.content_selectors:
+            try:
+                if selector.startswith('.') or selector.startswith('#'):
+                    elements = soup.select(selector)
+                else:
+                    elements = soup.find_all(selector)
+                if elements:
+                    content_element = elements[0]
+                    break
+            except:
+                continue
+        if not content_element:
+            content_element = soup.find('body')
+        return content_element
+    def extract_clean_text(self, soup):
+        """Extract and clean text from BeautifulSoup object."""
+        unwanted_tags = [
+            "script", "style", "nav", "footer", "header", "aside",
+            "advertisement", "ads", "popup", "modal", "cookie-notice"
+        ]
+        for tag in unwanted_tags:
+            for element in soup.find_all(tag):
+                element.decompose()
+        unwanted_classes = [
+            "sidebar", "menu", "navigation", "nav", "footer", "header",
+            "advertisement", "ad", "ads", "popup", "modal", "cookie",
+            "social", "share", "comment", "related", "recommended"
+        ]
+        for class_name in unwanted_classes:
+            for element in soup.find_all(class_=re.compile(class_name, re.I)):
+                element.decompose()
+            for element in soup.find_all(id=re.compile(class_name, re.I)):
+                element.decompose()
+        main_content = self.extract_main_content(soup)
+        if main_content:
+            text = main_content.get_text(separator=" ", strip=True)
+        else:
+            text = soup.get_text(separator=" ", strip=True)
+        lines = [line.strip() for line in text.split('\n') if line.strip()]
+        cleaned_text = '\n'.join(lines)
+        cleaned_text = re.sub(r'\n\s*\n', '\n\n', cleaned_text)
+        cleaned_text = re.sub(r' +', ' ', cleaned_text)
+        return cleaned_text
+    def scrape_page(self, url):
+        """Scrape content from a single page and save as markdown."""
+        if url in self.visited_links:
+            return []
+        self.page_count += 1
+        self.visited_links.add(url)
+        status = f"Scraping [{self.page_count}/{self.max_pages}]: {url}"
+        self.log_message(status)
+        try:
+            self.driver.get(url)
+            WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
+            time.sleep(self.delay)
+            try:
+                page_title = self.driver.title or f"Page_{self.page_count}"
+            except:
+                page_title = f"Page_{self.page_count}"
+            soup = BeautifulSoup(self.driver.page_source, "html.parser")
+            cleaned_text = self.extract_clean_text(soup)
+            if len(cleaned_text.strip()) < 50:
+                self.log_message(f"Skipping {url}: insufficient content")
+                return self.extract_links()
+            meta_desc = ""
+            meta_tag = soup.find("meta", attrs={"name": "description"})
+            if meta_tag:
+                meta_desc = meta_tag.get("content", "")
+            doc = {
+                "text": cleaned_text,
+                "metadata": {
+                    "source": url,
+                    "title": page_title,
+                    "site_name": self.site_name,
+                    "site_description": self.site_description,
+                    "site_category": self.site_category,
+                    "meta_description": meta_desc,
+                    "domain": self.domain_name,
+                    "scraped_at": datetime.now().isoformat()
+                }
+            }
+            self.documents.append(doc)
+            safe_filename = self.sanitize_filename(page_title)
+            file_path = os.path.join(self.output_dir, f"{safe_filename}.md")
+            counter = 1
+            original_path = file_path
+            while os.path.exists(file_path):
+                base, ext = os.path.splitext(original_path)
+                file_path = f"{base}_{counter}{ext}"
+                counter += 1
+            with open(file_path, "w", encoding="utf-8") as file:
+                file.write(f"# {page_title}\n\n")
+                file.write(f"**URL:** {url}\n")
+                file.write(f"**Site:** {self.site_name}\n")
+                file.write(f"**Category:** {self.site_category}\n")
+                if meta_desc:
+                    file.write(f"**Description:** {meta_desc}\n")
+                file.write(f"**Scraped:** {datetime.now()}\n\n")
+                file.write("---\n\n")
+                file.write(cleaned_text)
+            self.log_message(f"Saved: {os.path.basename(file_path)}")
+            new_links = self.extract_links()
+            self.log_message(f"Found {len(new_links)} new links")
+            return new_links
+        except Exception as e:
+            self.log_message(f"Error scraping {url}: {str(e)}")
+            return []
+    def create_summary(self):
+        """Create a summary of the scraped content."""
+        summary_path = os.path.join(self.output_dir, "scraping_summary.md")
+        with open(summary_path, "w", encoding="utf-8") as f:
+            f.write(f"# Scraping Summary: {self.site_name}\n\n")
+            f.write(f"**Website:** {self.site_name}\n")
+            f.write(f"**URL:** {self.base_url}\n")
+            f.write(f"**Domain:** {self.domain_name}\n")
+            f.write(f"**Category:** {self.site_category}\n")
+            f.write(f"**Description:** {self.site_description}\n\n")
+            f.write(f"**Scraping Details:**\n")
+            f.write(f"- Start time: {self.start_time}\n")
+            f.write(f"- End time: {datetime.now()}\n")
+            f.write(f"- Duration: {datetime.now() - self.start_time}\n")
+            f.write(f"- Pages scraped: {len(self.documents)}\n")
+            f.write(f"- Max pages allowed: {self.max_pages}\n")
+            f.write(f"- Max depth: {self.max_depth}\n")
+            f.write(f"- External links allowed: {self.scrape_external_links}\n\n")
+            if self.documents:
+                f.write("**Scraped Pages:**\n")
+                for i, doc in enumerate(self.documents, 1):
+                    f.write(f"{i}. [{doc['metadata']['title']}]({doc['metadata']['source']})\n")
+    def start(self):
+        """Start the website scraping process."""
+        try:
+            self.log_message(f"Starting website scraping for {self.site_name}")
+            self.log_message(f"Target: {self.base_url}")
+            self.log_message(f"Limits: max_depth={self.max_depth}, max_pages={self.max_pages}")
+            urls_to_scrape = [(self.base_url, 0)]
+            while urls_to_scrape and self.page_count < self.max_pages:
+                current_url, current_depth = urls_to_scrape.pop(0)
+                if current_url in self.visited_links or current_depth > self.max_depth:
+                    continue
+                new_links = self.scrape_page(current_url)
+                if current_depth + 1 <= self.max_depth:
+                    for link in new_links:
+                        if link not in self.visited_links:
+                            urls_to_scrape.append((link, current_depth + 1))
+            self.create_summary()
+            self.driver.quit()
+            end_time = datetime.now()
+            duration = end_time - self.start_time
+            self.log_message(f"Scraping completed for {self.site_name}")
+            self.log_message(f"Total pages scraped: {self.page_count}")
+            self.log_message(f"Duration: {duration}")
+            return {
+                "success": True,
+                "pages_scraped": self.page_count,
+                "duration": str(duration),
+                "output_dir": self.output_dir
+            }
+        except Exception as e:
+            self.driver.quit()
+            self.log_message(f"Scraping failed: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e),
+                "pages_scraped": self.page_count,
+                "duration": "0",
+                "output_dir": self.output_dir
+            }

app.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import gradio as gr
+import os
+import warnings
+from WebScraper import WebsiteScraper
+from merge_md import merge_md_to_pdf_and_convert_to_url
+warnings.filterwarnings("ignore")
+os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+global_output_dir = ""
+def scrape_website(url, site_name, site_description="", site_category="General",
+                  max_pages=20, max_depth=3, delay=2, scrape_external_links=False):
+    scraper = WebsiteScraper(
+        base_url=url,
+        site_name=site_name,
+        site_description=site_description,
+        site_category=site_category,
+        max_pages=max_pages,
+        max_depth=max_depth,
+        delay=delay,
+        scrape_external_links=scrape_external_links
+    )
+    return scraper.start()
+with gr.Blocks(title="General Website Scraper", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# General Website Scraper")
+    gr.Markdown("Scrape content from any website, save as markdown files, and merge into a PDF with viewer and downloadable link.")
+    with gr.Row():
+        url_input = gr.Textbox(
+            label="Website URL",
+            placeholder="e.g., https://example.com or https://blog.example.com",
+            info="Enter the starting URL to scrape"
+        )
+        site_name_input = gr.Textbox(
+            label="Site Name",
+            placeholder="e.g., Example Blog",
+            info="A descriptive name for the website"
+        )
+    with gr.Row():
+        site_description_input = gr.Textbox(
+            label="Site Description (Optional)",
+            placeholder="e.g., A technology blog about AI and programming",
+            info="Brief description of the website content"
+        )
+        site_category_input = gr.Dropdown(
+            label="Site Category",
+            choices=[
+                "General", "Blog", "News", "E-commerce", "Portfolio",
+                "Company", "Documentation", "Forum", "Social Media",
+                "Education", "Technology", "Entertainment", "Health",
+                "Finance", "Travel", "Food", "Sports", "Art", "Other"
+            ],
+            value="General",
+            info="Select the most appropriate category"
+        )
+    with gr.Row():
+        max_pages_input = gr.Number(
+            label="Max Pages", value=20, precision=0, minimum=1, maximum=1000,
+            info="Maximum number of pages to scrape"
+        )
+        max_depth_input = gr.Number(
+            label="Max Depth", value=3, precision=0, minimum=1, maximum=10,
+            info="How many clicks deep to follow links"
+        )
+        delay_input = gr.Number(
+            label="Delay (seconds)", value=2, precision=1, minimum=0.5, maximum=10,
+            info="Delay between requests to avoid overwhelming the server"
+        )
+    with gr.Row():
+        external_links_input = gr.Checkbox(
+            label="Include External Links", value=False,
+            info="Scrape links that go outside the original domain (use with caution)"
+        )
+    scrape_btn = gr.Button("Start Scraping", variant="primary", size="lg")
+    with gr.Row():
+        output = gr.Textbox(
+            label="Scraping Results",
+            lines=10,
+            max_lines=20,
+            info="Real-time scraping progress and results will appear here"
+        )
+    gr.Markdown("## PDF Generation & Viewer")
+    with gr.Row():
+        merge_pdf_btn = gr.Button("Merge to PDF and Get Link", variant="secondary", size="lg")
+    with gr.Row():
+        with gr.Column(scale=1):
+            pdf_output = gr.Textbox(
+                label="PDF Merge Results",
+                lines=5,
+                max_lines=10,
+                info="Results of merging Markdown files to PDF"
+            )
+            pdf_download = gr.File(
+                label="Download Merged PDF (Local File)",
+                file_types=[".pdf"],
+                visible=False
+            )
+            pdf_url_output = gr.HTML(
+                label="PDF Download Link",
+                visible=False
+            )
+        with gr.Column(scale=2):
+            pdf_viewer = gr.File(
+                label="PDF Viewer - View Merged Content",
+                file_types=[".pdf"],
+                visible=False,
+                interactive=False
+            )
+    def process_scrape(url, site_name, site_description, site_category, max_pages, max_depth, delay, external_links):
+        """
+        The function `process_scrape` takes in parameters related to website scraping, performs the
+        scraping operation, and returns a success message or an error message based on the result.
+        :param url: The `url` parameter is the URL of the website that you want to scrape
+        :param site_name: The `site_name` parameter is a string that represents the name of the website
+        being scraped. It is one of the required parameters for the `process_scrape` function
+        :param site_description: The `site_description` parameter in the `process_scrape` function is
+        used to provide a description of the website being scraped. It is a text description that helps
+        in identifying and describing the content or purpose of the website. This information can be
+        used for various purposes such as categorizing the website,
+        :param site_category: The `site_category` parameter in the `process_scrape` function is used to
+        specify the category of the website being scraped. It is one of the inputs required for the
+        scraping process
+        :param max_pages: The `max_pages` parameter in the `process_scrape` function represents the
+        maximum number of pages to scrape on the website. It is an integer value that determines the
+        limit for the number of pages that will be scraped during the process
+        :param max_depth: The `max_depth` parameter in the `process_scrape` function represents the
+        maximum depth of links to follow during the website scraping process. It determines how many
+        levels deep the scraper will navigate through the website's links starting from the initial URL.
+        This parameter helps control the extent of the scraping process and
+        :param delay: The `delay` parameter in the `process_scrape` function represents the time delay
+        (in seconds) between consecutive requests made during the scraping process. This delay is useful
+        for preventing overwhelming the target website with too many requests in a short period, which
+        could lead to being blocked or flagged as suspicious activity
+        :param external_links: The `external_links` parameter in the `process_scrape` function is a
+        boolean flag that determines whether external links should be scraped along with the internal
+        links of the website. If `external_links` is set to `True`, the scraper will also follow and
+        scrape external links found on the website
+        :return: The function `process_scrape` returns a tuple containing a message string, and three
+        `None` values. The message string can vary depending on the outcome of the scraping process. If
+        the scraping is successful, it returns a success message with details such as the number of
+        pages scraped, duration, output directory, and a list of files created. If the scraping fails,
+        it returns an error message indicating
+        """
+        global global_output_dir
+        if not url or not site_name:
+            return "Please provide both URL and Site Name", None, None, None
+        if not url.startswith(('http://', 'https://')):
+            url = 'https://' + url
+        try:
+            result = scrape_website(
+                url=url,
+                site_name=site_name,
+                site_description=site_description,
+                site_category=site_category,
+                max_pages=int(max_pages),
+                max_depth=int(max_depth),
+                delay=float(delay),
+                scrape_external_links=external_links
+            )
+            if result["success"]:
+                global_output_dir = result['output_dir']
+                return (
+                    f"Successfully scraped {result['pages_scraped']} pages!\n"
+                    f"Duration: {result['duration']}\n"
+                    f"Files saved to: {result['output_dir']}\n\n"
+                    f"Files created:\n"
+                    f"  • Individual page files (.md)\n"
+                    f"  • scraping_summary.md\n"
+                    f"  • scraping_log.txt\n\n"
+                    f"Ready to merge into PDF - click 'Merge to PDF' button below."
+                ), None, None, None
+            else:
+                return f"Scraping failed: {result['error']}", None, None, None
+        except Exception as e:
+            return f"Error: {str(e)}", None, None, None
+    def process_merge_to_pdf():
+        """
+        The function `process_merge_to_pdf` merges Markdown files into a PDF and provides download
+        options for the generated PDF.
+        :return: The `process_merge_to_pdf` function returns a tuple containing four elements:
+        """
+        global global_output_dir
+        if not global_output_dir:
+            return ("No scraping output directory found. Please scrape a website first.",
+                   None, None, gr.update(visible=False))
+        try:
+            result = merge_md_to_pdf_and_convert_to_url(
+                output_dir=global_output_dir,
+                site_name="Scraped Website",
+                site_description="Scraped content from website",
+                site_category="Technology",
+                output_format="pdf"
+            )
+            if result["success"]:
+                pdf_url = result["output_url"]
+                local_pdf_path = result["converted_path"]
+                message = (
+                    f"{result['message']}\n\n"
+                    f"PDF created successfully!\n"
+                    f"Local file: {local_pdf_path}\n"
+                    f"Download URL: {pdf_url}\n\n"
+                    f"View the PDF in the viewer on the right."
+                )
+                download_html = f'''
+                <div style="padding: 10px; background-color: #f0f0f0; border-radius: 5px; margin: 10px 0;">
+                    <h4>Download Options:</h4>
+                    <p><a href="{pdf_url}" target="_blank" style="color: #1f77b4; text-decoration: none; font-weight: bold;">
+                        Click here to download PDF from web link
+                    </a></p>
+                    <p><small>The PDF is also available in the viewer on the right and as a downloadable file above.</small></p>
+                </div>
+                '''
+                return (
+                    message,
+                    local_pdf_path,
+                    download_html,
+                    gr.update(value=local_pdf_path, visible=True)
+                )
+            else:
+                return (
+                    f"PDF merge failed: {result['error']}",
+                    None,
+                    None,
+                    gr.update(visible=False)
+                )
+        except Exception as e:
+            return (
+                f"Error during PDF merge: {str(e)}",
+                None,
+                None,
+                gr.update(visible=False)
+            )
+    scrape_btn.click(
+        process_scrape,
+        inputs=[
+            url_input, site_name_input, site_description_input, site_category_input,
+            max_pages_input, max_depth_input, delay_input, external_links_input
+        ],
+        outputs=[output, pdf_download, pdf_url_output, pdf_viewer]
+    )
+    merge_pdf_btn.click(
+        process_merge_to_pdf,
+        inputs=[],
+        outputs=[pdf_output, pdf_download, pdf_url_output, pdf_viewer]
+    )
+    with gr.Accordion("Example Usage & Tips", open=False):
+        gr.Markdown("""
+        ### Common Use Cases:
+        - News Websites: `https://techcrunch.com` - scrape latest tech news articles
+        - Blogs: `https://blog.openai.com` - scrape all blog posts and updates
+        - Company Sites: `https://company.com/products` - scrape product pages and documentation
+        - Personal Portfolios: `https://designer.com` - scrape project galleries and case studies
+        - Forums/Communities: `https://stackoverflow.com/questions/tagged/python` - scrape Q&A content
+        - E-commerce: `https://shop.com/category` - scrape product listings and descriptions
+        ### Tips for Better Results:
+        - Start with specific sections: Instead of `https://wikipedia.org`, try `https://en.wikipedia.org/wiki/Category:Artificial_intelligence`
+        - Use reasonable limits: Start with 10-20 pages to test, then increase if needed
+        - Respect rate limits: Use 2-3 second delays for most sites
+        - External links: Only enable for trusted sites to avoid scraping the entire internet
+        - Check robots.txt: Make sure you're allowed to scrape the site (`site.com/robots.txt`)
+        ### Output Files Explained:
+        - Individual .md files: Each scraped page saved as markdown
+        - scraping_summary.md: Overview of all scraped content with links
+        - scraping_log.txt: Detailed log of the scraping process
+        - Merged PDF: Combined content of all Markdown files, viewable in the interface and downloadable
+        ### PDF Features:
+        - Inline Viewer: View the merged PDF directly in the interface
+        - Download Options: Download via direct file or web link
+        - Multiple Formats: Local file and web-hosted version available
+        """)
+    gr.Markdown("""
+    ---
+    Important Notes:
+    - Always respect website terms of service and robots.txt
+    - Use reasonable delays to avoid overwhelming servers
+    - Some sites may block automated scraping
+    - Consider the website's bandwidth and server load
+    - The merged PDF is uploaded to a public link for easy sharing
+    - PDF viewer works best with modern browsers that support PDF display
+    """)
+if __name__ == "__main__":
+    demo.launch(mcp_server=True, share=True, server_port=7860)

merge_md.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import os
+from reportlab.lib.pagesizes import A4
+from reportlab.lib.units import inch
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
+from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+from datetime import datetime
+import markdown2
+from mistralai import Mistral
+from pathlib import Path
+from urllib.parse import urlparse
+import convertapi
+import requests
+from dotenv import load_dotenv
+import re
+load_dotenv()
+convertapi.api_credentials = os.getenv("CONVERTAPI_TOKEN")
+if not convertapi.api_credentials:
+    raise ValueError("CONVERTAPI_TOKEN environment variable is required")
+SUPPORTED_FORMATS = ["pdf", "docx", "txt"]
+MAX_FILE_SIZE = int(os.getenv("MAX_FILE_SIZE", 100 * 1024 * 1024))
+# TEMP_DIR = os.getenv("TEMP_DIR", "temp")
+# In merge_md.py, update temp directory handling
+TEMP_DIR = os.getenv("TEMP_DIR", "/tmp/scraper_temp")
+# Ensure temp directory exists
+os.makedirs(TEMP_DIR, exist_ok=True)
+def upload_to_service(file_path: str) -> str:
+    """
+    Mock function to simulate uploading a file to a cloud service.
+    Args:
+        file_path (str): Path to the file to upload.
+    Returns:
+        str: Mock public URL or error message.
+    """
+    try:
+        if not os.path.exists(file_path):
+            return f"File not found: {file_path}"
+        return f"https://mock-cloud-service.com/{os.path.basename(file_path)}"
+    except Exception as e:
+        return f"Error uploading file: {str(e)}"
+def convert_from_url(document_url: str, output_format: str) -> str:
+    """
+    Convert a document from a URL to a different format using ConvertAPI.
+    Args:
+        document_url (str): The URL of the input file.
+        output_format (str): The format to convert the file to.
+    Returns:
+        str: The path to the converted file or an error message.
+    """
+    try:
+        if not document_url or not document_url.lower().startswith(("http://", "https://")):
+            return "Invalid or unsupported URL format."
+        if output_format not in SUPPORTED_FORMATS:
+            return f"Unsupported output format: {output_format}"
+        result = convertapi.convert(output_format, {"File": document_url})
+        input_filename = Path(urlparse(document_url).path).stem or "converted_file"
+        output_filename = f"{input_filename}.{output_format}"
+        output_path = Path(TEMP_DIR) / output_filename
+        output_path.parent.mkdir(exist_ok=True)
+        result.file.save(str(output_path))
+        return str(output_path)
+    except Exception as e:
+        return f"Error converting file from URL: {str(e)}"
+def merge_md_to_pdf(output_dir, site_name, site_description="", site_category="General"):
+    """
+    Merge all Markdown files in the output directory into a single PDF using reportlab after processing with Mistral AI.
+    Args:
+        output_dir (str): Directory containing Markdown files.
+        site_name (str): Name of the site for the PDF title.
+        site_description (str): Description of the site.
+        site_category (str): Category of the site.
+    Returns:
+        dict: Result containing success status, output PDF path, and message.
+    """
+    try:
+        api_key = os.getenv("MISTRAL_API_KEY")
+        if not api_key:
+            return {
+                "success": False,
+                "error": "MISTRAL_API_KEY environment variable not set",
+                "output_pdf": None,
+                "pages_merged": 0
+            }
+        client = Mistral(api_key=api_key)
+        model = "mistral-large-latest"
+        if not os.path.exists(output_dir):
+            return {
+                "success": False,
+                "error": f"Output directory {output_dir} does not exist",
+                "output_pdf": None,
+                "pages_merged": 0
+            }
+        md_files = [
+            f for f in os.listdir(output_dir)
+            if f.endswith('.md') and f not in ['scraping_summary.md', 'scraping_log.txt']
+        ]
+        if not md_files:
+            return {
+                "success": False,
+                "error": "No Markdown files found in the output directory",
+                "output_pdf": None,
+                "pages_merged": 0
+            }
+        pdf_output_path = os.path.join(output_dir, f"{site_name}_merged.pdf")
+        doc = SimpleDocTemplate(
+            pdf_output_path,
+            pagesize=A4,
+            rightMargin=inch,
+            leftMargin=inch,
+            topMargin=inch,
+            bottomMargin=inch
+        )
+        styles = getSampleStyleSheet()
+        title_style = ParagraphStyle(name='Title', fontSize=24, leading=28, alignment=1, spaceAfter=20)
+        heading_style = ParagraphStyle(name='Heading2', fontSize=18, leading=22, spaceAfter=15)
+        body_style = ParagraphStyle(name='Body', fontSize=12, leading=14, spaceAfter=10)
+        story = [
+            Paragraph(f"{site_name}", title_style),
+            Spacer(1, 0.2 * inch),
+            Paragraph(f"Description: {site_description}", body_style),
+            Paragraph(f"Category: {site_category}", body_style),
+            Paragraph(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", body_style),
+            PageBreak(),
+            Paragraph("Table of Contents", heading_style),
+            Spacer(1, 0.2 * inch)
+        ]
+        toc_entries = []
+        for idx, md_file in enumerate(sorted(md_files), 1):
+            file_path = os.path.join(output_dir, md_file)
+            with open(file_path, 'r', encoding='utf-8') as f:
+                md_content = f.read()
+            title = md_content.split('\n')[0].strip('#').strip() or f"Page {idx}"
+            try:
+                prompt = f"""
+                You are an expert content editor. Below is the content of a Markdown file. Please enhance the content by making it more detailed, well-structured, and polished while preserving the original meaning. Ensure the output is in plain text suitable for inclusion in a PDF. Avoid adding Markdown or HTML formatting in the response.
+                If there are HTML tags like <p><strong>Agents-MCP-Hackathon (Agents-MCP-Hackathon)</strong></p>, convert them to plain text like Agents-MCP-Hackathon (Agents-MCP-Hackathon).
+                Original content:
+                {md_content}
+                Enhanced content:
+                """
+                response = client.chat.complete(
+                    model=model,
+                    messages=[{"role": "user", "content": prompt}]
+                )
+                enhanced_content = response.choices[0].message.content.strip()
+            except Exception as e:
+                print(f"Warning: Failed to process {md_file} with Mistral AI: {str(e)}. Using original content.")
+                enhanced_content = md_content
+            html_content = markdown2.markdown(enhanced_content, extras=['fenced-code-blocks', 'tables'])
+            text_content = re.sub(r'<[^>]+>', '', html_content)
+            text_content = re.sub(r'\s+', ' ', text_content).strip()
+            lines = text_content.split('\n')
+            toc_entries.append(Paragraph(f"{idx}. {title}", body_style))
+            story.append(Paragraph(title, heading_style))
+            story.append(Spacer(1, 0.1 * inch))
+            for line in lines:
+                if line.strip():
+                    story.append(Paragraph(line.strip(), body_style))
+            story.append(PageBreak())
+        story[6:6] = toc_entries + [PageBreak()]
+        doc.build(story)
+        return {
+            "success": True,
+            "output_pdf": pdf_output_path,
+            "pages_merged": len(md_files),
+            "message": f"Successfully merged {len(md_files)} Markdown files into {pdf_output_path} after processing with Mistral AI"
+        }
+    except Exception as e:
+        return {
+            "success": False,
+            "error": f"Failed to merge Markdown files into PDF: {str(e)}",
+            "output_pdf": None,
+            "pages_merged": 0
+        }
+def merge_md_to_pdf_and_convert_to_url(output_dir, site_name, site_description="", site_category="General", output_format="pdf"):
+    """
+    Merge Markdown files into a PDF, upload it to a service, and optionally convert to another format.
+    Args:
+        output_dir (str): Directory containing Markdown files.
+        site_name (str): Name of the site for the PDF title.
+        site_description (str): Description of the site.
+        site_category (str): Category of the site.
+        output_format (str): Optional format to convert the PDF to (e.g., 'docx', 'txt').
+    Returns:
+        dict: Result containing success status, output URL, and message.
+    """
+    try:
+        merge_result = merge_md_to_pdf(output_dir, site_name, site_description, site_category)
+        if not merge_result["success"]:
+            return {
+                "success": False,
+                "error": merge_result["error"],
+                "output_url": None,
+                "converted_path": None
+            }
+        pdf_path = merge_result["output_pdf"]
+        if not pdf_path or not os.path.exists(pdf_path):
+            return {
+                "success": False,
+                "error": "Generated PDF not found",
+                "output_url": None,
+                "converted_path": None
+            }
+        pdf_url = upload_to_service(pdf_path)
+        if not pdf_url.startswith("http"):
+            return {
+                "success": False,
+                "error": f"Failed to obtain URL: {pdf_url}",
+                "output_url": None,
+                "converted_path": None
+            }
+        converted_path = pdf_path
+        if output_format != "pdf":
+            converted_path = convert_from_url(pdf_url, output_format)
+            if not converted_path.startswith(TEMP_DIR):
+                return {
+                    "success": False,
+                    "error": f"Conversion failed: {converted_path}",
+                    "output_url": pdf_url,
+                    "converted_path": None
+                }
+        return {
+            "success": True,
+            "output_url": pdf_url,
+            "converted_path": converted_path,
+            "message": f"Successfully merged {merge_result['pages_merged']} Markdown files into PDF and uploaded to {pdf_url}",
+            "pages_merged": merge_result["pages_merged"]
+        }
+    except Exception as e:
+        return {
+            "success": False,
+            "error": f"Error in merging or uploading: {str(e)}",
+            "output_url": None,
+            "converted_path": None
+        }

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+gradio
+selenium
+beautifulsoup4
+requests
+reportlab
+markdown2
+mistralai
+convertapi
+python-dotenv
+pathlib
+urllib3
+webdriver-manager