Link2Doc / app.py
Asura05's picture
Update app.py
361d6e8 verified
import gradio as gr
import os
import warnings
from WebScraper import WebsiteScraper
from merge_md import merge_md_to_pdf_and_convert_to_url
warnings.filterwarnings("ignore")
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
global_output_dir = ""
def scrape_website(url, site_name, site_description="", site_category="General",
max_pages=20, max_depth=3, delay=2, scrape_external_links=False):
scraper = WebsiteScraper(
base_url=url,
site_name=site_name,
site_description=site_description,
site_category=site_category,
max_pages=max_pages,
max_depth=max_depth,
delay=delay,
scrape_external_links=scrape_external_links
)
return scraper.start()
with gr.Blocks(title="General Website Scraper", theme=gr.themes.Soft()) as demo:
gr.Markdown("# General Website Scraper")
gr.Markdown("Scrape content from any website, save as markdown files, and merge into a PDF with viewer and downloadable link.")
with gr.Row():
url_input = gr.Textbox(
label="Website URL",
placeholder="e.g., https://example.com or https://blog.example.com",
info="Enter the starting URL to scrape"
)
site_name_input = gr.Textbox(
label="Site Name",
placeholder="e.g., Example Blog",
info="A descriptive name for the website"
)
with gr.Row():
site_description_input = gr.Textbox(
label="Site Description (Optional)",
placeholder="e.g., A technology blog about AI and programming",
info="Brief description of the website content"
)
site_category_input = gr.Dropdown(
label="Site Category",
choices=[
"General", "Blog", "News", "E-commerce", "Portfolio",
"Company", "Documentation", "Forum", "Social Media",
"Education", "Technology", "Entertainment", "Health",
"Finance", "Travel", "Food", "Sports", "Art", "Other"
],
value="General",
info="Select the most appropriate category"
)
with gr.Row():
max_pages_input = gr.Number(
label="Max Pages", value=20, precision=0, minimum=1, maximum=1000,
info="Maximum number of pages to scrape"
)
max_depth_input = gr.Number(
label="Max Depth", value=3, precision=0, minimum=1, maximum=10,
info="How many clicks deep to follow links"
)
delay_input = gr.Number(
label="Delay (seconds)", value=2, precision=1, minimum=0.5, maximum=10,
info="Delay between requests to avoid overwhelming the server"
)
with gr.Row():
external_links_input = gr.Checkbox(
label="Include External Links", value=False,
info="Scrape links that go outside the original domain (use with caution)"
)
scrape_btn = gr.Button("Start Scraping", variant="primary", size="lg")
with gr.Row():
output = gr.Textbox(
label="Scraping Results",
lines=10,
max_lines=20,
info="Real-time scraping progress and results will appear here"
)
gr.Markdown("## PDF Generation & Viewer")
with gr.Row():
merge_pdf_btn = gr.Button("Merge to PDF and Get Link", variant="secondary", size="lg")
with gr.Row():
with gr.Column(scale=1):
pdf_output = gr.Textbox(
label="PDF Merge Results",
lines=5,
max_lines=10,
info="Results of merging Markdown files to PDF"
)
pdf_download = gr.File(
label="Download Merged PDF (Local File)",
file_types=[".pdf"],
visible=False
)
pdf_url_output = gr.HTML(
label="PDF Download Link",
visible=False
)
with gr.Column(scale=2):
pdf_viewer = gr.File(
label="PDF Viewer - View Merged Content",
file_types=[".pdf"],
visible=False,
interactive=False
)
gr.Markdown("## Related Video Demo")
youtube_embed = gr.HTML(
value="""
<div style='text-align: center;'>
<iframe width='560' height='315' src='https://www.youtube.com/embed/Wf2CqjQgOcI'
frameborder='0' allow='accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture'
allowfullscreen></iframe>
</div>
""",
label="Tutorial Video",
visible=True
)
def process_scrape(url, site_name, site_description, site_category, max_pages, max_depth, delay, external_links):
"""
The function `process_scrape` takes in parameters related to website scraping, performs the
scraping operation, and returns a success message or an error message based on the result.
:param url: The `url` parameter is the URL of the website that you want to scrape
:param site_name: The `site_name` parameter is a string that represents the name of the website
being scraped. It is one of the required parameters for the `process_scrape` function
:param site_description: The `site_description` parameter in the `process_scrape` function is
used to provide a description of the website being scraped. It is a text description that helps
in identifying and describing the content or purpose of the website. This information can be
used for various purposes such as categorizing the website,
:param site_category: The `site_category` parameter in the `process_scrape` function is used to
specify the category of the website being scraped. It is one of the inputs required for the
scraping process
:param max_pages: The `max_pages` parameter in the `process_scrape` function represents the
maximum number of pages to scrape on the website. It is an integer value that determines the
limit for the number of pages that will be scraped during the process
:param max_depth: The `max_depth` parameter in the `process_scrape` function represents the
maximum depth of links to follow during the website scraping process. It determines how many
levels deep the scraper will navigate through the website's links starting from the initial URL.
This parameter helps control the extent of the scraping process and
:param delay: The `delay` parameter in the `process_scrape` function represents the time delay
(in seconds) between consecutive requests made during the scraping process. This delay is useful
for preventing overwhelming the target website with too many requests in a short period, which
could lead to being blocked or flagged as suspicious activity
:param external_links: The `external_links` parameter in the `process_scrape` function is a
boolean flag that determines whether external links should be scraped along with the internal
links of the website. If `external_links` is set to `True`, the scraper will also follow and
scrape external links found on the website
:return: The function `process_scrape` returns a tuple containing a message string, and three
`None` values. The message string can vary depending on the outcome of the scraping process. If
the scraping is successful, it returns a success message with details such as the number of
pages scraped, duration, output directory, and a list of files created. If the scraping fails,
it returns an error message indicating
"""
global global_output_dir
if not url or not site_name:
return "Please provide both URL and Site Name", None, None, None
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
try:
result = scrape_website(
url=url,
site_name=site_name,
site_description=site_description,
site_category=site_category,
max_pages=int(max_pages),
max_depth=int(max_depth),
delay=float(delay),
scrape_external_links=external_links
)
if result["success"]:
global_output_dir = result['output_dir']
return (
f"Successfully scraped {result['pages_scraped']} pages!\n"
f"Duration: {result['duration']}\n"
f"Files saved to: {result['output_dir']}\n\n"
f"Files created:\n"
f" • Individual page files (.md)\n"
f" • scraping_summary.md\n"
f" • scraping_log.txt\n\n"
f"Ready to merge into PDF - click 'Merge to PDF' button below."
), None, None, None
else:
return f"Scraping failed: {result['error']}", None, None, None
except Exception as e:
return f"Error: {str(e)}", None, None, None
def process_merge_to_pdf():
"""
The function `process_merge_to_pdf` merges Markdown files into a PDF and provides download
options for the generated PDF.
:return: The `process_merge_to_pdf` function returns a tuple containing four elements:
"""
global global_output_dir
if not global_output_dir:
return ("No scraping output directory found. Please scrape a website first.",
None, None, gr.update(visible=False))
try:
result = merge_md_to_pdf_and_convert_to_url(
output_dir=global_output_dir,
site_name="Scraped Website",
site_description="Scraped content from website",
site_category="Technology",
output_format="pdf"
)
if result["success"]:
pdf_url = result["output_url"]
local_pdf_path = result["converted_path"]
message = (
f"{result['message']}\n\n"
f"PDF created successfully!\n"
f"Local file: {local_pdf_path}\n"
f"Download URL: {pdf_url}\n\n"
f"View the PDF in the viewer on the right."
)
download_html = f'''
<div style="padding: 10px; background-color: #f0f0f0; border-radius: 5px; margin: 10px 0;">
<h4>Download Options:</h4>
<p><a href="{pdf_url}" target="_blank" style="color: #1f77b4; text-decoration: none; font-weight: bold;">
Click here to download PDF from web link
</a></p>
<p><small>The PDF is also available in the viewer on the right and as a downloadable file above.</small></p>
</div>
'''
return (
message,
local_pdf_path,
download_html,
gr.update(value=local_pdf_path, visible=True)
)
else:
return (
f"PDF merge failed: {result['error']}",
None,
None,
gr.update(visible=False)
)
except Exception as e:
return (
f"Error during PDF merge: {str(e)}",
None,
None,
gr.update(visible=False)
)
scrape_btn.click(
process_scrape,
inputs=[
url_input, site_name_input, site_description_input, site_category_input,
max_pages_input, max_depth_input, delay_input, external_links_input
],
outputs=[output, pdf_download, pdf_url_output, pdf_viewer]
)
merge_pdf_btn.click(
process_merge_to_pdf,
inputs=[],
outputs=[pdf_output, pdf_download, pdf_url_output, pdf_viewer]
)
with gr.Accordion("Example Usage & Tips", open=False):
gr.Markdown("""
### Common Use Cases:
- News Websites: `https://techcrunch.com` - scrape latest tech news articles
- Blogs: `https://blog.openai.com` - scrape all blog posts and updates
- Company Sites: `https://company.com/products` - scrape product pages and documentation
- Personal Portfolios: `https://designer.com` - scrape project galleries and case studies
- Forums/Communities: `https://stackoverflow.com/questions/tagged/python` - scrape Q&A content
- E-commerce: `https://shop.com/category` - scrape product listings and descriptions
### Tips for Better Results:
- Start with specific sections: Instead of `https://wikipedia.org`, try `https://en.wikipedia.org/wiki/Category:Artificial_intelligence`
- Use reasonable limits: Start with 10-20 pages to test, then increase if needed
- Respect rate limits: Use 2-3 second delays for most sites
- External links: Only enable for trusted sites to avoid scraping the entire internet
- Check robots.txt: Make sure you're allowed to scrape the site (`site.com/robots.txt`)
### Output Files Explained:
- Individual .md files: Each scraped page saved as markdown
- scraping_summary.md: Overview of all scraped content with links
- scraping_log.txt: Detailed log of the scraping process
- Merged PDF: Combined content of all Markdown files, viewable in the interface and downloadable
### PDF Features:
- Inline Viewer: View the merged PDF directly in the interface
- Download Options: Download via direct file or web link
- Multiple Formats: Local file and web-hosted version available
""")
gr.Markdown("""
---
Important Notes:
- Always respect website terms of service and robots.txt
- Use reasonable delays to avoid overwhelming servers
- Some sites may block automated scraping
- Consider the website's bandwidth and server load
- The merged PDF is uploaded to a public link for easy sharing
- PDF viewer works best with modern browsers that support PDF display
""")
if __name__ == "__main__":
demo.launch(mcp_server=True, share=True, server_port=7860)
# @https://google.github.io/adk-docs/get-started/installation/
# use process scrape to extract above link and of maxpages of 2 and mergepdf using process_merge_to_pdf