Spaces:
Running
Running
import gradio as gr | |
import os | |
import warnings | |
from WebScraper import WebsiteScraper | |
from merge_md import merge_md_to_pdf_and_convert_to_url | |
warnings.filterwarnings("ignore") | |
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0" | |
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" | |
global_output_dir = "" | |
def scrape_website(url, site_name, site_description="", site_category="General", | |
max_pages=20, max_depth=3, delay=2, scrape_external_links=False): | |
scraper = WebsiteScraper( | |
base_url=url, | |
site_name=site_name, | |
site_description=site_description, | |
site_category=site_category, | |
max_pages=max_pages, | |
max_depth=max_depth, | |
delay=delay, | |
scrape_external_links=scrape_external_links | |
) | |
return scraper.start() | |
with gr.Blocks(title="General Website Scraper", theme=gr.themes.Soft()) as demo: | |
gr.Markdown("# General Website Scraper") | |
gr.Markdown("Scrape content from any website, save as markdown files, and merge into a PDF with viewer and downloadable link.") | |
with gr.Row(): | |
url_input = gr.Textbox( | |
label="Website URL", | |
placeholder="e.g., https://example.com or https://blog.example.com", | |
info="Enter the starting URL to scrape" | |
) | |
site_name_input = gr.Textbox( | |
label="Site Name", | |
placeholder="e.g., Example Blog", | |
info="A descriptive name for the website" | |
) | |
with gr.Row(): | |
site_description_input = gr.Textbox( | |
label="Site Description (Optional)", | |
placeholder="e.g., A technology blog about AI and programming", | |
info="Brief description of the website content" | |
) | |
site_category_input = gr.Dropdown( | |
label="Site Category", | |
choices=[ | |
"General", "Blog", "News", "E-commerce", "Portfolio", | |
"Company", "Documentation", "Forum", "Social Media", | |
"Education", "Technology", "Entertainment", "Health", | |
"Finance", "Travel", "Food", "Sports", "Art", "Other" | |
], | |
value="General", | |
info="Select the most appropriate category" | |
) | |
with gr.Row(): | |
max_pages_input = gr.Number( | |
label="Max Pages", value=20, precision=0, minimum=1, maximum=1000, | |
info="Maximum number of pages to scrape" | |
) | |
max_depth_input = gr.Number( | |
label="Max Depth", value=3, precision=0, minimum=1, maximum=10, | |
info="How many clicks deep to follow links" | |
) | |
delay_input = gr.Number( | |
label="Delay (seconds)", value=2, precision=1, minimum=0.5, maximum=10, | |
info="Delay between requests to avoid overwhelming the server" | |
) | |
with gr.Row(): | |
external_links_input = gr.Checkbox( | |
label="Include External Links", value=False, | |
info="Scrape links that go outside the original domain (use with caution)" | |
) | |
scrape_btn = gr.Button("Start Scraping", variant="primary", size="lg") | |
with gr.Row(): | |
output = gr.Textbox( | |
label="Scraping Results", | |
lines=10, | |
max_lines=20, | |
info="Real-time scraping progress and results will appear here" | |
) | |
gr.Markdown("## PDF Generation & Viewer") | |
with gr.Row(): | |
merge_pdf_btn = gr.Button("Merge to PDF and Get Link", variant="secondary", size="lg") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
pdf_output = gr.Textbox( | |
label="PDF Merge Results", | |
lines=5, | |
max_lines=10, | |
info="Results of merging Markdown files to PDF" | |
) | |
pdf_download = gr.File( | |
label="Download Merged PDF (Local File)", | |
file_types=[".pdf"], | |
visible=False | |
) | |
pdf_url_output = gr.HTML( | |
label="PDF Download Link", | |
visible=False | |
) | |
with gr.Column(scale=2): | |
pdf_viewer = gr.File( | |
label="PDF Viewer - View Merged Content", | |
file_types=[".pdf"], | |
visible=False, | |
interactive=False | |
) | |
gr.Markdown("## Related Video Demo") | |
youtube_embed = gr.HTML( | |
value=""" | |
<div style='text-align: center;'> | |
<iframe width='560' height='315' src='https://www.youtube.com/embed/Wf2CqjQgOcI' | |
frameborder='0' allow='accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture' | |
allowfullscreen></iframe> | |
</div> | |
""", | |
label="Tutorial Video", | |
visible=True | |
) | |
def process_scrape(url, site_name, site_description, site_category, max_pages, max_depth, delay, external_links): | |
""" | |
The function `process_scrape` takes in parameters related to website scraping, performs the | |
scraping operation, and returns a success message or an error message based on the result. | |
:param url: The `url` parameter is the URL of the website that you want to scrape | |
:param site_name: The `site_name` parameter is a string that represents the name of the website | |
being scraped. It is one of the required parameters for the `process_scrape` function | |
:param site_description: The `site_description` parameter in the `process_scrape` function is | |
used to provide a description of the website being scraped. It is a text description that helps | |
in identifying and describing the content or purpose of the website. This information can be | |
used for various purposes such as categorizing the website, | |
:param site_category: The `site_category` parameter in the `process_scrape` function is used to | |
specify the category of the website being scraped. It is one of the inputs required for the | |
scraping process | |
:param max_pages: The `max_pages` parameter in the `process_scrape` function represents the | |
maximum number of pages to scrape on the website. It is an integer value that determines the | |
limit for the number of pages that will be scraped during the process | |
:param max_depth: The `max_depth` parameter in the `process_scrape` function represents the | |
maximum depth of links to follow during the website scraping process. It determines how many | |
levels deep the scraper will navigate through the website's links starting from the initial URL. | |
This parameter helps control the extent of the scraping process and | |
:param delay: The `delay` parameter in the `process_scrape` function represents the time delay | |
(in seconds) between consecutive requests made during the scraping process. This delay is useful | |
for preventing overwhelming the target website with too many requests in a short period, which | |
could lead to being blocked or flagged as suspicious activity | |
:param external_links: The `external_links` parameter in the `process_scrape` function is a | |
boolean flag that determines whether external links should be scraped along with the internal | |
links of the website. If `external_links` is set to `True`, the scraper will also follow and | |
scrape external links found on the website | |
:return: The function `process_scrape` returns a tuple containing a message string, and three | |
`None` values. The message string can vary depending on the outcome of the scraping process. If | |
the scraping is successful, it returns a success message with details such as the number of | |
pages scraped, duration, output directory, and a list of files created. If the scraping fails, | |
it returns an error message indicating | |
""" | |
global global_output_dir | |
if not url or not site_name: | |
return "Please provide both URL and Site Name", None, None, None | |
if not url.startswith(('http://', 'https://')): | |
url = 'https://' + url | |
try: | |
result = scrape_website( | |
url=url, | |
site_name=site_name, | |
site_description=site_description, | |
site_category=site_category, | |
max_pages=int(max_pages), | |
max_depth=int(max_depth), | |
delay=float(delay), | |
scrape_external_links=external_links | |
) | |
if result["success"]: | |
global_output_dir = result['output_dir'] | |
return ( | |
f"Successfully scraped {result['pages_scraped']} pages!\n" | |
f"Duration: {result['duration']}\n" | |
f"Files saved to: {result['output_dir']}\n\n" | |
f"Files created:\n" | |
f" • Individual page files (.md)\n" | |
f" • scraping_summary.md\n" | |
f" • scraping_log.txt\n\n" | |
f"Ready to merge into PDF - click 'Merge to PDF' button below." | |
), None, None, None | |
else: | |
return f"Scraping failed: {result['error']}", None, None, None | |
except Exception as e: | |
return f"Error: {str(e)}", None, None, None | |
def process_merge_to_pdf(): | |
""" | |
The function `process_merge_to_pdf` merges Markdown files into a PDF and provides download | |
options for the generated PDF. | |
:return: The `process_merge_to_pdf` function returns a tuple containing four elements: | |
""" | |
global global_output_dir | |
if not global_output_dir: | |
return ("No scraping output directory found. Please scrape a website first.", | |
None, None, gr.update(visible=False)) | |
try: | |
result = merge_md_to_pdf_and_convert_to_url( | |
output_dir=global_output_dir, | |
site_name="Scraped Website", | |
site_description="Scraped content from website", | |
site_category="Technology", | |
output_format="pdf" | |
) | |
if result["success"]: | |
pdf_url = result["output_url"] | |
local_pdf_path = result["converted_path"] | |
message = ( | |
f"{result['message']}\n\n" | |
f"PDF created successfully!\n" | |
f"Local file: {local_pdf_path}\n" | |
f"Download URL: {pdf_url}\n\n" | |
f"View the PDF in the viewer on the right." | |
) | |
download_html = f''' | |
<div style="padding: 10px; background-color: #f0f0f0; border-radius: 5px; margin: 10px 0;"> | |
<h4>Download Options:</h4> | |
<p><a href="{pdf_url}" target="_blank" style="color: #1f77b4; text-decoration: none; font-weight: bold;"> | |
Click here to download PDF from web link | |
</a></p> | |
<p><small>The PDF is also available in the viewer on the right and as a downloadable file above.</small></p> | |
</div> | |
''' | |
return ( | |
message, | |
local_pdf_path, | |
download_html, | |
gr.update(value=local_pdf_path, visible=True) | |
) | |
else: | |
return ( | |
f"PDF merge failed: {result['error']}", | |
None, | |
None, | |
gr.update(visible=False) | |
) | |
except Exception as e: | |
return ( | |
f"Error during PDF merge: {str(e)}", | |
None, | |
None, | |
gr.update(visible=False) | |
) | |
scrape_btn.click( | |
process_scrape, | |
inputs=[ | |
url_input, site_name_input, site_description_input, site_category_input, | |
max_pages_input, max_depth_input, delay_input, external_links_input | |
], | |
outputs=[output, pdf_download, pdf_url_output, pdf_viewer] | |
) | |
merge_pdf_btn.click( | |
process_merge_to_pdf, | |
inputs=[], | |
outputs=[pdf_output, pdf_download, pdf_url_output, pdf_viewer] | |
) | |
with gr.Accordion("Example Usage & Tips", open=False): | |
gr.Markdown(""" | |
### Common Use Cases: | |
- News Websites: `https://techcrunch.com` - scrape latest tech news articles | |
- Blogs: `https://blog.openai.com` - scrape all blog posts and updates | |
- Company Sites: `https://company.com/products` - scrape product pages and documentation | |
- Personal Portfolios: `https://designer.com` - scrape project galleries and case studies | |
- Forums/Communities: `https://stackoverflow.com/questions/tagged/python` - scrape Q&A content | |
- E-commerce: `https://shop.com/category` - scrape product listings and descriptions | |
### Tips for Better Results: | |
- Start with specific sections: Instead of `https://wikipedia.org`, try `https://en.wikipedia.org/wiki/Category:Artificial_intelligence` | |
- Use reasonable limits: Start with 10-20 pages to test, then increase if needed | |
- Respect rate limits: Use 2-3 second delays for most sites | |
- External links: Only enable for trusted sites to avoid scraping the entire internet | |
- Check robots.txt: Make sure you're allowed to scrape the site (`site.com/robots.txt`) | |
### Output Files Explained: | |
- Individual .md files: Each scraped page saved as markdown | |
- scraping_summary.md: Overview of all scraped content with links | |
- scraping_log.txt: Detailed log of the scraping process | |
- Merged PDF: Combined content of all Markdown files, viewable in the interface and downloadable | |
### PDF Features: | |
- Inline Viewer: View the merged PDF directly in the interface | |
- Download Options: Download via direct file or web link | |
- Multiple Formats: Local file and web-hosted version available | |
""") | |
gr.Markdown(""" | |
--- | |
Important Notes: | |
- Always respect website terms of service and robots.txt | |
- Use reasonable delays to avoid overwhelming servers | |
- Some sites may block automated scraping | |
- Consider the website's bandwidth and server load | |
- The merged PDF is uploaded to a public link for easy sharing | |
- PDF viewer works best with modern browsers that support PDF display | |
""") | |
if __name__ == "__main__": | |
demo.launch(mcp_server=True, share=True, server_port=7860) | |
# @https://google.github.io/adk-docs/get-started/installation/ | |
# use process scrape to extract above link and of maxpages of 2 and mergepdf using process_merge_to_pdf | |