File size: 14,943 Bytes
ff3a25c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361d6e8
 
 
 
 
 
 
 
 
 
 
 
ff3a25c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42adaec
 
 
361d6e8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
import gradio as gr
import os
import warnings
from WebScraper import WebsiteScraper
from merge_md import merge_md_to_pdf_and_convert_to_url  

warnings.filterwarnings("ignore")
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

global_output_dir = ""

def scrape_website(url, site_name, site_description="", site_category="General", 
                  max_pages=20, max_depth=3, delay=2, scrape_external_links=False):
    scraper = WebsiteScraper(
        base_url=url,
        site_name=site_name,
        site_description=site_description,
        site_category=site_category,
        max_pages=max_pages,
        max_depth=max_depth,
        delay=delay,
        scrape_external_links=scrape_external_links
    )
    return scraper.start()

with gr.Blocks(title="General Website Scraper", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# General Website Scraper")
    gr.Markdown("Scrape content from any website, save as markdown files, and merge into a PDF with viewer and downloadable link.")
    
    with gr.Row():
        url_input = gr.Textbox(
            label="Website URL", 
            placeholder="e.g., https://example.com or https://blog.example.com",
            info="Enter the starting URL to scrape"
        )
        site_name_input = gr.Textbox(
            label="Site Name", 
            placeholder="e.g., Example Blog",
            info="A descriptive name for the website"
        )
    
    with gr.Row():
        site_description_input = gr.Textbox(
            label="Site Description (Optional)", 
            placeholder="e.g., A technology blog about AI and programming",
            info="Brief description of the website content"
        )
        site_category_input = gr.Dropdown(
            label="Site Category",
            choices=[
                "General", "Blog", "News", "E-commerce", "Portfolio", 
                "Company", "Documentation", "Forum", "Social Media",
                "Education", "Technology", "Entertainment", "Health",
                "Finance", "Travel", "Food", "Sports", "Art", "Other"
            ],
            value="General",
            info="Select the most appropriate category"
        )
    
    with gr.Row():
        max_pages_input = gr.Number(
            label="Max Pages", value=20, precision=0, minimum=1, maximum=1000,
            info="Maximum number of pages to scrape"
        )
        max_depth_input = gr.Number(
            label="Max Depth", value=3, precision=0, minimum=1, maximum=10,
            info="How many clicks deep to follow links"
        )
        delay_input = gr.Number(
            label="Delay (seconds)", value=2, precision=1, minimum=0.5, maximum=10,
            info="Delay between requests to avoid overwhelming the server"
        )
    
    with gr.Row():
        external_links_input = gr.Checkbox(
            label="Include External Links", value=False,
            info="Scrape links that go outside the original domain (use with caution)"
        )
    
    scrape_btn = gr.Button("Start Scraping", variant="primary", size="lg")
    
    with gr.Row():
        output = gr.Textbox(
            label="Scraping Results", 
            lines=10, 
            max_lines=20,
            info="Real-time scraping progress and results will appear here"
        )
    
    gr.Markdown("## PDF Generation & Viewer")
    
    with gr.Row():
        merge_pdf_btn = gr.Button("Merge to PDF and Get Link", variant="secondary", size="lg")
        
    with gr.Row():
        with gr.Column(scale=1):
            pdf_output = gr.Textbox(
                label="PDF Merge Results", 
                lines=5, 
                max_lines=10,
                info="Results of merging Markdown files to PDF"
            )
            
            pdf_download = gr.File(
                label="Download Merged PDF (Local File)",
                file_types=[".pdf"],
                visible=False
            )
            
            pdf_url_output = gr.HTML(
                label="PDF Download Link",
                visible=False
            )
        
        with gr.Column(scale=2):
            pdf_viewer = gr.File(
                label="PDF Viewer - View Merged Content",
                file_types=[".pdf"],
                visible=False,
                interactive=False
            )
    gr.Markdown("## Related Video Demo")
    youtube_embed = gr.HTML(
        value="""
        <div style='text-align: center;'>
            <iframe width='560' height='315' src='https://www.youtube.com/embed/Wf2CqjQgOcI' 
            frameborder='0' allow='accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture' 
            allowfullscreen></iframe>
        </div>
        """,
        label="Tutorial Video",
        visible=True
    )
    def process_scrape(url, site_name, site_description, site_category, max_pages, max_depth, delay, external_links):
        """
        The function `process_scrape` takes in parameters related to website scraping, performs the
        scraping operation, and returns a success message or an error message based on the result.
        
        :param url: The `url` parameter is the URL of the website that you want to scrape
        :param site_name: The `site_name` parameter is a string that represents the name of the website
        being scraped. It is one of the required parameters for the `process_scrape` function
        :param site_description: The `site_description` parameter in the `process_scrape` function is
        used to provide a description of the website being scraped. It is a text description that helps
        in identifying and describing the content or purpose of the website. This information can be
        used for various purposes such as categorizing the website,
        :param site_category: The `site_category` parameter in the `process_scrape` function is used to
        specify the category of the website being scraped. It is one of the inputs required for the
        scraping process
        :param max_pages: The `max_pages` parameter in the `process_scrape` function represents the
        maximum number of pages to scrape on the website. It is an integer value that determines the
        limit for the number of pages that will be scraped during the process
        :param max_depth: The `max_depth` parameter in the `process_scrape` function represents the
        maximum depth of links to follow during the website scraping process. It determines how many
        levels deep the scraper will navigate through the website's links starting from the initial URL.
        This parameter helps control the extent of the scraping process and
        :param delay: The `delay` parameter in the `process_scrape` function represents the time delay
        (in seconds) between consecutive requests made during the scraping process. This delay is useful
        for preventing overwhelming the target website with too many requests in a short period, which
        could lead to being blocked or flagged as suspicious activity
        :param external_links: The `external_links` parameter in the `process_scrape` function is a
        boolean flag that determines whether external links should be scraped along with the internal
        links of the website. If `external_links` is set to `True`, the scraper will also follow and
        scrape external links found on the website
        :return: The function `process_scrape` returns a tuple containing a message string, and three
        `None` values. The message string can vary depending on the outcome of the scraping process. If
        the scraping is successful, it returns a success message with details such as the number of
        pages scraped, duration, output directory, and a list of files created. If the scraping fails,
        it returns an error message indicating
        """
        global global_output_dir
        if not url or not site_name:
            return "Please provide both URL and Site Name", None, None, None
        
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        
        try:
            result = scrape_website(
                url=url,
                site_name=site_name,
                site_description=site_description,
                site_category=site_category,
                max_pages=int(max_pages),
                max_depth=int(max_depth),
                delay=float(delay),
                scrape_external_links=external_links
            )
            
            if result["success"]:
                global_output_dir = result['output_dir']
                return (
                    f"Successfully scraped {result['pages_scraped']} pages!\n"
                    f"Duration: {result['duration']}\n"
                    f"Files saved to: {result['output_dir']}\n\n"
                    f"Files created:\n"
                    f"  • Individual page files (.md)\n"
                    f"  • scraping_summary.md\n"
                    f"  • scraping_log.txt\n\n"
                    f"Ready to merge into PDF - click 'Merge to PDF' button below."
                ), None, None, None
            else:
                return f"Scraping failed: {result['error']}", None, None, None
        except Exception as e:
            return f"Error: {str(e)}", None, None, None

    def process_merge_to_pdf():
        """
        The function `process_merge_to_pdf` merges Markdown files into a PDF and provides download
        options for the generated PDF.
        :return: The `process_merge_to_pdf` function returns a tuple containing four elements:
        """
        global global_output_dir
        if not global_output_dir:
            return ("No scraping output directory found. Please scrape a website first.", 
                   None, None, gr.update(visible=False))
        
        try:
            result = merge_md_to_pdf_and_convert_to_url(
                output_dir=global_output_dir,
                site_name="Scraped Website",
                site_description="Scraped content from website",
                site_category="Technology",
                output_format="pdf"
            )
            
            if result["success"]:
                pdf_url = result["output_url"]
                local_pdf_path = result["converted_path"]
                
                message = (
                    f"{result['message']}\n\n"
                    f"PDF created successfully!\n"
                    f"Local file: {local_pdf_path}\n"
                    f"Download URL: {pdf_url}\n\n"
                    f"View the PDF in the viewer on the right."
                )
                
                download_html = f'''
                <div style="padding: 10px; background-color: #f0f0f0; border-radius: 5px; margin: 10px 0;">
                    <h4>Download Options:</h4>
                    <p><a href="{pdf_url}" target="_blank" style="color: #1f77b4; text-decoration: none; font-weight: bold;">
                        Click here to download PDF from web link
                    </a></p>
                    <p><small>The PDF is also available in the viewer on the right and as a downloadable file above.</small></p>
                </div>
                '''
                
                return (
                    message, 
                    local_pdf_path,
                    download_html,
                    gr.update(value=local_pdf_path, visible=True)
                )
            else:
                return (
                    f"PDF merge failed: {result['error']}", 
                    None, 
                    None, 
                    gr.update(visible=False)
                )
        except Exception as e:
            return (
                f"Error during PDF merge: {str(e)}", 
                None, 
                None, 
                gr.update(visible=False)
            )

    scrape_btn.click(
        process_scrape,
        inputs=[
            url_input, site_name_input, site_description_input, site_category_input,
            max_pages_input, max_depth_input, delay_input, external_links_input
        ],
        outputs=[output, pdf_download, pdf_url_output, pdf_viewer]
    )
    
    merge_pdf_btn.click(
        process_merge_to_pdf,
        inputs=[],
        outputs=[pdf_output, pdf_download, pdf_url_output, pdf_viewer]
    )
    
    with gr.Accordion("Example Usage & Tips", open=False):
        gr.Markdown("""
        ### Common Use Cases:
        - News Websites: `https://techcrunch.com` - scrape latest tech news articles
        - Blogs: `https://blog.openai.com` - scrape all blog posts and updates
        - Company Sites: `https://company.com/products` - scrape product pages and documentation
        - Personal Portfolios: `https://designer.com` - scrape project galleries and case studies
        - Forums/Communities: `https://stackoverflow.com/questions/tagged/python` - scrape Q&A content
        - E-commerce: `https://shop.com/category` - scrape product listings and descriptions
        
        ### Tips for Better Results:
        - Start with specific sections: Instead of `https://wikipedia.org`, try `https://en.wikipedia.org/wiki/Category:Artificial_intelligence`
        - Use reasonable limits: Start with 10-20 pages to test, then increase if needed
        - Respect rate limits: Use 2-3 second delays for most sites
        - External links: Only enable for trusted sites to avoid scraping the entire internet
        - Check robots.txt: Make sure you're allowed to scrape the site (`site.com/robots.txt`)
        
        ### Output Files Explained:
        - Individual .md files: Each scraped page saved as markdown
        - scraping_summary.md: Overview of all scraped content with links
        - scraping_log.txt: Detailed log of the scraping process
        - Merged PDF: Combined content of all Markdown files, viewable in the interface and downloadable
        
        ### PDF Features:
        - Inline Viewer: View the merged PDF directly in the interface
        - Download Options: Download via direct file or web link
        - Multiple Formats: Local file and web-hosted version available
        """)
    
    gr.Markdown("""
    ---
    Important Notes:
    - Always respect website terms of service and robots.txt
    - Use reasonable delays to avoid overwhelming servers
    - Some sites may block automated scraping
    - Consider the website's bandwidth and server load
    - The merged PDF is uploaded to a public link for easy sharing
    - PDF viewer works best with modern browsers that support PDF display
    """)

if __name__ == "__main__":
    demo.launch(mcp_server=True, share=True, server_port=7860)


# @https://google.github.io/adk-docs/get-started/installation/ 
# use process scrape to extract above link and of maxpages of 2 and mergepdf using process_merge_to_pdf