n0v33n commited on
Commit
ff3a25c
·
1 Parent(s): 777a5e5

Create required file for this space

Browse files
Files changed (6) hide show
  1. .gitignore +1 -0
  2. DockerFile +26 -0
  3. WebScraper.py +355 -0
  4. app.py +315 -0
  5. merge_md.py +263 -0
  6. requirements.txt +12 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
DockerFile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ # Install system dependencies for Chrome/Chromium
4
+ RUN apt-get update && apt-get install -y \
5
+ wget \
6
+ gnupg \
7
+ unzip \
8
+ curl \
9
+ chromium \
10
+ chromium-driver \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Set Chrome path for Selenium
14
+ ENV CHROME_BIN=/usr/bin/chromium
15
+ ENV CHROMEDRIVER_PATH=/usr/bin/chromedriver
16
+
17
+ WORKDIR /app
18
+
19
+ COPY requirements.txt .
20
+ RUN pip install --no-cache-dir -r requirements.txt
21
+
22
+ COPY . .
23
+
24
+ EXPOSE 7860
25
+
26
+ CMD ["python", "app.py"]
WebScraper.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import os
3
+ import re
4
+ import urllib.parse
5
+ from datetime import datetime
6
+ # from selenium import webdriver
7
+ # from selenium.webdriver.chrome.options import Options
8
+ from selenium.webdriver.common.by import By
9
+ from selenium.webdriver.support.ui import WebDriverWait
10
+ from selenium.webdriver.support import expected_conditions as EC
11
+ from bs4 import BeautifulSoup
12
+ try:
13
+ from selenium import webdriver
14
+ from selenium.webdriver.chrome.options import Options
15
+ SELENIUM_AVAILABLE = True
16
+ except ImportError:
17
+ SELENIUM_AVAILABLE = False
18
+ print("Selenium not available. Some features may not work.")
19
+
20
+ class WebsiteScraper:
21
+ def __init__(self, base_url, site_name, site_description="", site_category="General",
22
+ output_dir=None, max_depth=3, max_pages=50, delay=2, headless=True,
23
+ scrape_external_links=False, content_selectors=None):
24
+ """
25
+ Initialize the website scraper.
26
+
27
+ Args:
28
+ base_url (str): Starting URL to scrape
29
+ site_name (str): Name of the website
30
+ site_description (str): Description of the website
31
+ site_category (str): Category of the website
32
+ output_dir (str): Directory to save files (auto-generated if None)
33
+ max_depth (int): Maximum depth to crawl
34
+ max_pages (int): Maximum number of pages to scrape
35
+ delay (float): Delay between requests in seconds
36
+ headless (bool): Run browser in headless mode
37
+ scrape_external_links (bool): Whether to follow external links
38
+ content_selectors (list): CSS selectors to find main content
39
+ """
40
+ parsed_url = urllib.parse.urlparse(base_url)
41
+ self.base_url = base_url
42
+ self.base_domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
43
+ self.domain_name = parsed_url.netloc
44
+ self.site_name = site_name
45
+ self.site_description = site_description
46
+ self.site_category = site_category
47
+ self.scrape_external_links = scrape_external_links
48
+ self.content_selectors = content_selectors or [
49
+ 'main', 'article', '.content', '#content', '.main-content',
50
+ '.post-content', '.entry-content', '.page-content', 'body'
51
+ ]
52
+ self.max_depth = max_depth
53
+ self.max_pages = max_pages
54
+ self.delay = delay
55
+ self.visited_links = set()
56
+ self.page_count = 0
57
+ self.start_time = datetime.now()
58
+ if output_dir is None:
59
+ domain_safe = self.domain_name.replace(".", "_").replace(":", "_")
60
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
61
+ self.output_dir = f"{site_name}_{domain_safe}_{timestamp}"
62
+ else:
63
+ self.output_dir = output_dir
64
+ if not os.path.exists(self.output_dir):
65
+ os.makedirs(self.output_dir)
66
+ self.log_path = os.path.join(self.output_dir, "scraping_log.txt")
67
+ with open(self.log_path, "w", encoding="utf-8") as log_file:
68
+ log_file.write(f"Website scraping started at: {self.start_time}\n")
69
+ log_file.write(f"Website: {self.site_name}\n")
70
+ log_file.write(f"Description: {self.site_description}\n")
71
+ log_file.write(f"Category: {self.site_category}\n")
72
+ log_file.write(f"Base URL: {self.base_url}\n")
73
+ log_file.write(f"Domain: {self.domain_name}\n")
74
+ log_file.write(f"Max depth: {self.max_depth}\n")
75
+ log_file.write(f"Max pages: {self.max_pages}\n")
76
+ log_file.write(f"External links: {self.scrape_external_links}\n\n")
77
+ self.setup_driver(headless)
78
+ self.documents = []
79
+
80
+ def setup_driver(self, headless):
81
+ """Setup Chrome driver with options."""
82
+ try:
83
+ chrome_options = Options()
84
+ if headless:
85
+ chrome_options.add_argument("--headless")
86
+ chrome_options.add_argument("--no-sandbox")
87
+ chrome_options.add_argument("--disable-dev-shm-usage")
88
+ chrome_options.add_argument("--disable-logging")
89
+ chrome_options.add_argument("--log-level=3")
90
+ chrome_options.add_argument("--disable-extensions")
91
+ chrome_options.add_argument("--disable-gpu")
92
+ chrome_options.add_argument("--window-size=1920,1080")
93
+ chrome_options.add_argument("--disable-web-security")
94
+ chrome_options.add_argument("--allow-running-insecure-content")
95
+ chrome_options.add_argument("--disable-features=VizDisplayCompositor")
96
+ chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
97
+ chrome_options.binary_location = "/usr/bin/chromium"
98
+
99
+ chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
100
+ chrome_options.add_experimental_option('useAutomationExtension', False)
101
+ try:
102
+ self.driver = webdriver.Chrome(
103
+ executable_path="/usr/bin/chromedriver",
104
+ options=chrome_options
105
+ )
106
+ except:
107
+ from webdriver_manager.chrome import ChromeDriverManager
108
+ self.driver = webdriver.Chrome(
109
+ ChromeDriverManager().install(),
110
+ options=chrome_options
111
+ )
112
+
113
+ self.log_message("Chrome driver initialized successfully")
114
+ except Exception as e:
115
+ self.log_message(f"Error setting up Chrome driver: {e}")
116
+ raise
117
+
118
+ def log_message(self, message):
119
+ """Write message to console and log file."""
120
+ print(message)
121
+ with open(self.log_path, "a", encoding="utf-8") as log_file:
122
+ log_file.write(f"{message}\n")
123
+
124
+ def is_valid_url(self, url):
125
+ """Check if URL should be scraped."""
126
+ if not self.scrape_external_links and not url.startswith(self.base_domain):
127
+ return False
128
+ if re.search(r"\.(pdf|doc|docx|xls|xlsx|ppt|pptx|jpg|jpeg|png|gif|svg|ico|css|js|xml|json|zip|tar|gz|rar|7z|exe|dmg|mp3|mp4|avi|mov|wmv)$", url, re.IGNORECASE):
129
+ return False
130
+ if "#" in url:
131
+ url = url.split("#")[0]
132
+ if url in self.visited_links:
133
+ return False
134
+ skip_patterns = [
135
+ '/login', '/register', '/signup', '/sign-up', '/signin', '/sign-in',
136
+ '/logout', '/password', '/forgot', '/reset',
137
+ '/admin', '/dashboard', '/account', '/profile',
138
+ '/cart', '/checkout', '/payment', '/billing',
139
+ '/terms', '/privacy', '/legal', '/disclaimer',
140
+ '/sitemap', '/robots.txt', '/favicon'
141
+ ]
142
+ url_lower = url.lower()
143
+ for pattern in skip_patterns:
144
+ if pattern in url_lower:
145
+ return False
146
+ spam_patterns = ['popup', 'advertisement', 'tracking', 'analytics']
147
+ for pattern in spam_patterns:
148
+ if pattern in url_lower:
149
+ return False
150
+ return True
151
+
152
+ def sanitize_filename(self, text):
153
+ """Convert text to safe filename."""
154
+ if not text or len(text.strip()) == 0:
155
+ return f"page_{self.page_count}"
156
+ safe_name = re.sub(r'[^\w\s()-]', "_", text)
157
+ safe_name = re.sub(r'\s+', "_", safe_name)
158
+ safe_name = safe_name.strip("_")
159
+ return safe_name[:100] if len(safe_name) > 100 else safe_name
160
+
161
+ def extract_links(self):
162
+ """Extract valid links from current page."""
163
+ links = self.driver.find_elements(By.TAG_NAME, "a")
164
+ valid_links = []
165
+ for link in links:
166
+ try:
167
+ href = link.get_attribute("href")
168
+ if href:
169
+ if href.startswith('/'):
170
+ href = self.base_domain + href
171
+ elif href.startswith('./') or not href.startswith('http'):
172
+ current_url = self.driver.current_url
173
+ base_path = '/'.join(current_url.split('/')[:-1])
174
+ href = base_path + '/' + href.lstrip('./')
175
+ if self.is_valid_url(href) and href not in self.visited_links:
176
+ valid_links.append(href)
177
+ except Exception:
178
+ continue
179
+ return list(set(valid_links))
180
+
181
+ def extract_main_content(self, soup):
182
+ """Extract main content using various selectors."""
183
+ content_element = None
184
+ for selector in self.content_selectors:
185
+ try:
186
+ if selector.startswith('.') or selector.startswith('#'):
187
+ elements = soup.select(selector)
188
+ else:
189
+ elements = soup.find_all(selector)
190
+ if elements:
191
+ content_element = elements[0]
192
+ break
193
+ except:
194
+ continue
195
+ if not content_element:
196
+ content_element = soup.find('body')
197
+ return content_element
198
+
199
+ def extract_clean_text(self, soup):
200
+ """Extract and clean text from BeautifulSoup object."""
201
+ unwanted_tags = [
202
+ "script", "style", "nav", "footer", "header", "aside",
203
+ "advertisement", "ads", "popup", "modal", "cookie-notice"
204
+ ]
205
+ for tag in unwanted_tags:
206
+ for element in soup.find_all(tag):
207
+ element.decompose()
208
+ unwanted_classes = [
209
+ "sidebar", "menu", "navigation", "nav", "footer", "header",
210
+ "advertisement", "ad", "ads", "popup", "modal", "cookie",
211
+ "social", "share", "comment", "related", "recommended"
212
+ ]
213
+ for class_name in unwanted_classes:
214
+ for element in soup.find_all(class_=re.compile(class_name, re.I)):
215
+ element.decompose()
216
+ for element in soup.find_all(id=re.compile(class_name, re.I)):
217
+ element.decompose()
218
+ main_content = self.extract_main_content(soup)
219
+ if main_content:
220
+ text = main_content.get_text(separator=" ", strip=True)
221
+ else:
222
+ text = soup.get_text(separator=" ", strip=True)
223
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
224
+ cleaned_text = '\n'.join(lines)
225
+ cleaned_text = re.sub(r'\n\s*\n', '\n\n', cleaned_text)
226
+ cleaned_text = re.sub(r' +', ' ', cleaned_text)
227
+ return cleaned_text
228
+
229
+ def scrape_page(self, url):
230
+ """Scrape content from a single page and save as markdown."""
231
+ if url in self.visited_links:
232
+ return []
233
+ self.page_count += 1
234
+ self.visited_links.add(url)
235
+ status = f"Scraping [{self.page_count}/{self.max_pages}]: {url}"
236
+ self.log_message(status)
237
+ try:
238
+ self.driver.get(url)
239
+ WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
240
+ time.sleep(self.delay)
241
+ try:
242
+ page_title = self.driver.title or f"Page_{self.page_count}"
243
+ except:
244
+ page_title = f"Page_{self.page_count}"
245
+ soup = BeautifulSoup(self.driver.page_source, "html.parser")
246
+ cleaned_text = self.extract_clean_text(soup)
247
+ if len(cleaned_text.strip()) < 50:
248
+ self.log_message(f"Skipping {url}: insufficient content")
249
+ return self.extract_links()
250
+ meta_desc = ""
251
+ meta_tag = soup.find("meta", attrs={"name": "description"})
252
+ if meta_tag:
253
+ meta_desc = meta_tag.get("content", "")
254
+ doc = {
255
+ "text": cleaned_text,
256
+ "metadata": {
257
+ "source": url,
258
+ "title": page_title,
259
+ "site_name": self.site_name,
260
+ "site_description": self.site_description,
261
+ "site_category": self.site_category,
262
+ "meta_description": meta_desc,
263
+ "domain": self.domain_name,
264
+ "scraped_at": datetime.now().isoformat()
265
+ }
266
+ }
267
+ self.documents.append(doc)
268
+ safe_filename = self.sanitize_filename(page_title)
269
+ file_path = os.path.join(self.output_dir, f"{safe_filename}.md")
270
+ counter = 1
271
+ original_path = file_path
272
+ while os.path.exists(file_path):
273
+ base, ext = os.path.splitext(original_path)
274
+ file_path = f"{base}_{counter}{ext}"
275
+ counter += 1
276
+ with open(file_path, "w", encoding="utf-8") as file:
277
+ file.write(f"# {page_title}\n\n")
278
+ file.write(f"**URL:** {url}\n")
279
+ file.write(f"**Site:** {self.site_name}\n")
280
+ file.write(f"**Category:** {self.site_category}\n")
281
+ if meta_desc:
282
+ file.write(f"**Description:** {meta_desc}\n")
283
+ file.write(f"**Scraped:** {datetime.now()}\n\n")
284
+ file.write("---\n\n")
285
+ file.write(cleaned_text)
286
+ self.log_message(f"Saved: {os.path.basename(file_path)}")
287
+ new_links = self.extract_links()
288
+ self.log_message(f"Found {len(new_links)} new links")
289
+ return new_links
290
+ except Exception as e:
291
+ self.log_message(f"Error scraping {url}: {str(e)}")
292
+ return []
293
+
294
+ def create_summary(self):
295
+ """Create a summary of the scraped content."""
296
+ summary_path = os.path.join(self.output_dir, "scraping_summary.md")
297
+ with open(summary_path, "w", encoding="utf-8") as f:
298
+ f.write(f"# Scraping Summary: {self.site_name}\n\n")
299
+ f.write(f"**Website:** {self.site_name}\n")
300
+ f.write(f"**URL:** {self.base_url}\n")
301
+ f.write(f"**Domain:** {self.domain_name}\n")
302
+ f.write(f"**Category:** {self.site_category}\n")
303
+ f.write(f"**Description:** {self.site_description}\n\n")
304
+ f.write(f"**Scraping Details:**\n")
305
+ f.write(f"- Start time: {self.start_time}\n")
306
+ f.write(f"- End time: {datetime.now()}\n")
307
+ f.write(f"- Duration: {datetime.now() - self.start_time}\n")
308
+ f.write(f"- Pages scraped: {len(self.documents)}\n")
309
+ f.write(f"- Max pages allowed: {self.max_pages}\n")
310
+ f.write(f"- Max depth: {self.max_depth}\n")
311
+ f.write(f"- External links allowed: {self.scrape_external_links}\n\n")
312
+ if self.documents:
313
+ f.write("**Scraped Pages:**\n")
314
+ for i, doc in enumerate(self.documents, 1):
315
+ f.write(f"{i}. [{doc['metadata']['title']}]({doc['metadata']['source']})\n")
316
+
317
+ def start(self):
318
+ """Start the website scraping process."""
319
+ try:
320
+ self.log_message(f"Starting website scraping for {self.site_name}")
321
+ self.log_message(f"Target: {self.base_url}")
322
+ self.log_message(f"Limits: max_depth={self.max_depth}, max_pages={self.max_pages}")
323
+ urls_to_scrape = [(self.base_url, 0)]
324
+ while urls_to_scrape and self.page_count < self.max_pages:
325
+ current_url, current_depth = urls_to_scrape.pop(0)
326
+ if current_url in self.visited_links or current_depth > self.max_depth:
327
+ continue
328
+ new_links = self.scrape_page(current_url)
329
+ if current_depth + 1 <= self.max_depth:
330
+ for link in new_links:
331
+ if link not in self.visited_links:
332
+ urls_to_scrape.append((link, current_depth + 1))
333
+ self.create_summary()
334
+ self.driver.quit()
335
+ end_time = datetime.now()
336
+ duration = end_time - self.start_time
337
+ self.log_message(f"Scraping completed for {self.site_name}")
338
+ self.log_message(f"Total pages scraped: {self.page_count}")
339
+ self.log_message(f"Duration: {duration}")
340
+ return {
341
+ "success": True,
342
+ "pages_scraped": self.page_count,
343
+ "duration": str(duration),
344
+ "output_dir": self.output_dir
345
+ }
346
+ except Exception as e:
347
+ self.driver.quit()
348
+ self.log_message(f"Scraping failed: {str(e)}")
349
+ return {
350
+ "success": False,
351
+ "error": str(e),
352
+ "pages_scraped": self.page_count,
353
+ "duration": "0",
354
+ "output_dir": self.output_dir
355
+ }
app.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import warnings
4
+ from WebScraper import WebsiteScraper
5
+ from merge_md import merge_md_to_pdf_and_convert_to_url
6
+
7
+ warnings.filterwarnings("ignore")
8
+ os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
9
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
10
+
11
+ global_output_dir = ""
12
+
13
+ def scrape_website(url, site_name, site_description="", site_category="General",
14
+ max_pages=20, max_depth=3, delay=2, scrape_external_links=False):
15
+ scraper = WebsiteScraper(
16
+ base_url=url,
17
+ site_name=site_name,
18
+ site_description=site_description,
19
+ site_category=site_category,
20
+ max_pages=max_pages,
21
+ max_depth=max_depth,
22
+ delay=delay,
23
+ scrape_external_links=scrape_external_links
24
+ )
25
+ return scraper.start()
26
+
27
+ with gr.Blocks(title="General Website Scraper", theme=gr.themes.Soft()) as demo:
28
+ gr.Markdown("# General Website Scraper")
29
+ gr.Markdown("Scrape content from any website, save as markdown files, and merge into a PDF with viewer and downloadable link.")
30
+
31
+ with gr.Row():
32
+ url_input = gr.Textbox(
33
+ label="Website URL",
34
+ placeholder="e.g., https://example.com or https://blog.example.com",
35
+ info="Enter the starting URL to scrape"
36
+ )
37
+ site_name_input = gr.Textbox(
38
+ label="Site Name",
39
+ placeholder="e.g., Example Blog",
40
+ info="A descriptive name for the website"
41
+ )
42
+
43
+ with gr.Row():
44
+ site_description_input = gr.Textbox(
45
+ label="Site Description (Optional)",
46
+ placeholder="e.g., A technology blog about AI and programming",
47
+ info="Brief description of the website content"
48
+ )
49
+ site_category_input = gr.Dropdown(
50
+ label="Site Category",
51
+ choices=[
52
+ "General", "Blog", "News", "E-commerce", "Portfolio",
53
+ "Company", "Documentation", "Forum", "Social Media",
54
+ "Education", "Technology", "Entertainment", "Health",
55
+ "Finance", "Travel", "Food", "Sports", "Art", "Other"
56
+ ],
57
+ value="General",
58
+ info="Select the most appropriate category"
59
+ )
60
+
61
+ with gr.Row():
62
+ max_pages_input = gr.Number(
63
+ label="Max Pages", value=20, precision=0, minimum=1, maximum=1000,
64
+ info="Maximum number of pages to scrape"
65
+ )
66
+ max_depth_input = gr.Number(
67
+ label="Max Depth", value=3, precision=0, minimum=1, maximum=10,
68
+ info="How many clicks deep to follow links"
69
+ )
70
+ delay_input = gr.Number(
71
+ label="Delay (seconds)", value=2, precision=1, minimum=0.5, maximum=10,
72
+ info="Delay between requests to avoid overwhelming the server"
73
+ )
74
+
75
+ with gr.Row():
76
+ external_links_input = gr.Checkbox(
77
+ label="Include External Links", value=False,
78
+ info="Scrape links that go outside the original domain (use with caution)"
79
+ )
80
+
81
+ scrape_btn = gr.Button("Start Scraping", variant="primary", size="lg")
82
+
83
+ with gr.Row():
84
+ output = gr.Textbox(
85
+ label="Scraping Results",
86
+ lines=10,
87
+ max_lines=20,
88
+ info="Real-time scraping progress and results will appear here"
89
+ )
90
+
91
+ gr.Markdown("## PDF Generation & Viewer")
92
+
93
+ with gr.Row():
94
+ merge_pdf_btn = gr.Button("Merge to PDF and Get Link", variant="secondary", size="lg")
95
+
96
+ with gr.Row():
97
+ with gr.Column(scale=1):
98
+ pdf_output = gr.Textbox(
99
+ label="PDF Merge Results",
100
+ lines=5,
101
+ max_lines=10,
102
+ info="Results of merging Markdown files to PDF"
103
+ )
104
+
105
+ pdf_download = gr.File(
106
+ label="Download Merged PDF (Local File)",
107
+ file_types=[".pdf"],
108
+ visible=False
109
+ )
110
+
111
+ pdf_url_output = gr.HTML(
112
+ label="PDF Download Link",
113
+ visible=False
114
+ )
115
+
116
+ with gr.Column(scale=2):
117
+ pdf_viewer = gr.File(
118
+ label="PDF Viewer - View Merged Content",
119
+ file_types=[".pdf"],
120
+ visible=False,
121
+ interactive=False
122
+ )
123
+
124
+ def process_scrape(url, site_name, site_description, site_category, max_pages, max_depth, delay, external_links):
125
+ """
126
+ The function `process_scrape` takes in parameters related to website scraping, performs the
127
+ scraping operation, and returns a success message or an error message based on the result.
128
+
129
+ :param url: The `url` parameter is the URL of the website that you want to scrape
130
+ :param site_name: The `site_name` parameter is a string that represents the name of the website
131
+ being scraped. It is one of the required parameters for the `process_scrape` function
132
+ :param site_description: The `site_description` parameter in the `process_scrape` function is
133
+ used to provide a description of the website being scraped. It is a text description that helps
134
+ in identifying and describing the content or purpose of the website. This information can be
135
+ used for various purposes such as categorizing the website,
136
+ :param site_category: The `site_category` parameter in the `process_scrape` function is used to
137
+ specify the category of the website being scraped. It is one of the inputs required for the
138
+ scraping process
139
+ :param max_pages: The `max_pages` parameter in the `process_scrape` function represents the
140
+ maximum number of pages to scrape on the website. It is an integer value that determines the
141
+ limit for the number of pages that will be scraped during the process
142
+ :param max_depth: The `max_depth` parameter in the `process_scrape` function represents the
143
+ maximum depth of links to follow during the website scraping process. It determines how many
144
+ levels deep the scraper will navigate through the website's links starting from the initial URL.
145
+ This parameter helps control the extent of the scraping process and
146
+ :param delay: The `delay` parameter in the `process_scrape` function represents the time delay
147
+ (in seconds) between consecutive requests made during the scraping process. This delay is useful
148
+ for preventing overwhelming the target website with too many requests in a short period, which
149
+ could lead to being blocked or flagged as suspicious activity
150
+ :param external_links: The `external_links` parameter in the `process_scrape` function is a
151
+ boolean flag that determines whether external links should be scraped along with the internal
152
+ links of the website. If `external_links` is set to `True`, the scraper will also follow and
153
+ scrape external links found on the website
154
+ :return: The function `process_scrape` returns a tuple containing a message string, and three
155
+ `None` values. The message string can vary depending on the outcome of the scraping process. If
156
+ the scraping is successful, it returns a success message with details such as the number of
157
+ pages scraped, duration, output directory, and a list of files created. If the scraping fails,
158
+ it returns an error message indicating
159
+ """
160
+ global global_output_dir
161
+ if not url or not site_name:
162
+ return "Please provide both URL and Site Name", None, None, None
163
+
164
+ if not url.startswith(('http://', 'https://')):
165
+ url = 'https://' + url
166
+
167
+ try:
168
+ result = scrape_website(
169
+ url=url,
170
+ site_name=site_name,
171
+ site_description=site_description,
172
+ site_category=site_category,
173
+ max_pages=int(max_pages),
174
+ max_depth=int(max_depth),
175
+ delay=float(delay),
176
+ scrape_external_links=external_links
177
+ )
178
+
179
+ if result["success"]:
180
+ global_output_dir = result['output_dir']
181
+ return (
182
+ f"Successfully scraped {result['pages_scraped']} pages!\n"
183
+ f"Duration: {result['duration']}\n"
184
+ f"Files saved to: {result['output_dir']}\n\n"
185
+ f"Files created:\n"
186
+ f" • Individual page files (.md)\n"
187
+ f" • scraping_summary.md\n"
188
+ f" • scraping_log.txt\n\n"
189
+ f"Ready to merge into PDF - click 'Merge to PDF' button below."
190
+ ), None, None, None
191
+ else:
192
+ return f"Scraping failed: {result['error']}", None, None, None
193
+ except Exception as e:
194
+ return f"Error: {str(e)}", None, None, None
195
+
196
+ def process_merge_to_pdf():
197
+ """
198
+ The function `process_merge_to_pdf` merges Markdown files into a PDF and provides download
199
+ options for the generated PDF.
200
+ :return: The `process_merge_to_pdf` function returns a tuple containing four elements:
201
+ """
202
+ global global_output_dir
203
+ if not global_output_dir:
204
+ return ("No scraping output directory found. Please scrape a website first.",
205
+ None, None, gr.update(visible=False))
206
+
207
+ try:
208
+ result = merge_md_to_pdf_and_convert_to_url(
209
+ output_dir=global_output_dir,
210
+ site_name="Scraped Website",
211
+ site_description="Scraped content from website",
212
+ site_category="Technology",
213
+ output_format="pdf"
214
+ )
215
+
216
+ if result["success"]:
217
+ pdf_url = result["output_url"]
218
+ local_pdf_path = result["converted_path"]
219
+
220
+ message = (
221
+ f"{result['message']}\n\n"
222
+ f"PDF created successfully!\n"
223
+ f"Local file: {local_pdf_path}\n"
224
+ f"Download URL: {pdf_url}\n\n"
225
+ f"View the PDF in the viewer on the right."
226
+ )
227
+
228
+ download_html = f'''
229
+ <div style="padding: 10px; background-color: #f0f0f0; border-radius: 5px; margin: 10px 0;">
230
+ <h4>Download Options:</h4>
231
+ <p><a href="{pdf_url}" target="_blank" style="color: #1f77b4; text-decoration: none; font-weight: bold;">
232
+ Click here to download PDF from web link
233
+ </a></p>
234
+ <p><small>The PDF is also available in the viewer on the right and as a downloadable file above.</small></p>
235
+ </div>
236
+ '''
237
+
238
+ return (
239
+ message,
240
+ local_pdf_path,
241
+ download_html,
242
+ gr.update(value=local_pdf_path, visible=True)
243
+ )
244
+ else:
245
+ return (
246
+ f"PDF merge failed: {result['error']}",
247
+ None,
248
+ None,
249
+ gr.update(visible=False)
250
+ )
251
+ except Exception as e:
252
+ return (
253
+ f"Error during PDF merge: {str(e)}",
254
+ None,
255
+ None,
256
+ gr.update(visible=False)
257
+ )
258
+
259
+ scrape_btn.click(
260
+ process_scrape,
261
+ inputs=[
262
+ url_input, site_name_input, site_description_input, site_category_input,
263
+ max_pages_input, max_depth_input, delay_input, external_links_input
264
+ ],
265
+ outputs=[output, pdf_download, pdf_url_output, pdf_viewer]
266
+ )
267
+
268
+ merge_pdf_btn.click(
269
+ process_merge_to_pdf,
270
+ inputs=[],
271
+ outputs=[pdf_output, pdf_download, pdf_url_output, pdf_viewer]
272
+ )
273
+
274
+ with gr.Accordion("Example Usage & Tips", open=False):
275
+ gr.Markdown("""
276
+ ### Common Use Cases:
277
+ - News Websites: `https://techcrunch.com` - scrape latest tech news articles
278
+ - Blogs: `https://blog.openai.com` - scrape all blog posts and updates
279
+ - Company Sites: `https://company.com/products` - scrape product pages and documentation
280
+ - Personal Portfolios: `https://designer.com` - scrape project galleries and case studies
281
+ - Forums/Communities: `https://stackoverflow.com/questions/tagged/python` - scrape Q&A content
282
+ - E-commerce: `https://shop.com/category` - scrape product listings and descriptions
283
+
284
+ ### Tips for Better Results:
285
+ - Start with specific sections: Instead of `https://wikipedia.org`, try `https://en.wikipedia.org/wiki/Category:Artificial_intelligence`
286
+ - Use reasonable limits: Start with 10-20 pages to test, then increase if needed
287
+ - Respect rate limits: Use 2-3 second delays for most sites
288
+ - External links: Only enable for trusted sites to avoid scraping the entire internet
289
+ - Check robots.txt: Make sure you're allowed to scrape the site (`site.com/robots.txt`)
290
+
291
+ ### Output Files Explained:
292
+ - Individual .md files: Each scraped page saved as markdown
293
+ - scraping_summary.md: Overview of all scraped content with links
294
+ - scraping_log.txt: Detailed log of the scraping process
295
+ - Merged PDF: Combined content of all Markdown files, viewable in the interface and downloadable
296
+
297
+ ### PDF Features:
298
+ - Inline Viewer: View the merged PDF directly in the interface
299
+ - Download Options: Download via direct file or web link
300
+ - Multiple Formats: Local file and web-hosted version available
301
+ """)
302
+
303
+ gr.Markdown("""
304
+ ---
305
+ Important Notes:
306
+ - Always respect website terms of service and robots.txt
307
+ - Use reasonable delays to avoid overwhelming servers
308
+ - Some sites may block automated scraping
309
+ - Consider the website's bandwidth and server load
310
+ - The merged PDF is uploaded to a public link for easy sharing
311
+ - PDF viewer works best with modern browsers that support PDF display
312
+ """)
313
+
314
+ if __name__ == "__main__":
315
+ demo.launch(mcp_server=True, share=True, server_port=7860)
merge_md.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from reportlab.lib.pagesizes import A4
3
+ from reportlab.lib.units import inch
4
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
5
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
6
+ from datetime import datetime
7
+ import markdown2
8
+ from mistralai import Mistral
9
+ from pathlib import Path
10
+ from urllib.parse import urlparse
11
+ import convertapi
12
+ import requests
13
+ from dotenv import load_dotenv
14
+ import re
15
+
16
+ load_dotenv()
17
+
18
+ convertapi.api_credentials = os.getenv("CONVERTAPI_TOKEN")
19
+ if not convertapi.api_credentials:
20
+ raise ValueError("CONVERTAPI_TOKEN environment variable is required")
21
+
22
+ SUPPORTED_FORMATS = ["pdf", "docx", "txt"]
23
+ MAX_FILE_SIZE = int(os.getenv("MAX_FILE_SIZE", 100 * 1024 * 1024))
24
+ # TEMP_DIR = os.getenv("TEMP_DIR", "temp")
25
+ # In merge_md.py, update temp directory handling
26
+ TEMP_DIR = os.getenv("TEMP_DIR", "/tmp/scraper_temp")
27
+ # Ensure temp directory exists
28
+ os.makedirs(TEMP_DIR, exist_ok=True)
29
+
30
+ def upload_to_service(file_path: str) -> str:
31
+ """
32
+ Mock function to simulate uploading a file to a cloud service.
33
+ Args:
34
+ file_path (str): Path to the file to upload.
35
+ Returns:
36
+ str: Mock public URL or error message.
37
+ """
38
+ try:
39
+ if not os.path.exists(file_path):
40
+ return f"File not found: {file_path}"
41
+ return f"https://mock-cloud-service.com/{os.path.basename(file_path)}"
42
+ except Exception as e:
43
+ return f"Error uploading file: {str(e)}"
44
+
45
+ def convert_from_url(document_url: str, output_format: str) -> str:
46
+ """
47
+ Convert a document from a URL to a different format using ConvertAPI.
48
+ Args:
49
+ document_url (str): The URL of the input file.
50
+ output_format (str): The format to convert the file to.
51
+ Returns:
52
+ str: The path to the converted file or an error message.
53
+ """
54
+ try:
55
+ if not document_url or not document_url.lower().startswith(("http://", "https://")):
56
+ return "Invalid or unsupported URL format."
57
+ if output_format not in SUPPORTED_FORMATS:
58
+ return f"Unsupported output format: {output_format}"
59
+
60
+ result = convertapi.convert(output_format, {"File": document_url})
61
+ input_filename = Path(urlparse(document_url).path).stem or "converted_file"
62
+ output_filename = f"{input_filename}.{output_format}"
63
+ output_path = Path(TEMP_DIR) / output_filename
64
+ output_path.parent.mkdir(exist_ok=True)
65
+ result.file.save(str(output_path))
66
+ return str(output_path)
67
+ except Exception as e:
68
+ return f"Error converting file from URL: {str(e)}"
69
+
70
+ def merge_md_to_pdf(output_dir, site_name, site_description="", site_category="General"):
71
+ """
72
+ Merge all Markdown files in the output directory into a single PDF using reportlab after processing with Mistral AI.
73
+ Args:
74
+ output_dir (str): Directory containing Markdown files.
75
+ site_name (str): Name of the site for the PDF title.
76
+ site_description (str): Description of the site.
77
+ site_category (str): Category of the site.
78
+ Returns:
79
+ dict: Result containing success status, output PDF path, and message.
80
+ """
81
+ try:
82
+ api_key = os.getenv("MISTRAL_API_KEY")
83
+ if not api_key:
84
+ return {
85
+ "success": False,
86
+ "error": "MISTRAL_API_KEY environment variable not set",
87
+ "output_pdf": None,
88
+ "pages_merged": 0
89
+ }
90
+
91
+ client = Mistral(api_key=api_key)
92
+ model = "mistral-large-latest"
93
+
94
+ if not os.path.exists(output_dir):
95
+ return {
96
+ "success": False,
97
+ "error": f"Output directory {output_dir} does not exist",
98
+ "output_pdf": None,
99
+ "pages_merged": 0
100
+ }
101
+
102
+ md_files = [
103
+ f for f in os.listdir(output_dir)
104
+ if f.endswith('.md') and f not in ['scraping_summary.md', 'scraping_log.txt']
105
+ ]
106
+
107
+ if not md_files:
108
+ return {
109
+ "success": False,
110
+ "error": "No Markdown files found in the output directory",
111
+ "output_pdf": None,
112
+ "pages_merged": 0
113
+ }
114
+
115
+ pdf_output_path = os.path.join(output_dir, f"{site_name}_merged.pdf")
116
+ doc = SimpleDocTemplate(
117
+ pdf_output_path,
118
+ pagesize=A4,
119
+ rightMargin=inch,
120
+ leftMargin=inch,
121
+ topMargin=inch,
122
+ bottomMargin=inch
123
+ )
124
+ styles = getSampleStyleSheet()
125
+
126
+ title_style = ParagraphStyle(name='Title', fontSize=24, leading=28, alignment=1, spaceAfter=20)
127
+ heading_style = ParagraphStyle(name='Heading2', fontSize=18, leading=22, spaceAfter=15)
128
+ body_style = ParagraphStyle(name='Body', fontSize=12, leading=14, spaceAfter=10)
129
+
130
+ story = [
131
+ Paragraph(f"{site_name}", title_style),
132
+ Spacer(1, 0.2 * inch),
133
+ Paragraph(f"Description: {site_description}", body_style),
134
+ Paragraph(f"Category: {site_category}", body_style),
135
+ Paragraph(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", body_style),
136
+ PageBreak(),
137
+ Paragraph("Table of Contents", heading_style),
138
+ Spacer(1, 0.2 * inch)
139
+ ]
140
+
141
+ toc_entries = []
142
+ for idx, md_file in enumerate(sorted(md_files), 1):
143
+ file_path = os.path.join(output_dir, md_file)
144
+ with open(file_path, 'r', encoding='utf-8') as f:
145
+ md_content = f.read()
146
+
147
+ title = md_content.split('\n')[0].strip('#').strip() or f"Page {idx}"
148
+ try:
149
+ prompt = f"""
150
+ You are an expert content editor. Below is the content of a Markdown file. Please enhance the content by making it more detailed, well-structured, and polished while preserving the original meaning. Ensure the output is in plain text suitable for inclusion in a PDF. Avoid adding Markdown or HTML formatting in the response.
151
+ If there are HTML tags like <p><strong>Agents-MCP-Hackathon (Agents-MCP-Hackathon)</strong></p>, convert them to plain text like Agents-MCP-Hackathon (Agents-MCP-Hackathon).
152
+ Original content:
153
+ {md_content}
154
+
155
+ Enhanced content:
156
+ """
157
+ response = client.chat.complete(
158
+ model=model,
159
+ messages=[{"role": "user", "content": prompt}]
160
+ )
161
+ enhanced_content = response.choices[0].message.content.strip()
162
+ except Exception as e:
163
+ print(f"Warning: Failed to process {md_file} with Mistral AI: {str(e)}. Using original content.")
164
+ enhanced_content = md_content
165
+
166
+ html_content = markdown2.markdown(enhanced_content, extras=['fenced-code-blocks', 'tables'])
167
+ text_content = re.sub(r'<[^>]+>', '', html_content)
168
+ text_content = re.sub(r'\s+', ' ', text_content).strip()
169
+ lines = text_content.split('\n')
170
+
171
+ toc_entries.append(Paragraph(f"{idx}. {title}", body_style))
172
+
173
+ story.append(Paragraph(title, heading_style))
174
+ story.append(Spacer(1, 0.1 * inch))
175
+ for line in lines:
176
+ if line.strip():
177
+ story.append(Paragraph(line.strip(), body_style))
178
+ story.append(PageBreak())
179
+
180
+ story[6:6] = toc_entries + [PageBreak()]
181
+ doc.build(story)
182
+
183
+ return {
184
+ "success": True,
185
+ "output_pdf": pdf_output_path,
186
+ "pages_merged": len(md_files),
187
+ "message": f"Successfully merged {len(md_files)} Markdown files into {pdf_output_path} after processing with Mistral AI"
188
+ }
189
+
190
+ except Exception as e:
191
+ return {
192
+ "success": False,
193
+ "error": f"Failed to merge Markdown files into PDF: {str(e)}",
194
+ "output_pdf": None,
195
+ "pages_merged": 0
196
+ }
197
+
198
+ def merge_md_to_pdf_and_convert_to_url(output_dir, site_name, site_description="", site_category="General", output_format="pdf"):
199
+ """
200
+ Merge Markdown files into a PDF, upload it to a service, and optionally convert to another format.
201
+ Args:
202
+ output_dir (str): Directory containing Markdown files.
203
+ site_name (str): Name of the site for the PDF title.
204
+ site_description (str): Description of the site.
205
+ site_category (str): Category of the site.
206
+ output_format (str): Optional format to convert the PDF to (e.g., 'docx', 'txt').
207
+ Returns:
208
+ dict: Result containing success status, output URL, and message.
209
+ """
210
+ try:
211
+ merge_result = merge_md_to_pdf(output_dir, site_name, site_description, site_category)
212
+ if not merge_result["success"]:
213
+ return {
214
+ "success": False,
215
+ "error": merge_result["error"],
216
+ "output_url": None,
217
+ "converted_path": None
218
+ }
219
+
220
+ pdf_path = merge_result["output_pdf"]
221
+ if not pdf_path or not os.path.exists(pdf_path):
222
+ return {
223
+ "success": False,
224
+ "error": "Generated PDF not found",
225
+ "output_url": None,
226
+ "converted_path": None
227
+ }
228
+
229
+ pdf_url = upload_to_service(pdf_path)
230
+ if not pdf_url.startswith("http"):
231
+ return {
232
+ "success": False,
233
+ "error": f"Failed to obtain URL: {pdf_url}",
234
+ "output_url": None,
235
+ "converted_path": None
236
+ }
237
+
238
+ converted_path = pdf_path
239
+ if output_format != "pdf":
240
+ converted_path = convert_from_url(pdf_url, output_format)
241
+ if not converted_path.startswith(TEMP_DIR):
242
+ return {
243
+ "success": False,
244
+ "error": f"Conversion failed: {converted_path}",
245
+ "output_url": pdf_url,
246
+ "converted_path": None
247
+ }
248
+
249
+ return {
250
+ "success": True,
251
+ "output_url": pdf_url,
252
+ "converted_path": converted_path,
253
+ "message": f"Successfully merged {merge_result['pages_merged']} Markdown files into PDF and uploaded to {pdf_url}",
254
+ "pages_merged": merge_result["pages_merged"]
255
+ }
256
+
257
+ except Exception as e:
258
+ return {
259
+ "success": False,
260
+ "error": f"Error in merging or uploading: {str(e)}",
261
+ "output_url": None,
262
+ "converted_path": None
263
+ }
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ selenium
3
+ beautifulsoup4
4
+ requests
5
+ reportlab
6
+ markdown2
7
+ mistralai
8
+ convertapi
9
+ python-dotenv
10
+ pathlib
11
+ urllib3
12
+ webdriver-manager