Spaces:
Running
Running
n0v33n
commited on
Commit
·
ff3a25c
1
Parent(s):
777a5e5
Create required file for this space
Browse files- .gitignore +1 -0
- DockerFile +26 -0
- WebScraper.py +355 -0
- app.py +315 -0
- merge_md.py +263 -0
- requirements.txt +12 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.env
|
DockerFile
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.12-slim
|
2 |
+
|
3 |
+
# Install system dependencies for Chrome/Chromium
|
4 |
+
RUN apt-get update && apt-get install -y \
|
5 |
+
wget \
|
6 |
+
gnupg \
|
7 |
+
unzip \
|
8 |
+
curl \
|
9 |
+
chromium \
|
10 |
+
chromium-driver \
|
11 |
+
&& rm -rf /var/lib/apt/lists/*
|
12 |
+
|
13 |
+
# Set Chrome path for Selenium
|
14 |
+
ENV CHROME_BIN=/usr/bin/chromium
|
15 |
+
ENV CHROMEDRIVER_PATH=/usr/bin/chromedriver
|
16 |
+
|
17 |
+
WORKDIR /app
|
18 |
+
|
19 |
+
COPY requirements.txt .
|
20 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
21 |
+
|
22 |
+
COPY . .
|
23 |
+
|
24 |
+
EXPOSE 7860
|
25 |
+
|
26 |
+
CMD ["python", "app.py"]
|
WebScraper.py
ADDED
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
import urllib.parse
|
5 |
+
from datetime import datetime
|
6 |
+
# from selenium import webdriver
|
7 |
+
# from selenium.webdriver.chrome.options import Options
|
8 |
+
from selenium.webdriver.common.by import By
|
9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
11 |
+
from bs4 import BeautifulSoup
|
12 |
+
try:
|
13 |
+
from selenium import webdriver
|
14 |
+
from selenium.webdriver.chrome.options import Options
|
15 |
+
SELENIUM_AVAILABLE = True
|
16 |
+
except ImportError:
|
17 |
+
SELENIUM_AVAILABLE = False
|
18 |
+
print("Selenium not available. Some features may not work.")
|
19 |
+
|
20 |
+
class WebsiteScraper:
|
21 |
+
def __init__(self, base_url, site_name, site_description="", site_category="General",
|
22 |
+
output_dir=None, max_depth=3, max_pages=50, delay=2, headless=True,
|
23 |
+
scrape_external_links=False, content_selectors=None):
|
24 |
+
"""
|
25 |
+
Initialize the website scraper.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
base_url (str): Starting URL to scrape
|
29 |
+
site_name (str): Name of the website
|
30 |
+
site_description (str): Description of the website
|
31 |
+
site_category (str): Category of the website
|
32 |
+
output_dir (str): Directory to save files (auto-generated if None)
|
33 |
+
max_depth (int): Maximum depth to crawl
|
34 |
+
max_pages (int): Maximum number of pages to scrape
|
35 |
+
delay (float): Delay between requests in seconds
|
36 |
+
headless (bool): Run browser in headless mode
|
37 |
+
scrape_external_links (bool): Whether to follow external links
|
38 |
+
content_selectors (list): CSS selectors to find main content
|
39 |
+
"""
|
40 |
+
parsed_url = urllib.parse.urlparse(base_url)
|
41 |
+
self.base_url = base_url
|
42 |
+
self.base_domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
43 |
+
self.domain_name = parsed_url.netloc
|
44 |
+
self.site_name = site_name
|
45 |
+
self.site_description = site_description
|
46 |
+
self.site_category = site_category
|
47 |
+
self.scrape_external_links = scrape_external_links
|
48 |
+
self.content_selectors = content_selectors or [
|
49 |
+
'main', 'article', '.content', '#content', '.main-content',
|
50 |
+
'.post-content', '.entry-content', '.page-content', 'body'
|
51 |
+
]
|
52 |
+
self.max_depth = max_depth
|
53 |
+
self.max_pages = max_pages
|
54 |
+
self.delay = delay
|
55 |
+
self.visited_links = set()
|
56 |
+
self.page_count = 0
|
57 |
+
self.start_time = datetime.now()
|
58 |
+
if output_dir is None:
|
59 |
+
domain_safe = self.domain_name.replace(".", "_").replace(":", "_")
|
60 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
61 |
+
self.output_dir = f"{site_name}_{domain_safe}_{timestamp}"
|
62 |
+
else:
|
63 |
+
self.output_dir = output_dir
|
64 |
+
if not os.path.exists(self.output_dir):
|
65 |
+
os.makedirs(self.output_dir)
|
66 |
+
self.log_path = os.path.join(self.output_dir, "scraping_log.txt")
|
67 |
+
with open(self.log_path, "w", encoding="utf-8") as log_file:
|
68 |
+
log_file.write(f"Website scraping started at: {self.start_time}\n")
|
69 |
+
log_file.write(f"Website: {self.site_name}\n")
|
70 |
+
log_file.write(f"Description: {self.site_description}\n")
|
71 |
+
log_file.write(f"Category: {self.site_category}\n")
|
72 |
+
log_file.write(f"Base URL: {self.base_url}\n")
|
73 |
+
log_file.write(f"Domain: {self.domain_name}\n")
|
74 |
+
log_file.write(f"Max depth: {self.max_depth}\n")
|
75 |
+
log_file.write(f"Max pages: {self.max_pages}\n")
|
76 |
+
log_file.write(f"External links: {self.scrape_external_links}\n\n")
|
77 |
+
self.setup_driver(headless)
|
78 |
+
self.documents = []
|
79 |
+
|
80 |
+
def setup_driver(self, headless):
|
81 |
+
"""Setup Chrome driver with options."""
|
82 |
+
try:
|
83 |
+
chrome_options = Options()
|
84 |
+
if headless:
|
85 |
+
chrome_options.add_argument("--headless")
|
86 |
+
chrome_options.add_argument("--no-sandbox")
|
87 |
+
chrome_options.add_argument("--disable-dev-shm-usage")
|
88 |
+
chrome_options.add_argument("--disable-logging")
|
89 |
+
chrome_options.add_argument("--log-level=3")
|
90 |
+
chrome_options.add_argument("--disable-extensions")
|
91 |
+
chrome_options.add_argument("--disable-gpu")
|
92 |
+
chrome_options.add_argument("--window-size=1920,1080")
|
93 |
+
chrome_options.add_argument("--disable-web-security")
|
94 |
+
chrome_options.add_argument("--allow-running-insecure-content")
|
95 |
+
chrome_options.add_argument("--disable-features=VizDisplayCompositor")
|
96 |
+
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
97 |
+
chrome_options.binary_location = "/usr/bin/chromium"
|
98 |
+
|
99 |
+
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
|
100 |
+
chrome_options.add_experimental_option('useAutomationExtension', False)
|
101 |
+
try:
|
102 |
+
self.driver = webdriver.Chrome(
|
103 |
+
executable_path="/usr/bin/chromedriver",
|
104 |
+
options=chrome_options
|
105 |
+
)
|
106 |
+
except:
|
107 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
108 |
+
self.driver = webdriver.Chrome(
|
109 |
+
ChromeDriverManager().install(),
|
110 |
+
options=chrome_options
|
111 |
+
)
|
112 |
+
|
113 |
+
self.log_message("Chrome driver initialized successfully")
|
114 |
+
except Exception as e:
|
115 |
+
self.log_message(f"Error setting up Chrome driver: {e}")
|
116 |
+
raise
|
117 |
+
|
118 |
+
def log_message(self, message):
|
119 |
+
"""Write message to console and log file."""
|
120 |
+
print(message)
|
121 |
+
with open(self.log_path, "a", encoding="utf-8") as log_file:
|
122 |
+
log_file.write(f"{message}\n")
|
123 |
+
|
124 |
+
def is_valid_url(self, url):
|
125 |
+
"""Check if URL should be scraped."""
|
126 |
+
if not self.scrape_external_links and not url.startswith(self.base_domain):
|
127 |
+
return False
|
128 |
+
if re.search(r"\.(pdf|doc|docx|xls|xlsx|ppt|pptx|jpg|jpeg|png|gif|svg|ico|css|js|xml|json|zip|tar|gz|rar|7z|exe|dmg|mp3|mp4|avi|mov|wmv)$", url, re.IGNORECASE):
|
129 |
+
return False
|
130 |
+
if "#" in url:
|
131 |
+
url = url.split("#")[0]
|
132 |
+
if url in self.visited_links:
|
133 |
+
return False
|
134 |
+
skip_patterns = [
|
135 |
+
'/login', '/register', '/signup', '/sign-up', '/signin', '/sign-in',
|
136 |
+
'/logout', '/password', '/forgot', '/reset',
|
137 |
+
'/admin', '/dashboard', '/account', '/profile',
|
138 |
+
'/cart', '/checkout', '/payment', '/billing',
|
139 |
+
'/terms', '/privacy', '/legal', '/disclaimer',
|
140 |
+
'/sitemap', '/robots.txt', '/favicon'
|
141 |
+
]
|
142 |
+
url_lower = url.lower()
|
143 |
+
for pattern in skip_patterns:
|
144 |
+
if pattern in url_lower:
|
145 |
+
return False
|
146 |
+
spam_patterns = ['popup', 'advertisement', 'tracking', 'analytics']
|
147 |
+
for pattern in spam_patterns:
|
148 |
+
if pattern in url_lower:
|
149 |
+
return False
|
150 |
+
return True
|
151 |
+
|
152 |
+
def sanitize_filename(self, text):
|
153 |
+
"""Convert text to safe filename."""
|
154 |
+
if not text or len(text.strip()) == 0:
|
155 |
+
return f"page_{self.page_count}"
|
156 |
+
safe_name = re.sub(r'[^\w\s()-]', "_", text)
|
157 |
+
safe_name = re.sub(r'\s+', "_", safe_name)
|
158 |
+
safe_name = safe_name.strip("_")
|
159 |
+
return safe_name[:100] if len(safe_name) > 100 else safe_name
|
160 |
+
|
161 |
+
def extract_links(self):
|
162 |
+
"""Extract valid links from current page."""
|
163 |
+
links = self.driver.find_elements(By.TAG_NAME, "a")
|
164 |
+
valid_links = []
|
165 |
+
for link in links:
|
166 |
+
try:
|
167 |
+
href = link.get_attribute("href")
|
168 |
+
if href:
|
169 |
+
if href.startswith('/'):
|
170 |
+
href = self.base_domain + href
|
171 |
+
elif href.startswith('./') or not href.startswith('http'):
|
172 |
+
current_url = self.driver.current_url
|
173 |
+
base_path = '/'.join(current_url.split('/')[:-1])
|
174 |
+
href = base_path + '/' + href.lstrip('./')
|
175 |
+
if self.is_valid_url(href) and href not in self.visited_links:
|
176 |
+
valid_links.append(href)
|
177 |
+
except Exception:
|
178 |
+
continue
|
179 |
+
return list(set(valid_links))
|
180 |
+
|
181 |
+
def extract_main_content(self, soup):
|
182 |
+
"""Extract main content using various selectors."""
|
183 |
+
content_element = None
|
184 |
+
for selector in self.content_selectors:
|
185 |
+
try:
|
186 |
+
if selector.startswith('.') or selector.startswith('#'):
|
187 |
+
elements = soup.select(selector)
|
188 |
+
else:
|
189 |
+
elements = soup.find_all(selector)
|
190 |
+
if elements:
|
191 |
+
content_element = elements[0]
|
192 |
+
break
|
193 |
+
except:
|
194 |
+
continue
|
195 |
+
if not content_element:
|
196 |
+
content_element = soup.find('body')
|
197 |
+
return content_element
|
198 |
+
|
199 |
+
def extract_clean_text(self, soup):
|
200 |
+
"""Extract and clean text from BeautifulSoup object."""
|
201 |
+
unwanted_tags = [
|
202 |
+
"script", "style", "nav", "footer", "header", "aside",
|
203 |
+
"advertisement", "ads", "popup", "modal", "cookie-notice"
|
204 |
+
]
|
205 |
+
for tag in unwanted_tags:
|
206 |
+
for element in soup.find_all(tag):
|
207 |
+
element.decompose()
|
208 |
+
unwanted_classes = [
|
209 |
+
"sidebar", "menu", "navigation", "nav", "footer", "header",
|
210 |
+
"advertisement", "ad", "ads", "popup", "modal", "cookie",
|
211 |
+
"social", "share", "comment", "related", "recommended"
|
212 |
+
]
|
213 |
+
for class_name in unwanted_classes:
|
214 |
+
for element in soup.find_all(class_=re.compile(class_name, re.I)):
|
215 |
+
element.decompose()
|
216 |
+
for element in soup.find_all(id=re.compile(class_name, re.I)):
|
217 |
+
element.decompose()
|
218 |
+
main_content = self.extract_main_content(soup)
|
219 |
+
if main_content:
|
220 |
+
text = main_content.get_text(separator=" ", strip=True)
|
221 |
+
else:
|
222 |
+
text = soup.get_text(separator=" ", strip=True)
|
223 |
+
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
224 |
+
cleaned_text = '\n'.join(lines)
|
225 |
+
cleaned_text = re.sub(r'\n\s*\n', '\n\n', cleaned_text)
|
226 |
+
cleaned_text = re.sub(r' +', ' ', cleaned_text)
|
227 |
+
return cleaned_text
|
228 |
+
|
229 |
+
def scrape_page(self, url):
|
230 |
+
"""Scrape content from a single page and save as markdown."""
|
231 |
+
if url in self.visited_links:
|
232 |
+
return []
|
233 |
+
self.page_count += 1
|
234 |
+
self.visited_links.add(url)
|
235 |
+
status = f"Scraping [{self.page_count}/{self.max_pages}]: {url}"
|
236 |
+
self.log_message(status)
|
237 |
+
try:
|
238 |
+
self.driver.get(url)
|
239 |
+
WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
|
240 |
+
time.sleep(self.delay)
|
241 |
+
try:
|
242 |
+
page_title = self.driver.title or f"Page_{self.page_count}"
|
243 |
+
except:
|
244 |
+
page_title = f"Page_{self.page_count}"
|
245 |
+
soup = BeautifulSoup(self.driver.page_source, "html.parser")
|
246 |
+
cleaned_text = self.extract_clean_text(soup)
|
247 |
+
if len(cleaned_text.strip()) < 50:
|
248 |
+
self.log_message(f"Skipping {url}: insufficient content")
|
249 |
+
return self.extract_links()
|
250 |
+
meta_desc = ""
|
251 |
+
meta_tag = soup.find("meta", attrs={"name": "description"})
|
252 |
+
if meta_tag:
|
253 |
+
meta_desc = meta_tag.get("content", "")
|
254 |
+
doc = {
|
255 |
+
"text": cleaned_text,
|
256 |
+
"metadata": {
|
257 |
+
"source": url,
|
258 |
+
"title": page_title,
|
259 |
+
"site_name": self.site_name,
|
260 |
+
"site_description": self.site_description,
|
261 |
+
"site_category": self.site_category,
|
262 |
+
"meta_description": meta_desc,
|
263 |
+
"domain": self.domain_name,
|
264 |
+
"scraped_at": datetime.now().isoformat()
|
265 |
+
}
|
266 |
+
}
|
267 |
+
self.documents.append(doc)
|
268 |
+
safe_filename = self.sanitize_filename(page_title)
|
269 |
+
file_path = os.path.join(self.output_dir, f"{safe_filename}.md")
|
270 |
+
counter = 1
|
271 |
+
original_path = file_path
|
272 |
+
while os.path.exists(file_path):
|
273 |
+
base, ext = os.path.splitext(original_path)
|
274 |
+
file_path = f"{base}_{counter}{ext}"
|
275 |
+
counter += 1
|
276 |
+
with open(file_path, "w", encoding="utf-8") as file:
|
277 |
+
file.write(f"# {page_title}\n\n")
|
278 |
+
file.write(f"**URL:** {url}\n")
|
279 |
+
file.write(f"**Site:** {self.site_name}\n")
|
280 |
+
file.write(f"**Category:** {self.site_category}\n")
|
281 |
+
if meta_desc:
|
282 |
+
file.write(f"**Description:** {meta_desc}\n")
|
283 |
+
file.write(f"**Scraped:** {datetime.now()}\n\n")
|
284 |
+
file.write("---\n\n")
|
285 |
+
file.write(cleaned_text)
|
286 |
+
self.log_message(f"Saved: {os.path.basename(file_path)}")
|
287 |
+
new_links = self.extract_links()
|
288 |
+
self.log_message(f"Found {len(new_links)} new links")
|
289 |
+
return new_links
|
290 |
+
except Exception as e:
|
291 |
+
self.log_message(f"Error scraping {url}: {str(e)}")
|
292 |
+
return []
|
293 |
+
|
294 |
+
def create_summary(self):
|
295 |
+
"""Create a summary of the scraped content."""
|
296 |
+
summary_path = os.path.join(self.output_dir, "scraping_summary.md")
|
297 |
+
with open(summary_path, "w", encoding="utf-8") as f:
|
298 |
+
f.write(f"# Scraping Summary: {self.site_name}\n\n")
|
299 |
+
f.write(f"**Website:** {self.site_name}\n")
|
300 |
+
f.write(f"**URL:** {self.base_url}\n")
|
301 |
+
f.write(f"**Domain:** {self.domain_name}\n")
|
302 |
+
f.write(f"**Category:** {self.site_category}\n")
|
303 |
+
f.write(f"**Description:** {self.site_description}\n\n")
|
304 |
+
f.write(f"**Scraping Details:**\n")
|
305 |
+
f.write(f"- Start time: {self.start_time}\n")
|
306 |
+
f.write(f"- End time: {datetime.now()}\n")
|
307 |
+
f.write(f"- Duration: {datetime.now() - self.start_time}\n")
|
308 |
+
f.write(f"- Pages scraped: {len(self.documents)}\n")
|
309 |
+
f.write(f"- Max pages allowed: {self.max_pages}\n")
|
310 |
+
f.write(f"- Max depth: {self.max_depth}\n")
|
311 |
+
f.write(f"- External links allowed: {self.scrape_external_links}\n\n")
|
312 |
+
if self.documents:
|
313 |
+
f.write("**Scraped Pages:**\n")
|
314 |
+
for i, doc in enumerate(self.documents, 1):
|
315 |
+
f.write(f"{i}. [{doc['metadata']['title']}]({doc['metadata']['source']})\n")
|
316 |
+
|
317 |
+
def start(self):
|
318 |
+
"""Start the website scraping process."""
|
319 |
+
try:
|
320 |
+
self.log_message(f"Starting website scraping for {self.site_name}")
|
321 |
+
self.log_message(f"Target: {self.base_url}")
|
322 |
+
self.log_message(f"Limits: max_depth={self.max_depth}, max_pages={self.max_pages}")
|
323 |
+
urls_to_scrape = [(self.base_url, 0)]
|
324 |
+
while urls_to_scrape and self.page_count < self.max_pages:
|
325 |
+
current_url, current_depth = urls_to_scrape.pop(0)
|
326 |
+
if current_url in self.visited_links or current_depth > self.max_depth:
|
327 |
+
continue
|
328 |
+
new_links = self.scrape_page(current_url)
|
329 |
+
if current_depth + 1 <= self.max_depth:
|
330 |
+
for link in new_links:
|
331 |
+
if link not in self.visited_links:
|
332 |
+
urls_to_scrape.append((link, current_depth + 1))
|
333 |
+
self.create_summary()
|
334 |
+
self.driver.quit()
|
335 |
+
end_time = datetime.now()
|
336 |
+
duration = end_time - self.start_time
|
337 |
+
self.log_message(f"Scraping completed for {self.site_name}")
|
338 |
+
self.log_message(f"Total pages scraped: {self.page_count}")
|
339 |
+
self.log_message(f"Duration: {duration}")
|
340 |
+
return {
|
341 |
+
"success": True,
|
342 |
+
"pages_scraped": self.page_count,
|
343 |
+
"duration": str(duration),
|
344 |
+
"output_dir": self.output_dir
|
345 |
+
}
|
346 |
+
except Exception as e:
|
347 |
+
self.driver.quit()
|
348 |
+
self.log_message(f"Scraping failed: {str(e)}")
|
349 |
+
return {
|
350 |
+
"success": False,
|
351 |
+
"error": str(e),
|
352 |
+
"pages_scraped": self.page_count,
|
353 |
+
"duration": "0",
|
354 |
+
"output_dir": self.output_dir
|
355 |
+
}
|
app.py
ADDED
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
import warnings
|
4 |
+
from WebScraper import WebsiteScraper
|
5 |
+
from merge_md import merge_md_to_pdf_and_convert_to_url
|
6 |
+
|
7 |
+
warnings.filterwarnings("ignore")
|
8 |
+
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
|
9 |
+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
10 |
+
|
11 |
+
global_output_dir = ""
|
12 |
+
|
13 |
+
def scrape_website(url, site_name, site_description="", site_category="General",
|
14 |
+
max_pages=20, max_depth=3, delay=2, scrape_external_links=False):
|
15 |
+
scraper = WebsiteScraper(
|
16 |
+
base_url=url,
|
17 |
+
site_name=site_name,
|
18 |
+
site_description=site_description,
|
19 |
+
site_category=site_category,
|
20 |
+
max_pages=max_pages,
|
21 |
+
max_depth=max_depth,
|
22 |
+
delay=delay,
|
23 |
+
scrape_external_links=scrape_external_links
|
24 |
+
)
|
25 |
+
return scraper.start()
|
26 |
+
|
27 |
+
with gr.Blocks(title="General Website Scraper", theme=gr.themes.Soft()) as demo:
|
28 |
+
gr.Markdown("# General Website Scraper")
|
29 |
+
gr.Markdown("Scrape content from any website, save as markdown files, and merge into a PDF with viewer and downloadable link.")
|
30 |
+
|
31 |
+
with gr.Row():
|
32 |
+
url_input = gr.Textbox(
|
33 |
+
label="Website URL",
|
34 |
+
placeholder="e.g., https://example.com or https://blog.example.com",
|
35 |
+
info="Enter the starting URL to scrape"
|
36 |
+
)
|
37 |
+
site_name_input = gr.Textbox(
|
38 |
+
label="Site Name",
|
39 |
+
placeholder="e.g., Example Blog",
|
40 |
+
info="A descriptive name for the website"
|
41 |
+
)
|
42 |
+
|
43 |
+
with gr.Row():
|
44 |
+
site_description_input = gr.Textbox(
|
45 |
+
label="Site Description (Optional)",
|
46 |
+
placeholder="e.g., A technology blog about AI and programming",
|
47 |
+
info="Brief description of the website content"
|
48 |
+
)
|
49 |
+
site_category_input = gr.Dropdown(
|
50 |
+
label="Site Category",
|
51 |
+
choices=[
|
52 |
+
"General", "Blog", "News", "E-commerce", "Portfolio",
|
53 |
+
"Company", "Documentation", "Forum", "Social Media",
|
54 |
+
"Education", "Technology", "Entertainment", "Health",
|
55 |
+
"Finance", "Travel", "Food", "Sports", "Art", "Other"
|
56 |
+
],
|
57 |
+
value="General",
|
58 |
+
info="Select the most appropriate category"
|
59 |
+
)
|
60 |
+
|
61 |
+
with gr.Row():
|
62 |
+
max_pages_input = gr.Number(
|
63 |
+
label="Max Pages", value=20, precision=0, minimum=1, maximum=1000,
|
64 |
+
info="Maximum number of pages to scrape"
|
65 |
+
)
|
66 |
+
max_depth_input = gr.Number(
|
67 |
+
label="Max Depth", value=3, precision=0, minimum=1, maximum=10,
|
68 |
+
info="How many clicks deep to follow links"
|
69 |
+
)
|
70 |
+
delay_input = gr.Number(
|
71 |
+
label="Delay (seconds)", value=2, precision=1, minimum=0.5, maximum=10,
|
72 |
+
info="Delay between requests to avoid overwhelming the server"
|
73 |
+
)
|
74 |
+
|
75 |
+
with gr.Row():
|
76 |
+
external_links_input = gr.Checkbox(
|
77 |
+
label="Include External Links", value=False,
|
78 |
+
info="Scrape links that go outside the original domain (use with caution)"
|
79 |
+
)
|
80 |
+
|
81 |
+
scrape_btn = gr.Button("Start Scraping", variant="primary", size="lg")
|
82 |
+
|
83 |
+
with gr.Row():
|
84 |
+
output = gr.Textbox(
|
85 |
+
label="Scraping Results",
|
86 |
+
lines=10,
|
87 |
+
max_lines=20,
|
88 |
+
info="Real-time scraping progress and results will appear here"
|
89 |
+
)
|
90 |
+
|
91 |
+
gr.Markdown("## PDF Generation & Viewer")
|
92 |
+
|
93 |
+
with gr.Row():
|
94 |
+
merge_pdf_btn = gr.Button("Merge to PDF and Get Link", variant="secondary", size="lg")
|
95 |
+
|
96 |
+
with gr.Row():
|
97 |
+
with gr.Column(scale=1):
|
98 |
+
pdf_output = gr.Textbox(
|
99 |
+
label="PDF Merge Results",
|
100 |
+
lines=5,
|
101 |
+
max_lines=10,
|
102 |
+
info="Results of merging Markdown files to PDF"
|
103 |
+
)
|
104 |
+
|
105 |
+
pdf_download = gr.File(
|
106 |
+
label="Download Merged PDF (Local File)",
|
107 |
+
file_types=[".pdf"],
|
108 |
+
visible=False
|
109 |
+
)
|
110 |
+
|
111 |
+
pdf_url_output = gr.HTML(
|
112 |
+
label="PDF Download Link",
|
113 |
+
visible=False
|
114 |
+
)
|
115 |
+
|
116 |
+
with gr.Column(scale=2):
|
117 |
+
pdf_viewer = gr.File(
|
118 |
+
label="PDF Viewer - View Merged Content",
|
119 |
+
file_types=[".pdf"],
|
120 |
+
visible=False,
|
121 |
+
interactive=False
|
122 |
+
)
|
123 |
+
|
124 |
+
def process_scrape(url, site_name, site_description, site_category, max_pages, max_depth, delay, external_links):
|
125 |
+
"""
|
126 |
+
The function `process_scrape` takes in parameters related to website scraping, performs the
|
127 |
+
scraping operation, and returns a success message or an error message based on the result.
|
128 |
+
|
129 |
+
:param url: The `url` parameter is the URL of the website that you want to scrape
|
130 |
+
:param site_name: The `site_name` parameter is a string that represents the name of the website
|
131 |
+
being scraped. It is one of the required parameters for the `process_scrape` function
|
132 |
+
:param site_description: The `site_description` parameter in the `process_scrape` function is
|
133 |
+
used to provide a description of the website being scraped. It is a text description that helps
|
134 |
+
in identifying and describing the content or purpose of the website. This information can be
|
135 |
+
used for various purposes such as categorizing the website,
|
136 |
+
:param site_category: The `site_category` parameter in the `process_scrape` function is used to
|
137 |
+
specify the category of the website being scraped. It is one of the inputs required for the
|
138 |
+
scraping process
|
139 |
+
:param max_pages: The `max_pages` parameter in the `process_scrape` function represents the
|
140 |
+
maximum number of pages to scrape on the website. It is an integer value that determines the
|
141 |
+
limit for the number of pages that will be scraped during the process
|
142 |
+
:param max_depth: The `max_depth` parameter in the `process_scrape` function represents the
|
143 |
+
maximum depth of links to follow during the website scraping process. It determines how many
|
144 |
+
levels deep the scraper will navigate through the website's links starting from the initial URL.
|
145 |
+
This parameter helps control the extent of the scraping process and
|
146 |
+
:param delay: The `delay` parameter in the `process_scrape` function represents the time delay
|
147 |
+
(in seconds) between consecutive requests made during the scraping process. This delay is useful
|
148 |
+
for preventing overwhelming the target website with too many requests in a short period, which
|
149 |
+
could lead to being blocked or flagged as suspicious activity
|
150 |
+
:param external_links: The `external_links` parameter in the `process_scrape` function is a
|
151 |
+
boolean flag that determines whether external links should be scraped along with the internal
|
152 |
+
links of the website. If `external_links` is set to `True`, the scraper will also follow and
|
153 |
+
scrape external links found on the website
|
154 |
+
:return: The function `process_scrape` returns a tuple containing a message string, and three
|
155 |
+
`None` values. The message string can vary depending on the outcome of the scraping process. If
|
156 |
+
the scraping is successful, it returns a success message with details such as the number of
|
157 |
+
pages scraped, duration, output directory, and a list of files created. If the scraping fails,
|
158 |
+
it returns an error message indicating
|
159 |
+
"""
|
160 |
+
global global_output_dir
|
161 |
+
if not url or not site_name:
|
162 |
+
return "Please provide both URL and Site Name", None, None, None
|
163 |
+
|
164 |
+
if not url.startswith(('http://', 'https://')):
|
165 |
+
url = 'https://' + url
|
166 |
+
|
167 |
+
try:
|
168 |
+
result = scrape_website(
|
169 |
+
url=url,
|
170 |
+
site_name=site_name,
|
171 |
+
site_description=site_description,
|
172 |
+
site_category=site_category,
|
173 |
+
max_pages=int(max_pages),
|
174 |
+
max_depth=int(max_depth),
|
175 |
+
delay=float(delay),
|
176 |
+
scrape_external_links=external_links
|
177 |
+
)
|
178 |
+
|
179 |
+
if result["success"]:
|
180 |
+
global_output_dir = result['output_dir']
|
181 |
+
return (
|
182 |
+
f"Successfully scraped {result['pages_scraped']} pages!\n"
|
183 |
+
f"Duration: {result['duration']}\n"
|
184 |
+
f"Files saved to: {result['output_dir']}\n\n"
|
185 |
+
f"Files created:\n"
|
186 |
+
f" • Individual page files (.md)\n"
|
187 |
+
f" • scraping_summary.md\n"
|
188 |
+
f" • scraping_log.txt\n\n"
|
189 |
+
f"Ready to merge into PDF - click 'Merge to PDF' button below."
|
190 |
+
), None, None, None
|
191 |
+
else:
|
192 |
+
return f"Scraping failed: {result['error']}", None, None, None
|
193 |
+
except Exception as e:
|
194 |
+
return f"Error: {str(e)}", None, None, None
|
195 |
+
|
196 |
+
def process_merge_to_pdf():
|
197 |
+
"""
|
198 |
+
The function `process_merge_to_pdf` merges Markdown files into a PDF and provides download
|
199 |
+
options for the generated PDF.
|
200 |
+
:return: The `process_merge_to_pdf` function returns a tuple containing four elements:
|
201 |
+
"""
|
202 |
+
global global_output_dir
|
203 |
+
if not global_output_dir:
|
204 |
+
return ("No scraping output directory found. Please scrape a website first.",
|
205 |
+
None, None, gr.update(visible=False))
|
206 |
+
|
207 |
+
try:
|
208 |
+
result = merge_md_to_pdf_and_convert_to_url(
|
209 |
+
output_dir=global_output_dir,
|
210 |
+
site_name="Scraped Website",
|
211 |
+
site_description="Scraped content from website",
|
212 |
+
site_category="Technology",
|
213 |
+
output_format="pdf"
|
214 |
+
)
|
215 |
+
|
216 |
+
if result["success"]:
|
217 |
+
pdf_url = result["output_url"]
|
218 |
+
local_pdf_path = result["converted_path"]
|
219 |
+
|
220 |
+
message = (
|
221 |
+
f"{result['message']}\n\n"
|
222 |
+
f"PDF created successfully!\n"
|
223 |
+
f"Local file: {local_pdf_path}\n"
|
224 |
+
f"Download URL: {pdf_url}\n\n"
|
225 |
+
f"View the PDF in the viewer on the right."
|
226 |
+
)
|
227 |
+
|
228 |
+
download_html = f'''
|
229 |
+
<div style="padding: 10px; background-color: #f0f0f0; border-radius: 5px; margin: 10px 0;">
|
230 |
+
<h4>Download Options:</h4>
|
231 |
+
<p><a href="{pdf_url}" target="_blank" style="color: #1f77b4; text-decoration: none; font-weight: bold;">
|
232 |
+
Click here to download PDF from web link
|
233 |
+
</a></p>
|
234 |
+
<p><small>The PDF is also available in the viewer on the right and as a downloadable file above.</small></p>
|
235 |
+
</div>
|
236 |
+
'''
|
237 |
+
|
238 |
+
return (
|
239 |
+
message,
|
240 |
+
local_pdf_path,
|
241 |
+
download_html,
|
242 |
+
gr.update(value=local_pdf_path, visible=True)
|
243 |
+
)
|
244 |
+
else:
|
245 |
+
return (
|
246 |
+
f"PDF merge failed: {result['error']}",
|
247 |
+
None,
|
248 |
+
None,
|
249 |
+
gr.update(visible=False)
|
250 |
+
)
|
251 |
+
except Exception as e:
|
252 |
+
return (
|
253 |
+
f"Error during PDF merge: {str(e)}",
|
254 |
+
None,
|
255 |
+
None,
|
256 |
+
gr.update(visible=False)
|
257 |
+
)
|
258 |
+
|
259 |
+
scrape_btn.click(
|
260 |
+
process_scrape,
|
261 |
+
inputs=[
|
262 |
+
url_input, site_name_input, site_description_input, site_category_input,
|
263 |
+
max_pages_input, max_depth_input, delay_input, external_links_input
|
264 |
+
],
|
265 |
+
outputs=[output, pdf_download, pdf_url_output, pdf_viewer]
|
266 |
+
)
|
267 |
+
|
268 |
+
merge_pdf_btn.click(
|
269 |
+
process_merge_to_pdf,
|
270 |
+
inputs=[],
|
271 |
+
outputs=[pdf_output, pdf_download, pdf_url_output, pdf_viewer]
|
272 |
+
)
|
273 |
+
|
274 |
+
with gr.Accordion("Example Usage & Tips", open=False):
|
275 |
+
gr.Markdown("""
|
276 |
+
### Common Use Cases:
|
277 |
+
- News Websites: `https://techcrunch.com` - scrape latest tech news articles
|
278 |
+
- Blogs: `https://blog.openai.com` - scrape all blog posts and updates
|
279 |
+
- Company Sites: `https://company.com/products` - scrape product pages and documentation
|
280 |
+
- Personal Portfolios: `https://designer.com` - scrape project galleries and case studies
|
281 |
+
- Forums/Communities: `https://stackoverflow.com/questions/tagged/python` - scrape Q&A content
|
282 |
+
- E-commerce: `https://shop.com/category` - scrape product listings and descriptions
|
283 |
+
|
284 |
+
### Tips for Better Results:
|
285 |
+
- Start with specific sections: Instead of `https://wikipedia.org`, try `https://en.wikipedia.org/wiki/Category:Artificial_intelligence`
|
286 |
+
- Use reasonable limits: Start with 10-20 pages to test, then increase if needed
|
287 |
+
- Respect rate limits: Use 2-3 second delays for most sites
|
288 |
+
- External links: Only enable for trusted sites to avoid scraping the entire internet
|
289 |
+
- Check robots.txt: Make sure you're allowed to scrape the site (`site.com/robots.txt`)
|
290 |
+
|
291 |
+
### Output Files Explained:
|
292 |
+
- Individual .md files: Each scraped page saved as markdown
|
293 |
+
- scraping_summary.md: Overview of all scraped content with links
|
294 |
+
- scraping_log.txt: Detailed log of the scraping process
|
295 |
+
- Merged PDF: Combined content of all Markdown files, viewable in the interface and downloadable
|
296 |
+
|
297 |
+
### PDF Features:
|
298 |
+
- Inline Viewer: View the merged PDF directly in the interface
|
299 |
+
- Download Options: Download via direct file or web link
|
300 |
+
- Multiple Formats: Local file and web-hosted version available
|
301 |
+
""")
|
302 |
+
|
303 |
+
gr.Markdown("""
|
304 |
+
---
|
305 |
+
Important Notes:
|
306 |
+
- Always respect website terms of service and robots.txt
|
307 |
+
- Use reasonable delays to avoid overwhelming servers
|
308 |
+
- Some sites may block automated scraping
|
309 |
+
- Consider the website's bandwidth and server load
|
310 |
+
- The merged PDF is uploaded to a public link for easy sharing
|
311 |
+
- PDF viewer works best with modern browsers that support PDF display
|
312 |
+
""")
|
313 |
+
|
314 |
+
if __name__ == "__main__":
|
315 |
+
demo.launch(mcp_server=True, share=True, server_port=7860)
|
merge_md.py
ADDED
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from reportlab.lib.pagesizes import A4
|
3 |
+
from reportlab.lib.units import inch
|
4 |
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
|
5 |
+
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
6 |
+
from datetime import datetime
|
7 |
+
import markdown2
|
8 |
+
from mistralai import Mistral
|
9 |
+
from pathlib import Path
|
10 |
+
from urllib.parse import urlparse
|
11 |
+
import convertapi
|
12 |
+
import requests
|
13 |
+
from dotenv import load_dotenv
|
14 |
+
import re
|
15 |
+
|
16 |
+
load_dotenv()
|
17 |
+
|
18 |
+
convertapi.api_credentials = os.getenv("CONVERTAPI_TOKEN")
|
19 |
+
if not convertapi.api_credentials:
|
20 |
+
raise ValueError("CONVERTAPI_TOKEN environment variable is required")
|
21 |
+
|
22 |
+
SUPPORTED_FORMATS = ["pdf", "docx", "txt"]
|
23 |
+
MAX_FILE_SIZE = int(os.getenv("MAX_FILE_SIZE", 100 * 1024 * 1024))
|
24 |
+
# TEMP_DIR = os.getenv("TEMP_DIR", "temp")
|
25 |
+
# In merge_md.py, update temp directory handling
|
26 |
+
TEMP_DIR = os.getenv("TEMP_DIR", "/tmp/scraper_temp")
|
27 |
+
# Ensure temp directory exists
|
28 |
+
os.makedirs(TEMP_DIR, exist_ok=True)
|
29 |
+
|
30 |
+
def upload_to_service(file_path: str) -> str:
|
31 |
+
"""
|
32 |
+
Mock function to simulate uploading a file to a cloud service.
|
33 |
+
Args:
|
34 |
+
file_path (str): Path to the file to upload.
|
35 |
+
Returns:
|
36 |
+
str: Mock public URL or error message.
|
37 |
+
"""
|
38 |
+
try:
|
39 |
+
if not os.path.exists(file_path):
|
40 |
+
return f"File not found: {file_path}"
|
41 |
+
return f"https://mock-cloud-service.com/{os.path.basename(file_path)}"
|
42 |
+
except Exception as e:
|
43 |
+
return f"Error uploading file: {str(e)}"
|
44 |
+
|
45 |
+
def convert_from_url(document_url: str, output_format: str) -> str:
|
46 |
+
"""
|
47 |
+
Convert a document from a URL to a different format using ConvertAPI.
|
48 |
+
Args:
|
49 |
+
document_url (str): The URL of the input file.
|
50 |
+
output_format (str): The format to convert the file to.
|
51 |
+
Returns:
|
52 |
+
str: The path to the converted file or an error message.
|
53 |
+
"""
|
54 |
+
try:
|
55 |
+
if not document_url or not document_url.lower().startswith(("http://", "https://")):
|
56 |
+
return "Invalid or unsupported URL format."
|
57 |
+
if output_format not in SUPPORTED_FORMATS:
|
58 |
+
return f"Unsupported output format: {output_format}"
|
59 |
+
|
60 |
+
result = convertapi.convert(output_format, {"File": document_url})
|
61 |
+
input_filename = Path(urlparse(document_url).path).stem or "converted_file"
|
62 |
+
output_filename = f"{input_filename}.{output_format}"
|
63 |
+
output_path = Path(TEMP_DIR) / output_filename
|
64 |
+
output_path.parent.mkdir(exist_ok=True)
|
65 |
+
result.file.save(str(output_path))
|
66 |
+
return str(output_path)
|
67 |
+
except Exception as e:
|
68 |
+
return f"Error converting file from URL: {str(e)}"
|
69 |
+
|
70 |
+
def merge_md_to_pdf(output_dir, site_name, site_description="", site_category="General"):
|
71 |
+
"""
|
72 |
+
Merge all Markdown files in the output directory into a single PDF using reportlab after processing with Mistral AI.
|
73 |
+
Args:
|
74 |
+
output_dir (str): Directory containing Markdown files.
|
75 |
+
site_name (str): Name of the site for the PDF title.
|
76 |
+
site_description (str): Description of the site.
|
77 |
+
site_category (str): Category of the site.
|
78 |
+
Returns:
|
79 |
+
dict: Result containing success status, output PDF path, and message.
|
80 |
+
"""
|
81 |
+
try:
|
82 |
+
api_key = os.getenv("MISTRAL_API_KEY")
|
83 |
+
if not api_key:
|
84 |
+
return {
|
85 |
+
"success": False,
|
86 |
+
"error": "MISTRAL_API_KEY environment variable not set",
|
87 |
+
"output_pdf": None,
|
88 |
+
"pages_merged": 0
|
89 |
+
}
|
90 |
+
|
91 |
+
client = Mistral(api_key=api_key)
|
92 |
+
model = "mistral-large-latest"
|
93 |
+
|
94 |
+
if not os.path.exists(output_dir):
|
95 |
+
return {
|
96 |
+
"success": False,
|
97 |
+
"error": f"Output directory {output_dir} does not exist",
|
98 |
+
"output_pdf": None,
|
99 |
+
"pages_merged": 0
|
100 |
+
}
|
101 |
+
|
102 |
+
md_files = [
|
103 |
+
f for f in os.listdir(output_dir)
|
104 |
+
if f.endswith('.md') and f not in ['scraping_summary.md', 'scraping_log.txt']
|
105 |
+
]
|
106 |
+
|
107 |
+
if not md_files:
|
108 |
+
return {
|
109 |
+
"success": False,
|
110 |
+
"error": "No Markdown files found in the output directory",
|
111 |
+
"output_pdf": None,
|
112 |
+
"pages_merged": 0
|
113 |
+
}
|
114 |
+
|
115 |
+
pdf_output_path = os.path.join(output_dir, f"{site_name}_merged.pdf")
|
116 |
+
doc = SimpleDocTemplate(
|
117 |
+
pdf_output_path,
|
118 |
+
pagesize=A4,
|
119 |
+
rightMargin=inch,
|
120 |
+
leftMargin=inch,
|
121 |
+
topMargin=inch,
|
122 |
+
bottomMargin=inch
|
123 |
+
)
|
124 |
+
styles = getSampleStyleSheet()
|
125 |
+
|
126 |
+
title_style = ParagraphStyle(name='Title', fontSize=24, leading=28, alignment=1, spaceAfter=20)
|
127 |
+
heading_style = ParagraphStyle(name='Heading2', fontSize=18, leading=22, spaceAfter=15)
|
128 |
+
body_style = ParagraphStyle(name='Body', fontSize=12, leading=14, spaceAfter=10)
|
129 |
+
|
130 |
+
story = [
|
131 |
+
Paragraph(f"{site_name}", title_style),
|
132 |
+
Spacer(1, 0.2 * inch),
|
133 |
+
Paragraph(f"Description: {site_description}", body_style),
|
134 |
+
Paragraph(f"Category: {site_category}", body_style),
|
135 |
+
Paragraph(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", body_style),
|
136 |
+
PageBreak(),
|
137 |
+
Paragraph("Table of Contents", heading_style),
|
138 |
+
Spacer(1, 0.2 * inch)
|
139 |
+
]
|
140 |
+
|
141 |
+
toc_entries = []
|
142 |
+
for idx, md_file in enumerate(sorted(md_files), 1):
|
143 |
+
file_path = os.path.join(output_dir, md_file)
|
144 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
145 |
+
md_content = f.read()
|
146 |
+
|
147 |
+
title = md_content.split('\n')[0].strip('#').strip() or f"Page {idx}"
|
148 |
+
try:
|
149 |
+
prompt = f"""
|
150 |
+
You are an expert content editor. Below is the content of a Markdown file. Please enhance the content by making it more detailed, well-structured, and polished while preserving the original meaning. Ensure the output is in plain text suitable for inclusion in a PDF. Avoid adding Markdown or HTML formatting in the response.
|
151 |
+
If there are HTML tags like <p><strong>Agents-MCP-Hackathon (Agents-MCP-Hackathon)</strong></p>, convert them to plain text like Agents-MCP-Hackathon (Agents-MCP-Hackathon).
|
152 |
+
Original content:
|
153 |
+
{md_content}
|
154 |
+
|
155 |
+
Enhanced content:
|
156 |
+
"""
|
157 |
+
response = client.chat.complete(
|
158 |
+
model=model,
|
159 |
+
messages=[{"role": "user", "content": prompt}]
|
160 |
+
)
|
161 |
+
enhanced_content = response.choices[0].message.content.strip()
|
162 |
+
except Exception as e:
|
163 |
+
print(f"Warning: Failed to process {md_file} with Mistral AI: {str(e)}. Using original content.")
|
164 |
+
enhanced_content = md_content
|
165 |
+
|
166 |
+
html_content = markdown2.markdown(enhanced_content, extras=['fenced-code-blocks', 'tables'])
|
167 |
+
text_content = re.sub(r'<[^>]+>', '', html_content)
|
168 |
+
text_content = re.sub(r'\s+', ' ', text_content).strip()
|
169 |
+
lines = text_content.split('\n')
|
170 |
+
|
171 |
+
toc_entries.append(Paragraph(f"{idx}. {title}", body_style))
|
172 |
+
|
173 |
+
story.append(Paragraph(title, heading_style))
|
174 |
+
story.append(Spacer(1, 0.1 * inch))
|
175 |
+
for line in lines:
|
176 |
+
if line.strip():
|
177 |
+
story.append(Paragraph(line.strip(), body_style))
|
178 |
+
story.append(PageBreak())
|
179 |
+
|
180 |
+
story[6:6] = toc_entries + [PageBreak()]
|
181 |
+
doc.build(story)
|
182 |
+
|
183 |
+
return {
|
184 |
+
"success": True,
|
185 |
+
"output_pdf": pdf_output_path,
|
186 |
+
"pages_merged": len(md_files),
|
187 |
+
"message": f"Successfully merged {len(md_files)} Markdown files into {pdf_output_path} after processing with Mistral AI"
|
188 |
+
}
|
189 |
+
|
190 |
+
except Exception as e:
|
191 |
+
return {
|
192 |
+
"success": False,
|
193 |
+
"error": f"Failed to merge Markdown files into PDF: {str(e)}",
|
194 |
+
"output_pdf": None,
|
195 |
+
"pages_merged": 0
|
196 |
+
}
|
197 |
+
|
198 |
+
def merge_md_to_pdf_and_convert_to_url(output_dir, site_name, site_description="", site_category="General", output_format="pdf"):
|
199 |
+
"""
|
200 |
+
Merge Markdown files into a PDF, upload it to a service, and optionally convert to another format.
|
201 |
+
Args:
|
202 |
+
output_dir (str): Directory containing Markdown files.
|
203 |
+
site_name (str): Name of the site for the PDF title.
|
204 |
+
site_description (str): Description of the site.
|
205 |
+
site_category (str): Category of the site.
|
206 |
+
output_format (str): Optional format to convert the PDF to (e.g., 'docx', 'txt').
|
207 |
+
Returns:
|
208 |
+
dict: Result containing success status, output URL, and message.
|
209 |
+
"""
|
210 |
+
try:
|
211 |
+
merge_result = merge_md_to_pdf(output_dir, site_name, site_description, site_category)
|
212 |
+
if not merge_result["success"]:
|
213 |
+
return {
|
214 |
+
"success": False,
|
215 |
+
"error": merge_result["error"],
|
216 |
+
"output_url": None,
|
217 |
+
"converted_path": None
|
218 |
+
}
|
219 |
+
|
220 |
+
pdf_path = merge_result["output_pdf"]
|
221 |
+
if not pdf_path or not os.path.exists(pdf_path):
|
222 |
+
return {
|
223 |
+
"success": False,
|
224 |
+
"error": "Generated PDF not found",
|
225 |
+
"output_url": None,
|
226 |
+
"converted_path": None
|
227 |
+
}
|
228 |
+
|
229 |
+
pdf_url = upload_to_service(pdf_path)
|
230 |
+
if not pdf_url.startswith("http"):
|
231 |
+
return {
|
232 |
+
"success": False,
|
233 |
+
"error": f"Failed to obtain URL: {pdf_url}",
|
234 |
+
"output_url": None,
|
235 |
+
"converted_path": None
|
236 |
+
}
|
237 |
+
|
238 |
+
converted_path = pdf_path
|
239 |
+
if output_format != "pdf":
|
240 |
+
converted_path = convert_from_url(pdf_url, output_format)
|
241 |
+
if not converted_path.startswith(TEMP_DIR):
|
242 |
+
return {
|
243 |
+
"success": False,
|
244 |
+
"error": f"Conversion failed: {converted_path}",
|
245 |
+
"output_url": pdf_url,
|
246 |
+
"converted_path": None
|
247 |
+
}
|
248 |
+
|
249 |
+
return {
|
250 |
+
"success": True,
|
251 |
+
"output_url": pdf_url,
|
252 |
+
"converted_path": converted_path,
|
253 |
+
"message": f"Successfully merged {merge_result['pages_merged']} Markdown files into PDF and uploaded to {pdf_url}",
|
254 |
+
"pages_merged": merge_result["pages_merged"]
|
255 |
+
}
|
256 |
+
|
257 |
+
except Exception as e:
|
258 |
+
return {
|
259 |
+
"success": False,
|
260 |
+
"error": f"Error in merging or uploading: {str(e)}",
|
261 |
+
"output_url": None,
|
262 |
+
"converted_path": None
|
263 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
selenium
|
3 |
+
beautifulsoup4
|
4 |
+
requests
|
5 |
+
reportlab
|
6 |
+
markdown2
|
7 |
+
mistralai
|
8 |
+
convertapi
|
9 |
+
python-dotenv
|
10 |
+
pathlib
|
11 |
+
urllib3
|
12 |
+
webdriver-manager
|