Spaces:

MicroHealth
/

website-to-pdf

Sleeping

App Files Files Community

bluenevus commited on Apr 21

Commit

f63dcb0

verified ·

1 Parent(s): aa735f8

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -52

app.py CHANGED Viewed

@@ -7,87 +7,111 @@ import tempfile
 import re
 import logging
 from concurrent.futures import ThreadPoolExecutor, as_completed
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 def clean_text(text):
     text = ''.join(char for char in text if char.isprintable())
     text = re.sub(r'[^\x00-\x7F]+', ' ', text)
     return text
-def get_page_content(url):
     try:
-        logger.info(f"Fetching content from: {url}")
-        response = requests.get(url, timeout=10)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.text, 'html.parser')
-        content = []
-        main_content = soup.find('article') or soup.find('main') or soup
-        if main_content:
-            for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
-                for element in main_content.find_all(tag):
-                    text = clean_text(element.get_text(strip=True))
-                    if text:
-                        content.append(text)
-        logger.info(f"Found {len(content)} content items for {url}")
-        return content
     except Exception as e:
         logger.error(f"Error processing {url}: {str(e)}")
         return [f"Error processing {url}: {str(e)}"]
-def get_links(url, base_url):
     try:
-        response = requests.get(url, timeout=10)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.text, 'html.parser')
-        links = soup.find_all('a', href=True)
-        valid_links = []
-        for link in links:
-            full_url = urljoin(url, link['href'])
-            if full_url.startswith(base_url) and full_url != url:
-                valid_links.append(full_url)
-        return valid_links
     except Exception as e:
         logger.error(f"Error getting links from {url}: {str(e)}")
         return []
-def crawl_pages(base_url, max_depth):
     visited = set()
     to_visit = [(base_url, 0)]
     all_pages = []
-    def process_page(url, depth):
-        content = get_page_content(url)
-        logger.info(f"Processed page: {url} at depth {depth}")
-        return url, content, depth
-    with ThreadPoolExecutor(max_workers=10) as executor:
-        futures = []
         while to_visit:
             current_url, depth = to_visit.pop(0)
             if current_url in visited or depth > max_depth:
                 continue
             visited.add(current_url)
-            futures.append(executor.submit(process_page, current_url, depth))
             if depth < max_depth:
-                links = get_links(current_url, base_url)
                 for link in links:
                     if link not in visited:
                         to_visit.append((link, depth + 1))
-        for future in as_completed(futures):
-            url, content, depth = future.result()
-            all_pages.append((url, content))
     return all_pages
-def website_to_pdf(url, max_depth):
-    logger.info(f"Starting to process: {url} with max depth: {max_depth}")
-    all_pages = crawl_pages(url, max_depth)
-    logger.info(f"Found {len(all_pages)} pages to process")
     pdf = FPDF()
     pdf.set_auto_page_break(auto=True, margin=15)
@@ -111,24 +135,23 @@ def website_to_pdf(url, max_depth):
     return pdf_path
-def process_url(url, depth):
     try:
-        pdf_file = website_to_pdf(url, depth)
         return pdf_file
     except Exception as e:
         logger.error(f"Error in process_url: {str(e)}")
         return f"An error occurred: {str(e)}"
-def threaded_process_url(url, depth):
-    with ThreadPoolExecutor() as executor:
-        future = executor.submit(process_url, url, depth)
-        return future.result()
 iface = gr.Interface(
-    fn=threaded_process_url,
     inputs=[
         gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"),
-        gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Crawl Depth")
     ],
     outputs=gr.File(label="Download PDF"),
     title="Website to PDF Converter",

 import re
 import logging
 from concurrent.futures import ThreadPoolExecutor, as_completed
+import asyncio
+import aiohttp
+from aiolimiter import AsyncLimiter
+import sqlite3
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Initialize SQLite database
+conn = sqlite3.connect('crawl_cache.db')
+c = conn.cursor()
+c.execute('''CREATE TABLE IF NOT EXISTS pages
+             (url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
+conn.commit()
+# Rate limiter: 10 requests per second
+rate_limiter = AsyncLimiter(10, 1)
 def clean_text(text):
     text = ''.join(char for char in text if char.isprintable())
     text = re.sub(r'[^\x00-\x7F]+', ' ', text)
     return text
+async def get_page_content(session, url):
     try:
+        async with rate_limiter:
+            async with session.get(url, timeout=30) as response:
+                if response.status == 200:
+                    text = await response.text()
+                    soup = BeautifulSoup(text, 'html.parser')
+                    content = []
+                    main_content = soup.find('article') or soup.find('main') or soup
+                    if main_content:
+                        for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
+                            for element in main_content.find_all(tag):
+                                text = clean_text(element.get_text(strip=True))
+                                if text:
+                                    content.append(text)
+                    logger.info(f"Found {len(content)} content items for {url}")
+                    return content
+                else:
+                    logger.error(f"Error fetching {url}: HTTP {response.status}")
+                    return [f"Error fetching {url}: HTTP {response.status}"]
     except Exception as e:
         logger.error(f"Error processing {url}: {str(e)}")
         return [f"Error processing {url}: {str(e)}"]
+async def get_links(session, url, base_url):
     try:
+        async with rate_limiter:
+            async with session.get(url, timeout=30) as response:
+                if response.status == 200:
+                    text = await response.text()
+                    soup = BeautifulSoup(text, 'html.parser')
+                    links = soup.find_all('a', href=True)
+                    valid_links = []
+                    for link in links:
+                        full_url = urljoin(url, link['href'])
+                        if full_url.startswith(base_url) and full_url != url:
+                            valid_links.append(full_url)
+                    return valid_links
+                else:
+                    logger.error(f"Error fetching links from {url}: HTTP {response.status}")
+                    return []
     except Exception as e:
         logger.error(f"Error getting links from {url}: {str(e)}")
         return []
+async def crawl_pages(base_url, max_depth):
     visited = set()
     to_visit = [(base_url, 0)]
     all_pages = []
+    async with aiohttp.ClientSession() as session:
         while to_visit:
             current_url, depth = to_visit.pop(0)
             if current_url in visited or depth > max_depth:
                 continue
             visited.add(current_url)
+            # Check if page is already in the database
+            c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
+            result = c.fetchone()
+            if result:
+                content = eval(result[0])  # Convert string back to list
+            else:
+                content = await get_page_content(session, current_url)
+                # Store in database
+                c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
+                conn.commit()
+            all_pages.append((current_url, content))
+            logger.info(f"Processed page: {current_url} at depth {depth}")
             if depth < max_depth:
+                links = await get_links(session, current_url, base_url)
                 for link in links:
                     if link not in visited:
                         to_visit.append((link, depth + 1))
     return all_pages
+def website_to_pdf(all_pages):
+    logger.info(f"Starting PDF generation for {len(all_pages)} pages")
     pdf = FPDF()
     pdf.set_auto_page_break(auto=True, margin=15)
     return pdf_path
+async def process_url(url, depth):
     try:
+        all_pages = await crawl_pages(url, depth)
+        pdf_file = website_to_pdf(all_pages)
         return pdf_file
     except Exception as e:
         logger.error(f"Error in process_url: {str(e)}")
         return f"An error occurred: {str(e)}"
+def run_async(url, depth):
+    return asyncio.run(process_url(url, depth))
 iface = gr.Interface(
+    fn=run_async,
     inputs=[
         gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"),
+        gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Crawl Depth")
     ],
     outputs=gr.File(label="Download PDF"),
     title="Website to PDF Converter",