Spaces:

Felguk
/

Felguk-url-to-text

Running

App Files Files Community

Felguk commited on Jan 17

Commit

4f25906

verified ·

1 Parent(s): a682e5d

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -26

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 import requests
 from urllib.parse import urlparse, urljoin
 from bs4 import BeautifulSoup
 def is_valid_url(url):
     """Checks if the string is a valid URL."""
@@ -11,36 +12,40 @@ def is_valid_url(url):
     except:
         return False
-def extract_additional_resources(url):
     """Extracts links to CSS, JS, and images from HTML code."""
     try:
-        response = requests.get(url)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, "html.parser")
-        # Extract CSS links
-        css_links = [urljoin(url, link["href"]) for link in soup.find_all("link", rel="stylesheet") if "href" in link.attrs]
-        # Extract JS links
-        js_links = [urljoin(url, script["src"]) for script in soup.find_all("script") if "src" in script.attrs]
-        # Extract image links
-        img_links = [urljoin(url, img["src"]) for img in soup.find_all("img") if "src" in img.attrs]
-        return css_links, js_links, img_links
-    except Exception as e:
-        return [], [], []
-def fetch_file_content(url):
-    """Fetches the content of a file (CSS, JS, etc.) from a URL."""
-    try:
-        response = requests.get(url)
-        response.raise_for_status()
-        return response.text
-    except:
-        return "Failed to fetch content."
-def convert_to_text(url):
     if not is_valid_url(url):
         return "Error: Please enter a valid URL.", "", None, [], [], [], [], []  # Return error message and empty data
@@ -49,7 +54,7 @@ def convert_to_text(url):
         headers = {
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
         }
-        response = requests.get(url, headers=headers)
         response.raise_for_status()  # Check for HTTP errors (e.g., 404, 500)
         # Return results
@@ -63,11 +68,7 @@ def convert_to_text(url):
             file.write(response.text)
         # Extract additional resources
-        css_links, js_links, img_links = extract_additional_resources(url)
-        # Fetch CSS and JS content
-        css_content = [fetch_file_content(link) for link in css_links]
-        js_content = [fetch_file_content(link) for link in js_links]
         return results, response.text, file_path, css_links, js_links, img_links, css_content, js_content
     except requests.exceptions.RequestException as e:

 import requests
 from urllib.parse import urlparse, urljoin
 from bs4 import BeautifulSoup
+import asyncio
 def is_valid_url(url):
     """Checks if the string is a valid URL."""
     except:
         return False
+async def fetch_file_content(url):
+    """Fetches the content of a file (CSS, JS, etc.) from a URL."""
+    try:
+        response = await asyncio.to_thread(requests.get, url, timeout=5)
+        response.raise_for_status()
+        return response.text
+    except:
+        return "Failed to fetch content."
+async def extract_additional_resources(url):
     """Extracts links to CSS, JS, and images from HTML code."""
     try:
+        response = await asyncio.to_thread(requests.get, url, timeout=5)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, "html.parser")
+        # Extract CSS links (limit to 5)
+        css_links = [urljoin(url, link["href"]) for link in soup.find_all("link", rel="stylesheet") if "href" in link.attrs][:5]
+        # Extract JS links (limit to 5)
+        js_links = [urljoin(url, script["src"]) for script in soup.find_all("script") if "src" in script.attrs][:5]
+        # Extract image links (limit to 5)
+        img_links = [urljoin(url, img["src"]) for img in soup.find_all("img") if "src" in img.attrs][:5]
+        # Fetch CSS and JS content asynchronously
+        css_content = await asyncio.gather(*[fetch_file_content(link) for link in css_links])
+        js_content = await asyncio.gather(*[fetch_file_content(link) for link in js_links])
+        return css_links, js_links, img_links, css_content, js_content
+    except Exception as e:
+        return [], [], [], [], []
+async def convert_to_text(url):
     if not is_valid_url(url):
         return "Error: Please enter a valid URL.", "", None, [], [], [], [], []  # Return error message and empty data
         headers = {
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
         }
+        response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=5)
         response.raise_for_status()  # Check for HTTP errors (e.g., 404, 500)
         # Return results
             file.write(response.text)
         # Extract additional resources
+        css_links, js_links, img_links, css_content, js_content = await extract_additional_resources(url)
         return results, response.text, file_path, css_links, js_links, img_links, css_content, js_content
     except requests.exceptions.RequestException as e: