Felguk commited on
Commit
4f25906
Β·
verified Β·
1 Parent(s): a682e5d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -26
app.py CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
2
  import requests
3
  from urllib.parse import urlparse, urljoin
4
  from bs4 import BeautifulSoup
 
5
 
6
  def is_valid_url(url):
7
  """Checks if the string is a valid URL."""
@@ -11,36 +12,40 @@ def is_valid_url(url):
11
  except:
12
  return False
13
 
14
- def extract_additional_resources(url):
 
 
 
 
 
 
 
 
 
15
  """Extracts links to CSS, JS, and images from HTML code."""
16
  try:
17
- response = requests.get(url)
18
  response.raise_for_status()
19
  soup = BeautifulSoup(response.text, "html.parser")
20
 
21
- # Extract CSS links
22
- css_links = [urljoin(url, link["href"]) for link in soup.find_all("link", rel="stylesheet") if "href" in link.attrs]
23
 
24
- # Extract JS links
25
- js_links = [urljoin(url, script["src"]) for script in soup.find_all("script") if "src" in script.attrs]
26
 
27
- # Extract image links
28
- img_links = [urljoin(url, img["src"]) for img in soup.find_all("img") if "src" in img.attrs]
29
 
30
- return css_links, js_links, img_links
31
- except Exception as e:
32
- return [], [], []
33
 
34
- def fetch_file_content(url):
35
- """Fetches the content of a file (CSS, JS, etc.) from a URL."""
36
- try:
37
- response = requests.get(url)
38
- response.raise_for_status()
39
- return response.text
40
- except:
41
- return "Failed to fetch content."
42
 
43
- def convert_to_text(url):
44
  if not is_valid_url(url):
45
  return "Error: Please enter a valid URL.", "", None, [], [], [], [], [] # Return error message and empty data
46
 
@@ -49,7 +54,7 @@ def convert_to_text(url):
49
  headers = {
50
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
51
  }
52
- response = requests.get(url, headers=headers)
53
  response.raise_for_status() # Check for HTTP errors (e.g., 404, 500)
54
 
55
  # Return results
@@ -63,11 +68,7 @@ def convert_to_text(url):
63
  file.write(response.text)
64
 
65
  # Extract additional resources
66
- css_links, js_links, img_links = extract_additional_resources(url)
67
-
68
- # Fetch CSS and JS content
69
- css_content = [fetch_file_content(link) for link in css_links]
70
- js_content = [fetch_file_content(link) for link in js_links]
71
 
72
  return results, response.text, file_path, css_links, js_links, img_links, css_content, js_content
73
  except requests.exceptions.RequestException as e:
 
2
  import requests
3
  from urllib.parse import urlparse, urljoin
4
  from bs4 import BeautifulSoup
5
+ import asyncio
6
 
7
  def is_valid_url(url):
8
  """Checks if the string is a valid URL."""
 
12
  except:
13
  return False
14
 
15
+ async def fetch_file_content(url):
16
+ """Fetches the content of a file (CSS, JS, etc.) from a URL."""
17
+ try:
18
+ response = await asyncio.to_thread(requests.get, url, timeout=5)
19
+ response.raise_for_status()
20
+ return response.text
21
+ except:
22
+ return "Failed to fetch content."
23
+
24
+ async def extract_additional_resources(url):
25
  """Extracts links to CSS, JS, and images from HTML code."""
26
  try:
27
+ response = await asyncio.to_thread(requests.get, url, timeout=5)
28
  response.raise_for_status()
29
  soup = BeautifulSoup(response.text, "html.parser")
30
 
31
+ # Extract CSS links (limit to 5)
32
+ css_links = [urljoin(url, link["href"]) for link in soup.find_all("link", rel="stylesheet") if "href" in link.attrs][:5]
33
 
34
+ # Extract JS links (limit to 5)
35
+ js_links = [urljoin(url, script["src"]) for script in soup.find_all("script") if "src" in script.attrs][:5]
36
 
37
+ # Extract image links (limit to 5)
38
+ img_links = [urljoin(url, img["src"]) for img in soup.find_all("img") if "src" in img.attrs][:5]
39
 
40
+ # Fetch CSS and JS content asynchronously
41
+ css_content = await asyncio.gather(*[fetch_file_content(link) for link in css_links])
42
+ js_content = await asyncio.gather(*[fetch_file_content(link) for link in js_links])
43
 
44
+ return css_links, js_links, img_links, css_content, js_content
45
+ except Exception as e:
46
+ return [], [], [], [], []
 
 
 
 
 
47
 
48
+ async def convert_to_text(url):
49
  if not is_valid_url(url):
50
  return "Error: Please enter a valid URL.", "", None, [], [], [], [], [] # Return error message and empty data
51
 
 
54
  headers = {
55
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
56
  }
57
+ response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=5)
58
  response.raise_for_status() # Check for HTTP errors (e.g., 404, 500)
59
 
60
  # Return results
 
68
  file.write(response.text)
69
 
70
  # Extract additional resources
71
+ css_links, js_links, img_links, css_content, js_content = await extract_additional_resources(url)
 
 
 
 
72
 
73
  return results, response.text, file_path, css_links, js_links, img_links, css_content, js_content
74
  except requests.exceptions.RequestException as e: