KSh100 commited on
Commit
0deeeca
·
verified ·
1 Parent(s): b8a4185

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -10
app.py CHANGED
@@ -50,20 +50,21 @@ def extract_images(soup, base_url):
50
  images.append({"Alt Text": alt_text, "Image URL": full_img_url})
51
  return images
52
 
53
- def format_detailed_output(structured_data):
54
  """Formats the structured data into a Markdown string."""
55
- result = "### Structured Page Content\n\n"
56
- result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
57
- result += "**Links:**\n"
 
58
  if structured_data["Links"]:
59
- result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) + "\n"
60
  else:
61
- result += "No links found.\n"
62
- result += "**Images:**\n"
63
  if structured_data["Images"]:
64
- result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) + "\n"
65
  else:
66
- result += "No images found.\n"
67
  return result
68
 
69
  # Web Page Processing Function
@@ -75,12 +76,16 @@ def download_and_process_web_page(url):
75
  try:
76
  response = get(url)
77
  soup = response.soup()
 
 
 
 
78
  structured_data = {
79
  "Texts": extract_texts(soup),
80
  "Links": extract_links(soup, url),
81
  "Images": extract_images(soup, url)
82
  }
83
- return format_detailed_output(structured_data)
84
 
85
  except urllib3.exceptions.HTTPError as e:
86
  return f"Error: {e}"
 
50
  images.append({"Alt Text": alt_text, "Image URL": full_img_url})
51
  return images
52
 
53
+ def format_detailed_output(structured_data, page_title):
54
  """Formats the structured data into a Markdown string."""
55
+ result = f"## Page Title: {page_title}\n\n"
56
+ result += "### Texts\n\n"
57
+ result += " ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found."
58
+ result += "\n\n### Links\n\n"
59
  if structured_data["Links"]:
60
+ result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"])
61
  else:
62
+ result += "No links found."
63
+ result += "\n\n### Images\n\n"
64
  if structured_data["Images"]:
65
+ result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"])
66
  else:
67
+ result += "No images found."
68
  return result
69
 
70
  # Web Page Processing Function
 
76
  try:
77
  response = get(url)
78
  soup = response.soup()
79
+
80
+ # Extract page title
81
+ page_title = soup.title.string if soup.title else "No Title Found"
82
+
83
  structured_data = {
84
  "Texts": extract_texts(soup),
85
  "Links": extract_links(soup, url),
86
  "Images": extract_images(soup, url)
87
  }
88
+ return format_detailed_output(structured_data, page_title)
89
 
90
  except urllib3.exceptions.HTTPError as e:
91
  return f"Error: {e}"