File size: 5,506 Bytes
70a6690 cc376b5 449019a 36da19a cc376b5 449019a 9ce9160 449019a 9ce9160 449019a 9ce9160 449019a cc376b5 9ce9160 449019a 6f9e750 449019a 6f9e750 449019a 36da19a 449019a 0f9dd22 449019a 0f9dd22 36da19a 0f9dd22 36da19a 449019a 36da19a 449019a 36da19a 449019a 36da19a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import requests
from bs4 import BeautifulSoup
from fpdf import FPDF
from urllib.parse import urljoin
import re
import streamlit as st
import pandas as pd
def clean_text(text: str) -> str:
"""Clean extracted text by removing extra whitespace and special characters."""
text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace and newlines
return text
def scrape_accountant_data(base_url: str) -> list:
"""Scrape accountant data from all pages of the given URL."""
data = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
current_page = base_url
while current_page:
response = requests.get(current_page, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extract accountant entries
entries = soup.find_all('div', class_='wpbdp-listing')
for entry in entries:
name = clean_text(entry.find('a').get_text() if entry.find('a') else "No Name")
address = clean_text(entry.find('div', class_='wpbdp-field-address').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-address') else "No Address")
business_type = clean_text(entry.find('div', class_='wpbdp-field-business_type').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-business_type') else "No Business Type")
location = clean_text(entry.find('div', class_='wpbdp-field-location').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-location') else "No Location")
tags = clean_text(", ".join([tag.get_text() for tag in entry.find('div', class_='wpbdp-field-tags').find_all('a')]) if entry.find('div', class_='wpbdp-field-tags') else "No Tags")
data.append({
'Name': name,
'Address': address,
'Business Type': business_type,
'Location': location,
'Tags': tags
})
# Find the next page link within <span class="next">
next_page_span = soup.find('span', class_='next')
next_page_link = next_page_span.find('a')['href'] if next_page_span and next_page_span.find('a') else None
current_page = urljoin(base_url, next_page_link) if next_page_link else None
return data
def generate_pdf(data: list, output_file: str):
"""Generate a PDF from the scraped data."""
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
# Use a Unicode-compatible font (TrueType Font)
pdf.add_font('DejaVu', '', 'DejaVuSans.ttf', uni=True) # Make sure you have DejaVuSans.ttf file
pdf.set_font('DejaVu', '', 12)
pdf.cell(0, 10, 'Accountant Directory', ln=True, align='C')
pdf.ln(10)
for entry in data:
name = entry['Name']
address = entry['Address']
business_type = entry['Business Type']
location = entry['Location']
tags = entry['Tags']
pdf.cell(0, 10, f"Name: {name}", ln=True)
pdf.cell(0, 10, f"Address: {address}", ln=True)
pdf.cell(0, 10, f"Business Type: {business_type}", ln=True)
pdf.cell(0, 10, f"Location: {location}", ln=True)
pdf.cell(0, 10, f"Tags: {tags}", ln=True)
pdf.ln(10)
pdf.output(output_file)
def generate_excel(data: list, output_file: str):
"""Generate an Excel file from the scraped data."""
df = pd.DataFrame(data)
df.to_excel(output_file, index=False, engine='openpyxl')
def main():
st.title("Accountant Directory Scraper")
base_url = st.text_input("Enter the URL to scrape:", "https://www.bangladeshcircle.com/bangladesh-business-directory/wpbdp_category/accountant/")
# Text input fields for custom file names
pdf_filename = st.text_input("Enter the name for the PDF file (without extension):", "accountant_directory")
excel_filename = st.text_input("Enter the name for the Excel file (without extension):", "accountant_directory")
# Add extension to the filenames
pdf_output_file = f"{pdf_filename}.pdf"
excel_output_file = f"{excel_filename}.xlsx"
if st.button("Scrape Data"):
with st.spinner("Scraping data, please wait..."):
data = scrape_accountant_data(base_url)
if data:
# Generate PDF and Excel files with custom names
generate_pdf(data, pdf_output_file)
generate_excel(data, excel_output_file)
st.success(f"Scraping complete! PDF and Excel files generated with names: {pdf_filename}.pdf and {excel_filename}.xlsx.")
# Provide download links for both files
with open(pdf_output_file, "rb") as pdf_file:
st.download_button(
label="Download PDF",
data=pdf_file,
file_name=pdf_output_file,
mime="application/pdf"
)
with open(excel_output_file, "rb") as excel_file:
st.download_button(
label="Download Excel",
data=excel_file,
file_name=excel_output_file,
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
else:
st.error("No data found to scrape.")
if __name__ == "__main__":
main()
|