|
import requests |
|
from bs4 import BeautifulSoup |
|
from fpdf import FPDF |
|
from urllib.parse import urljoin |
|
import re |
|
import streamlit as st |
|
import pandas as pd |
|
|
|
def clean_text(text: str) -> str: |
|
"""Clean extracted text by removing extra whitespace and special characters.""" |
|
text = re.sub(r'\s+', ' ', text).strip() |
|
return text |
|
|
|
def scrape_accountant_data(base_url: str) -> list: |
|
"""Scrape accountant data from all pages of the given URL.""" |
|
data = [] |
|
headers = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
|
} |
|
|
|
current_page = base_url |
|
|
|
while current_page: |
|
response = requests.get(current_page, headers=headers, timeout=10) |
|
response.raise_for_status() |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
|
|
entries = soup.find_all('div', class_='wpbdp-listing') |
|
for entry in entries: |
|
name = clean_text(entry.find('a').get_text() if entry.find('a') else "No Name") |
|
address = clean_text(entry.find('div', class_='wpbdp-field-address').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-address') else "No Address") |
|
business_type = clean_text(entry.find('div', class_='wpbdp-field-business_type').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-business_type') else "No Business Type") |
|
location = clean_text(entry.find('div', class_='wpbdp-field-location').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-location') else "No Location") |
|
tags = clean_text(", ".join([tag.get_text() for tag in entry.find('div', class_='wpbdp-field-tags').find_all('a')]) if entry.find('div', class_='wpbdp-field-tags') else "No Tags") |
|
|
|
data.append({ |
|
'Name': name, |
|
'Address': address, |
|
'Business Type': business_type, |
|
'Location': location, |
|
'Tags': tags |
|
}) |
|
|
|
|
|
next_page_span = soup.find('span', class_='next') |
|
next_page_link = next_page_span.find('a')['href'] if next_page_span and next_page_span.find('a') else None |
|
current_page = urljoin(base_url, next_page_link) if next_page_link else None |
|
|
|
return data |
|
|
|
def generate_pdf(data: list, output_file: str): |
|
"""Generate a PDF from the scraped data.""" |
|
pdf = FPDF() |
|
pdf.set_auto_page_break(auto=True, margin=15) |
|
pdf.add_page() |
|
|
|
|
|
pdf.add_font('DejaVu', '', 'DejaVuSans.ttf', uni=True) |
|
pdf.set_font('DejaVu', '', 12) |
|
|
|
pdf.cell(0, 10, 'Accountant Directory', ln=True, align='C') |
|
pdf.ln(10) |
|
|
|
for entry in data: |
|
name = entry['Name'] |
|
address = entry['Address'] |
|
business_type = entry['Business Type'] |
|
location = entry['Location'] |
|
tags = entry['Tags'] |
|
|
|
pdf.cell(0, 10, f"Name: {name}", ln=True) |
|
pdf.cell(0, 10, f"Address: {address}", ln=True) |
|
pdf.cell(0, 10, f"Business Type: {business_type}", ln=True) |
|
pdf.cell(0, 10, f"Location: {location}", ln=True) |
|
pdf.cell(0, 10, f"Tags: {tags}", ln=True) |
|
pdf.ln(10) |
|
|
|
pdf.output(output_file) |
|
|
|
def generate_excel(data: list, output_file: str): |
|
"""Generate an Excel file from the scraped data.""" |
|
df = pd.DataFrame(data) |
|
df.to_excel(output_file, index=False, engine='openpyxl') |
|
|
|
def main(): |
|
st.title("Accountant Directory Scraper") |
|
|
|
base_url = st.text_input("Enter the URL to scrape:", "https://www.bangladeshcircle.com/bangladesh-business-directory/wpbdp_category/accountant/") |
|
|
|
|
|
pdf_filename = st.text_input("Enter the name for the PDF file (without extension):", "accountant_directory") |
|
excel_filename = st.text_input("Enter the name for the Excel file (without extension):", "accountant_directory") |
|
|
|
|
|
pdf_output_file = f"{pdf_filename}.pdf" |
|
excel_output_file = f"{excel_filename}.xlsx" |
|
|
|
if st.button("Scrape Data"): |
|
with st.spinner("Scraping data, please wait..."): |
|
data = scrape_accountant_data(base_url) |
|
|
|
if data: |
|
|
|
generate_pdf(data, pdf_output_file) |
|
generate_excel(data, excel_output_file) |
|
|
|
st.success(f"Scraping complete! PDF and Excel files generated with names: {pdf_filename}.pdf and {excel_filename}.xlsx.") |
|
|
|
|
|
with open(pdf_output_file, "rb") as pdf_file: |
|
st.download_button( |
|
label="Download PDF", |
|
data=pdf_file, |
|
file_name=pdf_output_file, |
|
mime="application/pdf" |
|
) |
|
|
|
with open(excel_output_file, "rb") as excel_file: |
|
st.download_button( |
|
label="Download Excel", |
|
data=excel_file, |
|
file_name=excel_output_file, |
|
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
|
) |
|
else: |
|
st.error("No data found to scrape.") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|