shukdevdatta123's picture
Update app.py
6f9e750 verified
import requests
from bs4 import BeautifulSoup
from fpdf import FPDF
from urllib.parse import urljoin
import re
import streamlit as st
import pandas as pd
def clean_text(text: str) -> str:
"""Clean extracted text by removing extra whitespace and special characters."""
text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace and newlines
return text
def scrape_accountant_data(base_url: str) -> list:
"""Scrape accountant data from all pages of the given URL."""
data = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
current_page = base_url
while current_page:
response = requests.get(current_page, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extract accountant entries
entries = soup.find_all('div', class_='wpbdp-listing')
for entry in entries:
name = clean_text(entry.find('a').get_text() if entry.find('a') else "No Name")
address = clean_text(entry.find('div', class_='wpbdp-field-address').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-address') else "No Address")
business_type = clean_text(entry.find('div', class_='wpbdp-field-business_type').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-business_type') else "No Business Type")
location = clean_text(entry.find('div', class_='wpbdp-field-location').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-location') else "No Location")
tags = clean_text(", ".join([tag.get_text() for tag in entry.find('div', class_='wpbdp-field-tags').find_all('a')]) if entry.find('div', class_='wpbdp-field-tags') else "No Tags")
data.append({
'Name': name,
'Address': address,
'Business Type': business_type,
'Location': location,
'Tags': tags
})
# Find the next page link within <span class="next">
next_page_span = soup.find('span', class_='next')
next_page_link = next_page_span.find('a')['href'] if next_page_span and next_page_span.find('a') else None
current_page = urljoin(base_url, next_page_link) if next_page_link else None
return data
def generate_pdf(data: list, output_file: str):
"""Generate a PDF from the scraped data."""
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
# Use a Unicode-compatible font (TrueType Font)
pdf.add_font('DejaVu', '', 'DejaVuSans.ttf', uni=True) # Make sure you have DejaVuSans.ttf file
pdf.set_font('DejaVu', '', 12)
pdf.cell(0, 10, 'Accountant Directory', ln=True, align='C')
pdf.ln(10)
for entry in data:
name = entry['Name']
address = entry['Address']
business_type = entry['Business Type']
location = entry['Location']
tags = entry['Tags']
pdf.cell(0, 10, f"Name: {name}", ln=True)
pdf.cell(0, 10, f"Address: {address}", ln=True)
pdf.cell(0, 10, f"Business Type: {business_type}", ln=True)
pdf.cell(0, 10, f"Location: {location}", ln=True)
pdf.cell(0, 10, f"Tags: {tags}", ln=True)
pdf.ln(10)
pdf.output(output_file)
def generate_excel(data: list, output_file: str):
"""Generate an Excel file from the scraped data."""
df = pd.DataFrame(data)
df.to_excel(output_file, index=False, engine='openpyxl')
def main():
st.title("Accountant Directory Scraper")
base_url = st.text_input("Enter the URL to scrape:", "https://www.bangladeshcircle.com/bangladesh-business-directory/wpbdp_category/accountant/")
# Text input fields for custom file names
pdf_filename = st.text_input("Enter the name for the PDF file (without extension):", "accountant_directory")
excel_filename = st.text_input("Enter the name for the Excel file (without extension):", "accountant_directory")
# Add extension to the filenames
pdf_output_file = f"{pdf_filename}.pdf"
excel_output_file = f"{excel_filename}.xlsx"
if st.button("Scrape Data"):
with st.spinner("Scraping data, please wait..."):
data = scrape_accountant_data(base_url)
if data:
# Generate PDF and Excel files with custom names
generate_pdf(data, pdf_output_file)
generate_excel(data, excel_output_file)
st.success(f"Scraping complete! PDF and Excel files generated with names: {pdf_filename}.pdf and {excel_filename}.xlsx.")
# Provide download links for both files
with open(pdf_output_file, "rb") as pdf_file:
st.download_button(
label="Download PDF",
data=pdf_file,
file_name=pdf_output_file,
mime="application/pdf"
)
with open(excel_output_file, "rb") as excel_file:
st.download_button(
label="Download Excel",
data=excel_file,
file_name=excel_output_file,
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
else:
st.error("No data found to scrape.")
if __name__ == "__main__":
main()