import requests from bs4 import BeautifulSoup from fpdf import FPDF from urllib.parse import urljoin import re import streamlit as st import pandas as pd def clean_text(text: str) -> str: """Clean extracted text by removing extra whitespace and special characters.""" text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace and newlines return text def scrape_accountant_data(base_url: str) -> list: """Scrape accountant data from all pages of the given URL.""" data = [] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } current_page = base_url while current_page: response = requests.get(current_page, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Extract accountant entries entries = soup.find_all('div', class_='wpbdp-listing') for entry in entries: name = clean_text(entry.find('a').get_text() if entry.find('a') else "No Name") address = clean_text(entry.find('div', class_='wpbdp-field-address').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-address') else "No Address") business_type = clean_text(entry.find('div', class_='wpbdp-field-business_type').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-business_type') else "No Business Type") location = clean_text(entry.find('div', class_='wpbdp-field-location').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-location') else "No Location") tags = clean_text(", ".join([tag.get_text() for tag in entry.find('div', class_='wpbdp-field-tags').find_all('a')]) if entry.find('div', class_='wpbdp-field-tags') else "No Tags") data.append({ 'Name': name, 'Address': address, 'Business Type': business_type, 'Location': location, 'Tags': tags }) # Find the next page link within next_page_span = soup.find('span', class_='next') next_page_link = next_page_span.find('a')['href'] if next_page_span and next_page_span.find('a') else None current_page = urljoin(base_url, next_page_link) if next_page_link else None return data def generate_pdf(data: list, output_file: str): """Generate a PDF from the scraped data.""" pdf = FPDF() pdf.set_auto_page_break(auto=True, margin=15) pdf.add_page() # Use a Unicode-compatible font (TrueType Font) pdf.add_font('DejaVu', '', 'DejaVuSans.ttf', uni=True) # Make sure you have DejaVuSans.ttf file pdf.set_font('DejaVu', '', 12) pdf.cell(0, 10, 'Accountant Directory', ln=True, align='C') pdf.ln(10) for entry in data: name = entry['Name'] address = entry['Address'] business_type = entry['Business Type'] location = entry['Location'] tags = entry['Tags'] pdf.cell(0, 10, f"Name: {name}", ln=True) pdf.cell(0, 10, f"Address: {address}", ln=True) pdf.cell(0, 10, f"Business Type: {business_type}", ln=True) pdf.cell(0, 10, f"Location: {location}", ln=True) pdf.cell(0, 10, f"Tags: {tags}", ln=True) pdf.ln(10) pdf.output(output_file) def generate_excel(data: list, output_file: str): """Generate an Excel file from the scraped data.""" df = pd.DataFrame(data) df.to_excel(output_file, index=False, engine='openpyxl') def main(): st.title("Accountant Directory Scraper") base_url = st.text_input("Enter the URL to scrape:", "https://www.bangladeshcircle.com/bangladesh-business-directory/wpbdp_category/accountant/") # Text input fields for custom file names pdf_filename = st.text_input("Enter the name for the PDF file (without extension):", "accountant_directory") excel_filename = st.text_input("Enter the name for the Excel file (without extension):", "accountant_directory") # Add extension to the filenames pdf_output_file = f"{pdf_filename}.pdf" excel_output_file = f"{excel_filename}.xlsx" if st.button("Scrape Data"): with st.spinner("Scraping data, please wait..."): data = scrape_accountant_data(base_url) if data: # Generate PDF and Excel files with custom names generate_pdf(data, pdf_output_file) generate_excel(data, excel_output_file) st.success(f"Scraping complete! PDF and Excel files generated with names: {pdf_filename}.pdf and {excel_filename}.xlsx.") # Provide download links for both files with open(pdf_output_file, "rb") as pdf_file: st.download_button( label="Download PDF", data=pdf_file, file_name=pdf_output_file, mime="application/pdf" ) with open(excel_output_file, "rb") as excel_file: st.download_button( label="Download Excel", data=excel_file, file_name=excel_output_file, mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) else: st.error("No data found to scrape.") if __name__ == "__main__": main()