Spaces:

shukdevdatta123
/

bangladeshcircle-scrapper

Running

File size: 5,506 Bytes

import requests
from bs4 import BeautifulSoup
from fpdf import FPDF
from urllib.parse import urljoin
import re
import streamlit as st
import pandas as pd

def clean_text(text: str) -> str:
    """Clean extracted text by removing extra whitespace and special characters."""
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace and newlines
    return text

def scrape_accountant_data(base_url: str) -> list:
    """Scrape accountant data from all pages of the given URL."""
    data = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    current_page = base_url

    while current_page:
        response = requests.get(current_page, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract accountant entries
        entries = soup.find_all('div', class_='wpbdp-listing')
        for entry in entries:
            name = clean_text(entry.find('a').get_text() if entry.find('a') else "No Name")
            address = clean_text(entry.find('div', class_='wpbdp-field-address').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-address') else "No Address")
            business_type = clean_text(entry.find('div', class_='wpbdp-field-business_type').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-business_type') else "No Business Type")
            location = clean_text(entry.find('div', class_='wpbdp-field-location').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-location') else "No Location")
            tags = clean_text(", ".join([tag.get_text() for tag in entry.find('div', class_='wpbdp-field-tags').find_all('a')]) if entry.find('div', class_='wpbdp-field-tags') else "No Tags")

            data.append({
                'Name': name,
                'Address': address,
                'Business Type': business_type,
                'Location': location,
                'Tags': tags
            })

        # Find the next page link within <span class="next">
        next_page_span = soup.find('span', class_='next')
        next_page_link = next_page_span.find('a')['href'] if next_page_span and next_page_span.find('a') else None
        current_page = urljoin(base_url, next_page_link) if next_page_link else None

    return data

def generate_pdf(data: list, output_file: str):
    """Generate a PDF from the scraped data."""
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()

    # Use a Unicode-compatible font (TrueType Font)
    pdf.add_font('DejaVu', '', 'DejaVuSans.ttf', uni=True)  # Make sure you have DejaVuSans.ttf file
    pdf.set_font('DejaVu', '', 12)

    pdf.cell(0, 10, 'Accountant Directory', ln=True, align='C')
    pdf.ln(10)

    for entry in data:
        name = entry['Name']
        address = entry['Address']
        business_type = entry['Business Type']
        location = entry['Location']
        tags = entry['Tags']

        pdf.cell(0, 10, f"Name: {name}", ln=True)
        pdf.cell(0, 10, f"Address: {address}", ln=True)
        pdf.cell(0, 10, f"Business Type: {business_type}", ln=True)
        pdf.cell(0, 10, f"Location: {location}", ln=True)
        pdf.cell(0, 10, f"Tags: {tags}", ln=True)
        pdf.ln(10)

    pdf.output(output_file)

def generate_excel(data: list, output_file: str):
    """Generate an Excel file from the scraped data."""
    df = pd.DataFrame(data)
    df.to_excel(output_file, index=False, engine='openpyxl')

def main():
    st.title("Accountant Directory Scraper")

    base_url = st.text_input("Enter the URL to scrape:", "https://www.bangladeshcircle.com/bangladesh-business-directory/wpbdp_category/accountant/")

    # Text input fields for custom file names
    pdf_filename = st.text_input("Enter the name for the PDF file (without extension):", "accountant_directory")
    excel_filename = st.text_input("Enter the name for the Excel file (without extension):", "accountant_directory")

    # Add extension to the filenames
    pdf_output_file = f"{pdf_filename}.pdf"
    excel_output_file = f"{excel_filename}.xlsx"

    if st.button("Scrape Data"):
        with st.spinner("Scraping data, please wait..."):
            data = scrape_accountant_data(base_url)

        if data:
            # Generate PDF and Excel files with custom names
            generate_pdf(data, pdf_output_file)
            generate_excel(data, excel_output_file)

            st.success(f"Scraping complete! PDF and Excel files generated with names: {pdf_filename}.pdf and {excel_filename}.xlsx.")
            
            # Provide download links for both files
            with open(pdf_output_file, "rb") as pdf_file:
                st.download_button(
                    label="Download PDF",
                    data=pdf_file,
                    file_name=pdf_output_file,
                    mime="application/pdf"
                )

            with open(excel_output_file, "rb") as excel_file:
                st.download_button(
                    label="Download Excel",
                    data=excel_file,
                    file_name=excel_output_file,
                    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                )
        else:
            st.error("No data found to scrape.")

if __name__ == "__main__":
    main()