File size: 5,506 Bytes
70a6690
cc376b5
449019a
 
 
 
36da19a
cc376b5
449019a
 
 
 
9ce9160
449019a
 
 
 
 
 
9ce9160
449019a
9ce9160
449019a
 
 
cc376b5
9ce9160
449019a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f9e750
 
 
 
 
449019a
 
 
 
6f9e750
 
 
 
 
 
 
 
 
 
 
449019a
 
 
 
36da19a
 
 
 
 
449019a
 
 
 
 
0f9dd22
 
 
 
 
 
 
 
449019a
 
 
 
 
0f9dd22
36da19a
 
 
0f9dd22
36da19a
 
 
449019a
 
 
36da19a
449019a
 
36da19a
 
 
 
 
 
 
 
449019a
 
 
 
36da19a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import requests
from bs4 import BeautifulSoup
from fpdf import FPDF
from urllib.parse import urljoin
import re
import streamlit as st
import pandas as pd

def clean_text(text: str) -> str:
    """Clean extracted text by removing extra whitespace and special characters."""
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace and newlines
    return text

def scrape_accountant_data(base_url: str) -> list:
    """Scrape accountant data from all pages of the given URL."""
    data = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    current_page = base_url

    while current_page:
        response = requests.get(current_page, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract accountant entries
        entries = soup.find_all('div', class_='wpbdp-listing')
        for entry in entries:
            name = clean_text(entry.find('a').get_text() if entry.find('a') else "No Name")
            address = clean_text(entry.find('div', class_='wpbdp-field-address').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-address') else "No Address")
            business_type = clean_text(entry.find('div', class_='wpbdp-field-business_type').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-business_type') else "No Business Type")
            location = clean_text(entry.find('div', class_='wpbdp-field-location').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-location') else "No Location")
            tags = clean_text(", ".join([tag.get_text() for tag in entry.find('div', class_='wpbdp-field-tags').find_all('a')]) if entry.find('div', class_='wpbdp-field-tags') else "No Tags")

            data.append({
                'Name': name,
                'Address': address,
                'Business Type': business_type,
                'Location': location,
                'Tags': tags
            })

        # Find the next page link within <span class="next">
        next_page_span = soup.find('span', class_='next')
        next_page_link = next_page_span.find('a')['href'] if next_page_span and next_page_span.find('a') else None
        current_page = urljoin(base_url, next_page_link) if next_page_link else None

    return data

def generate_pdf(data: list, output_file: str):
    """Generate a PDF from the scraped data."""
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()

    # Use a Unicode-compatible font (TrueType Font)
    pdf.add_font('DejaVu', '', 'DejaVuSans.ttf', uni=True)  # Make sure you have DejaVuSans.ttf file
    pdf.set_font('DejaVu', '', 12)

    pdf.cell(0, 10, 'Accountant Directory', ln=True, align='C')
    pdf.ln(10)

    for entry in data:
        name = entry['Name']
        address = entry['Address']
        business_type = entry['Business Type']
        location = entry['Location']
        tags = entry['Tags']

        pdf.cell(0, 10, f"Name: {name}", ln=True)
        pdf.cell(0, 10, f"Address: {address}", ln=True)
        pdf.cell(0, 10, f"Business Type: {business_type}", ln=True)
        pdf.cell(0, 10, f"Location: {location}", ln=True)
        pdf.cell(0, 10, f"Tags: {tags}", ln=True)
        pdf.ln(10)

    pdf.output(output_file)

def generate_excel(data: list, output_file: str):
    """Generate an Excel file from the scraped data."""
    df = pd.DataFrame(data)
    df.to_excel(output_file, index=False, engine='openpyxl')

def main():
    st.title("Accountant Directory Scraper")

    base_url = st.text_input("Enter the URL to scrape:", "https://www.bangladeshcircle.com/bangladesh-business-directory/wpbdp_category/accountant/")

    # Text input fields for custom file names
    pdf_filename = st.text_input("Enter the name for the PDF file (without extension):", "accountant_directory")
    excel_filename = st.text_input("Enter the name for the Excel file (without extension):", "accountant_directory")

    # Add extension to the filenames
    pdf_output_file = f"{pdf_filename}.pdf"
    excel_output_file = f"{excel_filename}.xlsx"

    if st.button("Scrape Data"):
        with st.spinner("Scraping data, please wait..."):
            data = scrape_accountant_data(base_url)

        if data:
            # Generate PDF and Excel files with custom names
            generate_pdf(data, pdf_output_file)
            generate_excel(data, excel_output_file)

            st.success(f"Scraping complete! PDF and Excel files generated with names: {pdf_filename}.pdf and {excel_filename}.xlsx.")
            
            # Provide download links for both files
            with open(pdf_output_file, "rb") as pdf_file:
                st.download_button(
                    label="Download PDF",
                    data=pdf_file,
                    file_name=pdf_output_file,
                    mime="application/pdf"
                )

            with open(excel_output_file, "rb") as excel_file:
                st.download_button(
                    label="Download Excel",
                    data=excel_file,
                    file_name=excel_output_file,
                    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                )
        else:
            st.error("No data found to scrape.")

if __name__ == "__main__":
    main()