Spaces:

shukdevdatta123
/

bangladeshcircle-scrapper

Running

App Files Files Community

bangladeshcircle-scrapper / app.py

shukdevdatta123

Update app.py

6f9e750 verified 7 months ago

raw

history blame contribute delete

5.51 kB

	import requests
	from bs4 import BeautifulSoup
	from fpdf import FPDF
	from urllib.parse import urljoin
	import re
	import streamlit as st
	import pandas as pd

	def clean_text(text: str) -> str:
	"""Clean extracted text by removing extra whitespace and special characters."""
	text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace and newlines
	return text

	def scrape_accountant_data(base_url: str) -> list:
	"""Scrape accountant data from all pages of the given URL."""
	data = []
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	current_page = base_url

	while current_page:
	response = requests.get(current_page, headers=headers, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')

	# Extract accountant entries
	entries = soup.find_all('div', class_='wpbdp-listing')
	for entry in entries:
	name = clean_text(entry.find('a').get_text() if entry.find('a') else "No Name")
	address = clean_text(entry.find('div', class_='wpbdp-field-address').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-address') else "No Address")
	business_type = clean_text(entry.find('div', class_='wpbdp-field-business_type').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-business_type') else "No Business Type")
	location = clean_text(entry.find('div', class_='wpbdp-field-location').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-location') else "No Location")
	tags = clean_text(", ".join([tag.get_text() for tag in entry.find('div', class_='wpbdp-field-tags').find_all('a')]) if entry.find('div', class_='wpbdp-field-tags') else "No Tags")

	data.append({
	'Name': name,
	'Address': address,
	'Business Type': business_type,
	'Location': location,
	'Tags': tags
	})

	# Find the next page link within <span class="next">
	next_page_span = soup.find('span', class_='next')
	next_page_link = next_page_span.find('a')['href'] if next_page_span and next_page_span.find('a') else None
	current_page = urljoin(base_url, next_page_link) if next_page_link else None

	return data

	def generate_pdf(data: list, output_file: str):
	"""Generate a PDF from the scraped data."""
	pdf = FPDF()
	pdf.set_auto_page_break(auto=True, margin=15)
	pdf.add_page()

	# Use a Unicode-compatible font (TrueType Font)
	pdf.add_font('DejaVu', '', 'DejaVuSans.ttf', uni=True) # Make sure you have DejaVuSans.ttf file
	pdf.set_font('DejaVu', '', 12)

	pdf.cell(0, 10, 'Accountant Directory', ln=True, align='C')
	pdf.ln(10)

	for entry in data:
	name = entry['Name']
	address = entry['Address']
	business_type = entry['Business Type']
	location = entry['Location']
	tags = entry['Tags']

	pdf.cell(0, 10, f"Name: {name}", ln=True)
	pdf.cell(0, 10, f"Address: {address}", ln=True)
	pdf.cell(0, 10, f"Business Type: {business_type}", ln=True)
	pdf.cell(0, 10, f"Location: {location}", ln=True)
	pdf.cell(0, 10, f"Tags: {tags}", ln=True)
	pdf.ln(10)

	pdf.output(output_file)

	def generate_excel(data: list, output_file: str):
	"""Generate an Excel file from the scraped data."""
	df = pd.DataFrame(data)
	df.to_excel(output_file, index=False, engine='openpyxl')

	def main():
	st.title("Accountant Directory Scraper")

	base_url = st.text_input("Enter the URL to scrape:", "https://www.bangladeshcircle.com/bangladesh-business-directory/wpbdp_category/accountant/")

	# Text input fields for custom file names
	pdf_filename = st.text_input("Enter the name for the PDF file (without extension):", "accountant_directory")
	excel_filename = st.text_input("Enter the name for the Excel file (without extension):", "accountant_directory")

	# Add extension to the filenames
	pdf_output_file = f"{pdf_filename}.pdf"
	excel_output_file = f"{excel_filename}.xlsx"

	if st.button("Scrape Data"):
	with st.spinner("Scraping data, please wait..."):
	data = scrape_accountant_data(base_url)

	if data:
	# Generate PDF and Excel files with custom names
	generate_pdf(data, pdf_output_file)
	generate_excel(data, excel_output_file)

	st.success(f"Scraping complete! PDF and Excel files generated with names: {pdf_filename}.pdf and {excel_filename}.xlsx.")

	# Provide download links for both files
	with open(pdf_output_file, "rb") as pdf_file:
	st.download_button(
	label="Download PDF",
	data=pdf_file,
	file_name=pdf_output_file,
	mime="application/pdf"
	)

	with open(excel_output_file, "rb") as excel_file:
	st.download_button(
	label="Download Excel",
	data=excel_file,
	file_name=excel_output_file,
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)
	else:
	st.error("No data found to scrape.")

	if __name__ == "__main__":
	main()