import json from pydantic import BaseModel from typing import List, Set, Tuple from crawl4ai import ( AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig, LLMExtractionStrategy, ) from utils import is_duplicated from config import LLM_MODEL, API_TOKEN def get_browser_config() -> BrowserConfig: """ Returns the browser configuration for the crawler. Returns: BrowserConfig: The configuration settings for the browser. """ # https://docs.crawl4ai.com/core/browser-crawler-config/ return BrowserConfig( browser_type="chromium", # Type of browser to simulate headless=True, # Whether to run in headless mode (no GUI) verbose=True, # Enable verbose logging ) def get_llm_strategy(llm_instructions: str, output_format: BaseModel) -> LLMExtractionStrategy: """ Returns the configuration for the language model extraction strategy. Returns: LLMExtractionStrategy: The settings for how to extract data using LLM. """ # https://docs.crawl4ai.com/api/strategies/#llmextractionstrategy return LLMExtractionStrategy( provider=LLM_MODEL, # Name of the LLM provider api_token=API_TOKEN, # API token for authentication schema=output_format.model_json_schema(), # JSON schema of the data model extraction_type="schema", # Type of extraction to perform instruction=llm_instructions, # Instructions for the LLM input_format="markdown", # Format of the input content verbose=True, # Enable verbose logging ) async def check_no_results( crawler: AsyncWebCrawler, url: str, session_id: str, ) -> bool: """ Checks if the "No Results Found" message is present on the page. Args: crawler (AsyncWebCrawler): The web crawler instance. url (str): The URL to check. session_id (str): The session identifier. Returns: bool: True if "No Results Found" message is found, False otherwise. """ # Fetch the page without any CSS selector or extraction strategy result = await crawler.arun( url=url, config=CrawlerRunConfig( cache_mode=CacheMode.BYPASS, session_id=session_id, ), ) if result.success: if "No Results Found" in result.cleaned_html: return True else: print( f"Error fetching page for 'No Results Found' check: {result.error_message}" ) return False async def fetch_and_process_page( crawler: AsyncWebCrawler, page_number: int, base_url: str, css_selector: str, llm_strategy: LLMExtractionStrategy, session_id: str, seen_names: Set[str], ) -> Tuple[List[dict], bool]: """ Fetches and processes a single page from yellowpages. Args: crawler (AsyncWebCrawler): The web crawler instance. page_number (int): The page number to fetch. base_url (str): The base URL of the website. css_selector (str): The CSS selector to target the content. llm_strategy (LLMExtractionStrategy): The LLM extraction strategy. session_id (str): The session identifier. required_keys (List[str]): List of required keys in the business data. seen_names (Set[str]): Set of business names that have already been seen. Returns: Tuple[List[dict], bool]: - List[dict]: A list of processed businesss from the page. - bool: A flag indicating if the "No Results Found" message was encountered. """ url = base_url.format(page_number=page_number) print(f"Loading page {page_number}...") # Check if "No Results Found" message is present no_results = await check_no_results(crawler, url, session_id) if no_results: return [], True # No more results, signal to stop crawling # Fetch page content with the extraction strategy result = await crawler.arun( url=url, config=CrawlerRunConfig( cache_mode=CacheMode.BYPASS, # Do not use cached data extraction_strategy=llm_strategy, # Strategy for data extraction css_selector=css_selector, # Target specific content on the page session_id=session_id, # Unique session ID for the crawl ), ) print("----------------------------- Result-----------------------------") print(result.extracted_content) if not (result.success and result.extracted_content): print(f"Error fetching page {page_number}: {result.error_message}") return [], False # Parse extracted content extracted_data = json.loads(result.extracted_content) print("----------------------------Exracted Data----------------------------") print(extracted_data) if not extracted_data: print(f"No businesss found on page {page_number}.") return [], False # After parsing extracted content print("Extracted data:", extracted_data) # Process businesss all_businesses = [] for business in extracted_data: # Debugging: Print each business to understand its structure print("Processing business:", business) # Ignore the 'error' key if it's False if business.get("error") is False: business.pop("error", None) # Remove the 'error' key if it's False if is_duplicated(business["name"], seen_names): print(f"Duplicate business '{business['name']}' found. Skipping.") continue # Skip duplicate businesss # Add business to the list seen_names.add(business["name"]) all_businesses.append(business) if not all_businesses: print(f"No complete businesss found on page {page_number}.") return [], False print(f"Extracted {len(all_businesses)} businesss from page {page_number}.") return all_businesses, False # Continue crawling