gneya-bacancy's picture
Upload 8 files
03c34b1 verified
import json
from pydantic import BaseModel
from typing import List, Set, Tuple
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CacheMode,
CrawlerRunConfig,
LLMExtractionStrategy,
)
from utils import is_duplicated
from config import LLM_MODEL, API_TOKEN
def get_browser_config() -> BrowserConfig:
"""
Returns the browser configuration for the crawler.
Returns:
BrowserConfig: The configuration settings for the browser.
"""
# https://docs.crawl4ai.com/core/browser-crawler-config/
return BrowserConfig(
browser_type="chromium", # Type of browser to simulate
headless=True, # Whether to run in headless mode (no GUI)
verbose=True, # Enable verbose logging
)
def get_llm_strategy(llm_instructions: str, output_format: BaseModel) -> LLMExtractionStrategy:
"""
Returns the configuration for the language model extraction strategy.
Returns:
LLMExtractionStrategy: The settings for how to extract data using LLM.
"""
# https://docs.crawl4ai.com/api/strategies/#llmextractionstrategy
return LLMExtractionStrategy(
provider=LLM_MODEL, # Name of the LLM provider
api_token=API_TOKEN, # API token for authentication
schema=output_format.model_json_schema(), # JSON schema of the data model
extraction_type="schema", # Type of extraction to perform
instruction=llm_instructions, # Instructions for the LLM
input_format="markdown", # Format of the input content
verbose=True, # Enable verbose logging
)
async def check_no_results(
crawler: AsyncWebCrawler,
url: str,
session_id: str,
) -> bool:
"""
Checks if the "No Results Found" message is present on the page.
Args:
crawler (AsyncWebCrawler): The web crawler instance.
url (str): The URL to check.
session_id (str): The session identifier.
Returns:
bool: True if "No Results Found" message is found, False otherwise.
"""
# Fetch the page without any CSS selector or extraction strategy
result = await crawler.arun(
url=url,
config=CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
session_id=session_id,
),
)
if result.success:
if "No Results Found" in result.cleaned_html:
return True
else:
print(
f"Error fetching page for 'No Results Found' check: {result.error_message}"
)
return False
async def fetch_and_process_page(
crawler: AsyncWebCrawler,
page_number: int,
base_url: str,
css_selector: str,
llm_strategy: LLMExtractionStrategy,
session_id: str,
seen_names: Set[str],
) -> Tuple[List[dict], bool]:
"""
Fetches and processes a single page from yellowpages.
Args:
crawler (AsyncWebCrawler): The web crawler instance.
page_number (int): The page number to fetch.
base_url (str): The base URL of the website.
css_selector (str): The CSS selector to target the content.
llm_strategy (LLMExtractionStrategy): The LLM extraction strategy.
session_id (str): The session identifier.
required_keys (List[str]): List of required keys in the business data.
seen_names (Set[str]): Set of business names that have already been seen.
Returns:
Tuple[List[dict], bool]:
- List[dict]: A list of processed businesss from the page.
- bool: A flag indicating if the "No Results Found" message was encountered.
"""
url = base_url.format(page_number=page_number)
print(f"Loading page {page_number}...")
# Check if "No Results Found" message is present
no_results = await check_no_results(crawler, url, session_id)
if no_results:
return [], True # No more results, signal to stop crawling
# Fetch page content with the extraction strategy
result = await crawler.arun(
url=url,
config=CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, # Do not use cached data
extraction_strategy=llm_strategy, # Strategy for data extraction
css_selector=css_selector, # Target specific content on the page
session_id=session_id, # Unique session ID for the crawl
),
)
print("----------------------------- Result-----------------------------")
print(result.extracted_content)
if not (result.success and result.extracted_content):
print(f"Error fetching page {page_number}: {result.error_message}")
return [], False
# Parse extracted content
extracted_data = json.loads(result.extracted_content)
print("----------------------------Exracted Data----------------------------")
print(extracted_data)
if not extracted_data:
print(f"No businesss found on page {page_number}.")
return [], False
# After parsing extracted content
print("Extracted data:", extracted_data)
# Process businesss
all_businesses = []
for business in extracted_data:
# Debugging: Print each business to understand its structure
print("Processing business:", business)
# Ignore the 'error' key if it's False
if business.get("error") is False:
business.pop("error", None) # Remove the 'error' key if it's False
if is_duplicated(business["name"], seen_names):
print(f"Duplicate business '{business['name']}' found. Skipping.")
continue # Skip duplicate businesss
# Add business to the list
seen_names.add(business["name"])
all_businesses.append(business)
if not all_businesses:
print(f"No complete businesss found on page {page_number}.")
return [], False
print(f"Extracted {len(all_businesses)} businesss from page {page_number}.")
return all_businesses, False # Continue crawling