File size: 5,928 Bytes
03c34b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import json
from pydantic import BaseModel
from typing import List, Set, Tuple
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CacheMode,
CrawlerRunConfig,
LLMExtractionStrategy,
)
from utils import is_duplicated
from config import LLM_MODEL, API_TOKEN
def get_browser_config() -> BrowserConfig:
"""
Returns the browser configuration for the crawler.
Returns:
BrowserConfig: The configuration settings for the browser.
"""
# https://docs.crawl4ai.com/core/browser-crawler-config/
return BrowserConfig(
browser_type="chromium", # Type of browser to simulate
headless=True, # Whether to run in headless mode (no GUI)
verbose=True, # Enable verbose logging
)
def get_llm_strategy(llm_instructions: str, output_format: BaseModel) -> LLMExtractionStrategy:
"""
Returns the configuration for the language model extraction strategy.
Returns:
LLMExtractionStrategy: The settings for how to extract data using LLM.
"""
# https://docs.crawl4ai.com/api/strategies/#llmextractionstrategy
return LLMExtractionStrategy(
provider=LLM_MODEL, # Name of the LLM provider
api_token=API_TOKEN, # API token for authentication
schema=output_format.model_json_schema(), # JSON schema of the data model
extraction_type="schema", # Type of extraction to perform
instruction=llm_instructions, # Instructions for the LLM
input_format="markdown", # Format of the input content
verbose=True, # Enable verbose logging
)
async def check_no_results(
crawler: AsyncWebCrawler,
url: str,
session_id: str,
) -> bool:
"""
Checks if the "No Results Found" message is present on the page.
Args:
crawler (AsyncWebCrawler): The web crawler instance.
url (str): The URL to check.
session_id (str): The session identifier.
Returns:
bool: True if "No Results Found" message is found, False otherwise.
"""
# Fetch the page without any CSS selector or extraction strategy
result = await crawler.arun(
url=url,
config=CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
session_id=session_id,
),
)
if result.success:
if "No Results Found" in result.cleaned_html:
return True
else:
print(
f"Error fetching page for 'No Results Found' check: {result.error_message}"
)
return False
async def fetch_and_process_page(
crawler: AsyncWebCrawler,
page_number: int,
base_url: str,
css_selector: str,
llm_strategy: LLMExtractionStrategy,
session_id: str,
seen_names: Set[str],
) -> Tuple[List[dict], bool]:
"""
Fetches and processes a single page from yellowpages.
Args:
crawler (AsyncWebCrawler): The web crawler instance.
page_number (int): The page number to fetch.
base_url (str): The base URL of the website.
css_selector (str): The CSS selector to target the content.
llm_strategy (LLMExtractionStrategy): The LLM extraction strategy.
session_id (str): The session identifier.
required_keys (List[str]): List of required keys in the business data.
seen_names (Set[str]): Set of business names that have already been seen.
Returns:
Tuple[List[dict], bool]:
- List[dict]: A list of processed businesss from the page.
- bool: A flag indicating if the "No Results Found" message was encountered.
"""
url = base_url.format(page_number=page_number)
print(f"Loading page {page_number}...")
# Check if "No Results Found" message is present
no_results = await check_no_results(crawler, url, session_id)
if no_results:
return [], True # No more results, signal to stop crawling
# Fetch page content with the extraction strategy
result = await crawler.arun(
url=url,
config=CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, # Do not use cached data
extraction_strategy=llm_strategy, # Strategy for data extraction
css_selector=css_selector, # Target specific content on the page
session_id=session_id, # Unique session ID for the crawl
),
)
print("----------------------------- Result-----------------------------")
print(result.extracted_content)
if not (result.success and result.extracted_content):
print(f"Error fetching page {page_number}: {result.error_message}")
return [], False
# Parse extracted content
extracted_data = json.loads(result.extracted_content)
print("----------------------------Exracted Data----------------------------")
print(extracted_data)
if not extracted_data:
print(f"No businesss found on page {page_number}.")
return [], False
# After parsing extracted content
print("Extracted data:", extracted_data)
# Process businesss
all_businesses = []
for business in extracted_data:
# Debugging: Print each business to understand its structure
print("Processing business:", business)
# Ignore the 'error' key if it's False
if business.get("error") is False:
business.pop("error", None) # Remove the 'error' key if it's False
if is_duplicated(business["name"], seen_names):
print(f"Duplicate business '{business['name']}' found. Skipping.")
continue # Skip duplicate businesss
# Add business to the list
seen_names.add(business["name"])
all_businesses.append(business)
if not all_businesses:
print(f"No complete businesss found on page {page_number}.")
return [], False
print(f"Extracted {len(all_businesses)} businesss from page {page_number}.")
return all_businesses, False # Continue crawling
|