import os | |
# Specify the LLM model to use. You can choose any LLM supported by LiteLLM. | |
# Example options include "gpt-4o", "claude", "deepseek-chat", etc. | |
# For a full list of supported models, refer to: | |
# https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json | |
#LLM_MODEL = "groq/llama3-8b-8192" | |
#LLM_MODEL= "gemini-2.5-pro-exp-03-25" | |
LLM_MODEL ="openai/gpt-4o-mini" | |
# API token for authentication with the LLM provider. | |
# This is fetched from the environment variable "GEMINI_API_KEY". | |
#PI_TOKEN = os.getenv("GEMINI_API_KEY") | |
#API_TOKEN="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" | |
#API_TOKEN="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" | |
import os | |
API_TOKEN= os.getenv("OPENAI_API_KEY") | |
# Base URL of the website to scrape. | |
# In this example, we are scraping Yellow Pages for dentists in Toronto, ON. | |
# You can modify the URL to change the location or the type of business. | |
# Example: | |
# - For plumbers in Vancouver: "https://www.yellowpages.ca/search/si/{page_number}/Plumbers/Vancouver+BC" | |
# - For restaurants in Montreal: "https://www.yellowpages.ca/search/si/{page_number}/Restaurants/Montreal+QC" | |
BASE_URL = "https://gentledental.ai/" | |
# CSS selector to target the main HTML element containing the business information. | |
# This is specific to Yellow Pages and helps focus the scraper on relevant content | |
# instead of sending the entire HTML page to the LLM. | |
# CSS_SELECTOR = "[class^='listing_right_section']" | |
CSS_SELECTOR = "" | |
# Maximum number of pages to crawl. Adjust this value based on how much data you want to scrape. | |
MAX_PAGES = 3 # Example: Set to 5 to scrape 5 pages. | |
# Instructions for the LLM on what information to extract from the scraped content. | |
# The LLM will extract the following details for each business: | |
# - Name | |
# - Address | |
# - Website | |
# - Phone number | |
# - A one-sentence description | |
SCRAPER_INSTRUCTIONS = ( | |
"Extract all business information: 'name', 'address', 'website'" | |
", 'phone number' and a one-sentence 'description' from the following content." | |
) |