|
import asyncio |
|
from crawl4ai import AsyncWebCrawler |
|
from dotenv import load_dotenv |
|
from config import API_TOKEN, LLM_MODEL |
|
from config import BASE_URL, CSS_SELECTOR, MAX_PAGES, SCRAPER_INSTRUCTIONS |
|
from utils import save_data_to_csv |
|
from scraper import ( |
|
get_browser_config, |
|
get_llm_strategy, |
|
fetch_and_process_page |
|
) |
|
import os |
|
|
|
from langchain.agents import AgentExecutor, create_openai_tools_agent |
|
from langchain_groq import ChatGroq |
|
|
|
from langchain.tools import BaseTool, StructuredTool, tool |
|
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder |
|
from business import BusinessData |
|
from langchain_core.tools import tool |
|
from langchain_openai import ChatOpenAI |
|
from langchain_groq import ChatGroq |
|
from langchain.agents import AgentExecutor, create_openai_tools_agent |
|
from langchain_community.utilities import GoogleSerperAPIWrapper |
|
|
|
|
|
load_dotenv() |
|
@tool |
|
def search(query): |
|
"""Search for links using the Google Serper API.""" |
|
print("Searching for links...") |
|
serp_tool= GoogleSerperAPIWrapper( |
|
serper_api_key="76238dbeac9defaae61715ba4e928a42c0e98e6e") |
|
data = serp_tool.results(query) |
|
links = [] |
|
|
|
|
|
if 'organic' in data: |
|
for result in data['organic']: |
|
if 'link' in result: |
|
links.append(result['link']) |
|
|
|
|
|
if 'organic' in data: |
|
for result in data['organic']: |
|
if 'sitelinks' in result: |
|
for sitelink in result['sitelinks']: |
|
if 'link' in sitelink: |
|
links.append(sitelink['link']) |
|
|
|
|
|
if 'peopleAlsoAsk' in data: |
|
for item in data['peopleAlsoAsk']: |
|
if 'link' in item: |
|
links.append(item['link']) |
|
print(links) |
|
return links[:5] |
|
|
|
@tool |
|
async def scrape(url): |
|
""" |
|
Function to scrape data once search tool is called and url is available. |
|
""" |
|
|
|
|
|
browser_config = get_browser_config() |
|
llm_strategy = get_llm_strategy( |
|
llm_instructions=SCRAPER_INSTRUCTIONS, |
|
output_format=BusinessData |
|
) |
|
session_id = "crawler_session" |
|
|
|
|
|
page_number = 1 |
|
all_records = [] |
|
seen_names = set() |
|
|
|
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler: |
|
while True: |
|
|
|
records, no_results_found = await fetch_and_process_page( |
|
crawler, |
|
page_number, |
|
url, |
|
CSS_SELECTOR, |
|
llm_strategy, |
|
session_id, |
|
seen_names, |
|
) |
|
|
|
if no_results_found: |
|
print("No more records found. Ending crawl.") |
|
break |
|
|
|
if not records: |
|
print(f"No records extracted from page {page_number}.") |
|
break |
|
|
|
|
|
all_records.extend(records) |
|
page_number += 1 |
|
|
|
if page_number > MAX_PAGES: |
|
break |
|
|
|
|
|
await asyncio.sleep(2) |
|
|
|
|
|
if all_records: |
|
save_data_to_csv( |
|
records=all_records, |
|
data_struct=BusinessData, |
|
filename="businesses_data.csv" |
|
) |
|
else: |
|
print("No records were found during the crawl.") |
|
|
|
|
|
llm_strategy.show_usage() |
|
return all_records |
|
|
|
|
|
async def main(input): |
|
prompt = ChatPromptTemplate.from_messages( |
|
[ |
|
("system", """"You are a web scraping assistant. |
|
Your have task to do: |
|
- If the user_input has sites mentioned then directly use the scrapping tool to scrape those data |
|
- Once the search tool is called then only call scrape tool |
|
- If the user_input has no sites mentioned then use the search tool to get the sites and then use the scrapping tool to scrape those data |
|
- Scrape the website and extract the business information. |
|
- Information such as plan name, price, inclusions, benefits and every minor details of the plan has to be included. |
|
- If the required data is not specified, create the query for search tool to find that data and again find the websites that might have infromation then use the scrape tool to get the required information. |
|
- For example if the price of health and life insurance is not specificed, create the search query for such as user_input price and then search and scrape. |
|
- Continue till every infromation of the plan is not convered. Such as plan name, it's price, inclusions, benefits and every minor details. |
|
- Focus on one plan get all the required details of that plan and then move to the next plan. |
|
- Information such as plans and all the details of the plans is to be fetched. |
|
- Return information in Table format only. Use markdowns and return the data in table format. |
|
"""), |
|
MessagesPlaceholder("chat_history", optional=True), |
|
("human", "User_input: {input}"), |
|
MessagesPlaceholder("agent_scratchpad"), |
|
] |
|
) |
|
tools = [scrape,search] |
|
|
|
|
|
model = "gpt-4o-mini" |
|
api_key = os.getenv("OPENAI_API_KEY") |
|
llm = ChatOpenAI(api_key=api_key, model=model, temperature=0.0) |
|
""" |
|
Entry point of the script. |
|
""" |
|
|
|
|
|
|
|
agent = create_openai_tools_agent(llm, tools, prompt) |
|
|
|
|
|
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True) |
|
result = await agent_executor.ainvoke({"input": input}) |
|
return result['output'] |
|
|
|
print(result['output']) |
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
asyncio.run(main()) |
|
|