File size: 6,413 Bytes
03c34b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import asyncio
from crawl4ai import AsyncWebCrawler
from dotenv import load_dotenv
from config import API_TOKEN, LLM_MODEL
from config import BASE_URL, CSS_SELECTOR, MAX_PAGES, SCRAPER_INSTRUCTIONS
from utils import save_data_to_csv
from scraper import (
    get_browser_config,
    get_llm_strategy,
    fetch_and_process_page
)
import os

from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain_groq import ChatGroq

from langchain.tools import BaseTool, StructuredTool, tool
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from business import BusinessData
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI
from langchain_groq import ChatGroq
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain_community.utilities import GoogleSerperAPIWrapper


load_dotenv()
@tool
def search(query):
  """Search for links using the Google Serper API."""
  print("Searching for links...")
  serp_tool= GoogleSerperAPIWrapper(
          serper_api_key="76238dbeac9defaae61715ba4e928a42c0e98e6e")
  data = serp_tool.results(query)
  links = []

  # Extract links from the 'organic' results
  if 'organic' in data:
    for result in data['organic']:
      if 'link' in result:
        links.append(result['link'])

  # Extract links from 'sitelinks' within organic results
  if 'organic' in data:
    for result in data['organic']:
      if 'sitelinks' in result:
        for sitelink in result['sitelinks']:
          if 'link' in sitelink:
            links.append(sitelink['link'])
            
  # Extract links from 'peopleAlsoAsk'
  if 'peopleAlsoAsk' in data:
    for item in data['peopleAlsoAsk']:
      if 'link' in item:
        links.append(item['link'])
  print(links)
  return links[:5]

@tool
async def scrape(url):
    """
    Function to scrape data once search tool is called and url is available. 
    """
    
    # Initialize configurations
    browser_config = get_browser_config()
    llm_strategy = get_llm_strategy(
        llm_instructions=SCRAPER_INSTRUCTIONS,  # Instructions for the LLM
        output_format=BusinessData # Data output format
    )
    session_id = "crawler_session"

    # Initialize state variables
    page_number = 1
    all_records = []
    seen_names = set()

    # Start the web crawler context
    # https://docs.crawl4ai.com/api/async-webcrawler/#asyncwebcrawler
    async with AsyncWebCrawler(config=browser_config) as crawler:
        while True:
            # Fetch and process data from the current page
            records, no_results_found = await fetch_and_process_page(
                crawler,
                page_number,
                url,
                CSS_SELECTOR,
                llm_strategy,
                session_id,
                seen_names,
            )

            if no_results_found:
                print("No more records found. Ending crawl.")
                break  # Stop crawling when "No Results Found" message appears

            if not records:
                print(f"No records extracted from page {page_number}.")
                break  # Stop if no records are extracted

            # Add the records from this page to the total list
            all_records.extend(records)
            page_number += 1  # Move to the next 
            
            if page_number > MAX_PAGES:
                break

            # Pause between requests to avoid rate limits
            await asyncio.sleep(2)  # Adjust sleep time as needed

    # Save the collected records to a CSV file
    if all_records:
        save_data_to_csv(
            records=all_records, 
            data_struct=BusinessData,
            filename="businesses_data.csv"
        )
    else:
        print("No records were found during the crawl.")

    # Display usage statistics for the LLM strategy
    llm_strategy.show_usage()
    return all_records


async def main(input):
    prompt = ChatPromptTemplate.from_messages(
    [
        ("system", """"You are a web scraping assistant.
        Your have task to do:
        - If the user_input has sites mentioned then directly use the scrapping tool to scrape those data
        - Once the search tool is called then only call scrape tool
        - If the user_input has no sites mentioned then use the search tool to get the sites and then use the scrapping tool to scrape those data
        - Scrape the website and extract the business information.
        - Information such as plan name, price, inclusions, benefits and every minor details of the plan has to be included.
        - If the required data is not specified, create the query for search tool to find that data and again find the websites that might have infromation then use the scrape tool to get the required information.
        - For example if the price of health and life insurance is not specificed, create the search query for such as user_input price and then search and scrape.
        - Continue till every infromation of the plan is not convered. Such as plan name, it's price, inclusions, benefits and every minor details.
        - Focus on one plan get all the required details of that plan and then move to the next plan.
        - Information such as plans and all the details of the plans is to be fetched.
        - Return information in Table format only. Use markdowns and return the data in table format.
        """),
        MessagesPlaceholder("chat_history", optional=True),
        ("human", "User_input: {input}"),
        MessagesPlaceholder("agent_scratchpad"),
    ]
)
    tools = [scrape,search]
    # model  = "llama3-8b-8192"
    # llm = ChatGroq(api_key=API_TOKEN, model=model)
    model = "gpt-4o-mini"
    api_key = os.getenv("OPENAI_API_KEY")
    llm = ChatOpenAI(api_key=api_key, model=model, temperature=0.0)
    """
    Entry point of the script.
    """
    # llm_tools=llm.bind_tools(tools)
    
    # print(llm_tools.invoke("Scrape https://example.com/ for me"))
    agent = create_openai_tools_agent(llm, tools, prompt)

# Create an agent executor by passing in the agent and tools
    agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
    result = await agent_executor.ainvoke({"input": input})
    return result['output']

    print(result['output'])
    # url = input()
    # await crawl_yellowpages(url)


if __name__ == "__main__":
    asyncio.run(main())