Spaces:
Paused
Paused
Changed selenium retrieval implementation
Browse files- search_agent.py +22 -2
- search_agent_ui.py +2 -1
- web_crawler.py +17 -22
search_agent.py
CHANGED
|
@@ -41,6 +41,26 @@ import web_crawler as wc
|
|
| 41 |
console = Console()
|
| 42 |
dotenv.load_dotenv()
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
callbacks = []
|
| 45 |
if os.getenv("LANGCHAIN_API_KEY"):
|
| 46 |
callbacks.append(
|
|
@@ -59,7 +79,7 @@ if __name__ == '__main__':
|
|
| 59 |
query = arguments["SEARCH_QUERY"]
|
| 60 |
|
| 61 |
chat = wr.get_chat_llm(provider, model, temperature)
|
| 62 |
-
console.log(f"Using {
|
| 63 |
|
| 64 |
with console.status(f"[bold green]Optimizing query for search: {query}"):
|
| 65 |
optimize_search_query = wr.optimize_search_query(chat, query, callbacks=callbacks)
|
|
@@ -74,7 +94,7 @@ if __name__ == '__main__':
|
|
| 74 |
with console.status(
|
| 75 |
f"[bold green]Fetching content for {len(sources)} sources", spinner="growVertical"
|
| 76 |
):
|
| 77 |
-
contents = wc.get_links_contents(sources)
|
| 78 |
console.log(f"Managed to extract content from {len(contents)} sources")
|
| 79 |
|
| 80 |
with console.status(f"[bold green]Embeddubg {len(contents)} sources for content", spinner="growVertical"):
|
|
|
|
| 41 |
console = Console()
|
| 42 |
dotenv.load_dotenv()
|
| 43 |
|
| 44 |
+
def get_selenium_driver():
|
| 45 |
+
from selenium import webdriver
|
| 46 |
+
from selenium.webdriver.chrome.options import Options
|
| 47 |
+
from selenium.common.exceptions import TimeoutException
|
| 48 |
+
|
| 49 |
+
chrome_options = Options()
|
| 50 |
+
chrome_options.add_argument("headless")
|
| 51 |
+
chrome_options.add_argument("--disable-extensions")
|
| 52 |
+
chrome_options.add_argument("--disable-gpu")
|
| 53 |
+
chrome_options.add_argument("--no-sandbox")
|
| 54 |
+
chrome_options.add_argument("--disable-dev-shm-usage")
|
| 55 |
+
chrome_options.add_argument("--remote-debugging-port=9222")
|
| 56 |
+
chrome_options.add_argument('--blink-settings=imagesEnabled=false')
|
| 57 |
+
chrome_options.add_argument("--window-size=1920,1080")
|
| 58 |
+
|
| 59 |
+
driver = webdriver.Chrome(options=chrome_options)
|
| 60 |
+
return driver
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
|
| 64 |
callbacks = []
|
| 65 |
if os.getenv("LANGCHAIN_API_KEY"):
|
| 66 |
callbacks.append(
|
|
|
|
| 79 |
query = arguments["SEARCH_QUERY"]
|
| 80 |
|
| 81 |
chat = wr.get_chat_llm(provider, model, temperature)
|
| 82 |
+
#console.log(f"Using {model} on {provider} with temperature {temperature}")
|
| 83 |
|
| 84 |
with console.status(f"[bold green]Optimizing query for search: {query}"):
|
| 85 |
optimize_search_query = wr.optimize_search_query(chat, query, callbacks=callbacks)
|
|
|
|
| 94 |
with console.status(
|
| 95 |
f"[bold green]Fetching content for {len(sources)} sources", spinner="growVertical"
|
| 96 |
):
|
| 97 |
+
contents = wc.get_links_contents(sources, get_selenium_driver)
|
| 98 |
console.log(f"Managed to extract content from {len(contents)} sources")
|
| 99 |
|
| 100 |
with console.status(f"[bold green]Embeddubg {len(contents)} sources for content", spinner="growVertical"):
|
search_agent_ui.py
CHANGED
|
@@ -15,6 +15,7 @@ ls_tracer = LangChainTracer(
|
|
| 15 |
client=Client()
|
| 16 |
)
|
| 17 |
|
|
|
|
| 18 |
chat = wr.get_chat_llm(provider="cohere")
|
| 19 |
|
| 20 |
st.title("π Simple Search Agent π¬")
|
|
@@ -43,7 +44,7 @@ if prompt := st.chat_input():
|
|
| 43 |
|
| 44 |
|
| 45 |
with st.spinner(f"Searching the web for: {optimize_search_query}"):
|
| 46 |
-
sources = wc.get_sources(optimize_search_query)
|
| 47 |
|
| 48 |
with st.spinner(f"I'm now retrieveing the {len(sources)} webpages and documents I found (be patient)"):
|
| 49 |
contents = wc.get_links_contents(sources)
|
|
|
|
| 15 |
client=Client()
|
| 16 |
)
|
| 17 |
|
| 18 |
+
|
| 19 |
chat = wr.get_chat_llm(provider="cohere")
|
| 20 |
|
| 21 |
st.title("π Simple Search Agent π¬")
|
|
|
|
| 44 |
|
| 45 |
|
| 46 |
with st.spinner(f"Searching the web for: {optimize_search_query}"):
|
| 47 |
+
sources = wc.get_sources(optimize_search_query, max_pages=20)
|
| 48 |
|
| 49 |
with st.spinner(f"I'm now retrieveing the {len(sources)} webpages and documents I found (be patient)"):
|
| 50 |
contents = wc.get_links_contents(sources)
|
web_crawler.py
CHANGED
|
@@ -52,19 +52,9 @@ def get_sources(query, max_pages=10, domain=None):
|
|
| 52 |
print('Error fetching search results:', error)
|
| 53 |
raise
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
chrome_options.add_argument("--disable-extensions")
|
| 59 |
-
chrome_options.add_argument("--disable-gpu")
|
| 60 |
-
chrome_options.add_argument("--no-sandbox")
|
| 61 |
-
chrome_options.add_argument("--disable-dev-shm-usage")
|
| 62 |
-
chrome_options.add_argument("--remote-debugging-port=9222")
|
| 63 |
-
chrome_options.add_argument('--blink-settings=imagesEnabled=false')
|
| 64 |
-
chrome_options.add_argument("--window-size=1920,1080")
|
| 65 |
-
|
| 66 |
-
driver = webdriver.Chrome(options=chrome_options)
|
| 67 |
-
|
| 68 |
try:
|
| 69 |
driver.set_page_load_timeout(timeout)
|
| 70 |
driver.get(url)
|
|
@@ -118,28 +108,33 @@ def process_source(source):
|
|
| 118 |
return {**source, 'page_content': source['snippet']}
|
| 119 |
return {**source, 'page_content': None}
|
| 120 |
|
| 121 |
-
def get_links_contents(sources):
|
| 122 |
with ThreadPoolExecutor() as executor:
|
| 123 |
-
results = list(executor.map(process_source, sources))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
for result in results:
|
| 125 |
if result['page_content'] is None:
|
| 126 |
url = result['link']
|
| 127 |
print(f"Fetching with selenium {url}")
|
| 128 |
-
|
|
|
|
| 129 |
main_content = extract(html, output_format='txt', include_links=True)
|
| 130 |
if main_content:
|
| 131 |
result['page_content'] = main_content
|
| 132 |
-
|
| 133 |
-
# Filter out None results
|
| 134 |
-
return [result for result in results if result is not None]
|
| 135 |
|
| 136 |
def vectorize(contents):
|
| 137 |
documents = []
|
| 138 |
for content in contents:
|
| 139 |
try:
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
|
|
|
|
|
|
| 143 |
except Exception as e:
|
| 144 |
print(f"[gray]Error processing content for {content['link']}: {e}")
|
| 145 |
semantic_chunker = SemanticChunker(OpenAIEmbeddings(model="text-embedding-3-large"), breakpoint_threshold_type="percentile")
|
|
|
|
| 52 |
print('Error fetching search results:', error)
|
| 53 |
raise
|
| 54 |
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def fetch_with_selenium(url, driver, timeout=8,):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
try:
|
| 59 |
driver.set_page_load_timeout(timeout)
|
| 60 |
driver.get(url)
|
|
|
|
| 108 |
return {**source, 'page_content': source['snippet']}
|
| 109 |
return {**source, 'page_content': None}
|
| 110 |
|
| 111 |
+
def get_links_contents(sources, get_driver_func=None):
|
| 112 |
with ThreadPoolExecutor() as executor:
|
| 113 |
+
results = list(executor.map(process_source, sources))
|
| 114 |
+
|
| 115 |
+
if get_driver_func is None:
|
| 116 |
+
return [result for result in results if result is not None]
|
| 117 |
+
|
| 118 |
for result in results:
|
| 119 |
if result['page_content'] is None:
|
| 120 |
url = result['link']
|
| 121 |
print(f"Fetching with selenium {url}")
|
| 122 |
+
driver = get_driver_func()
|
| 123 |
+
html = fetch_with_selenium(url, driver)
|
| 124 |
main_content = extract(html, output_format='txt', include_links=True)
|
| 125 |
if main_content:
|
| 126 |
result['page_content'] = main_content
|
| 127 |
+
return results
|
|
|
|
|
|
|
| 128 |
|
| 129 |
def vectorize(contents):
|
| 130 |
documents = []
|
| 131 |
for content in contents:
|
| 132 |
try:
|
| 133 |
+
page_content = content['page_content']
|
| 134 |
+
if page_content: # Sometimes Selenium is not fetching properly
|
| 135 |
+
metadata = {'title': content['title'], 'source': content['link']}
|
| 136 |
+
doc = Document(page_content=content['page_content'], metadata=metadata)
|
| 137 |
+
documents.append(doc)
|
| 138 |
except Exception as e:
|
| 139 |
print(f"[gray]Error processing content for {content['link']}: {e}")
|
| 140 |
semantic_chunker = SemanticChunker(OpenAIEmbeddings(model="text-embedding-3-large"), breakpoint_threshold_type="percentile")
|