Firoj112 commited on
Commit
e9ed5be
·
verified ·
1 Parent(s): d175522

Create scrape_text.py

Browse files
Files changed (1) hide show
  1. tools/scrape_text.py +52 -0
tools/scrape_text.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from smolagents.tools import Tool
2
+ from helium import S
3
+ from selenium.webdriver.common.by import By
4
+ import json
5
+
6
+ def scrape_text(driver, selector="p", extract_table=False):
7
+ """
8
+ Scrape text or table data from elements matching a CSS selector on the current page.
9
+
10
+ Args:
11
+ driver: Selenium WebDriver instance
12
+ selector (str): CSS selector to target elements (default: 'p' for paragraphs)
13
+ extract_table (bool): If True, extract table data as JSON (default: False)
14
+
15
+ Returns:
16
+ str or dict: Text from elements or JSON table data
17
+ """
18
+ try:
19
+ if extract_table:
20
+ tables = driver.find_elements(By.CSS_SELECTOR, selector)
21
+ if not tables:
22
+ return "No tables found for selector"
23
+ table_data = []
24
+ for table in tables:
25
+ rows = table.find_elements(By.TAG_NAME, "tr")
26
+ table_rows = []
27
+ for row in rows:
28
+ cells = row.find_elements(By.TAG_NAME, "td") or row.find_elements(By.TAG_NAME, "th")
29
+ row_data = [cell.text.strip() for cell in cells if cell.text.strip()]
30
+ if row_data:
31
+ table_rows.append(row_data)
32
+ if table_rows:
33
+ table_data.append(table_rows)
34
+ return json.dumps(table_data) if table_data else "No table data found"
35
+ else:
36
+ elements = driver.find_elements(By.CSS_SELECTOR, selector)
37
+ text_list = [element.text.strip() for element in elements if element.text.strip()]
38
+ return "\n".join(text_list) if text_list else "No text found for selector"
39
+ except Exception as e:
40
+ return f"Failed to scrape with selector {selector}: {str(e)}"
41
+
42
+ # Register the tool
43
+ tool = Tool(
44
+ name="scrape_text",
45
+ description="Scrapes text or table data from elements matching a CSS selector on the current page.",
46
+ inputs={
47
+ "selector": {"type": "str", "default": "p", "description": "CSS selector to target elements"},
48
+ "extract_table": {"type": "bool", "default": False, "description": "If True, extract table data as JSON"}
49
+ },
50
+ output_type="str",
51
+ function=scrape_text
52
+ )