File size: 9,117 Bytes
108e7a1 87aa741 108e7a1 bffd09a 108e7a1 bffd09a b1939df 87aa741 b1939df 87aa741 108e7a1 87aa741 108e7a1 87aa741 108e7a1 87aa741 108e7a1 87aa741 108e7a1 87aa741 108e7a1 87aa741 108e7a1 87aa741 108e7a1 87aa741 108e7a1 87aa741 108e7a1 87aa741 108e7a1 87aa741 108e7a1 87aa741 108e7a1 87aa741 108e7a1 87aa741 108e7a1 87aa741 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
import os
import pandas as pd
from typing import Dict, List, Union, Tuple, Any, Optional
import numpy as np
from smolagents.tools import Tool
class SpreadsheetTool(Tool):
"""
Parses spreadsheet files (e.g., .xlsx) and extracts tabular data for analysis or allows querying.
Useful for reading, processing, and converting spreadsheet content to Python data structures.
"""
name = "spreadsheet_processor"
description = "Parses a spreadsheet file (e.g., .xlsx, .xls, .csv) and can perform queries. Returns extracted data or query results."
inputs = {
'file_path': {'type': 'string', 'description': 'Path to the spreadsheet file.'},
'query_instructions': {'type': 'string', 'description': 'Optional. Instructions for querying the data (e.g., "Sum column A"). If None, parses the whole sheet.', 'nullable': True}
}
outputs = {'result': {'type': 'object', 'description': 'A dictionary containing parsed sheet data, query results, or an error message.'}}
output_type = "object"
def __init__(self, *args, **kwargs):
"""Initialize the SpreadsheetTool."""
super().__init__(*args, **kwargs)
self.is_initialized = True
# Main entry point for the agent
def forward(self, file_path: str, query_instructions: Optional[str] = None) -> Dict[str, Any]:
if not os.path.exists(file_path):
return {"error": f"File not found: {file_path}"}
# Determine file type for appropriate parsing
_, file_extension = os.path.splitext(file_path)
file_extension = file_extension.lower()
parsed_data = None
if file_extension in ['.xlsx', '.xls']:
parsed_data = self._parse_excel(file_path)
elif file_extension == '.csv':
parsed_data = self._parse_csv(file_path)
else:
return {"error": f"Unsupported file type: {file_extension}. Supported types: .xlsx, .xls, .csv"}
if parsed_data.get("error"):
return parsed_data # Return error from parsing step
if query_instructions:
return self._query_data(parsed_data, query_instructions)
else:
# If no query, return the parsed data and summary
return {
"parsed_sheets": parsed_data.get("sheets"),
"summary": parsed_data.get("summary"),
"message": "Spreadsheet parsed successfully."
}
def _parse_excel(self, file_path: str) -> Dict[str, Any]:
"""Parse an Excel spreadsheet and extract useful information."""
try:
excel_file = pd.ExcelFile(file_path)
sheet_names = excel_file.sheet_names
sheets = {}
for sheet_name in sheet_names:
sheets[sheet_name] = pd.read_excel(excel_file, sheet_name=sheet_name)
summary = self._create_summary(sheets)
return {"sheets": sheets, "sheet_names": sheet_names, "summary": summary, "error": None}
except Exception as e:
return {"error": f"Error parsing Excel spreadsheet: {str(e)}"}
def _parse_csv(self, file_path: str) -> Dict[str, Any]:
"""Parse a CSV file."""
try:
df = pd.read_csv(file_path)
# CSVs don't have multiple sheets, so we adapt the structure
sheet_name = os.path.splitext(os.path.basename(file_path))[0]
sheets = {sheet_name: df}
summary = self._create_summary(sheets)
return {"sheets": sheets, "sheet_names": [sheet_name], "summary": summary, "error": None}
except Exception as e:
return {"error": f"Error parsing CSV file: {str(e)}"}
def _create_summary(self, sheets_dict: Dict[str, pd.DataFrame]) -> Dict[str, Any]:
"""Create a summary of the spreadsheet contents."""
summary = {}
for sheet_name, df in sheets_dict.items():
summary[sheet_name] = {
"shape": df.shape,
"columns": df.columns.tolist(),
"numeric_columns": df.select_dtypes(include=[np.number]).columns.tolist(),
"text_columns": df.select_dtypes(include=['object']).columns.tolist(),
"has_nulls": df.isnull().any().any(),
"first_few_rows": df.head(3).to_dict('records')
}
return summary
# Renamed from query_data to _query_data and adjusted arguments
def _query_data(self, parsed_data_dict: Dict[str, Any], query_instructions: str) -> Dict[str, Any]:
"""
Execute a query on the spreadsheet data based on instructions.
This is a simplified placeholder. Real implementation would need robust query parsing.
"""
if parsed_data_dict.get("error"):
return {"error": parsed_data_dict["error"]}
sheets = parsed_data_dict.get("sheets")
if not sheets:
return {"error": "No sheets data available for querying."}
# Placeholder for actual query logic.
# This would involve parsing `query_instructions` (e.g., using regex, NLP, or a DSL)
# and applying pandas operations.
# For now, let's return a message indicating the query was received and basic info.
results = {}
explanation = f"Query instruction received: '{query_instructions}'. Advanced query execution is not fully implemented. " \
f"Returning summary of available sheets: {list(sheets.keys())}."
# Example: if query asks for sum, try to sum first numeric column of first sheet
if "sum" in query_instructions.lower():
first_sheet_name = next(iter(sheets))
df = sheets[first_sheet_name]
numeric_cols = df.select_dtypes(include=[np.number]).columns
if not numeric_cols.empty:
col_to_sum = numeric_cols[0]
try:
total_sum = df[col_to_sum].sum()
results[f'{first_sheet_name}_{col_to_sum}_sum'] = total_sum
explanation += f" Example sum of column '{col_to_sum}' in sheet '{first_sheet_name}': {total_sum}."
except Exception as e:
explanation += f" Could not perform example sum: {e}."
else:
explanation += " No numeric columns found for example sum."
return {"query_results": results, "explanation": explanation, "original_query": query_instructions}
# Example usage (for direct testing)
if __name__ == '__main__':
tool = SpreadsheetTool()
# Create dummy files for testing
dummy_excel_file = "dummy_test.xlsx"
dummy_csv_file = "dummy_test.csv"
# Create a dummy Excel file
df_excel = pd.DataFrame({
'colA': [1, 2, 3, 4, 5],
'colB': ['apple', 'banana', 'cherry', 'date', 'elderberry'],
'colC': [10.1, 20.2, 30.3, 40.4, 50.5]
})
with pd.ExcelWriter(dummy_excel_file) as writer:
df_excel.to_excel(writer, sheet_name='Sheet1', index=False)
df_excel.head(2).to_excel(writer, sheet_name='Sheet2', index=False)
# Create a dummy CSV file
df_csv = pd.DataFrame({
'id': [101, 102, 103],
'product': ['widget', 'gadget', 'gizmo'],
'price': [19.99, 29.50, 15.00]
})
df_csv.to_csv(dummy_csv_file, index=False)
print("--- Test 1: Parse Excel file (no query) ---")
result1 = tool.forward(file_path=dummy_excel_file)
print(result1)
assert "error" not in result1 or result1["error"] is None
assert "Sheet1" in result1["parsed_sheets"]
print("\n--- Test 2: Parse CSV file (no query) ---")
result2 = tool.forward(file_path=dummy_csv_file)
print(result2)
assert "error" not in result2 or result2["error"] is None
assert dummy_csv_file.split('.')[0] in result2["parsed_sheets"]
print("\n--- Test 3: Query Excel file (simple sum example) ---")
result3 = tool.forward(file_path=dummy_excel_file, query_instructions="sum colA from Sheet1")
print(result3)
assert "error" not in result3 or result3["error"] is None
assert "query_results" in result3
if result3.get("query_results"):
assert "Sheet1_colA_sum" in result3["query_results"]
assert result3["query_results"]["Sheet1_colA_sum"] == 15
print("\n--- Test 4: File not found ---")
result4 = tool.forward(file_path="non_existent_file.xlsx")
print(result4)
assert result4["error"] is not None
assert "File not found" in result4["error"]
print("\n--- Test 5: Unsupported file type ---")
dummy_txt_file = "dummy_test.txt"
with open(dummy_txt_file, "w") as f:
f.write("hello")
result5 = tool.forward(file_path=dummy_txt_file)
print(result5)
assert result5["error"] is not None
assert "Unsupported file type" in result5["error"]
os.remove(dummy_txt_file)
# Clean up dummy files
if os.path.exists(dummy_excel_file):
os.remove(dummy_excel_file)
if os.path.exists(dummy_csv_file):
os.remove(dummy_csv_file)
print("\nSpreadsheetTool tests completed.") |