File size: 9,117 Bytes
108e7a1
 
87aa741
108e7a1
bffd09a
108e7a1
bffd09a
b1939df
87aa741
b1939df
 
87aa741
 
 
 
 
 
 
 
108e7a1
87aa741
108e7a1
87aa741
 
108e7a1
87aa741
 
108e7a1
 
87aa741
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108e7a1
 
 
 
 
 
 
87aa741
108e7a1
87aa741
 
 
 
 
 
 
 
 
 
 
 
 
 
108e7a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87aa741
 
108e7a1
 
87aa741
108e7a1
87aa741
 
108e7a1
87aa741
 
 
 
 
 
 
 
108e7a1
87aa741
 
 
108e7a1
87aa741
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108e7a1
87aa741
108e7a1
87aa741
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108e7a1
87aa741
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import os
import pandas as pd
from typing import Dict, List, Union, Tuple, Any, Optional
import numpy as np
from smolagents.tools import Tool

class SpreadsheetTool(Tool):
    """
    Parses spreadsheet files (e.g., .xlsx) and extracts tabular data for analysis or allows querying.
    Useful for reading, processing, and converting spreadsheet content to Python data structures.
    """
    name = "spreadsheet_processor"
    description = "Parses a spreadsheet file (e.g., .xlsx, .xls, .csv) and can perform queries. Returns extracted data or query results."
    inputs = {
        'file_path': {'type': 'string', 'description': 'Path to the spreadsheet file.'},
        'query_instructions': {'type': 'string', 'description': 'Optional. Instructions for querying the data (e.g., "Sum column A"). If None, parses the whole sheet.', 'nullable': True}
    }
    outputs = {'result': {'type': 'object', 'description': 'A dictionary containing parsed sheet data, query results, or an error message.'}}
    output_type = "object"
    
    def __init__(self, *args, **kwargs):
        """Initialize the SpreadsheetTool."""
        super().__init__(*args, **kwargs)
        self.is_initialized = True
    
    # Main entry point for the agent
    def forward(self, file_path: str, query_instructions: Optional[str] = None) -> Dict[str, Any]:
        if not os.path.exists(file_path):
            return {"error": f"File not found: {file_path}"}

        # Determine file type for appropriate parsing
        _, file_extension = os.path.splitext(file_path)
        file_extension = file_extension.lower()

        parsed_data = None
        if file_extension in ['.xlsx', '.xls']:
            parsed_data = self._parse_excel(file_path)
        elif file_extension == '.csv':
            parsed_data = self._parse_csv(file_path)
        else:
            return {"error": f"Unsupported file type: {file_extension}. Supported types: .xlsx, .xls, .csv"}

        if parsed_data.get("error"):
            return parsed_data # Return error from parsing step

        if query_instructions:
            return self._query_data(parsed_data, query_instructions)
        else:
            # If no query, return the parsed data and summary
            return {
                "parsed_sheets": parsed_data.get("sheets"), 
                "summary": parsed_data.get("summary"),
                "message": "Spreadsheet parsed successfully."
            }

    def _parse_excel(self, file_path: str) -> Dict[str, Any]:
        """Parse an Excel spreadsheet and extract useful information."""
        try:
            excel_file = pd.ExcelFile(file_path)
            sheet_names = excel_file.sheet_names
            sheets = {}
            for sheet_name in sheet_names:
                sheets[sheet_name] = pd.read_excel(excel_file, sheet_name=sheet_name)
            summary = self._create_summary(sheets)
            return {"sheets": sheets, "sheet_names": sheet_names, "summary": summary, "error": None}
        except Exception as e:
            return {"error": f"Error parsing Excel spreadsheet: {str(e)}"}

    def _parse_csv(self, file_path: str) -> Dict[str, Any]:
        """Parse a CSV file."""
        try:
            df = pd.read_csv(file_path)
            # CSVs don't have multiple sheets, so we adapt the structure
            sheet_name = os.path.splitext(os.path.basename(file_path))[0]
            sheets = {sheet_name: df}
            summary = self._create_summary(sheets)
            return {"sheets": sheets, "sheet_names": [sheet_name], "summary": summary, "error": None}
        except Exception as e:
            return {"error": f"Error parsing CSV file: {str(e)}"}

    def _create_summary(self, sheets_dict: Dict[str, pd.DataFrame]) -> Dict[str, Any]:
        """Create a summary of the spreadsheet contents."""
        summary = {}
        
        for sheet_name, df in sheets_dict.items():
            summary[sheet_name] = {
                "shape": df.shape,
                "columns": df.columns.tolist(),
                "numeric_columns": df.select_dtypes(include=[np.number]).columns.tolist(),
                "text_columns": df.select_dtypes(include=['object']).columns.tolist(),
                "has_nulls": df.isnull().any().any(),
                "first_few_rows": df.head(3).to_dict('records')
            }
        
        return summary
    
    # Renamed from query_data to _query_data and adjusted arguments
    def _query_data(self, parsed_data_dict: Dict[str, Any], query_instructions: str) -> Dict[str, Any]:
        """
        Execute a query on the spreadsheet data based on instructions.
        This is a simplified placeholder. Real implementation would need robust query parsing.
        """
        if parsed_data_dict.get("error"):
            return {"error": parsed_data_dict["error"]}
        
        sheets = parsed_data_dict.get("sheets")
        if not sheets:
            return {"error": "No sheets data available for querying."}
            
        # Placeholder for actual query logic. 
        # This would involve parsing `query_instructions` (e.g., using regex, NLP, or a DSL)
        # and applying pandas operations.
        # For now, let's return a message indicating the query was received and basic info.
        
        results = {}
        explanation = f"Query instruction received: '{query_instructions}'. Advanced query execution is not fully implemented. " \
                      f"Returning summary of available sheets: {list(sheets.keys())}."
        
        # Example: if query asks for sum, try to sum first numeric column of first sheet
        if "sum" in query_instructions.lower():
            first_sheet_name = next(iter(sheets))
            df = sheets[first_sheet_name]
            numeric_cols = df.select_dtypes(include=[np.number]).columns
            if not numeric_cols.empty:
                col_to_sum = numeric_cols[0]
                try:
                    total_sum = df[col_to_sum].sum()
                    results[f'{first_sheet_name}_{col_to_sum}_sum'] = total_sum
                    explanation += f" Example sum of column '{col_to_sum}' in sheet '{first_sheet_name}': {total_sum}."
                except Exception as e:
                    explanation += f" Could not perform example sum: {e}."
            else:
                explanation += " No numeric columns found for example sum."

        return {"query_results": results, "explanation": explanation, "original_query": query_instructions}

# Example usage (for direct testing)
if __name__ == '__main__':
    tool = SpreadsheetTool()

    # Create dummy files for testing
    dummy_excel_file = "dummy_test.xlsx"
    dummy_csv_file = "dummy_test.csv"

    # Create a dummy Excel file
    df_excel = pd.DataFrame({
        'colA': [1, 2, 3, 4, 5],
        'colB': ['apple', 'banana', 'cherry', 'date', 'elderberry'],
        'colC': [10.1, 20.2, 30.3, 40.4, 50.5]
    })
    with pd.ExcelWriter(dummy_excel_file) as writer:
        df_excel.to_excel(writer, sheet_name='Sheet1', index=False)
        df_excel.head(2).to_excel(writer, sheet_name='Sheet2', index=False)

    # Create a dummy CSV file
    df_csv = pd.DataFrame({
        'id': [101, 102, 103],
        'product': ['widget', 'gadget', 'gizmo'],
        'price': [19.99, 29.50, 15.00]
    })
    df_csv.to_csv(dummy_csv_file, index=False)

    print("--- Test 1: Parse Excel file (no query) ---")
    result1 = tool.forward(file_path=dummy_excel_file)
    print(result1)
    assert "error" not in result1 or result1["error"] is None
    assert "Sheet1" in result1["parsed_sheets"]

    print("\n--- Test 2: Parse CSV file (no query) ---")
    result2 = tool.forward(file_path=dummy_csv_file)
    print(result2)
    assert "error" not in result2 or result2["error"] is None
    assert dummy_csv_file.split('.')[0] in result2["parsed_sheets"]

    print("\n--- Test 3: Query Excel file (simple sum example) ---")
    result3 = tool.forward(file_path=dummy_excel_file, query_instructions="sum colA from Sheet1")
    print(result3)
    assert "error" not in result3 or result3["error"] is None
    assert "query_results" in result3
    if result3.get("query_results"):
        assert "Sheet1_colA_sum" in result3["query_results"]
        assert result3["query_results"]["Sheet1_colA_sum"] == 15

    print("\n--- Test 4: File not found ---")
    result4 = tool.forward(file_path="non_existent_file.xlsx")
    print(result4)
    assert result4["error"] is not None
    assert "File not found" in result4["error"]
    
    print("\n--- Test 5: Unsupported file type ---")
    dummy_txt_file = "dummy_test.txt"
    with open(dummy_txt_file, "w") as f:
        f.write("hello")
    result5 = tool.forward(file_path=dummy_txt_file)
    print(result5)
    assert result5["error"] is not None
    assert "Unsupported file type" in result5["error"]
    os.remove(dummy_txt_file)

    # Clean up dummy files
    if os.path.exists(dummy_excel_file):
        os.remove(dummy_excel_file)
    if os.path.exists(dummy_csv_file):
        os.remove(dummy_csv_file)

    print("\nSpreadsheetTool tests completed.")