HF_Agents_Final_Project

Sleeping

File size: 9,117 Bytes

import os
import pandas as pd
from typing import Dict, List, Union, Tuple, Any, Optional
import numpy as np
from smolagents.tools import Tool

class SpreadsheetTool(Tool):
    """
    Parses spreadsheet files (e.g., .xlsx) and extracts tabular data for analysis or allows querying.
    Useful for reading, processing, and converting spreadsheet content to Python data structures.
    """
    name = "spreadsheet_processor"
    description = "Parses a spreadsheet file (e.g., .xlsx, .xls, .csv) and can perform queries. Returns extracted data or query results."
    inputs = {
        'file_path': {'type': 'string', 'description': 'Path to the spreadsheet file.'},
        'query_instructions': {'type': 'string', 'description': 'Optional. Instructions for querying the data (e.g., "Sum column A"). If None, parses the whole sheet.', 'nullable': True}
    }
    outputs = {'result': {'type': 'object', 'description': 'A dictionary containing parsed sheet data, query results, or an error message.'}}
    output_type = "object"
    
    def __init__(self, *args, **kwargs):
        """Initialize the SpreadsheetTool."""
        super().__init__(*args, **kwargs)
        self.is_initialized = True
    
    # Main entry point for the agent
    def forward(self, file_path: str, query_instructions: Optional[str] = None) -> Dict[str, Any]:
        if not os.path.exists(file_path):
            return {"error": f"File not found: {file_path}"}

        # Determine file type for appropriate parsing
        _, file_extension = os.path.splitext(file_path)
        file_extension = file_extension.lower()

        parsed_data = None
        if file_extension in ['.xlsx', '.xls']:
            parsed_data = self._parse_excel(file_path)
        elif file_extension == '.csv':
            parsed_data = self._parse_csv(file_path)
        else:
            return {"error": f"Unsupported file type: {file_extension}. Supported types: .xlsx, .xls, .csv"}

        if parsed_data.get("error"):
            return parsed_data # Return error from parsing step

        if query_instructions:
            return self._query_data(parsed_data, query_instructions)
        else:
            # If no query, return the parsed data and summary
            return {
                "parsed_sheets": parsed_data.get("sheets"), 
                "summary": parsed_data.get("summary"),
                "message": "Spreadsheet parsed successfully."
            }

    def _parse_excel(self, file_path: str) -> Dict[str, Any]:
        """Parse an Excel spreadsheet and extract useful information."""
        try:
            excel_file = pd.ExcelFile(file_path)
            sheet_names = excel_file.sheet_names
            sheets = {}
            for sheet_name in sheet_names:
                sheets[sheet_name] = pd.read_excel(excel_file, sheet_name=sheet_name)
            summary = self._create_summary(sheets)
            return {"sheets": sheets, "sheet_names": sheet_names, "summary": summary, "error": None}
        except Exception as e:
            return {"error": f"Error parsing Excel spreadsheet: {str(e)}"}

    def _parse_csv(self, file_path: str) -> Dict[str, Any]:
        """Parse a CSV file."""
        try:
            df = pd.read_csv(file_path)
            # CSVs don't have multiple sheets, so we adapt the structure
            sheet_name = os.path.splitext(os.path.basename(file_path))[0]
            sheets = {sheet_name: df}
            summary = self._create_summary(sheets)
            return {"sheets": sheets, "sheet_names": [sheet_name], "summary": summary, "error": None}
        except Exception as e:
            return {"error": f"Error parsing CSV file: {str(e)}"}

    def _create_summary(self, sheets_dict: Dict[str, pd.DataFrame]) -> Dict[str, Any]:
        """Create a summary of the spreadsheet contents."""
        summary = {}
        
        for sheet_name, df in sheets_dict.items():
            summary[sheet_name] = {
                "shape": df.shape,
                "columns": df.columns.tolist(),
                "numeric_columns": df.select_dtypes(include=[np.number]).columns.tolist(),
                "text_columns": df.select_dtypes(include=['object']).columns.tolist(),
                "has_nulls": df.isnull().any().any(),
                "first_few_rows": df.head(3).to_dict('records')
            }
        
        return summary
    
    # Renamed from query_data to _query_data and adjusted arguments
    def _query_data(self, parsed_data_dict: Dict[str, Any], query_instructions: str) -> Dict[str, Any]:
        """
        Execute a query on the spreadsheet data based on instructions.
        This is a simplified placeholder. Real implementation would need robust query parsing.
        """
        if parsed_data_dict.get("error"):
            return {"error": parsed_data_dict["error"]}
        
        sheets = parsed_data_dict.get("sheets")
        if not sheets:
            return {"error": "No sheets data available for querying."}
            
        # Placeholder for actual query logic. 
        # This would involve parsing `query_instructions` (e.g., using regex, NLP, or a DSL)
        # and applying pandas operations.
        # For now, let's return a message indicating the query was received and basic info.
        
        results = {}
        explanation = f"Query instruction received: '{query_instructions}'. Advanced query execution is not fully implemented. " \
                      f"Returning summary of available sheets: {list(sheets.keys())}."
        
        # Example: if query asks for sum, try to sum first numeric column of first sheet
        if "sum" in query_instructions.lower():
            first_sheet_name = next(iter(sheets))
            df = sheets[first_sheet_name]
            numeric_cols = df.select_dtypes(include=[np.number]).columns
            if not numeric_cols.empty:
                col_to_sum = numeric_cols[0]
                try:
                    total_sum = df[col_to_sum].sum()
                    results[f'{first_sheet_name}_{col_to_sum}_sum'] = total_sum
                    explanation += f" Example sum of column '{col_to_sum}' in sheet '{first_sheet_name}': {total_sum}."
                except Exception as e:
                    explanation += f" Could not perform example sum: {e}."
            else:
                explanation += " No numeric columns found for example sum."

        return {"query_results": results, "explanation": explanation, "original_query": query_instructions}

# Example usage (for direct testing)
if __name__ == '__main__':
    tool = SpreadsheetTool()

    # Create dummy files for testing
    dummy_excel_file = "dummy_test.xlsx"
    dummy_csv_file = "dummy_test.csv"

    # Create a dummy Excel file
    df_excel = pd.DataFrame({
        'colA': [1, 2, 3, 4, 5],
        'colB': ['apple', 'banana', 'cherry', 'date', 'elderberry'],
        'colC': [10.1, 20.2, 30.3, 40.4, 50.5]
    })
    with pd.ExcelWriter(dummy_excel_file) as writer:
        df_excel.to_excel(writer, sheet_name='Sheet1', index=False)
        df_excel.head(2).to_excel(writer, sheet_name='Sheet2', index=False)

    # Create a dummy CSV file
    df_csv = pd.DataFrame({
        'id': [101, 102, 103],
        'product': ['widget', 'gadget', 'gizmo'],
        'price': [19.99, 29.50, 15.00]
    })
    df_csv.to_csv(dummy_csv_file, index=False)

    print("--- Test 1: Parse Excel file (no query) ---")
    result1 = tool.forward(file_path=dummy_excel_file)
    print(result1)
    assert "error" not in result1 or result1["error"] is None
    assert "Sheet1" in result1["parsed_sheets"]

    print("\n--- Test 2: Parse CSV file (no query) ---")
    result2 = tool.forward(file_path=dummy_csv_file)
    print(result2)
    assert "error" not in result2 or result2["error"] is None
    assert dummy_csv_file.split('.')[0] in result2["parsed_sheets"]

    print("\n--- Test 3: Query Excel file (simple sum example) ---")
    result3 = tool.forward(file_path=dummy_excel_file, query_instructions="sum colA from Sheet1")
    print(result3)
    assert "error" not in result3 or result3["error"] is None
    assert "query_results" in result3
    if result3.get("query_results"):
        assert "Sheet1_colA_sum" in result3["query_results"]
        assert result3["query_results"]["Sheet1_colA_sum"] == 15

    print("\n--- Test 4: File not found ---")
    result4 = tool.forward(file_path="non_existent_file.xlsx")
    print(result4)
    assert result4["error"] is not None
    assert "File not found" in result4["error"]
    
    print("\n--- Test 5: Unsupported file type ---")
    dummy_txt_file = "dummy_test.txt"
    with open(dummy_txt_file, "w") as f:
        f.write("hello")
    result5 = tool.forward(file_path=dummy_txt_file)
    print(result5)
    assert result5["error"] is not None
    assert "Unsupported file type" in result5["error"]
    os.remove(dummy_txt_file)

    # Clean up dummy files
    if os.path.exists(dummy_excel_file):
        os.remove(dummy_excel_file)
    if os.path.exists(dummy_csv_file):
        os.remove(dummy_csv_file)

    print("\nSpreadsheetTool tests completed.")