HF_Agents_Final_Project

Sleeping

App Files Files Community

Yago Bolivar commited on May 12

Commit

108e7a1

1 Parent(s): 2172594

feat: implement SpreadsheetTool for parsing and querying Excel files with detailed summaries

Browse files

Files changed (3) hide show

src/spreadsheet_tool.py +243 -0
tests/conftest.py +9 -0
tests/test_spreadsheet_tool.py +90 -0

src/spreadsheet_tool.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import os
+import pandas as pd
+from typing import Dict, List, Union, Tuple, Any
+import numpy as np
+class SpreadsheetTool:
+    """Tool for parsing and extracting data from Excel (.xlsx) files."""
+    def __init__(self):
+        """Initialize the SpreadsheetTool."""
+        pass
+    def parse_spreadsheet(self, file_path: str) -> Dict[str, Any]:
+        """
+        Parse an Excel spreadsheet and extract useful information.
+        Args:
+            file_path: Path to the .xlsx file
+        Returns:
+            Dictionary containing:
+                - sheets: Dictionary of sheet names and their DataFrames
+                - sheet_names: List of sheet names
+                - summary: Basic spreadsheet summary
+                - error: Error message if any
+        """
+        if not os.path.exists(file_path):
+            return {"error": f"File not found: {file_path}"}
+        try:
+            # Read all sheets in the Excel file
+            excel_file = pd.ExcelFile(file_path)
+            sheet_names = excel_file.sheet_names
+            sheets = {}
+            for sheet_name in sheet_names:
+                sheets[sheet_name] = pd.read_excel(excel_file, sheet_name=sheet_name)
+            # Create a summary of the spreadsheet
+            summary = self._create_summary(sheets)
+            return {
+                "sheets": sheets,
+                "sheet_names": sheet_names,
+                "summary": summary,
+                "error": None
+            }
+        except Exception as e:
+            return {"error": f"Error parsing spreadsheet: {str(e)}"}
+    def _create_summary(self, sheets_dict: Dict[str, pd.DataFrame]) -> Dict[str, Any]:
+        """Create a summary of the spreadsheet contents."""
+        summary = {}
+        for sheet_name, df in sheets_dict.items():
+            summary[sheet_name] = {
+                "shape": df.shape,
+                "columns": df.columns.tolist(),
+                "numeric_columns": df.select_dtypes(include=[np.number]).columns.tolist(),
+                "text_columns": df.select_dtypes(include=['object']).columns.tolist(),
+                "has_nulls": df.isnull().any().any(),
+                "first_few_rows": df.head(3).to_dict('records')
+            }
+        return summary
+    def query_data(self, data: Dict[str, Any], query_instructions: str) -> Dict[str, Any]:
+        """
+        Execute a query on the spreadsheet data based on instructions.
+        Args:
+            data: The parsed spreadsheet data (from parse_spreadsheet)
+            query_instructions: Instructions for querying the data (e.g., "Sum column A")
+        Returns:
+            Dictionary with query results and potential explanation
+        """
+        if data.get("error"):
+            return {"error": data["error"]}
+        try:
+            # This is where you'd implement more sophisticated query logic
+            # For now, we'll implement some basic operations
+            sheets = data["sheets"]
+            result = {}
+            # Handle common operations based on query_instructions
+            if "sum" in query_instructions.lower():
+                # Extract column or range to sum
+                # This is a simple implementation - a more robust one would use regex or NLP
+                for sheet_name, df in sheets.items():
+                    numeric_cols = df.select_dtypes(include=[np.number]).columns
+                    if not numeric_cols.empty:
+                        result[f"{sheet_name}_sums"] = {
+                            col: df[col].sum() for col in numeric_cols
+                        }
+            elif "average" in query_instructions.lower() or "mean" in query_instructions.lower():
+                for sheet_name, df in sheets.items():
+                    numeric_cols = df.select_dtypes(include=[np.number]).columns
+                    if not numeric_cols.empty:
+                        result[f"{sheet_name}_averages"] = {
+                            col: df[col].mean() for col in numeric_cols
+                        }
+            elif "count" in query_instructions.lower():
+                for sheet_name, df in sheets.items():
+                    result[f"{sheet_name}_counts"] = {
+                        "rows": len(df),
+                        "non_null_counts": df.count().to_dict()
+                    }
+            # Add the raw data structure for more custom processing by the agent
+            result["data_structure"] = {
+                sheet_name: {
+                    "columns": df.columns.tolist(),
+                    "dtypes": df.dtypes.astype(str).to_dict()
+                } for sheet_name, df in sheets.items()
+            }
+            return result
+        except Exception as e:
+            return {"error": f"Error querying data: {str(e)}"}
+    def extract_specific_data(self, data: Dict[str, Any], sheet_name: str = None,
+                             column_names: List[str] = None,
+                             row_indices: List[int] = None) -> Dict[str, Any]:
+        """
+        Extract specific data from the spreadsheet.
+        Args:
+            data: The parsed spreadsheet data
+            sheet_name: Name of the sheet to extract from (default: first sheet)
+            column_names: List of column names to extract (default: all columns)
+            row_indices: List of row indices to extract (default: all rows)
+        Returns:
+            Dictionary with extracted data
+        """
+        if data.get("error"):
+            return {"error": data["error"]}
+        try:
+            sheets = data["sheets"]
+            # Default to the first sheet if not specified
+            if sheet_name is None:
+                sheet_name = data["sheet_names"][0]
+            if sheet_name not in sheets:
+                return {"error": f"Sheet '{sheet_name}' not found"}
+            df = sheets[sheet_name]
+            # Filter columns if specified
+            if column_names:
+                # Check if all requested columns exist
+                missing_columns = [col for col in column_names if col not in df.columns]
+                if missing_columns:
+                    return {"error": f"Columns not found: {missing_columns}"}
+                df = df[column_names]
+            # Filter rows if specified
+            if row_indices:
+                # Check if indices are in range
+                max_index = len(df) - 1
+                invalid_indices = [i for i in row_indices if i < 0 or i > max_index]
+                if invalid_indices:
+                    return {"error": f"Row indices out of range: {invalid_indices}. Valid range: 0-{max_index}"}
+                df = df.iloc[row_indices]
+            return {
+                "data": df.to_dict('records'),
+                "shape": df.shape
+            }
+        except Exception as e:
+            return {"error": f"Error extracting specific data: {str(e)}"}
+# Example usage (if this script is run directly)
+if __name__ == "__main__":
+    # Create a simple test spreadsheet for demonstration
+    test_dir = "spreadsheet_test"
+    os.makedirs(test_dir, exist_ok=True)
+    # Create a test DataFrame
+    test_data = {
+        'Product': ['Apple', 'Orange', 'Banana', 'Mango'],
+        'Price': [1.2, 0.8, 0.5, 1.5],
+        'Quantity': [100, 80, 200, 50],
+        'Revenue': [120, 64, 100, 75]
+    }
+    df = pd.DataFrame(test_data)
+    test_file_path = os.path.join(test_dir, "test_spreadsheet.xlsx")
+    # Save to Excel
+    with pd.ExcelWriter(test_file_path) as writer:
+        df.to_excel(writer, sheet_name='Sales', index=False)
+        # Create a second sheet with different data
+        pd.DataFrame({
+            'Month': ['Jan', 'Feb', 'Mar', 'Apr'],
+            'Expenses': [50, 60, 55, 70]
+        }).to_excel(writer, sheet_name='Expenses', index=False)
+    print(f"Created test spreadsheet at {test_file_path}")
+    # Test the tool
+    spreadsheet_tool = SpreadsheetTool()
+    # Parse the spreadsheet
+    print("\nParsing spreadsheet...")
+    parsed_data = spreadsheet_tool.parse_spreadsheet(test_file_path)
+    if parsed_data.get("error"):
+        print(f"Error: {parsed_data['error']}")
+    else:
+        print(f"Successfully parsed {len(parsed_data['sheet_names'])} sheets:")
+        print(f"Sheet names: {parsed_data['sheet_names']}")
+        # Show a sample of the first sheet
+        first_sheet_name = parsed_data['sheet_names'][0]
+        first_sheet = parsed_data['sheets'][first_sheet_name]
+        print(f"\nFirst few rows of '{first_sheet_name}':")
+        print(first_sheet.head())
+        # Test query
+        print("\nQuerying data (sum operation)...")
+        query_result = spreadsheet_tool.query_data(parsed_data, "sum")
+        print(f"Query result: {query_result}")
+        # Test specific data extraction
+        print("\nExtracting specific data...")
+        extract_result = spreadsheet_tool.extract_specific_data(
+            parsed_data,
+            sheet_name='Sales',
+            column_names=['Product', 'Revenue']
+        )
+        print(f"Extracted data: {extract_result}")

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""
+Configuration file for pytest.
+This file modifies sys.path to allow imports from the parent directory.
+"""
+import sys
+import os
+# Add the parent directory to sys.path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

tests/test_spreadsheet_tool.py ADDED Viewed

	@@ -0,0 +1,90 @@

+#!/usr/bin/env python3
+# Testing the spreadsheet tool with a downloaded Excel file
+import os
+import sys
+# Add the parent directory to sys.path to find the src module
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from src.spreadsheet_tool import SpreadsheetTool
+def main():
+    # Initialize the spreadsheet tool
+    spreadsheet_tool = SpreadsheetTool()
+    # Path to the downloaded Excel file
+    # Need to navigate up one level and then to downloaded_files
+    project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    excel_file_path = os.path.join(project_root, "downloaded_files", "7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx")
+    print(f"Testing SpreadsheetTool with file: {excel_file_path}")
+    print(f"File exists: {os.path.exists(excel_file_path)}")
+    # Parse the spreadsheet
+    print("\n--- PARSING SPREADSHEET ---")
+    parsed_data = spreadsheet_tool.parse_spreadsheet(excel_file_path)
+    if parsed_data.get("error"):
+        print(f"Error: {parsed_data['error']}")
+        return
+    # Display basic information about the spreadsheet
+    print(f"\nSpreadsheet contains {len(parsed_data['sheet_names'])} sheets:")
+    print(f"Sheet names: {parsed_data['sheet_names']}")
+    # Display a summary of each sheet
+    print("\n--- SHEET SUMMARIES ---")
+    for sheet_name, info in parsed_data["summary"].items():
+        print(f"\nSheet: {sheet_name}")
+        print(f"  Dimensions: {info['shape'][0]} rows × {info['shape'][1]} columns")
+        print(f"  Column names: {info['columns']}")
+        print(f"  Numeric columns: {info['numeric_columns']}")
+        print(f"  Text columns: {info['text_columns']}")
+        print(f"  Contains null values: {info['has_nulls']}")
+        # Display a sample of the first 3 rows
+        print(f"\n  Sample data (first 3 rows):")
+        for i, row in enumerate(info['first_few_rows']):
+            print(f"    Row {i+1}: {row}")
+    # Test the query_data method for numeric operations
+    print("\n--- TESTING QUERY OPERATIONS ---")
+    for query in ["sum", "average", "count"]:
+        print(f"\nTesting '{query}' operation:")
+        query_result = spreadsheet_tool.query_data(parsed_data, query)
+        if query_result.get("error"):
+            print(f"  Error: {query_result['error']}")
+        else:
+            # Remove data_structure from output to keep it cleaner
+            if "data_structure" in query_result:
+                del query_result["data_structure"]
+            print(f"  Result: {query_result}")
+    # Test extracting specific data
+    print("\n--- TESTING DATA EXTRACTION ---")
+    # We'll extract data from the first sheet
+    first_sheet = parsed_data["sheet_names"][0]
+    all_columns = parsed_data["summary"][first_sheet]["columns"]
+    # Extract first two columns from the first sheet
+    if len(all_columns) >= 2:
+        extract_columns = all_columns[:2]
+        print(f"\nExtracting columns {extract_columns} from sheet '{first_sheet}':")
+        extract_result = spreadsheet_tool.extract_specific_data(
+            parsed_data,
+            sheet_name=first_sheet,
+            column_names=extract_columns
+        )
+        if extract_result.get("error"):
+            print(f"  Error: {extract_result['error']}")
+        else:
+            print(f"  Extracted data shape: {extract_result['shape']}")
+            print(f"  First few rows:")
+            for i, row in enumerate(extract_result['data'][:3]):
+                print(f"    Row {i+1}: {row}")
+if __name__ == "__main__":
+    main()