HF_Agents_Final_Project

Sleeping

HF_Agents_Final_Project / src /spreadsheet_tool.py

Yago Bolivar

Refactor speech_to_text.py to implement a singleton ASR pipeline, enhance error handling, and introduce SpeechToTextTool for better integration. Update spreadsheet_tool.py to support querying and improve parsing functionality, including CSV support. Enhance video_processing_tool.py with new tasks for metadata extraction and frame extraction, while improving object detection capabilities and initialization checks.

87aa741 3 months ago

raw

history blame contribute delete

9.12 kB

	import os
	import pandas as pd
	from typing import Dict, List, Union, Tuple, Any, Optional
	import numpy as np
	from smolagents.tools import Tool

	class SpreadsheetTool(Tool):
	"""
	Parses spreadsheet files (e.g., .xlsx) and extracts tabular data for analysis or allows querying.
	Useful for reading, processing, and converting spreadsheet content to Python data structures.
	"""
	name = "spreadsheet_processor"
	description = "Parses a spreadsheet file (e.g., .xlsx, .xls, .csv) and can perform queries. Returns extracted data or query results."
	inputs = {
	'file_path': {'type': 'string', 'description': 'Path to the spreadsheet file.'},
	'query_instructions': {'type': 'string', 'description': 'Optional. Instructions for querying the data (e.g., "Sum column A"). If None, parses the whole sheet.', 'nullable': True}
	}
	outputs = {'result': {'type': 'object', 'description': 'A dictionary containing parsed sheet data, query results, or an error message.'}}
	output_type = "object"

	def __init__(self, args, *kwargs):
	"""Initialize the SpreadsheetTool."""
	super().__init__(args, *kwargs)
	self.is_initialized = True

	# Main entry point for the agent
	def forward(self, file_path: str, query_instructions: Optional[str] = None) -> Dict[str, Any]:
	if not os.path.exists(file_path):
	return {"error": f"File not found: {file_path}"}

	# Determine file type for appropriate parsing
	_, file_extension = os.path.splitext(file_path)
	file_extension = file_extension.lower()

	parsed_data = None
	if file_extension in ['.xlsx', '.xls']:
	parsed_data = self._parse_excel(file_path)
	elif file_extension == '.csv':
	parsed_data = self._parse_csv(file_path)
	else:
	return {"error": f"Unsupported file type: {file_extension}. Supported types: .xlsx, .xls, .csv"}

	if parsed_data.get("error"):
	return parsed_data # Return error from parsing step

	if query_instructions:
	return self._query_data(parsed_data, query_instructions)
	else:
	# If no query, return the parsed data and summary
	return {
	"parsed_sheets": parsed_data.get("sheets"),
	"summary": parsed_data.get("summary"),
	"message": "Spreadsheet parsed successfully."
	}

	def _parse_excel(self, file_path: str) -> Dict[str, Any]:
	"""Parse an Excel spreadsheet and extract useful information."""
	try:
	excel_file = pd.ExcelFile(file_path)
	sheet_names = excel_file.sheet_names
	sheets = {}
	for sheet_name in sheet_names:
	sheets[sheet_name] = pd.read_excel(excel_file, sheet_name=sheet_name)
	summary = self._create_summary(sheets)
	return {"sheets": sheets, "sheet_names": sheet_names, "summary": summary, "error": None}
	except Exception as e:
	return {"error": f"Error parsing Excel spreadsheet: {str(e)}"}

	def _parse_csv(self, file_path: str) -> Dict[str, Any]:
	"""Parse a CSV file."""
	try:
	df = pd.read_csv(file_path)
	# CSVs don't have multiple sheets, so we adapt the structure
	sheet_name = os.path.splitext(os.path.basename(file_path))[0]
	sheets = {sheet_name: df}
	summary = self._create_summary(sheets)
	return {"sheets": sheets, "sheet_names": [sheet_name], "summary": summary, "error": None}
	except Exception as e:
	return {"error": f"Error parsing CSV file: {str(e)}"}

	def _create_summary(self, sheets_dict: Dict[str, pd.DataFrame]) -> Dict[str, Any]:
	"""Create a summary of the spreadsheet contents."""
	summary = {}

	for sheet_name, df in sheets_dict.items():
	summary[sheet_name] = {
	"shape": df.shape,
	"columns": df.columns.tolist(),
	"numeric_columns": df.select_dtypes(include=[np.number]).columns.tolist(),
	"text_columns": df.select_dtypes(include=['object']).columns.tolist(),
	"has_nulls": df.isnull().any().any(),
	"first_few_rows": df.head(3).to_dict('records')
	}

	return summary

	# Renamed from query_data to _query_data and adjusted arguments
	def _query_data(self, parsed_data_dict: Dict[str, Any], query_instructions: str) -> Dict[str, Any]:
	"""
	Execute a query on the spreadsheet data based on instructions.
	This is a simplified placeholder. Real implementation would need robust query parsing.
	"""
	if parsed_data_dict.get("error"):
	return {"error": parsed_data_dict["error"]}

	sheets = parsed_data_dict.get("sheets")
	if not sheets:
	return {"error": "No sheets data available for querying."}

	# Placeholder for actual query logic.
	# This would involve parsing `query_instructions` (e.g., using regex, NLP, or a DSL)
	# and applying pandas operations.
	# For now, let's return a message indicating the query was received and basic info.

	results = {}
	explanation = f"Query instruction received: '{query_instructions}'. Advanced query execution is not fully implemented. " \
	f"Returning summary of available sheets: {list(sheets.keys())}."

	# Example: if query asks for sum, try to sum first numeric column of first sheet
	if "sum" in query_instructions.lower():
	first_sheet_name = next(iter(sheets))
	df = sheets[first_sheet_name]
	numeric_cols = df.select_dtypes(include=[np.number]).columns
	if not numeric_cols.empty:
	col_to_sum = numeric_cols[0]
	try:
	total_sum = df[col_to_sum].sum()
	results[f'{first_sheet_name}_{col_to_sum}_sum'] = total_sum
	explanation += f" Example sum of column '{col_to_sum}' in sheet '{first_sheet_name}': {total_sum}."
	except Exception as e:
	explanation += f" Could not perform example sum: {e}."
	else:
	explanation += " No numeric columns found for example sum."

	return {"query_results": results, "explanation": explanation, "original_query": query_instructions}

	# Example usage (for direct testing)
	if __name__ == '__main__':
	tool = SpreadsheetTool()

	# Create dummy files for testing
	dummy_excel_file = "dummy_test.xlsx"
	dummy_csv_file = "dummy_test.csv"

	# Create a dummy Excel file
	df_excel = pd.DataFrame({
	'colA': [1, 2, 3, 4, 5],
	'colB': ['apple', 'banana', 'cherry', 'date', 'elderberry'],
	'colC': [10.1, 20.2, 30.3, 40.4, 50.5]
	})
	with pd.ExcelWriter(dummy_excel_file) as writer:
	df_excel.to_excel(writer, sheet_name='Sheet1', index=False)
	df_excel.head(2).to_excel(writer, sheet_name='Sheet2', index=False)

	# Create a dummy CSV file
	df_csv = pd.DataFrame({
	'id': [101, 102, 103],
	'product': ['widget', 'gadget', 'gizmo'],
	'price': [19.99, 29.50, 15.00]
	})
	df_csv.to_csv(dummy_csv_file, index=False)

	print("--- Test 1: Parse Excel file (no query) ---")
	result1 = tool.forward(file_path=dummy_excel_file)
	print(result1)
	assert "error" not in result1 or result1["error"] is None
	assert "Sheet1" in result1["parsed_sheets"]

	print("\n--- Test 2: Parse CSV file (no query) ---")
	result2 = tool.forward(file_path=dummy_csv_file)
	print(result2)
	assert "error" not in result2 or result2["error"] is None
	assert dummy_csv_file.split('.')[0] in result2["parsed_sheets"]

	print("\n--- Test 3: Query Excel file (simple sum example) ---")
	result3 = tool.forward(file_path=dummy_excel_file, query_instructions="sum colA from Sheet1")
	print(result3)
	assert "error" not in result3 or result3["error"] is None
	assert "query_results" in result3
	if result3.get("query_results"):
	assert "Sheet1_colA_sum" in result3["query_results"]
	assert result3["query_results"]["Sheet1_colA_sum"] == 15

	print("\n--- Test 4: File not found ---")
	result4 = tool.forward(file_path="non_existent_file.xlsx")
	print(result4)
	assert result4["error"] is not None
	assert "File not found" in result4["error"]

	print("\n--- Test 5: Unsupported file type ---")
	dummy_txt_file = "dummy_test.txt"
	with open(dummy_txt_file, "w") as f:
	f.write("hello")
	result5 = tool.forward(file_path=dummy_txt_file)
	print(result5)
	assert result5["error"] is not None
	assert "Unsupported file type" in result5["error"]
	os.remove(dummy_txt_file)

	# Clean up dummy files
	if os.path.exists(dummy_excel_file):
	os.remove(dummy_excel_file)
	if os.path.exists(dummy_csv_file):
	os.remove(dummy_csv_file)

	print("\nSpreadsheetTool tests completed.")