Spaces:
Sleeping
Sleeping
""" | |
Utility functions for working with different file formats in the resources directory | |
""" | |
import os | |
import json | |
import pandas as pd | |
from typing import Dict, Any, Union, List, Optional | |
import logging | |
from PIL import Image | |
import base64 | |
from io import BytesIO | |
# Configure logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
# Constants | |
RESOURCE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resource") | |
def list_resources() -> List[str]: | |
"""List all files in the resources directory""" | |
try: | |
return [f for f in os.listdir(RESOURCE_DIR) if os.path.isfile(os.path.join(RESOURCE_DIR, f))] | |
except Exception as e: | |
logger.error(f"Error listing resources: {e}") | |
return [] | |
def load_excel(file_path: str) -> Union[pd.DataFrame, None]: | |
"""Load data from an Excel file""" | |
try: | |
return pd.read_excel(file_path) | |
except Exception as e: | |
logger.error(f"Error reading Excel file {file_path}: {e}") | |
return None | |
def load_csv(file_path: str) -> Union[pd.DataFrame, None]: | |
"""Load data from a CSV file""" | |
try: | |
return pd.read_csv(file_path) | |
except Exception as e: | |
logger.error(f"Error reading CSV file {file_path}: {e}") | |
return None | |
def load_text(file_path: str) -> Union[str, None]: | |
"""Load content from a text file""" | |
try: | |
with open(file_path, 'r', encoding='utf-8') as f: | |
return f.read() | |
except Exception as e: | |
logger.error(f"Error reading text file {file_path}: {e}") | |
return None | |
def load_json(file_path: str) -> Union[Dict, List, None]: | |
"""Load data from a JSON file""" | |
try: | |
with open(file_path, 'r', encoding='utf-8') as f: | |
return json.load(f) | |
except Exception as e: | |
logger.error(f"Error reading JSON file {file_path}: {e}") | |
return None | |
def load_image(file_path: str) -> Union[str, None]: | |
"""Load an image file and return base64 representation""" | |
try: | |
with Image.open(file_path) as img: | |
buffered = BytesIO() | |
img.save(buffered, format=img.format) | |
img_str = base64.b64encode(buffered.getvalue()).decode() | |
return f"data:image/{img.format.lower()};base64,{img_str}" | |
except Exception as e: | |
logger.error(f"Error reading image file {file_path}: {e}") | |
return None | |
def get_file_handler(file_path: str) -> Union[Any, None]: | |
"""Get the appropriate handler for a file based on its extension""" | |
if not os.path.exists(file_path): | |
logger.error(f"File not found: {file_path}") | |
return None | |
ext = os.path.splitext(file_path)[1].lower() | |
if ext in ['.xlsx', '.xls']: | |
return load_excel(file_path) | |
elif ext == '.csv': | |
return load_csv(file_path) | |
elif ext in ['.txt', '.md', '.py']: | |
return load_text(file_path) | |
elif ext in ['.json', '.jsonld']: | |
return load_json(file_path) | |
elif ext in ['.jpg', '.jpeg', '.png', '.gif']: | |
return load_image(file_path) | |
else: | |
logger.warning(f"No handler for file type {ext}") | |
return None | |
def search_metadata_by_question(question: str) -> List[Dict]: | |
""" | |
Search the metadata.jsonl file for entries that match a given question | |
""" | |
results = [] | |
metadata_path = os.path.join(RESOURCE_DIR, "metadata.jsonl") | |
try: | |
with open(metadata_path, 'r', encoding='utf-8') as f: | |
for line in f: | |
data = json.loads(line) | |
metadata_question = data.get('Question', '').lower() | |
# Check for question match | |
if question.lower() in metadata_question or metadata_question in question.lower(): | |
results.append(data) | |
# Check if this is a file-based question | |
if 'attached' in question.lower() or 'spreadsheet' in question.lower(): | |
if data.get('file_name'): | |
results.append(data) | |
except Exception as e: | |
logger.error(f"Error searching metadata: {e}") | |
return results | |
def get_metadata_answer(task_id: str) -> str: | |
"""Get the answer for a specific task ID from metadata""" | |
metadata_path = os.path.join(RESOURCE_DIR, "metadata.jsonl") | |
try: | |
with open(metadata_path, 'r', encoding='utf-8') as f: | |
for line in f: | |
data = json.loads(line) | |
if data.get('task_id') == task_id: | |
return data.get('Final answer', '') | |
except Exception as e: | |
logger.error(f"Error getting metadata answer: {e}") | |
return "" | |