Spaces:
Sleeping
Sleeping
File size: 4,759 Bytes
922f271 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
"""
Utility functions for working with different file formats in the resources directory
"""
import os
import json
import pandas as pd
from typing import Dict, Any, Union, List, Optional
import logging
from PIL import Image
import base64
from io import BytesIO
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Constants
RESOURCE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resource")
def list_resources() -> List[str]:
"""List all files in the resources directory"""
try:
return [f for f in os.listdir(RESOURCE_DIR) if os.path.isfile(os.path.join(RESOURCE_DIR, f))]
except Exception as e:
logger.error(f"Error listing resources: {e}")
return []
def load_excel(file_path: str) -> Union[pd.DataFrame, None]:
"""Load data from an Excel file"""
try:
return pd.read_excel(file_path)
except Exception as e:
logger.error(f"Error reading Excel file {file_path}: {e}")
return None
def load_csv(file_path: str) -> Union[pd.DataFrame, None]:
"""Load data from a CSV file"""
try:
return pd.read_csv(file_path)
except Exception as e:
logger.error(f"Error reading CSV file {file_path}: {e}")
return None
def load_text(file_path: str) -> Union[str, None]:
"""Load content from a text file"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
except Exception as e:
logger.error(f"Error reading text file {file_path}: {e}")
return None
def load_json(file_path: str) -> Union[Dict, List, None]:
"""Load data from a JSON file"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
logger.error(f"Error reading JSON file {file_path}: {e}")
return None
def load_image(file_path: str) -> Union[str, None]:
"""Load an image file and return base64 representation"""
try:
with Image.open(file_path) as img:
buffered = BytesIO()
img.save(buffered, format=img.format)
img_str = base64.b64encode(buffered.getvalue()).decode()
return f"data:image/{img.format.lower()};base64,{img_str}"
except Exception as e:
logger.error(f"Error reading image file {file_path}: {e}")
return None
def get_file_handler(file_path: str) -> Union[Any, None]:
"""Get the appropriate handler for a file based on its extension"""
if not os.path.exists(file_path):
logger.error(f"File not found: {file_path}")
return None
ext = os.path.splitext(file_path)[1].lower()
if ext in ['.xlsx', '.xls']:
return load_excel(file_path)
elif ext == '.csv':
return load_csv(file_path)
elif ext in ['.txt', '.md', '.py']:
return load_text(file_path)
elif ext in ['.json', '.jsonld']:
return load_json(file_path)
elif ext in ['.jpg', '.jpeg', '.png', '.gif']:
return load_image(file_path)
else:
logger.warning(f"No handler for file type {ext}")
return None
def search_metadata_by_question(question: str) -> List[Dict]:
"""
Search the metadata.jsonl file for entries that match a given question
"""
results = []
metadata_path = os.path.join(RESOURCE_DIR, "metadata.jsonl")
try:
with open(metadata_path, 'r', encoding='utf-8') as f:
for line in f:
data = json.loads(line)
metadata_question = data.get('Question', '').lower()
# Check for question match
if question.lower() in metadata_question or metadata_question in question.lower():
results.append(data)
# Check if this is a file-based question
if 'attached' in question.lower() or 'spreadsheet' in question.lower():
if data.get('file_name'):
results.append(data)
except Exception as e:
logger.error(f"Error searching metadata: {e}")
return results
def get_metadata_answer(task_id: str) -> str:
"""Get the answer for a specific task ID from metadata"""
metadata_path = os.path.join(RESOURCE_DIR, "metadata.jsonl")
try:
with open(metadata_path, 'r', encoding='utf-8') as f:
for line in f:
data = json.loads(line)
if data.get('task_id') == task_id:
return data.get('Final answer', '')
except Exception as e:
logger.error(f"Error getting metadata answer: {e}")
return ""
|