ppsingh's picture
Upload 6 files
137c471 verified
import configparser
import logging
import os
import ast
import re
from dotenv import load_dotenv
# Local .env file
load_dotenv()
def getconfig(configfile_path: str):
"""
Read the config file
Params
----------------
configfile_path: file path of .cfg file
"""
config = configparser.ConfigParser()
try:
config.read_file(open(configfile_path))
return config
except:
logging.warning("config file not found")
def get_auth(provider: str) -> dict:
"""Get authentication configuration for different providers"""
auth_configs = {
"huggingface": {"api_key": os.getenv("HF_TOKEN")},
"qdrant": {"api_key": os.getenv("QDRANT_API_KEY")},
}
provider = provider.lower() # Normalize to lowercase
if provider not in auth_configs:
raise ValueError(f"Unsupported provider: {provider}")
auth_config = auth_configs[provider]
api_key = auth_config.get("api_key")
if not api_key:
logging.warning(f"No API key found for provider '{provider}'. Please set the appropriate environment variable.")
auth_config["api_key"] = None
return auth_config
def process_content(content: str) -> str:
"""
Process and clean malformed content that may contain stringified nested lists.
The test DB on qdrant somehow got a bit malformed in the processing - but probably good to have this anyway
Args:
content: Raw content from vector store
Returns:
Cleaned, readable text content
"""
if not content:
return content
# Check if content looks like a stringified list/nested structure
content_stripped = content.strip()
if content_stripped.startswith('[') and content_stripped.endswith(']'):
try:
# Parse as literal list structure
parsed_content = ast.literal_eval(content_stripped)
if isinstance(parsed_content, list):
# Flatten nested lists and extract meaningful text
def extract_text_from_nested(obj):
if isinstance(obj, list):
text_items = []
for item in obj:
extracted = extract_text_from_nested(item)
if extracted and extracted.strip():
text_items.append(extracted)
return ' '.join(text_items)
elif isinstance(obj, str) and obj.strip():
return obj.strip()
elif isinstance(obj, dict):
# Handle dict structures if present
text_items = []
for key, value in obj.items():
if isinstance(value, str) and value.strip():
text_items.append(f"{key}: {value}")
return ' '.join(text_items)
else:
return ''
extracted_text = extract_text_from_nested(parsed_content)
if extracted_text and len(extracted_text.strip()) > 0:
# Clean up extra whitespace and format nicely
cleaned_text = re.sub(r'\s+', ' ', extracted_text).strip()
logging.debug(f"Successfully processed nested list content: {len(cleaned_text)} chars")
return cleaned_text
else:
logging.warning("Parsed list content but no meaningful text found")
return content # Return original if no meaningful text extracted
except (ValueError, SyntaxError) as e:
logging.debug(f"Content looks like list but failed to parse: {e}")
# Fall through to return original content
# For regular text content, just clean up whitespace
return re.sub(r'\s+', ' ', content).strip()