File size: 4,014 Bytes
137c471
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import configparser
import logging
import os
import ast
import re
from dotenv import load_dotenv

# Local .env file
load_dotenv()

def getconfig(configfile_path: str):
    """
    Read the config file
    Params
    ----------------
    configfile_path: file path of .cfg file
    """
    config = configparser.ConfigParser()
    try:
        config.read_file(open(configfile_path))
        return config
    except:
        logging.warning("config file not found")


def get_auth(provider: str) -> dict:
    """Get authentication configuration for different providers"""
    auth_configs = {
        "huggingface": {"api_key": os.getenv("HF_TOKEN")},
        "qdrant": {"api_key": os.getenv("QDRANT_API_KEY")},
    }
    
    provider = provider.lower()  # Normalize to lowercase
    
    if provider not in auth_configs:
        raise ValueError(f"Unsupported provider: {provider}")
    
    auth_config = auth_configs[provider]
    api_key = auth_config.get("api_key")
    
    if not api_key:
        logging.warning(f"No API key found for provider '{provider}'. Please set the appropriate environment variable.")
        auth_config["api_key"] = None
    
    return auth_config


def process_content(content: str) -> str:
    """
    Process and clean malformed content that may contain stringified nested lists.
    The test DB on qdrant somehow got a bit malformed in the processing - but probably good to have this anyway
    
    Args:
        content: Raw content from vector store
        
    Returns:
        Cleaned, readable text content
    """
    if not content:
        return content
    
    # Check if content looks like a stringified list/nested structure
    content_stripped = content.strip()
    if content_stripped.startswith('[') and content_stripped.endswith(']'):
        try:
            # Parse as literal list structure
            parsed_content = ast.literal_eval(content_stripped)
            
            if isinstance(parsed_content, list):
                # Flatten nested lists and extract meaningful text
                def extract_text_from_nested(obj):
                    if isinstance(obj, list):
                        text_items = []
                        for item in obj:
                            extracted = extract_text_from_nested(item)
                            if extracted and extracted.strip():
                                text_items.append(extracted)
                        return ' '.join(text_items)
                    elif isinstance(obj, str) and obj.strip():
                        return obj.strip()
                    elif isinstance(obj, dict):
                        # Handle dict structures if present
                        text_items = []
                        for key, value in obj.items():
                            if isinstance(value, str) and value.strip():
                                text_items.append(f"{key}: {value}")
                        return ' '.join(text_items)
                    else:
                        return ''
                
                extracted_text = extract_text_from_nested(parsed_content)
                
                if extracted_text and len(extracted_text.strip()) > 0:
                    # Clean up extra whitespace and format nicely
                    cleaned_text = re.sub(r'\s+', ' ', extracted_text).strip()
                    logging.debug(f"Successfully processed nested list content: {len(cleaned_text)} chars")
                    return cleaned_text
                else:
                    logging.warning("Parsed list content but no meaningful text found")
                    return content  # Return original if no meaningful text extracted
                    
        except (ValueError, SyntaxError) as e:
            logging.debug(f"Content looks like list but failed to parse: {e}")
            # Fall through to return original content
    
    # For regular text content, just clean up whitespace
    return re.sub(r'\s+', ' ', content).strip()