import re
import json
from typing import Dict, Any, List, Optional

def is_valid_query(query: str) -> bool:
    """
    Validates if a search query is legitimate.
    
    Args:
        query: The search query to validate
        
    Returns:
        Boolean indicating if the query is valid
    """
    # Reject empty queries
    if not query or query.strip() == "":
        return False
    
    # Reject single emoji queries
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251" 
        "]+"
    )
    
    stripped_query = emoji_pattern.sub(r'', query).strip()
    if not stripped_query and len(query) <= 5:  # Single emoji or very short
        return False
    
    # Reject random numbers only (at least 5 digits with no context)
    if re.match(r'^\d{5,}$', query.strip()):
        return False
    
    # Reject gibberish (no vowels in long string suggests gibberish)
    if len(query) > 10 and not re.search(r'[aeiouAEIOU]', query):
        return False
        
    return True

def format_research_results(search_results: List[Dict[str, Any]], 
                           scraped_contents: Dict[str, str],
                           analyzed_contents: Dict[str, Dict[str, Any]]) -> str:
    """
    Formats research results into a readable response with citations.
    
    Args:
        search_results: The list of search result items
        scraped_contents: Dict mapping URLs to scraped content
        analyzed_contents: Dict mapping URLs to analysis results
        
    Returns:
        Formatted response with citations
    """
    response_parts = []
    citations = []
    
    # Filter to only include relevant content based on analysis
    relevant_urls = {
        url: data 
        for url, data in analyzed_contents.items() 
        if data.get("relevance_score", 0) >= 5
    }
    
    # No relevant results
    if not relevant_urls:
        return "I couldn't find relevant information for your query. Could you try rephrasing or providing more details?"
    
    # Compile the response with relevant information
    for i, (url, data) in enumerate(relevant_urls.items(), 1):
        citations.append(f"[{i}] {url}")
        filtered_content = data.get("filtered_content", "")
        
        # Add the content with citation
        if filtered_content:
            response_parts.append(f"{filtered_content} [{i}]")
    
    # Combine everything
    response = "\n\n".join(response_parts)
    citation_text = "\n".join(citations)
    
    return f"{response}\n\nSources:\n{citation_text}"

def extract_citations(text: str) -> List[Dict[str, str]]:
    """
    Extract citations from formatted text.
    
    Args:
        text: Text with citation markers like [1], [2], etc.
        
    Returns:
        List of citation objects with citation number and referenced text
    """
    citations = []
    citation_pattern = r'\[(\d+)\]'
    
    matches = re.finditer(citation_pattern, text)
    for match in matches:
        citation_num = match.group(1)
        # Get the preceding text (limited to reasonable length)
        start_pos = max(0, match.start() - 100)
        cited_text = text[start_pos:match.start()].strip()
        if len(cited_text) == 100:  # Truncated
            cited_text = "..." + cited_text
        
        citations.append({
            "number": citation_num,
            "text": cited_text
        })
    
    return citations