File size: 4,090 Bytes
d445f2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import re
import json
from typing import Dict, Any, List, Optional

def is_valid_query(query: str) -> bool:
    """

    Validates if a search query is legitimate.

    

    Args:

        query: The search query to validate

        

    Returns:

        Boolean indicating if the query is valid

    """
    # Reject empty queries
    if not query or query.strip() == "":
        return False
    
    # Reject single emoji queries
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251" 
        "]+"
    )
    
    stripped_query = emoji_pattern.sub(r'', query).strip()
    if not stripped_query and len(query) <= 5:  # Single emoji or very short
        return False
    
    # Reject random numbers only (at least 5 digits with no context)
    if re.match(r'^\d{5,}$', query.strip()):
        return False
    
    # Reject gibberish (no vowels in long string suggests gibberish)
    if len(query) > 10 and not re.search(r'[aeiouAEIOU]', query):
        return False
        
    return True

def format_research_results(search_results: List[Dict[str, Any]], 

                           scraped_contents: Dict[str, str],

                           analyzed_contents: Dict[str, Dict[str, Any]]) -> str:
    """

    Formats research results into a readable response with citations.

    

    Args:

        search_results: The list of search result items

        scraped_contents: Dict mapping URLs to scraped content

        analyzed_contents: Dict mapping URLs to analysis results

        

    Returns:

        Formatted response with citations

    """
    response_parts = []
    citations = []
    
    # Filter to only include relevant content based on analysis
    relevant_urls = {
        url: data 
        for url, data in analyzed_contents.items() 
        if data.get("relevance_score", 0) >= 5
    }
    
    # No relevant results
    if not relevant_urls:
        return "I couldn't find relevant information for your query. Could you try rephrasing or providing more details?"
    
    # Compile the response with relevant information
    for i, (url, data) in enumerate(relevant_urls.items(), 1):
        citations.append(f"[{i}] {url}")
        filtered_content = data.get("filtered_content", "")
        
        # Add the content with citation
        if filtered_content:
            response_parts.append(f"{filtered_content} [{i}]")
    
    # Combine everything
    response = "\n\n".join(response_parts)
    citation_text = "\n".join(citations)
    
    return f"{response}\n\nSources:\n{citation_text}"

def extract_citations(text: str) -> List[Dict[str, str]]:
    """

    Extract citations from formatted text.

    

    Args:

        text: Text with citation markers like [1], [2], etc.

        

    Returns:

        List of citation objects with citation number and referenced text

    """
    citations = []
    citation_pattern = r'\[(\d+)\]'
    
    matches = re.finditer(citation_pattern, text)
    for match in matches:
        citation_num = match.group(1)
        # Get the preceding text (limited to reasonable length)
        start_pos = max(0, match.start() - 100)
        cited_text = text[start_pos:match.start()].strip()
        if len(cited_text) == 100:  # Truncated
            cited_text = "..." + cited_text
        
        citations.append({
            "number": citation_num,
            "text": cited_text
        })
    
    return citations