Spaces:
Sleeping
Sleeping
File size: 4,090 Bytes
d445f2a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import re
import json
from typing import Dict, Any, List, Optional
def is_valid_query(query: str) -> bool:
"""
Validates if a search query is legitimate.
Args:
query: The search query to validate
Returns:
Boolean indicating if the query is valid
"""
# Reject empty queries
if not query or query.strip() == "":
return False
# Reject single emoji queries
emoji_pattern = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F700-\U0001F77F" # alchemical symbols
"\U0001F780-\U0001F7FF" # Geometric Shapes
"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
"\U0001FA00-\U0001FA6F" # Chess Symbols
"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
"\U00002702-\U000027B0" # Dingbats
"\U000024C2-\U0001F251"
"]+"
)
stripped_query = emoji_pattern.sub(r'', query).strip()
if not stripped_query and len(query) <= 5: # Single emoji or very short
return False
# Reject random numbers only (at least 5 digits with no context)
if re.match(r'^\d{5,}$', query.strip()):
return False
# Reject gibberish (no vowels in long string suggests gibberish)
if len(query) > 10 and not re.search(r'[aeiouAEIOU]', query):
return False
return True
def format_research_results(search_results: List[Dict[str, Any]],
scraped_contents: Dict[str, str],
analyzed_contents: Dict[str, Dict[str, Any]]) -> str:
"""
Formats research results into a readable response with citations.
Args:
search_results: The list of search result items
scraped_contents: Dict mapping URLs to scraped content
analyzed_contents: Dict mapping URLs to analysis results
Returns:
Formatted response with citations
"""
response_parts = []
citations = []
# Filter to only include relevant content based on analysis
relevant_urls = {
url: data
for url, data in analyzed_contents.items()
if data.get("relevance_score", 0) >= 5
}
# No relevant results
if not relevant_urls:
return "I couldn't find relevant information for your query. Could you try rephrasing or providing more details?"
# Compile the response with relevant information
for i, (url, data) in enumerate(relevant_urls.items(), 1):
citations.append(f"[{i}] {url}")
filtered_content = data.get("filtered_content", "")
# Add the content with citation
if filtered_content:
response_parts.append(f"{filtered_content} [{i}]")
# Combine everything
response = "\n\n".join(response_parts)
citation_text = "\n".join(citations)
return f"{response}\n\nSources:\n{citation_text}"
def extract_citations(text: str) -> List[Dict[str, str]]:
"""
Extract citations from formatted text.
Args:
text: Text with citation markers like [1], [2], etc.
Returns:
List of citation objects with citation number and referenced text
"""
citations = []
citation_pattern = r'\[(\d+)\]'
matches = re.finditer(citation_pattern, text)
for match in matches:
citation_num = match.group(1)
# Get the preceding text (limited to reasonable length)
start_pos = max(0, match.start() - 100)
cited_text = text[start_pos:match.start()].strip()
if len(cited_text) == 100: # Truncated
cited_text = "..." + cited_text
citations.append({
"number": citation_num,
"text": cited_text
})
return citations |