Spaces:
Sleeping
Sleeping
import re | |
import json | |
from typing import Dict, Any, List, Optional | |
def is_valid_query(query: str) -> bool: | |
""" | |
Validates if a search query is legitimate. | |
Args: | |
query: The search query to validate | |
Returns: | |
Boolean indicating if the query is valid | |
""" | |
# Reject empty queries | |
if not query or query.strip() == "": | |
return False | |
# Reject single emoji queries | |
emoji_pattern = re.compile( | |
"[" | |
"\U0001F600-\U0001F64F" # emoticons | |
"\U0001F300-\U0001F5FF" # symbols & pictographs | |
"\U0001F680-\U0001F6FF" # transport & map symbols | |
"\U0001F700-\U0001F77F" # alchemical symbols | |
"\U0001F780-\U0001F7FF" # Geometric Shapes | |
"\U0001F800-\U0001F8FF" # Supplemental Arrows-C | |
"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs | |
"\U0001FA00-\U0001FA6F" # Chess Symbols | |
"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A | |
"\U00002702-\U000027B0" # Dingbats | |
"\U000024C2-\U0001F251" | |
"]+" | |
) | |
stripped_query = emoji_pattern.sub(r'', query).strip() | |
if not stripped_query and len(query) <= 5: # Single emoji or very short | |
return False | |
# Reject random numbers only (at least 5 digits with no context) | |
if re.match(r'^\d{5,}$', query.strip()): | |
return False | |
# Reject gibberish (no vowels in long string suggests gibberish) | |
if len(query) > 10 and not re.search(r'[aeiouAEIOU]', query): | |
return False | |
return True | |
def format_research_results(search_results: List[Dict[str, Any]], | |
scraped_contents: Dict[str, str], | |
analyzed_contents: Dict[str, Dict[str, Any]]) -> str: | |
""" | |
Formats research results into a readable response with citations. | |
Args: | |
search_results: The list of search result items | |
scraped_contents: Dict mapping URLs to scraped content | |
analyzed_contents: Dict mapping URLs to analysis results | |
Returns: | |
Formatted response with citations | |
""" | |
response_parts = [] | |
citations = [] | |
# Filter to only include relevant content based on analysis | |
relevant_urls = { | |
url: data | |
for url, data in analyzed_contents.items() | |
if data.get("relevance_score", 0) >= 5 | |
} | |
# No relevant results | |
if not relevant_urls: | |
return "I couldn't find relevant information for your query. Could you try rephrasing or providing more details?" | |
# Compile the response with relevant information | |
for i, (url, data) in enumerate(relevant_urls.items(), 1): | |
citations.append(f"[{i}] {url}") | |
filtered_content = data.get("filtered_content", "") | |
# Add the content with citation | |
if filtered_content: | |
response_parts.append(f"{filtered_content} [{i}]") | |
# Combine everything | |
response = "\n\n".join(response_parts) | |
citation_text = "\n".join(citations) | |
return f"{response}\n\nSources:\n{citation_text}" | |
def extract_citations(text: str) -> List[Dict[str, str]]: | |
""" | |
Extract citations from formatted text. | |
Args: | |
text: Text with citation markers like [1], [2], etc. | |
Returns: | |
List of citation objects with citation number and referenced text | |
""" | |
citations = [] | |
citation_pattern = r'\[(\d+)\]' | |
matches = re.finditer(citation_pattern, text) | |
for match in matches: | |
citation_num = match.group(1) | |
# Get the preceding text (limited to reasonable length) | |
start_pos = max(0, match.start() - 100) | |
cited_text = text[start_pos:match.start()].strip() | |
if len(cited_text) == 100: # Truncated | |
cited_text = "..." + cited_text | |
citations.append({ | |
"number": citation_num, | |
"text": cited_text | |
}) | |
return citations |