Spaces:
Sleeping
Sleeping
from __future__ import annotations | |
from typing import List, Tuple, Iterable, Optional, Dict, Any | |
import re | |
# We use private-use unicode placeholders so they survive html.escape/markdown | |
HIGHLIGHT_START = "\uE000" | |
HIGHLIGHT_END = "\uE001" | |
__all__ = [ | |
"extract_quoted_fragments", | |
"find_exact_matches", | |
"compute_best_ngram_window", | |
"merge_intervals", | |
"compute_highlight_spans", | |
"insert_highlight_placeholders", | |
"annotate_text_with_evidence_placeholders", | |
] | |
def extract_quoted_fragments(evidence: Any) -> Dict[str, List[str]]: | |
"""Extract quoted fragments from evidence. | |
Returns a dict with keys: | |
- "quoted": list of quoted strings | |
- "unquoted": list of unquoted fragments (may be empty) | |
Evidence may be a string (possibly containing quotes) or a list of strings. | |
We treat double quotes (") and single quotes ('). | |
""" | |
quoted: List[str] = [] | |
unquoted: List[str] = [] | |
def _from_str(s: str) -> None: | |
# Capture content inside matching quotes | |
# Handles multiple quoted segments, keeps inner text only | |
q = re.findall(r'"([^"]+)"|\'([^\']+)\'', s) | |
if q: | |
for g1, g2 in q: | |
frag = g1 or g2 | |
frag = frag.strip() | |
if frag: | |
quoted.append(frag) | |
# Remove the quoted parts from the string to detect remaining unquoted | |
s_wo = re.sub(r'"[^\"]+"|\'[^\']+\'', " ", s) | |
residue = s_wo.strip() | |
if residue: | |
unquoted.append(residue) | |
else: | |
s = s.strip() | |
if s: | |
unquoted.append(s) | |
if isinstance(evidence, list): | |
for item in evidence: | |
if isinstance(item, str): | |
_from_str(item) | |
else: | |
# Non-string items are ignored; caller can decide how to handle | |
continue | |
elif isinstance(evidence, str): | |
_from_str(evidence) | |
else: | |
# Unknown evidence type → nothing to extract | |
pass | |
return {"quoted": quoted, "unquoted": unquoted} | |
def _tokenize_words_with_offsets(text: str) -> List[Tuple[str, int, int]]: | |
"""Tokenize into word tokens with their (start, end) character offsets. | |
We treat word characters (\w) as tokens and ignore pure whitespace. Punctuation | |
is not included as tokens for n-gram matching. | |
""" | |
tokens: List[Tuple[str, int, int]] = [] | |
for m in re.finditer(r"\w+", text): | |
tokens.append((m.group(0).lower(), m.start(), m.end())) | |
return tokens | |
def find_exact_matches(text: str, phrase: str) -> List[Tuple[int, int]]: | |
"""Case-insensitive exact substring matches of phrase in text. | |
Returns a list of (start, end) character indices. | |
""" | |
if not phrase: | |
return [] | |
hay = text.lower() | |
needle = phrase.lower() | |
matches: List[Tuple[int, int]] = [] | |
start = 0 | |
while True: | |
idx = hay.find(needle, start) | |
if idx == -1: | |
break | |
matches.append((idx, idx + len(phrase))) | |
start = idx + 1 | |
return matches | |
def compute_best_ngram_window(text: str, target: str, n: int = 3, overlap_threshold: float = 0.5) -> Optional[Tuple[int, int]]: | |
"""Find a window in `text` that maximizes n-gram overlap with `target`. | |
- Tokenization is word-based (\w+). Case-insensitive. | |
- If target has fewer than n tokens, fallback to n=1 (unigram overlap). | |
- Returns (start_char, end_char) of best window if overlap >= threshold, else None. | |
""" | |
text_toks = _tokenize_words_with_offsets(text) | |
target_toks = [t for t, _, _ in _tokenize_words_with_offsets(target)] | |
if not text_toks or not target_toks: | |
return None | |
if n < 1: | |
n = 1 | |
if len(target_toks) < n: | |
n = 1 | |
def _ngrams(tokens: List[str], k: int) -> List[Tuple[str, ...]]: | |
return [tuple(tokens[i:i+k]) for i in range(0, len(tokens) - k + 1)] if len(tokens) >= k else [] | |
target_ngrams = set(_ngrams(target_toks, n)) | |
if not target_ngrams: | |
# If still empty, fallback to unigram set | |
target_ngrams = set((t,) for t in target_toks) | |
n = 1 | |
best_score = 0.0 | |
best_span: Optional[Tuple[int, int]] = None | |
# Sliding windows over the text tokens with the same token length as the target | |
window_len = max(len(target_toks), n) # ensure at least n | |
for i in range(0, len(text_toks) - window_len + 1): | |
window_tokens = [tok for tok, _, _ in text_toks[i:i+window_len]] | |
window_ngrams = set(_ngrams(window_tokens, n)) or set((t,) for t in window_tokens) | |
overlap = len(window_ngrams & target_ngrams) | |
denom = max(1, len(target_ngrams)) | |
score = overlap / denom | |
if score > best_score: | |
# Character span across the window | |
start_char = text_toks[i][1] | |
end_char = text_toks[i+window_len-1][2] | |
best_score = score | |
best_span = (start_char, end_char) | |
if best_span and best_score >= overlap_threshold: | |
return best_span | |
return None | |
def merge_intervals(spans: Iterable[Tuple[int, int]]) -> List[Tuple[int, int]]: | |
"""Merge overlapping or touching intervals.""" | |
s = sorted(spans) | |
if not s: | |
return [] | |
merged = [list(s[0])] | |
for a, b in s[1:]: | |
if a <= merged[-1][1]: | |
merged[-1][1] = max(merged[-1][1], b) | |
else: | |
merged.append([a, b]) | |
return [(a, b) for a, b in merged] | |
def compute_highlight_spans(text: str, evidence: Any, n: int = 3, overlap_threshold: float = 0.5) -> List[Tuple[int, int]]: | |
"""Compute character spans to highlight in `text` using `evidence`. | |
Strategy: | |
- For any quoted fragments, do exact case-insensitive matching (all occurrences). | |
- If there are no quoted fragments but there is unquoted evidence text, use n-gram overlap | |
to find the best-matching window and highlight it if above threshold. | |
- If evidence is a list, treat each element independently (quoted detection applied per element). | |
""" | |
parts = extract_quoted_fragments(evidence) | |
spans: List[Tuple[int, int]] = [] | |
# Exact matches for quoted fragments | |
for q in parts["quoted"]: | |
spans.extend(find_exact_matches(text, q)) | |
# If no quoted matches found and have unquoted evidence, try n-gram for each unquoted fragment | |
if not spans and parts["unquoted"]: | |
for uq in parts["unquoted"]: | |
win = compute_best_ngram_window(text, uq, n=n, overlap_threshold=overlap_threshold) | |
if win: | |
spans.append(win) | |
return merge_intervals(spans) | |
def insert_highlight_placeholders(text: str, spans: List[Tuple[int, int]]) -> str: | |
"""Insert placeholder markers into `text` for each (start, end) span. | |
Assumes spans are non-overlapping and sorted; callers should merge first. | |
""" | |
if not spans: | |
return text | |
parts: List[str] = [] | |
last = 0 | |
for a, b in spans: | |
if a < last: | |
# Overlap – skip to avoid corrupting indices | |
continue | |
parts.append(text[last:a]) | |
parts.append(HIGHLIGHT_START) | |
parts.append(text[a:b]) | |
parts.append(HIGHLIGHT_END) | |
last = b | |
parts.append(text[last:]) | |
return "".join(parts) | |
def annotate_text_with_evidence_placeholders(text: str, evidence: Any, *, n: int = 3, overlap_threshold: float = 0.5) -> str: | |
"""Return text with highlight placeholders inserted based on evidence. | |
This is the main API used by the renderer. After further processing (markdown), | |
callers should post-process HTML to replace placeholders with <mark> tags. | |
""" | |
spans = compute_highlight_spans(text, evidence, n=n, overlap_threshold=overlap_threshold) | |
if not spans: | |
return text | |
return insert_highlight_placeholders(text, spans) |