from __future__ import annotations from typing import List, Tuple, Iterable, Optional, Dict, Any import re # We use private-use unicode placeholders so they survive html.escape/markdown HIGHLIGHT_START = "\uE000" HIGHLIGHT_END = "\uE001" __all__ = [ "extract_quoted_fragments", "find_exact_matches", "compute_best_ngram_window", "merge_intervals", "compute_highlight_spans", "insert_highlight_placeholders", "annotate_text_with_evidence_placeholders", ] def extract_quoted_fragments(evidence: Any) -> Dict[str, List[str]]: """Extract quoted fragments from evidence. Returns a dict with keys: - "quoted": list of quoted strings - "unquoted": list of unquoted fragments (may be empty) Evidence may be a string (possibly containing quotes) or a list of strings. We treat double quotes (") and single quotes ('). """ quoted: List[str] = [] unquoted: List[str] = [] def _from_str(s: str) -> None: # Capture content inside matching quotes # Handles multiple quoted segments, keeps inner text only q = re.findall(r'"([^"]+)"|\'([^\']+)\'', s) if q: for g1, g2 in q: frag = g1 or g2 frag = frag.strip() if frag: quoted.append(frag) # Remove the quoted parts from the string to detect remaining unquoted s_wo = re.sub(r'"[^\"]+"|\'[^\']+\'', " ", s) residue = s_wo.strip() if residue: unquoted.append(residue) else: s = s.strip() if s: unquoted.append(s) if isinstance(evidence, list): for item in evidence: if isinstance(item, str): _from_str(item) else: # Non-string items are ignored; caller can decide how to handle continue elif isinstance(evidence, str): _from_str(evidence) else: # Unknown evidence type → nothing to extract pass return {"quoted": quoted, "unquoted": unquoted} def _tokenize_words_with_offsets(text: str) -> List[Tuple[str, int, int]]: """Tokenize into word tokens with their (start, end) character offsets. We treat word characters (\w) as tokens and ignore pure whitespace. Punctuation is not included as tokens for n-gram matching. """ tokens: List[Tuple[str, int, int]] = [] for m in re.finditer(r"\w+", text): tokens.append((m.group(0).lower(), m.start(), m.end())) return tokens def find_exact_matches(text: str, phrase: str) -> List[Tuple[int, int]]: """Case-insensitive exact substring matches of phrase in text. Returns a list of (start, end) character indices. """ if not phrase: return [] hay = text.lower() needle = phrase.lower() matches: List[Tuple[int, int]] = [] start = 0 while True: idx = hay.find(needle, start) if idx == -1: break matches.append((idx, idx + len(phrase))) start = idx + 1 return matches def compute_best_ngram_window(text: str, target: str, n: int = 3, overlap_threshold: float = 0.5) -> Optional[Tuple[int, int]]: """Find a window in `text` that maximizes n-gram overlap with `target`. - Tokenization is word-based (\w+). Case-insensitive. - If target has fewer than n tokens, fallback to n=1 (unigram overlap). - Returns (start_char, end_char) of best window if overlap >= threshold, else None. """ text_toks = _tokenize_words_with_offsets(text) target_toks = [t for t, _, _ in _tokenize_words_with_offsets(target)] if not text_toks or not target_toks: return None if n < 1: n = 1 if len(target_toks) < n: n = 1 def _ngrams(tokens: List[str], k: int) -> List[Tuple[str, ...]]: return [tuple(tokens[i:i+k]) for i in range(0, len(tokens) - k + 1)] if len(tokens) >= k else [] target_ngrams = set(_ngrams(target_toks, n)) if not target_ngrams: # If still empty, fallback to unigram set target_ngrams = set((t,) for t in target_toks) n = 1 best_score = 0.0 best_span: Optional[Tuple[int, int]] = None # Sliding windows over the text tokens with the same token length as the target window_len = max(len(target_toks), n) # ensure at least n for i in range(0, len(text_toks) - window_len + 1): window_tokens = [tok for tok, _, _ in text_toks[i:i+window_len]] window_ngrams = set(_ngrams(window_tokens, n)) or set((t,) for t in window_tokens) overlap = len(window_ngrams & target_ngrams) denom = max(1, len(target_ngrams)) score = overlap / denom if score > best_score: # Character span across the window start_char = text_toks[i][1] end_char = text_toks[i+window_len-1][2] best_score = score best_span = (start_char, end_char) if best_span and best_score >= overlap_threshold: return best_span return None def merge_intervals(spans: Iterable[Tuple[int, int]]) -> List[Tuple[int, int]]: """Merge overlapping or touching intervals.""" s = sorted(spans) if not s: return [] merged = [list(s[0])] for a, b in s[1:]: if a <= merged[-1][1]: merged[-1][1] = max(merged[-1][1], b) else: merged.append([a, b]) return [(a, b) for a, b in merged] def compute_highlight_spans(text: str, evidence: Any, n: int = 3, overlap_threshold: float = 0.5) -> List[Tuple[int, int]]: """Compute character spans to highlight in `text` using `evidence`. Strategy: - For any quoted fragments, do exact case-insensitive matching (all occurrences). - If there are no quoted fragments but there is unquoted evidence text, use n-gram overlap to find the best-matching window and highlight it if above threshold. - If evidence is a list, treat each element independently (quoted detection applied per element). """ parts = extract_quoted_fragments(evidence) spans: List[Tuple[int, int]] = [] # Exact matches for quoted fragments for q in parts["quoted"]: spans.extend(find_exact_matches(text, q)) # If no quoted matches found and have unquoted evidence, try n-gram for each unquoted fragment if not spans and parts["unquoted"]: for uq in parts["unquoted"]: win = compute_best_ngram_window(text, uq, n=n, overlap_threshold=overlap_threshold) if win: spans.append(win) return merge_intervals(spans) def insert_highlight_placeholders(text: str, spans: List[Tuple[int, int]]) -> str: """Insert placeholder markers into `text` for each (start, end) span. Assumes spans are non-overlapping and sorted; callers should merge first. """ if not spans: return text parts: List[str] = [] last = 0 for a, b in spans: if a < last: # Overlap – skip to avoid corrupting indices continue parts.append(text[last:a]) parts.append(HIGHLIGHT_START) parts.append(text[a:b]) parts.append(HIGHLIGHT_END) last = b parts.append(text[last:]) return "".join(parts) def annotate_text_with_evidence_placeholders(text: str, evidence: Any, *, n: int = 3, overlap_threshold: float = 0.5) -> str: """Return text with highlight placeholders inserted based on evidence. This is the main API used by the renderer. After further processing (markdown), callers should post-process HTML to replace placeholders with tags. """ spans = compute_highlight_spans(text, evidence, n=n, overlap_threshold=overlap_threshold) if not spans: return text return insert_highlight_placeholders(text, spans)