Spaces:
Sleeping
Sleeping
File size: 7,860 Bytes
76309ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
from __future__ import annotations
from typing import List, Tuple, Iterable, Optional, Dict, Any
import re
# We use private-use unicode placeholders so they survive html.escape/markdown
HIGHLIGHT_START = "\uE000"
HIGHLIGHT_END = "\uE001"
__all__ = [
"extract_quoted_fragments",
"find_exact_matches",
"compute_best_ngram_window",
"merge_intervals",
"compute_highlight_spans",
"insert_highlight_placeholders",
"annotate_text_with_evidence_placeholders",
]
def extract_quoted_fragments(evidence: Any) -> Dict[str, List[str]]:
"""Extract quoted fragments from evidence.
Returns a dict with keys:
- "quoted": list of quoted strings
- "unquoted": list of unquoted fragments (may be empty)
Evidence may be a string (possibly containing quotes) or a list of strings.
We treat double quotes (") and single quotes (').
"""
quoted: List[str] = []
unquoted: List[str] = []
def _from_str(s: str) -> None:
# Capture content inside matching quotes
# Handles multiple quoted segments, keeps inner text only
q = re.findall(r'"([^"]+)"|\'([^\']+)\'', s)
if q:
for g1, g2 in q:
frag = g1 or g2
frag = frag.strip()
if frag:
quoted.append(frag)
# Remove the quoted parts from the string to detect remaining unquoted
s_wo = re.sub(r'"[^\"]+"|\'[^\']+\'', " ", s)
residue = s_wo.strip()
if residue:
unquoted.append(residue)
else:
s = s.strip()
if s:
unquoted.append(s)
if isinstance(evidence, list):
for item in evidence:
if isinstance(item, str):
_from_str(item)
else:
# Non-string items are ignored; caller can decide how to handle
continue
elif isinstance(evidence, str):
_from_str(evidence)
else:
# Unknown evidence type → nothing to extract
pass
return {"quoted": quoted, "unquoted": unquoted}
def _tokenize_words_with_offsets(text: str) -> List[Tuple[str, int, int]]:
"""Tokenize into word tokens with their (start, end) character offsets.
We treat word characters (\w) as tokens and ignore pure whitespace. Punctuation
is not included as tokens for n-gram matching.
"""
tokens: List[Tuple[str, int, int]] = []
for m in re.finditer(r"\w+", text):
tokens.append((m.group(0).lower(), m.start(), m.end()))
return tokens
def find_exact_matches(text: str, phrase: str) -> List[Tuple[int, int]]:
"""Case-insensitive exact substring matches of phrase in text.
Returns a list of (start, end) character indices.
"""
if not phrase:
return []
hay = text.lower()
needle = phrase.lower()
matches: List[Tuple[int, int]] = []
start = 0
while True:
idx = hay.find(needle, start)
if idx == -1:
break
matches.append((idx, idx + len(phrase)))
start = idx + 1
return matches
def compute_best_ngram_window(text: str, target: str, n: int = 3, overlap_threshold: float = 0.5) -> Optional[Tuple[int, int]]:
"""Find a window in `text` that maximizes n-gram overlap with `target`.
- Tokenization is word-based (\w+). Case-insensitive.
- If target has fewer than n tokens, fallback to n=1 (unigram overlap).
- Returns (start_char, end_char) of best window if overlap >= threshold, else None.
"""
text_toks = _tokenize_words_with_offsets(text)
target_toks = [t for t, _, _ in _tokenize_words_with_offsets(target)]
if not text_toks or not target_toks:
return None
if n < 1:
n = 1
if len(target_toks) < n:
n = 1
def _ngrams(tokens: List[str], k: int) -> List[Tuple[str, ...]]:
return [tuple(tokens[i:i+k]) for i in range(0, len(tokens) - k + 1)] if len(tokens) >= k else []
target_ngrams = set(_ngrams(target_toks, n))
if not target_ngrams:
# If still empty, fallback to unigram set
target_ngrams = set((t,) for t in target_toks)
n = 1
best_score = 0.0
best_span: Optional[Tuple[int, int]] = None
# Sliding windows over the text tokens with the same token length as the target
window_len = max(len(target_toks), n) # ensure at least n
for i in range(0, len(text_toks) - window_len + 1):
window_tokens = [tok for tok, _, _ in text_toks[i:i+window_len]]
window_ngrams = set(_ngrams(window_tokens, n)) or set((t,) for t in window_tokens)
overlap = len(window_ngrams & target_ngrams)
denom = max(1, len(target_ngrams))
score = overlap / denom
if score > best_score:
# Character span across the window
start_char = text_toks[i][1]
end_char = text_toks[i+window_len-1][2]
best_score = score
best_span = (start_char, end_char)
if best_span and best_score >= overlap_threshold:
return best_span
return None
def merge_intervals(spans: Iterable[Tuple[int, int]]) -> List[Tuple[int, int]]:
"""Merge overlapping or touching intervals."""
s = sorted(spans)
if not s:
return []
merged = [list(s[0])]
for a, b in s[1:]:
if a <= merged[-1][1]:
merged[-1][1] = max(merged[-1][1], b)
else:
merged.append([a, b])
return [(a, b) for a, b in merged]
def compute_highlight_spans(text: str, evidence: Any, n: int = 3, overlap_threshold: float = 0.5) -> List[Tuple[int, int]]:
"""Compute character spans to highlight in `text` using `evidence`.
Strategy:
- For any quoted fragments, do exact case-insensitive matching (all occurrences).
- If there are no quoted fragments but there is unquoted evidence text, use n-gram overlap
to find the best-matching window and highlight it if above threshold.
- If evidence is a list, treat each element independently (quoted detection applied per element).
"""
parts = extract_quoted_fragments(evidence)
spans: List[Tuple[int, int]] = []
# Exact matches for quoted fragments
for q in parts["quoted"]:
spans.extend(find_exact_matches(text, q))
# If no quoted matches found and have unquoted evidence, try n-gram for each unquoted fragment
if not spans and parts["unquoted"]:
for uq in parts["unquoted"]:
win = compute_best_ngram_window(text, uq, n=n, overlap_threshold=overlap_threshold)
if win:
spans.append(win)
return merge_intervals(spans)
def insert_highlight_placeholders(text: str, spans: List[Tuple[int, int]]) -> str:
"""Insert placeholder markers into `text` for each (start, end) span.
Assumes spans are non-overlapping and sorted; callers should merge first.
"""
if not spans:
return text
parts: List[str] = []
last = 0
for a, b in spans:
if a < last:
# Overlap – skip to avoid corrupting indices
continue
parts.append(text[last:a])
parts.append(HIGHLIGHT_START)
parts.append(text[a:b])
parts.append(HIGHLIGHT_END)
last = b
parts.append(text[last:])
return "".join(parts)
def annotate_text_with_evidence_placeholders(text: str, evidence: Any, *, n: int = 3, overlap_threshold: float = 0.5) -> str:
"""Return text with highlight placeholders inserted based on evidence.
This is the main API used by the renderer. After further processing (markdown),
callers should post-process HTML to replace placeholders with <mark> tags.
"""
spans = compute_highlight_spans(text, evidence, n=n, overlap_threshold=overlap_threshold)
if not spans:
return text
return insert_highlight_placeholders(text, spans) |