File size: 7,860 Bytes
76309ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
from __future__ import annotations

from typing import List, Tuple, Iterable, Optional, Dict, Any
import re

# We use private-use unicode placeholders so they survive html.escape/markdown
HIGHLIGHT_START = "\uE000"
HIGHLIGHT_END = "\uE001"

__all__ = [
    "extract_quoted_fragments",
    "find_exact_matches",
    "compute_best_ngram_window",
    "merge_intervals",
    "compute_highlight_spans",
    "insert_highlight_placeholders",
    "annotate_text_with_evidence_placeholders",
]


def extract_quoted_fragments(evidence: Any) -> Dict[str, List[str]]:
    """Extract quoted fragments from evidence.

    Returns a dict with keys:
      - "quoted": list of quoted strings
      - "unquoted": list of unquoted fragments (may be empty)

    Evidence may be a string (possibly containing quotes) or a list of strings.
    We treat double quotes (") and single quotes (').
    """
    quoted: List[str] = []
    unquoted: List[str] = []

    def _from_str(s: str) -> None:
        # Capture content inside matching quotes
        # Handles multiple quoted segments, keeps inner text only
        q = re.findall(r'"([^"]+)"|\'([^\']+)\'', s)
        if q:
            for g1, g2 in q:
                frag = g1 or g2
                frag = frag.strip()
                if frag:
                    quoted.append(frag)
            # Remove the quoted parts from the string to detect remaining unquoted
            s_wo = re.sub(r'"[^\"]+"|\'[^\']+\'', " ", s)
            residue = s_wo.strip()
            if residue:
                unquoted.append(residue)
        else:
            s = s.strip()
            if s:
                unquoted.append(s)

    if isinstance(evidence, list):
        for item in evidence:
            if isinstance(item, str):
                _from_str(item)
            else:
                # Non-string items are ignored; caller can decide how to handle
                continue
    elif isinstance(evidence, str):
        _from_str(evidence)
    else:
        # Unknown evidence type → nothing to extract
        pass

    return {"quoted": quoted, "unquoted": unquoted}


def _tokenize_words_with_offsets(text: str) -> List[Tuple[str, int, int]]:
    """Tokenize into word tokens with their (start, end) character offsets.

    We treat word characters (\w) as tokens and ignore pure whitespace. Punctuation
    is not included as tokens for n-gram matching.
    """
    tokens: List[Tuple[str, int, int]] = []
    for m in re.finditer(r"\w+", text):
        tokens.append((m.group(0).lower(), m.start(), m.end()))
    return tokens


def find_exact_matches(text: str, phrase: str) -> List[Tuple[int, int]]:
    """Case-insensitive exact substring matches of phrase in text.

    Returns a list of (start, end) character indices.
    """
    if not phrase:
        return []
    hay = text.lower()
    needle = phrase.lower()
    matches: List[Tuple[int, int]] = []
    start = 0
    while True:
        idx = hay.find(needle, start)
        if idx == -1:
            break
        matches.append((idx, idx + len(phrase)))
        start = idx + 1
    return matches


def compute_best_ngram_window(text: str, target: str, n: int = 3, overlap_threshold: float = 0.5) -> Optional[Tuple[int, int]]:
    """Find a window in `text` that maximizes n-gram overlap with `target`.

    - Tokenization is word-based (\w+). Case-insensitive.
    - If target has fewer than n tokens, fallback to n=1 (unigram overlap).
    - Returns (start_char, end_char) of best window if overlap >= threshold, else None.
    """
    text_toks = _tokenize_words_with_offsets(text)
    target_toks = [t for t, _, _ in _tokenize_words_with_offsets(target)]

    if not text_toks or not target_toks:
        return None

    if n < 1:
        n = 1
    if len(target_toks) < n:
        n = 1

    def _ngrams(tokens: List[str], k: int) -> List[Tuple[str, ...]]:
        return [tuple(tokens[i:i+k]) for i in range(0, len(tokens) - k + 1)] if len(tokens) >= k else []

    target_ngrams = set(_ngrams(target_toks, n))
    if not target_ngrams:
        # If still empty, fallback to unigram set
        target_ngrams = set((t,) for t in target_toks)
        n = 1

    best_score = 0.0
    best_span: Optional[Tuple[int, int]] = None

    # Sliding windows over the text tokens with the same token length as the target
    window_len = max(len(target_toks), n)  # ensure at least n
    for i in range(0, len(text_toks) - window_len + 1):
        window_tokens = [tok for tok, _, _ in text_toks[i:i+window_len]]
        window_ngrams = set(_ngrams(window_tokens, n)) or set((t,) for t in window_tokens)
        overlap = len(window_ngrams & target_ngrams)
        denom = max(1, len(target_ngrams))
        score = overlap / denom
        if score > best_score:
            # Character span across the window
            start_char = text_toks[i][1]
            end_char = text_toks[i+window_len-1][2]
            best_score = score
            best_span = (start_char, end_char)

    if best_span and best_score >= overlap_threshold:
        return best_span
    return None


def merge_intervals(spans: Iterable[Tuple[int, int]]) -> List[Tuple[int, int]]:
    """Merge overlapping or touching intervals."""
    s = sorted(spans)
    if not s:
        return []
    merged = [list(s[0])]
    for a, b in s[1:]:
        if a <= merged[-1][1]:
            merged[-1][1] = max(merged[-1][1], b)
        else:
            merged.append([a, b])
    return [(a, b) for a, b in merged]


def compute_highlight_spans(text: str, evidence: Any, n: int = 3, overlap_threshold: float = 0.5) -> List[Tuple[int, int]]:
    """Compute character spans to highlight in `text` using `evidence`.

    Strategy:
      - For any quoted fragments, do exact case-insensitive matching (all occurrences).
      - If there are no quoted fragments but there is unquoted evidence text, use n-gram overlap
        to find the best-matching window and highlight it if above threshold.
      - If evidence is a list, treat each element independently (quoted detection applied per element).
    """
    parts = extract_quoted_fragments(evidence)
    spans: List[Tuple[int, int]] = []

    # Exact matches for quoted fragments
    for q in parts["quoted"]:
        spans.extend(find_exact_matches(text, q))

    # If no quoted matches found and have unquoted evidence, try n-gram for each unquoted fragment
    if not spans and parts["unquoted"]:
        for uq in parts["unquoted"]:
            win = compute_best_ngram_window(text, uq, n=n, overlap_threshold=overlap_threshold)
            if win:
                spans.append(win)

    return merge_intervals(spans)


def insert_highlight_placeholders(text: str, spans: List[Tuple[int, int]]) -> str:
    """Insert placeholder markers into `text` for each (start, end) span.

    Assumes spans are non-overlapping and sorted; callers should merge first.
    """
    if not spans:
        return text
    parts: List[str] = []
    last = 0
    for a, b in spans:
        if a < last:
            # Overlap – skip to avoid corrupting indices
            continue
        parts.append(text[last:a])
        parts.append(HIGHLIGHT_START)
        parts.append(text[a:b])
        parts.append(HIGHLIGHT_END)
        last = b
    parts.append(text[last:])
    return "".join(parts)


def annotate_text_with_evidence_placeholders(text: str, evidence: Any, *, n: int = 3, overlap_threshold: float = 0.5) -> str:
    """Return text with highlight placeholders inserted based on evidence.

    This is the main API used by the renderer. After further processing (markdown),
    callers should post-process HTML to replace placeholders with <mark> tags.
    """
    spans = compute_highlight_spans(text, evidence, n=n, overlap_threshold=overlap_threshold)
    if not spans:
        return text
    return insert_highlight_placeholders(text, spans)