# /// script # requires-python = ">=3.12" # dependencies = [ # "charset-normalizer==3.4.2", # "great-tables==0.17.0", # "marimo", # "pandas==2.3.0", # ] # /// import marimo __generated_with = "0.14.6" app = marimo.App(width="full", app_title="LLM Text Preprocessing Checker") @app.cell def _(): import marimo as mo return (mo,) @app.cell def _(mo): mo.md( r""" # LLM Text Preprocessing Checker Checks two files and provides the diff output as well as metrics on deleted and inserted characters. Additionaly, provides a breakdown by Unicode character class of deletions and insertions. Note that this uses a pure-Python Myers diff algorithm for the comparison and may not be performant for larger diffs. """ ) return @app.cell def _(): import unicodedata from typing import List, Dict, Any from dataclasses import dataclass from enum import IntEnum import html as python_html from great_tables import GT, loc, style import pandas as pd class Operation(IntEnum): DELETE = 0 INSERT = 1 EQUAL = 2 @dataclass(slots=True) class Edit: operation: Operation old_start: int old_end: int new_start: int new_end: int old_text: str = "" new_text: str = "" DEL_STYLE = "background-color:#ffcccc;color:#880000;text-decoration:line-through;" INS_STYLE = "background-color:#ccffcc;color:#008800;" EQUAL_STYLE = "color:#666666;" CONTAINER_STYLE = ( "font-family: ui-monospace, monospace; " "white-space: pre-wrap; " "line-height: 1.6; " "padding: 20px; " "background-color: #f8f9fa; " "border-radius: 8px; " "border: 1px solid #dee2e6;" ) def classify_char(char: str) -> str: """Classify a character using Unicode categories.""" if not char: return "empty" category = unicodedata.category(char) # Map Unicode categories to readable classifications category_map = { "Ll": "lowercase", "Lu": "uppercase", "Lt": "titlecase", "Lm": "modifier_letter", "Lo": "other_letter", "Nd": "decimal_digit", "Nl": "letter_number", "No": "other_number", "Pc": "connector_punctuation", "Pd": "dash_punctuation", "Ps": "open_punctuation", "Pe": "close_punctuation", "Pi": "initial_punctuation", "Pf": "final_punctuation", "Po": "other_punctuation", "Sm": "math_symbol", "Sc": "currency_symbol", "Sk": "modifier_symbol", "So": "other_symbol", "Zs": "space", "Zl": "line_separator", "Zp": "paragraph_separator", "Cc": "control", "Cf": "format", "Co": "private_use", "Cn": "unassigned", } # Special handling for CJK if "\u4e00" <= char <= "\u9fff": return "cjk_ideograph" elif "\u3040" <= char <= "\u309f": return "hiragana" elif "\u30a0" <= char <= "\u30ff": return "katakana" elif "\uac00" <= char <= "\ud7af": return "hangul" return category_map.get(category, category) def _myers_backtrack(trace: List[List[int]], a: str, b: str) -> List[Edit]: """Back-tracking helper to materialise the edit script.""" edits: List[Edit] = [] n, m = len(a), len(b) x, y = n, m offset = len(trace[0]) // 2 # Walk the layers backwards for d in range(len(trace) - 1, 0, -1): v = trace[d] k = x - y idx = k + offset # Determine the predecessor k' if k == -d or (k != d and v[idx - 1] < v[idx + 1]): k_prev = k + 1 # came from below (insertion) else: k_prev = k - 1 # came from right (deletion) x_prev = trace[d - 1][k_prev + offset] y_prev = x_prev - k_prev # Emit the matching "snake" while x > x_prev and y > y_prev: x -= 1 y -= 1 edits.append(Edit(Operation.EQUAL, x, x + 1, y, y + 1, a[x], b[y])) # Emit the single edit (INSERT or DELETE) that led to the snake if x_prev == x: # insertion y -= 1 edits.append(Edit(Operation.INSERT, x, x, y, y + 1, "", b[y])) else: # deletion x -= 1 edits.append(Edit(Operation.DELETE, x, x + 1, y, y, a[x], "")) # Leading snake (d = 0) – everything matched at the start while x > 0 and y > 0: x -= 1 y -= 1 edits.append(Edit(Operation.EQUAL, x, x + 1, y, y + 1, a[x], b[y])) # Any remaining leading insertions / deletions while x > 0: x -= 1 edits.append(Edit(Operation.DELETE, x, x + 1, y, y, a[x], "")) while y > 0: y -= 1 edits.append(Edit(Operation.INSERT, x, x, y, y + 1, "", b[y])) edits.reverse() return edits def myers_diff(a: str, b: str) -> List[Edit]: """ Very fast Myers diff (O((N+M)·D) time, O(N+M) memory). Returns a list of Edit objects (DELETE / INSERT / EQUAL). """ n, m = len(a), len(b) if n == 0: return [Edit(Operation.INSERT, 0, 0, 0, m, "", b)] if m else [] if m == 0: return [Edit(Operation.DELETE, 0, n, 0, 0, a, "")] if n else [] max_d = n + m offset = max_d # map k ∈ [-max_d .. +max_d] → index v = [0] * (2 * max_d + 1) # current frontier trace = [] # keeps a copy of v for every d # Forward phase – build the "trace" that will be backtracked for d in range(max_d + 1): v_next = v[:] # copy *once* per layer for k in range(-d, d + 1, 2): idx = k + offset # Choosing the predecessor (insertion vs deletion) if k == -d or (k != d and v[idx - 1] < v[idx + 1]): x = v[idx + 1] # insertion (move down) else: x = v[idx - 1] + 1 # deletion (move right) y = x - k # Greedy snake – march diagonally while chars match while x < n and y < m and a[x] == b[y]: x += 1 y += 1 v_next[idx] = x # Reached the end – stop early if x >= n and y >= m: trace.append(v_next) return _myers_backtrack(trace, a, b) trace.append(v_next) v = v_next # reuse buffer # Should never get here raise RuntimeError("diff failed") def classify_text(text: str) -> Dict[str, int]: """Count characters by classification.""" if not text: return {} classifications = {} for char in text: char_class = classify_char(char) classifications[char_class] = classifications.get(char_class, 0) + 1 return classifications def classify_edits(edits: List[Edit]) -> Dict[Operation, Dict[str, int]]: """ Classify edit operations by character class. Returns a nested dictionary: {operation: {char_class: count}} """ # Filter out EQUAL operations to save memory change_edits = [e for e in edits if e.operation != Operation.EQUAL] # Group all edits by operation type (not consecutive grouping) edits_by_op = {} for edit in change_edits: if edit.operation not in edits_by_op: edits_by_op[edit.operation] = [] edits_by_op[edit.operation].append(edit) result = {} for op, edit_list in edits_by_op.items(): combined_text = "" if op == Operation.DELETE: combined_text = "".join(e.old_text for e in edit_list) elif op == Operation.INSERT: combined_text = "".join(e.new_text for e in edit_list) result[op] = classify_text(combined_text) return result def calculate_change_metrics( original: str, edits: List[Edit], classifications: Dict[Operation, Dict[str, int]], ) -> Dict[str, Any]: """Calculate detailed change metrics including percentages.""" metrics = { "total_original_chars": len(original), "total_deleted_chars": 0, "total_inserted_chars": 0, "deletion_percentage": 0.0, "insertion_percentage": 0.0, "net_change_percentage": 0.0, "char_class_metrics": {}, } # Calculate total changes for edit in edits: if edit.operation == Operation.DELETE: metrics["total_deleted_chars"] += len(edit.old_text) elif edit.operation == Operation.INSERT: metrics["total_inserted_chars"] += len(edit.new_text) # Calculate percentages if metrics["total_original_chars"] > 0: metrics["deletion_percentage"] = ( metrics["total_deleted_chars"] / metrics["total_original_chars"] ) * 100 metrics["insertion_percentage"] = ( metrics["total_inserted_chars"] / metrics["total_original_chars"] ) * 100 net_change = ( metrics["total_inserted_chars"] - metrics["total_deleted_chars"] ) metrics["net_change_percentage"] = ( net_change / metrics["total_original_chars"] ) * 100 # Get character classification of original text original_classifications = classify_text(original) # Calculate per-character-class metrics all_char_classes = set() for op_classes in classifications.values(): all_char_classes.update(op_classes.keys()) all_char_classes.update(original_classifications.keys()) for char_class in all_char_classes: original_count = original_classifications.get(char_class, 0) deleted_count = classifications.get(Operation.DELETE, {}).get(char_class, 0) inserted_count = classifications.get(Operation.INSERT, {}).get( char_class, 0 ) class_metrics = { "original_count": original_count, "deleted_count": deleted_count, "inserted_count": inserted_count, "deletion_percentage": 0.0, "insertion_percentage": 0.0, } if original_count > 0: class_metrics["deletion_percentage"] = ( deleted_count / original_count ) * 100 # Insertion percentage relative to original count of this class if original_count > 0: class_metrics["insertion_percentage"] = ( inserted_count / original_count ) * 100 elif inserted_count > 0: # If there were none originally, show as new class_metrics["insertion_percentage"] = float("inf") metrics["char_class_metrics"][char_class] = class_metrics return metrics def escape_html(text: str) -> str: """Escape HTML and make whitespace visible.""" # First escape HTML text = python_html.escape(text) # Make whitespace visible ws_trans = str.maketrans({" ": "·", "\t": "→ ", "\n": "¶\n"}) return text.translate(ws_trans) def generate_html_diff( edits: List[Edit], show_equal: bool = True, max_equal_length: int = 100 ) -> str: """Generate HTML visualization of the diff with performance optimizations.""" # Pre-allocate list for better performance html_parts = [] # Group consecutive edits of the same type to reduce HTML tags grouped_edits = [] current_group = [] current_op = None for edit in edits: if ( edit.operation == current_op and len(current_group) < 100 ): # Batch up to 100 current_group.append(edit) else: if current_group: grouped_edits.append((current_op, current_group)) current_group = [edit] current_op = edit.operation if current_group: grouped_edits.append((current_op, current_group)) # Process grouped edits for op, group in grouped_edits: if op == Operation.DELETE: combined_text = "".join(e.old_text for e in group) escaped = escape_html(combined_text) html_parts.append( f'{escaped}' ) elif op == Operation.INSERT: combined_text = "".join(e.new_text for e in group) escaped = escape_html(combined_text) html_parts.append( f'{escaped}' ) elif op == Operation.EQUAL and show_equal: combined_text = "".join(e.old_text for e in group) # Truncate very long equal sections if len(combined_text) > max_equal_length: start = escape_html(combined_text[: max_equal_length // 2]) end = escape_html(combined_text[-max_equal_length // 2 :]) omitted = len(combined_text) - max_equal_length html_parts.append( f'{start}' f"...{omitted} chars omitted..." f"{end}" ) else: escaped = escape_html(combined_text) html_parts.append(f'{escaped}') return f'
{"".join(html_parts)}
' def generate_side_by_side_html(edits: List[Edit]) -> str: """Generate side-by-side HTML diff view.""" old_parts = [] new_parts = [] for edit in edits: if edit.operation == Operation.DELETE: escaped = escape_html(edit.old_text) old_parts.append(f'{escaped}') elif edit.operation == Operation.INSERT: escaped = escape_html(edit.new_text) new_parts.append(f'{escaped}') elif edit.operation == Operation.EQUAL: escaped = escape_html(edit.old_text) old_parts.append(f"{escaped}") new_parts.append(f"{escaped}") return f'''

Original

{"".join(old_parts)}

Processed

{"".join(new_parts)}
''' def generate_html_diff_fast(edits: List[Edit], context_chars: int = 5) -> str: """ Ultra-fast HTML diff generation showing only changes with context. """ html_parts = [] # Filter to only show changes and surrounding context change_indices = [ i for i, e in enumerate(edits) if e.operation != Operation.EQUAL ] if not change_indices: return '
No changes found.
' # Build ranges to show (change + context) ranges_to_show = [] start = max(0, change_indices[0] - context_chars) end = min(len(edits), change_indices[0] + context_chars + 1) for idx in change_indices[1:]: if idx - end <= context_chars * 2: # Extend current range end = min(len(edits), idx + context_chars + 1) else: # Save current range and start new one ranges_to_show.append((start, end)) start = max(0, idx - context_chars) end = min(len(edits), idx + context_chars + 1) ranges_to_show.append((start, end)) # Generate HTML for ranges for i, (start, end) in enumerate(ranges_to_show): if i > 0: html_parts.append( '
...
' ) for j in range(start, end): edit = edits[j] if edit.operation == Operation.DELETE: escaped = escape_html(edit.old_text) html_parts.append(f'{escaped}') elif edit.operation == Operation.INSERT: escaped = escape_html(edit.new_text) html_parts.append(f'{escaped}') else: # EQUAL escaped = escape_html(edit.old_text) html_parts.append(f'{escaped}') return f'
{"".join(html_parts)}
' def generate_side_by_side_html_fast( edits: List[Edit], context_chars: int = 5 ) -> str: """ Fast side-by-side HTML diff generation showing only changes with context. """ # Filter to only show changes and surrounding context change_indices = [ i for i, e in enumerate(edits) if e.operation != Operation.EQUAL ] if not change_indices: return """

Original

No changes found.

Processed

No changes found.
""" # Build ranges to show (change + context) ranges_to_show = [] start = max(0, change_indices[0] - context_chars) end = min(len(edits), change_indices[0] + context_chars + 1) for idx in change_indices[1:]: if idx - end <= context_chars * 2: # Extend current range end = min(len(edits), idx + context_chars + 1) else: # Save current range and start new one ranges_to_show.append((start, end)) start = max(0, idx - context_chars) end = min(len(edits), idx + context_chars + 1) ranges_to_show.append((start, end)) # Generate HTML for ranges old_parts = [] new_parts = [] for i, (start, end) in enumerate(ranges_to_show): if i > 0: separator = ( '
...
' ) old_parts.append(separator) new_parts.append(separator) for j in range(start, end): edit = edits[j] if edit.operation == Operation.DELETE: escaped = escape_html(edit.old_text) old_parts.append(f'{escaped}') elif edit.operation == Operation.INSERT: escaped = escape_html(edit.new_text) new_parts.append(f'{escaped}') else: # EQUAL escaped = escape_html(edit.old_text) old_parts.append(f'{escaped}') new_parts.append(f'{escaped}') return f'''

Original

{"".join(old_parts)}

Processed

{"".join(new_parts)}
''' def operation_to_past(op: Operation) -> str: if op == Operation.INSERT: return "inserted" else: return str(op) + "d" def format_diff_summary( edits: List[Edit], classifications: Dict[Operation, Dict[str, int]], metrics: Dict[str, Any], ) -> str: """Create a human-readable summary of the diff.""" lines = ["## Diff Summary\n"] # Overall statistics lines.append("### Overall Statistics") lines.append( f"- **Original text**: {metrics['total_original_chars']:,} characters" ) # Format deletions del_pct = format_percentage(metrics["deletion_percentage"]) lines.append( f"- **Deletions**: {metrics['total_deleted_chars']:,} characters ({del_pct})" ) # Format insertions ins_pct = format_percentage(metrics["insertion_percentage"]) lines.append( f"- **Insertions**: {metrics['total_inserted_chars']:,} characters ({ins_pct})" ) # Format net change net_pct = metrics["net_change_percentage"] if abs(net_pct) < 0.01: net_pct_str = f"{net_pct:+.3f}%" else: net_pct_str = f"{net_pct:+.1f}%" lines.append( f"- **Net change**: {net_pct_str} " f"({'increase' if metrics['net_change_percentage'] > 0 else 'decrease' if metrics['net_change_percentage'] < 0 else 'no change'})" ) # Character classifications if classifications: lines.append("\n### Character Classifications") # Show changes by character class for op in [Operation.DELETE, Operation.INSERT]: if op in classifications and classifications[op]: lines.append(f"\n**{operation_to_past(op).title()} Characters:**") for char_class, count in sorted( classifications[op].items(), key=lambda x: -x[1] ): lines.append( f"- {char_class.replace('_', ' ').title()}: {count}" ) # Show percentage changes by character class lines.append("\n### Change Percentages by Character Class") # Sort by most changed (highest deletion or insertion percentage) sorted_classes = sorted( metrics["char_class_metrics"].items(), key=lambda x: max( x[1]["deletion_percentage"], 0 if x[1]["insertion_percentage"] == float("inf") else x[1]["insertion_percentage"], ), reverse=True, ) for char_class, class_metrics in sorted_classes: if ( class_metrics["deleted_count"] > 0 or class_metrics["inserted_count"] > 0 ): class_name = char_class.replace("_", " ").title() # Format the line line_parts = [f"- **{class_name}**:"] if class_metrics["original_count"] > 0: line_parts.append( f"Original: {class_metrics['original_count']}" ) if class_metrics["deleted_count"] > 0: line_parts.append( f"Deleted: {class_metrics['deleted_count']} " f"({class_metrics['deletion_percentage']:.1f}%)" ) if class_metrics["inserted_count"] > 0: if class_metrics["insertion_percentage"] == float("inf"): line_parts.append( f"Inserted: {class_metrics['inserted_count']} (new)" ) else: line_parts.append( f"Inserted: {class_metrics['inserted_count']} " f"({class_metrics['insertion_percentage']:.1f}%)" ) lines.append(" | ".join(line_parts)) return "\n".join(lines) def format_percentage(value: float, min_decimals: int = 1) -> str: """Format percentage with adaptive decimal places.""" if value == 0: return "0%" elif value < 0.01: return f"{value:.3f}%" # Show 3 decimals for very small values elif value < 0.1: return f"{value:.2f}%" # Show 2 decimals for small values elif value < 1: return f"{value:.1f}%" # Show 1 decimal for values < 1% else: return f"{value:.0f}%" # No decimals for values >= 1% def classify_edits_with_chars( edits: List[Edit], ) -> Dict[Operation, Dict[str, Dict[str, int]]]: """ Classify edit operations by character class and track character frequencies. Returns: {operation: {char_class: {char: count}}} """ from collections import defaultdict, Counter # Filter out EQUAL operations change_edits = [e for e in edits if e.operation != Operation.EQUAL] # Track characters by operation and classification result = defaultdict(lambda: defaultdict(Counter)) for edit in change_edits: text = ( edit.old_text if edit.operation == Operation.DELETE else edit.new_text ) for char in text: char_class = classify_char(char) result[edit.operation][char_class][char] += 1 return dict(result) def get_top_chars(char_counter: Dict[str, int], n: int = 5) -> str: """Get top n characters by frequency, formatted for display.""" if not char_counter: return "-" # Sort by frequency and take top n top_chars = sorted(char_counter.items(), key=lambda x: -x[1])[:n] # Format characters for display formatted_chars = [] for char, _ in top_chars: if char == " ": formatted_chars.append("·") # Middle dot for space elif char == "\n": formatted_chars.append("¶") # Pilcrow for newline elif char == "\t": formatted_chars.append("→") # Arrow for tab elif ord(char) < 32 or ord(char) == 127: formatted_chars.append(f"\\x{ord(char):02x}") # Hex for control chars else: formatted_chars.append(char) return " ".join(formatted_chars) def create_summary_tables( edits: List[Edit], classifications: Dict[Operation, Dict[str, int]], metrics: Dict[str, Any], ) -> Dict[str, GT]: """Create great_tables tables for the diff summary.""" # Get detailed character data detailed_classifications = classify_edits_with_chars(edits) # Table 1: Overall Statistics (unchanged) overall_data = pd.DataFrame( { "Metric": [ "Original Length", "Characters Deleted", "Characters Inserted", "Net Change", ], "Count": [ metrics["total_original_chars"], metrics["total_deleted_chars"], metrics["total_inserted_chars"], metrics["total_inserted_chars"] - metrics["total_deleted_chars"], ], "Percentage": [ "-", format_percentage(metrics["deletion_percentage"]), format_percentage(metrics["insertion_percentage"]), f"{metrics['net_change_percentage']:+.3f}%" if abs(metrics["net_change_percentage"]) < 0.01 else f"{metrics['net_change_percentage']:+.1f}%", ], } ) overall_table = ( GT(overall_data) .tab_header( title="Text Change Summary", subtitle=f"Total edits: {len([e for e in edits if e.operation != Operation.EQUAL])}", ) .fmt_number(columns="Count", decimals=0, use_seps=True) .tab_style( style=[style.fill(color="#f0f0f0"), style.text(weight="bold")], locations=loc.body(rows=[3]), ) .cols_align(align="center", columns=["Count", "Percentage"]) .opt_stylize(style=1, color="blue") ) # Table 2: Character Class Changes with top characters char_class_data = [] # Get all character classes all_classes = set() for op_classes in classifications.values(): all_classes.update(op_classes.keys()) all_classes.update(metrics["char_class_metrics"].keys()) # Build rows for char_class in sorted(all_classes): class_metrics = metrics["char_class_metrics"].get(char_class, {}) # Get top characters for this class del_chars = detailed_classifications.get(Operation.DELETE, {}).get( char_class, {} ) ins_chars = detailed_classifications.get(Operation.INSERT, {}).get( char_class, {} ) row = { "Character Class": char_class.replace("_", " ").title(), "Original": class_metrics.get("original_count", 0), "Deleted": class_metrics.get("deleted_count", 0), "Top Deleted": get_top_chars(del_chars, 5), "Inserted": class_metrics.get("inserted_count", 0), "Top Inserted": get_top_chars(ins_chars, 5), "Del %": format_percentage(class_metrics.get("deletion_percentage", 0)) if class_metrics.get("deletion_percentage", 0) > 0 else "-", "Ins %": ( "new" if class_metrics.get("insertion_percentage", 0) == float("inf") else format_percentage(class_metrics.get("insertion_percentage", 0)) if class_metrics.get("insertion_percentage", 0) > 0 else "-" ), } # Only include rows with changes if row["Deleted"] > 0 or row["Inserted"] > 0: char_class_data.append(row) if char_class_data: char_class_df = pd.DataFrame(char_class_data) char_class_table = ( GT(char_class_df) .tab_header(title="Changes by Character Classification") .fmt_number( columns=["Original", "Deleted", "Inserted"], decimals=0, use_seps=True, ) .tab_style( style=style.fill(color="#ffcccc"), locations=loc.body(columns=["Deleted", "Top Deleted"]), ) .tab_style( style=style.fill(color="#ccffcc"), locations=loc.body(columns=["Inserted", "Top Inserted"]), ) .tab_style( style=style.text(font="monospace"), locations=loc.body(columns=["Top Deleted", "Top Inserted"]), ) .cols_align( align="center", columns=["Original", "Deleted", "Inserted", "Del %", "Ins %"], ) .cols_align(align="left", columns=["Top Deleted", "Top Inserted"]) .tab_spanner( label="Counts", columns=["Original", "Deleted", "Inserted"] ) .tab_spanner( label="Characters", columns=["Top Deleted", "Top Inserted"] ) .tab_spanner(label="Percentages", columns=["Del %", "Ins %"]) .cols_width( { "Character Class": "20%", "Original": "10%", "Deleted": "10%", "Top Deleted": "15%", "Inserted": "10%", "Top Inserted": "15%", "Del %": "10%", "Ins %": "10%", } ) .opt_stylize(style=1, color="blue") ) else: char_class_table = None # Table 3: Compact Combined View (unchanged except for percentage formatting) compact_data = [] # Add summary row compact_data.append( { "Type": "Total", "Deleted": metrics["total_deleted_chars"], "Inserted": metrics["total_inserted_chars"], "Net": metrics["total_inserted_chars"] - metrics["total_deleted_chars"], "Change": f"{metrics['net_change_percentage']:+.3f}%" if abs(metrics["net_change_percentage"]) < 0.01 else f"{metrics['net_change_percentage']:+.0f}%", } ) # Add top character classes (sorted by total change) class_changes = [] for char_class, class_metrics in metrics["char_class_metrics"].items(): if ( class_metrics["deleted_count"] > 0 or class_metrics["inserted_count"] > 0 ): class_changes.append( { "Type": char_class.replace("_", " ").title(), "Deleted": class_metrics["deleted_count"], "Inserted": class_metrics["inserted_count"], "Net": class_metrics["inserted_count"] - class_metrics["deleted_count"], "Change": class_metrics["deleted_count"] + class_metrics["inserted_count"], } ) # Sort by total change and take top 5 class_changes.sort(key=lambda x: x["Change"], reverse=True) for item in class_changes[:5]: item["Change"] = f"{item['Net']:+d}" if item["Net"] != 0 else "±0" compact_data.append(item) compact_df = pd.DataFrame(compact_data) compact_table = ( GT(compact_df) .tab_header(title="Edit Summary - Compact View") .fmt_number( columns=["Deleted", "Inserted", "Net"], decimals=0, use_seps=True ) .tab_style( style=[ style.fill(color="#e8e8e8"), style.text(weight="bold"), style.borders(sides=["top", "bottom"], color="#666", weight="2px"), ], locations=loc.body(rows=[0]), ) .tab_style( style=style.text(color="#880000"), locations=loc.body(columns=["Deleted"]), ) .tab_style( style=style.text(color="#008800"), locations=loc.body(columns=["Inserted"]), ) .cols_align( align="center", columns=["Deleted", "Inserted", "Net", "Change"] ) .cols_width( { "Type": "40%", "Deleted": "15%", "Inserted": "15%", "Net": "15%", "Change": "15%", } ) .opt_stylize(style=1, color="cyan") ) return { "overall": overall_table, "char_class": char_class_table, "compact": compact_table, } def create_operation_matrix_table( edits: List[Edit], classifications: Dict[Operation, Dict[str, int]] ) -> GT: """Create a matrix view of operations by character class.""" # Get all character classes all_classes = set() for op_classes in classifications.values(): all_classes.update(op_classes.keys()) # Build matrix data matrix_data = [] for char_class in sorted(all_classes): row = { "Character Type": char_class.replace("_", " ").title(), "Deletions": classifications.get(Operation.DELETE, {}).get( char_class, 0 ), "Insertions": classifications.get(Operation.INSERT, {}).get( char_class, 0 ), "Balance": ( classifications.get(Operation.INSERT, {}).get(char_class, 0) - classifications.get(Operation.DELETE, {}).get(char_class, 0) ), } matrix_data.append(row) # Sort by total changes matrix_data.sort(key=lambda x: x["Deletions"] + x["Insertions"], reverse=True) # Convert to DataFrame matrix_df = pd.DataFrame(matrix_data) # Calculate max values for domains max_del = max((r["Deletions"] for r in matrix_data), default=1) max_ins = max((r["Insertions"] for r in matrix_data), default=1) max_balance = max((abs(r["Balance"]) for r in matrix_data), default=1) matrix_table = ( GT(matrix_df) .tab_header(title="Operation Matrix by Character Type") .fmt_number(columns=["Deletions", "Insertions", "Balance"], decimals=0) .data_color( columns=["Deletions"], palette=["white", "#ffcccc"], domain=[0, max_del], ) .data_color( columns=["Insertions"], palette=["white", "#ccffcc"], domain=[0, max_ins], ) .data_color( columns=["Balance"], palette=["#ffcccc", "white", "#ccffcc"], domain=[-max_balance, max_balance], ) .cols_align(align="center", columns=["Deletions", "Insertions", "Balance"]) .opt_stylize(style=2, color="gray") ) return matrix_table def is_long_diff(edits: List[Edit], original: str) -> bool: """Determine if a diff should use fast rendering.""" return len(edits) > 1000 or len(original) > 10000 def analyze_text_changes( original: str, processed: str, ) -> Dict[str, Any]: """ Main function to analyze changes between two texts. """ edits = myers_diff(original, processed) classifications = classify_edits(edits) metrics = calculate_change_metrics(original, edits, classifications) summary = format_diff_summary(edits, classifications, metrics) result = { "edits": edits, "classifications": classifications, "metrics": metrics, "summary": summary, "tables": create_summary_tables(edits, classifications, metrics), "matrix_table": create_operation_matrix_table(edits, classifications), } return result def render_html_diff( edits: List[Edit], original: str, context_chars: int = 5, side_by_side: bool = False, use_fast_html: bool | None = None, ) -> str: """ Unified function to render HTML diffs with automatic optimization. Args: edits: List of Edit operations original: Original text (for length checking) context_chars: Number of context lines to show in fast mode side_by_side: Whether to use side-by-side view use_fast_html: Force fast mode (None for auto-detect) Returns: HTML string of the diff """ if use_fast_html is None: use_fast_html = is_long_diff(edits, original) if use_fast_html: if side_by_side: return generate_side_by_side_html_fast( edits, context_chars=context_chars ) else: return generate_html_diff_fast(edits, context_chars=context_chars) else: if side_by_side: # For non-fast mode, still use length-based optimization if len(edits) > 500: return generate_side_by_side_html_fast(edits, max_length=50000) else: return generate_side_by_side_html(edits) else: return generate_html_diff(edits, show_equal=True, max_equal_length=200) return analyze_text_changes, render_html_diff @app.cell def _(mo): o_file_upload = mo.ui.file(label="Original text", kind="area") p_file_upload = mo.ui.file(label="Preprocessed text", kind="area") file_stack = mo.hstack([o_file_upload, p_file_upload], widths="equal") return file_stack, o_file_upload, p_file_upload @app.cell def _(mo): o_textbox = mo.ui.text_area(label="Original text", full_width=True) p_textbox = mo.ui.text_area(label="Preprocessed text", full_width=True) text_stack = mo.hstack([o_textbox, p_textbox], widths="equal") return o_textbox, p_textbox, text_stack @app.cell def _(file_stack, mo, text_stack): mo.ui.tabs({"Text": text_stack, "File": file_stack}) return @app.function def check_text_similarity(text1: str, text2: str, threshold: float = 0.1) -> bool: """Check if texts are similar enough based on length and character overlap.""" if not text1 or not text2: return False return len(set(text1) & set(text2)) / len( set(text1) | set(text2) ) >= threshold and abs(len(text1) - len(text2)) / max(len(text1), len(text2)) <= ( 1 - threshold ) @app.cell def _(mo, o_file_upload, o_textbox, p_file_upload, p_textbox): from charset_normalizer import detect def detect_encoding(b: bytes) -> str: result = detect(b) return result["encoding"] o_text, p_text = ( "Example text will be used if none provided!", "Example Text will be used, if none provided.", ) try: if o_file_upload.contents(): encoding = detect_encoding(o_file_upload.contents()) try: o_text = o_file_upload.contents().decode(encoding) except UnicodeDecodeError: o_text = o_file_upload.contents().decode("utf-8") elif o_textbox.value: o_text = o_textbox.value if p_file_upload.contents(): encoding = detect_encoding(o_file_upload.contents()) try: p_text = p_file_upload.contents().decode(encoding) except UnicodeDecodeError: p_text = p_file_upload.contents().decode("utf-8") elif p_textbox.value: p_text = p_textbox.value except UnicodeDecodeError: mo.stop( True, mo.md("Error decoding files. Please try UTF-8.").callout(kind="danger"), ) mo.stop( not check_text_similarity(o_text, p_text), mo.md( f"Texts are too dissimilar! Aborting comparison.\n\n{o_text[:50]}\n\n{p_text[:50]}" ).callout(kind="danger"), ) return o_text, p_text @app.cell def _(analyze_text_changes, o_text, p_text): results = analyze_text_changes(o_text, p_text) return (results,) @app.cell def _(mo, results): results_tables = mo.vstack( [ results["tables"]["overall"], results["tables"]["char_class"], results["tables"]["compact"], ] ) return (results_tables,) @app.cell def _(mo, o_text, render_html_diff, results, results_tables): diff_view = mo.ui.tabs( { "Combined diff": mo.Html( render_html_diff( results["edits"], o_text, ) ), "Side-by-side diff": mo.Html( render_html_diff( results["edits"], o_text, side_by_side=True, ) ), } ) mo.md(f""" # Results {results_tables} {diff_view} """) return @app.cell def _(): return if __name__ == "__main__": app.run()