ttm-webapp-hf / pipeline /differential_viz.py
daniel-wojahn's picture
cleanup and expansion of structural analysis
edd4b9d
"""
Differential visualization enhancements for Tibetan legal manuscript analysis.
Provides enhanced heatmaps with structural change highlighting.
"""
import plotly.graph_objects as go
from typing import Dict, List
import pandas as pd
from .structural_analysis import detect_structural_changes, generate_structural_alignment
def create_differential_heatmap(texts_dict: Dict[str, str],
chapter_key: str,
metric_results: pd.DataFrame,
highlight_threshold: float = 0.7) -> go.Figure:
"""
Create enhanced heatmap with structural change highlighting.
Args:
texts_dict: Dictionary mapping text names to their content
chapter_key: Chapter identifier being analyzed
metric_results: DataFrame with similarity metrics
highlight_threshold: Threshold for highlighting significant changes
"""
# Get unique text pairs
text_pairs = metric_results['Text Pair'].unique()
# Create enhanced heatmap data
enhanced_data = []
for pair in text_pairs:
texts = pair.split(' vs ')
if len(texts) == 2:
text1_name, text2_name = texts
# Get actual text content
text1_content = texts_dict.get(text1_name, '')
text2_content = texts_dict.get(text2_name, '')
# Perform structural analysis
changes = detect_structural_changes(text1_content, text2_content)
alignment = generate_structural_alignment(text1_content, text2_content)
# Create enhanced metrics
enhanced_row = {
'Text Pair': pair,
'Chapter': chapter_key,
'structural_changes': len(changes['insertions']) + len(changes['deletions']) + len(changes['modifications']),
'modification_score': len(changes['modifications']),
'insertion_score': len(changes['insertions']),
'deletion_score': len(changes['deletions']),
'alignment_quality': len(alignment['matches']) / max(len(alignment['segments1']) + len(alignment['segments2']), 1),
'significant_differences': len([c for c in changes['modifications'] if len(c['original']) > 10])
}
enhanced_data.append(enhanced_row)
# Create a clean table with numbers and percentages
summary_table = []
for row in enhanced_data:
text_pair = row['Text Pair']
chapter = row['Chapter']
# Calculate percentages
total_changes = row['structural_changes']
modifications = row['modification_score']
insertions_deletions = row['insertion_score'] + row['deletion_score']
alignment_quality = row['alignment_quality']
# Create summary row
summary_row = {
'Text Pair': text_pair,
'Chapter': chapter,
'Total Changes': total_changes,
'Modifications': modifications,
'Insertions/Deletions': insertions_deletions,
'Alignment Quality': f"{alignment_quality:.1f}%",
'Significant Differences': row['significant_differences']
}
summary_table.append(summary_row)
# Create DataFrame for table display
summary_df = pd.DataFrame(summary_table)
# Create a simple table with styling
fig = go.Figure(data=[go.Table(
header=dict(
values=['Text Pair', 'Chapter', 'Total Changes', 'Modifications',
'Insertions/Deletions', 'Alignment Quality', 'Significant Differences'],
font=dict(size=12, color='white'),
fill_color='darkblue',
align='left'
),
cells=dict(
values=[
summary_df['Text Pair'],
summary_df['Chapter'],
summary_df['Total Changes'],
summary_df['Modifications'],
summary_df['Insertions/Deletions'],
summary_df['Alignment Quality'],
summary_df['Significant Differences']
],
font=dict(size=11),
align='left',
fill_color=['lightgrey' if i % 2 == 0 else 'white'
for i in range(len(summary_df))]
)
)])
fig.update_layout(
title="Structural Analysis Summary",
height=400,
margin=dict(l=10, r=10, t=40, b=10)
)
return fig
def create_change_detection_report(texts_dict: Dict[str, str],
chapter_key: str,
output_format: str = 'html') -> str:
"""
Create detailed change detection report for a chapter.
Args:
texts_dict: Dictionary mapping text names to content
chapter_key: Chapter identifier
output_format: Format for output ('html', 'json', 'markdown')
"""
from .structural_analysis import generate_differential_report
text_names = list(texts_dict.keys())
reports = []
for i, text1_name in enumerate(text_names):
for text2_name in text_names[i+1:]:
text1_content = texts_dict[text1_name]
text2_content = texts_dict[text2_name]
report = generate_differential_report(
text1_content, text2_content, text1_name, text2_name
)
reports.append(report)
if output_format == 'html':
return create_html_report(reports, chapter_key)
elif output_format == 'json':
import json
return json.dumps(reports, indent=2, ensure_ascii=False)
else:
return create_markdown_report(reports, chapter_key)
def create_html_report(reports: List[Dict], chapter_key: str) -> str:
"""Create HTML report for structural analysis."""
html = f"""
<!DOCTYPE html>
<html>
<head>
<title>Structural Analysis Report - Chapter {chapter_key}</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 20px; }}
.report {{ max-width: 1200px; margin: 0 auto; }}
.comparison {{ border: 1px solid #ddd; margin: 20px 0; padding: 15px; }}
.changes {{ display: flex; gap: 20px; }}
.change-type {{ flex: 1; padding: 10px; border: 1px solid #eee; }}
.insertion {{ background-color: #e8f5e8; }}
.deletion {{ background-color: #ffe8e8; }}
.modification {{ background-color: #fff3e0; }}
.highlight {{ background-color: yellow; padding: 2px 4px; }}
</style>
</head>
<body>
<div class="report">
<h1>Structural Analysis Report - Chapter {chapter_key}</h1>
"""
for report in reports:
html += f"""
<div class="comparison">
<h2>{report['file1']} vs {report['file2']}</h2>
<div class="scores">
<p><strong>Structural Similarity:</strong> {report['scores']['structural_similarity']:.2f}</p>
<p><strong>Alignment Score:</strong> {report['scores']['alignment_score']:.2f}</p>
</div>
<div class="changes">
<div class="change-type insertion">
<h3>Insertions ({len(report['changes']['insertions'])})</h3>
{format_changes_html(report['changes']['insertions'])}
</div>
<div class="change-type deletion">
<h3>Deletions ({len(report['changes']['deletions'])})</h3>
{format_changes_html(report['changes']['deletions'])}
</div>
<div class="change-type modification">
<h3>Modifications ({len(report['changes']['modifications'])})</h3>
{format_changes_html(report['changes']['modifications'], is_modification=True)}
</div>
</div>
</div>
"""
html += """
</div>
</body>
</html>
"""
return html
def format_changes_html(changes: List[Dict], is_modification: bool = False) -> str:
"""Format changes for HTML display."""
if not changes:
return "<p>No changes detected.</p>"
html = ""
for change in changes[:5]: # Limit to first 5 for brevity
if is_modification:
html += f"""
<div class="change">
<span class="highlight">{change.get('original', '')}</span> β†’
<span class="highlight">{change.get('replacement', '')}</span>
</div>
"""
else:
html += f"""
<div class="change">
<span class="highlight">{change.get('word', '')}</span>
</div>
"""
if len(changes) > 5:
html += f"<p>... and {len(changes) - 5} more</p>"
return html
def create_markdown_report(reports: List[Dict], chapter_key: str) -> str:
"""Create markdown report for structural analysis."""
md = f"# Structural Analysis Report - Chapter {chapter_key}\n\n"
for report in reports:
md += f"## {report['file1']} vs {report['file2']}\n\n"
md += f"- **Structural Similarity**: {report['scores']['structural_similarity']:.2f}\n"
md += f"- **Alignment Score**: {report['scores']['alignment_score']:.2f}\n"
md += f"- **Insertions**: {len(report['changes']['insertions'])}\n"
md += f"- **Deletions**: {len(report['changes']['deletions'])}\n"
md += f"- **Modifications**: {len(report['changes']['modifications'])}\n\n"
if report['changes']['modifications']:
md += "### Significant Modifications:\n"
for mod in report['changes']['modifications'][:3]:
md += f"- **{mod.get('original', '')}** β†’ **{mod.get('replacement', '')}**\n"
return md