Spaces:
Sleeping
Sleeping
Commit
·
988e855
1
Parent(s):
bda2b5b
tfidf remove; css fixes
Browse files- Tibetan Text Metrics Report.html +553 -0
- app.py +18 -22
- pipeline/differential_viz.py +53 -70
- pipeline/metrics.py +2 -48
- pipeline/stopwords_bo.py +1 -1
- pipeline/stopwords_lite_bo.py +1 -1
Tibetan Text Metrics Report.html
ADDED
@@ -0,0 +1,553 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
<!DOCTYPE html>
|
3 |
+
<html>
|
4 |
+
<head>
|
5 |
+
<title>Structural Analysis Report - Chapter all_chapters</title>
|
6 |
+
<style>
|
7 |
+
body { font-family: Arial, sans-serif; margin: 20px; }
|
8 |
+
.report { max-width: 1200px; margin: 0 auto; }
|
9 |
+
.comparison { border: 1px solid #ddd; margin: 20px 0; padding: 15px; }
|
10 |
+
.changes { display: flex; gap: 20px; }
|
11 |
+
.change-type { flex: 1; padding: 10px; border: 1px solid #eee; }
|
12 |
+
.insertion { background-color: #e8f5e8; }
|
13 |
+
.deletion { background-color: #ffe8e8; }
|
14 |
+
.modification { background-color: #fff3e0; }
|
15 |
+
.highlight { background-color: yellow; padding: 2px 4px; }
|
16 |
+
</style>
|
17 |
+
</head>
|
18 |
+
<body>
|
19 |
+
<div class="report">
|
20 |
+
<h1>Structural Analysis Report - Chapter all_chapters</h1>
|
21 |
+
|
22 |
+
<div class="comparison">
|
23 |
+
<h2>Bailey.txt vs Dolanji_16.txt</h2>
|
24 |
+
<div class="scores">
|
25 |
+
<p><strong>Structural Similarity:</strong> 0.00</p>
|
26 |
+
<p><strong>Alignment Score:</strong> 0.03</p>
|
27 |
+
</div>
|
28 |
+
|
29 |
+
<div class="changes">
|
30 |
+
<div class="change-type insertion">
|
31 |
+
<h3>Insertions (889)</h3>
|
32 |
+
|
33 |
+
<div class="change">
|
34 |
+
<span class="highlight">དགྲ་འདུལ་བ་ལ་དགོས་ཏེ།</span>
|
35 |
+
</div>
|
36 |
+
|
37 |
+
<div class="change">
|
38 |
+
<span class="highlight">རྒྱལ་རིགས་བསྟན་བཅོས་དྲངས་པ་ལས།</span>
|
39 |
+
</div>
|
40 |
+
|
41 |
+
<div class="change">
|
42 |
+
<span class="highlight">འཇིག་རྟེན་གྱི་ཁ་དཔེར།</span>
|
43 |
+
</div>
|
44 |
+
|
45 |
+
<div class="change">
|
46 |
+
<span class="highlight">ཐོག་མ་ནས་དྲག་འདུལ་མི་བརྩོམ་པར།</span>
|
47 |
+
</div>
|
48 |
+
|
49 |
+
<div class="change">
|
50 |
+
<span class="highlight">གོ་བ་བསྐོན།</span>
|
51 |
+
</div>
|
52 |
+
<p>... and 884 more</p>
|
53 |
+
</div>
|
54 |
+
<div class="change-type deletion">
|
55 |
+
<h3>Deletions (681)</h3>
|
56 |
+
|
57 |
+
<div class="change">
|
58 |
+
<span class="highlight">གཡུལ་ཕན་ཚུན་མཉམ་པའི་དགྲ་འམ།</span>
|
59 |
+
</div>
|
60 |
+
|
61 |
+
<div class="change">
|
62 |
+
<span class="highlight">གསེར་ཡིག་གི་འགྲུལ་བཙུགས་ཏེ།</span>
|
63 |
+
</div>
|
64 |
+
|
65 |
+
<div class="change">
|
66 |
+
<span class="highlight">ཞི་རྒྱས་དབང་དྲག་གང་འགྲོ་གང་ཟབ་བལྟ་དགོས་པ་བཞིན་མི་སྣ་གསེར་ཡིག་གིས་ཀྱང་།</span>
|
67 |
+
</div>
|
68 |
+
|
69 |
+
<div class="change">
|
70 |
+
<span class="highlight">རང་གི་དཔོན་ཁུང་གི་དོན་བསྒྲུབ་བྱ་གཙོ་བོར་གཟུང་།</span>
|
71 |
+
</div>
|
72 |
+
|
73 |
+
<div class="change">
|
74 |
+
<span class="highlight">དྲག་པོ་མཐའ་སྐྱེལ་དུ་སོང་ན་ལེགས་ཉེས་སྤྱི་མར་ཡོད་པའི།</span>
|
75 |
+
</div>
|
76 |
+
<p>... and 676 more</p>
|
77 |
+
</div>
|
78 |
+
<div class="change-type modification">
|
79 |
+
<h3>Modifications (203)</h3>
|
80 |
+
|
81 |
+
<div class="change">
|
82 |
+
<span class="highlight">དང་པོ་དཔའ་བོ་སྟག་གི་ཞལ་ལྕེ་ནི་དགྲ་འདུལ་བ་ལ་དགོས་ཏེ།</span> →
|
83 |
+
<span class="highlight">དང་པོ་དཔའ་བོ་སྟག་གི་ཞལ་ལྕེ་ནི།</span>
|
84 |
+
</div>
|
85 |
+
|
86 |
+
<div class="change">
|
87 |
+
<span class="highlight">དེ་ཡང་འདུལ་བ་ལུང་དུ་རྒྱལ་རིགས་ཀྱི་བསྟན་བཅོས་དྲངས་པར།</span> →
|
88 |
+
<span class="highlight">དེ་ཡང་འདུལ་བ་ལུང་དུ།</span>
|
89 |
+
</div>
|
90 |
+
|
91 |
+
<div class="change">
|
92 |
+
<span class="highlight">མཁས་པ་རྣམས་ནི་གཞོམ་པ་དང་།</span> →
|
93 |
+
<span class="highlight">མཁས་པ་རྣམས་ནི་གཞམ་པ་དང་།</span>
|
94 |
+
</div>
|
95 |
+
|
96 |
+
<div class="change">
|
97 |
+
<span class="highlight">དཔུང་གི་དོན་རྣམས་བསྒྲུབ་པར་བྱེད།</span> →
|
98 |
+
<span class="highlight">དཔུང་གི་དོན་རྣམས་བསྒྲུབ་པར་བྱ།</span>
|
99 |
+
</div>
|
100 |
+
|
101 |
+
<div class="change">
|
102 |
+
<span class="highlight">ཅེས་གསུང་པ་ལྟར།</span> →
|
103 |
+
<span class="highlight">ཅེས་གསུངས་པ་ལྟར།</span>
|
104 |
+
</div>
|
105 |
+
<p>... and 198 more</p>
|
106 |
+
</div>
|
107 |
+
</div>
|
108 |
+
</div>
|
109 |
+
|
110 |
+
<div class="comparison">
|
111 |
+
<h2>Bailey.txt vs Leiden_16.txt</h2>
|
112 |
+
<div class="scores">
|
113 |
+
<p><strong>Structural Similarity:</strong> 0.00</p>
|
114 |
+
<p><strong>Alignment Score:</strong> 0.04</p>
|
115 |
+
</div>
|
116 |
+
|
117 |
+
<div class="changes">
|
118 |
+
<div class="change-type insertion">
|
119 |
+
<h3>Insertions (807)</h3>
|
120 |
+
|
121 |
+
<div class="change">
|
122 |
+
<span class="highlight">དགྲ་འདུལ་བ་ལ་དགོས་ཏེ།</span>
|
123 |
+
</div>
|
124 |
+
|
125 |
+
<div class="change">
|
126 |
+
<span class="highlight">ཐོག་མར་དྲག་པོ་མཐའ་སྐྱེལ་ཡ་འུད་མི་བསྲེ་བར་འཇིག་རྟེན་གྱི་ཁ་དཔེར།</span>
|
127 |
+
</div>
|
128 |
+
|
129 |
+
<div class="change">
|
130 |
+
<span class="highlight">བྱ་མ་འཕུར་བ་སྒོ་ང་ལོན་པའི་དཔེ་ལྟར།</span>
|
131 |
+
</div>
|
132 |
+
|
133 |
+
<div class="change">
|
134 |
+
<span class="highlight">མཁས་པ་རྣམས་ནི་གཞོམ་པའི་དོན།</span>
|
135 |
+
</div>
|
136 |
+
|
137 |
+
<div class="change">
|
138 |
+
<span class="highlight">རྒྱལ་སྲིད་ལྟ་བུའི་སྟོབས་དང་གཡུལ་ཕན་ཚུན་སྙོམས་པའི་དགྲ་འམ།</span>
|
139 |
+
</div>
|
140 |
+
<p>... and 802 more</p>
|
141 |
+
</div>
|
142 |
+
<div class="change-type deletion">
|
143 |
+
<h3>Deletions (706)</h3>
|
144 |
+
|
145 |
+
<div class="change">
|
146 |
+
<span class="highlight">ཐོག་མར་དྲག་པོ་མཐར་སྐྱེལ་ཡ་འུད་དང་མི་བསྲེ་བར་འཇིག་རྟེན་གྱི་ཁ་དཔེར།</span>
|
147 |
+
</div>
|
148 |
+
|
149 |
+
<div class="change">
|
150 |
+
<span class="highlight">བྱ་མ་འཕུར་སྒོང་ང་ལོན་པའི་དཔེ་ལྟར།</span>
|
151 |
+
</div>
|
152 |
+
|
153 |
+
<div class="change">
|
154 |
+
<span class="highlight">མཁས་པ་རྣམས་ནི་གཞལ་པའི་དོན།</span>
|
155 |
+
</div>
|
156 |
+
|
157 |
+
<div class="change">
|
158 |
+
<span class="highlight">རྒྱལ་སྲིད་ལྟ་བུའི་སྟོབས་དང་།</span>
|
159 |
+
</div>
|
160 |
+
|
161 |
+
<div class="change">
|
162 |
+
<span class="highlight">གཡུལ་ཕན་ཚུན་མཉམ་པའི་དགྲ་འམ།</span>
|
163 |
+
</div>
|
164 |
+
<p>... and 701 more</p>
|
165 |
+
</div>
|
166 |
+
<div class="change-type modification">
|
167 |
+
<h3>Modifications (170)</h3>
|
168 |
+
|
169 |
+
<div class="change">
|
170 |
+
<span class="highlight">དང་པོ་དཔའ་བོ་སྟག་གི་ཞལ་ལྕེ་ནི་དགྲ་འདུལ་བ་ལ་དགོས་ཏེ།</span> →
|
171 |
+
<span class="highlight">དང་པོ་དཔའ་བོ་སྟག་གི་ཞལ་ལྕེ་ནི།</span>
|
172 |
+
</div>
|
173 |
+
|
174 |
+
<div class="change">
|
175 |
+
<span class="highlight">དེ་ཡང་འདུལ་བ་ལུང་དུ་རྒྱལ་རིགས་ཀྱི་བསྟན་བཅོས་དྲངས་པར།</span> →
|
176 |
+
<span class="highlight">དེ་ཡང་འདུལ་བ་ལུང་དུ་རྒྱལ་རིགས་ཀྱི་བསྟན་བཅོས་གྲངས་སར།</span>
|
177 |
+
</div>
|
178 |
+
|
179 |
+
<div class="change">
|
180 |
+
<span class="highlight">མཁས་པ་རྣམས་ནི་གཞོམ་པ་དང་།</span> →
|
181 |
+
<span class="highlight">མཁས་པ་རྣམས་ནི་གཞོམས་པ་དང་།</span>
|
182 |
+
</div>
|
183 |
+
|
184 |
+
<div class="change">
|
185 |
+
<span class="highlight">ཀུན་���ུ་སྤྱད་དང་ལྔ་པར་ནི།</span> →
|
186 |
+
<span class="highlight">ཀུན་ཏུ་དཔྱད་དང་ལྔ་པར་ནི།</span>
|
187 |
+
</div>
|
188 |
+
|
189 |
+
<div class="change">
|
190 |
+
<span class="highlight">དཔུང་གི་དོན་རྣམས་བསྒྲུབ་པར་བྱེད།</span> →
|
191 |
+
<span class="highlight">དཔུང་གི་དོན་རྣམས་འགྲུབ་པར་བྱེད།</span>
|
192 |
+
</div>
|
193 |
+
<p>... and 165 more</p>
|
194 |
+
</div>
|
195 |
+
</div>
|
196 |
+
</div>
|
197 |
+
|
198 |
+
<div class="comparison">
|
199 |
+
<h2>Bailey.txt vs Ngari 8.txt</h2>
|
200 |
+
<div class="scores">
|
201 |
+
<p><strong>Structural Similarity:</strong> 0.00</p>
|
202 |
+
<p><strong>Alignment Score:</strong> 0.00</p>
|
203 |
+
</div>
|
204 |
+
|
205 |
+
<div class="changes">
|
206 |
+
<div class="change-type insertion">
|
207 |
+
<h3>Insertions (275)</h3>
|
208 |
+
|
209 |
+
<div class="change">
|
210 |
+
<span class="highlight">གཉིས་པ།</span>
|
211 |
+
</div>
|
212 |
+
|
213 |
+
<div class="change">
|
214 |
+
<span class="highlight">གསུམ་པ།</span>
|
215 |
+
</div>
|
216 |
+
|
217 |
+
<div class="change">
|
218 |
+
<span class="highlight">བཞི་པ།</span>
|
219 |
+
</div>
|
220 |
+
|
221 |
+
<div class="change">
|
222 |
+
<span class="highlight">བཀའ་ཁྲིམས་རིན་ཆེན་འབྱུང་གནས་ལས་ཐོག་པའི།</span>
|
223 |
+
</div>
|
224 |
+
|
225 |
+
<div class="change">
|
226 |
+
<span class="highlight">བཞུ་ཤེས་བདེན་མཛུག་བྱེད་པའི་རི་བོང་འཛིན།</span>
|
227 |
+
</div>
|
228 |
+
<p>... and 270 more</p>
|
229 |
+
</div>
|
230 |
+
<div class="change-type deletion">
|
231 |
+
<h3>Deletions (827)</h3>
|
232 |
+
|
233 |
+
<div class="change">
|
234 |
+
<span class="highlight">དེ་ཡང་འདུལ་བ་ལུང་དུ་རྒྱལ་རིགས་ཀྱི་བསྟན་བཅོས་དྲངས་པར།</span>
|
235 |
+
</div>
|
236 |
+
|
237 |
+
<div class="change">
|
238 |
+
<span class="highlight">མཁས་པ་རྣམས་ནི་གཞོམ་པ་དང་།</span>
|
239 |
+
</div>
|
240 |
+
|
241 |
+
<div class="change">
|
242 |
+
<span class="highlight">རབ་ཏུ་སྦྱིན་དང་བསླུ་བ་དང་།</span>
|
243 |
+
</div>
|
244 |
+
|
245 |
+
<div class="change">
|
246 |
+
<span class="highlight">ཀུན་ཏུ་སྤྱད་དང་ལྔ་པར་ནི།</span>
|
247 |
+
</div>
|
248 |
+
|
249 |
+
<div class="change">
|
250 |
+
<span class="highlight">ཅེས་གསུང་པ་ལྟར།</span>
|
251 |
+
</div>
|
252 |
+
<p>... and 822 more</p>
|
253 |
+
</div>
|
254 |
+
<div class="change-type modification">
|
255 |
+
<h3>Modifications (90)</h3>
|
256 |
+
|
257 |
+
<div class="change">
|
258 |
+
<span class="highlight">དང་པོ་དཔའ་བོ་སྟག་གི་ཞལ་ལྕེ་ནི་དགྲ་འདུལ་བ་ལ་དགོས་ཏེ།</span> →
|
259 |
+
<span class="highlight">དང་པོ།</span>
|
260 |
+
</div>
|
261 |
+
|
262 |
+
<div class="change">
|
263 |
+
<span class="highlight">དཔུང་གི་དོན་རྣམས་བསྒྲུབ་པར་བྱེད།</span> →
|
264 |
+
<span class="highlight">ཞེས་པ་ལྟར།</span>
|
265 |
+
</div>
|
266 |
+
|
267 |
+
<div class="change">
|
268 |
+
<span class="highlight">ཐོག་མར་དྲག་པོ་མཐར་སྐྱེལ་ཡ་འུད་དང་མི་བསྲེ་བར་འཇིག་རྟེན་གྱི་ཁ་དཔེར།</span> →
|
269 |
+
<span class="highlight">དོད་མི་མཉམ་ན་གདོང་ཤེར་མི་བྱེད་པ།</span>
|
270 |
+
</div>
|
271 |
+
|
272 |
+
<div class="change">
|
273 |
+
<span class="highlight">ཁེ་ཉེན་སྒྲགས་གོ་བ་དཀོན[*བཀོན]།</span> →
|
274 |
+
<span class="highlight">□□ཆད་པ་ཞུ་མི་རྣམ་ལ་ཐལ་ཆ་མི་བྱེད།</span>
|
275 |
+
</div>
|
276 |
+
|
277 |
+
<div class="change">
|
278 |
+
<span class="highlight">བསམ་པ་ཆེ་བར་བཟུང་།</span> →
|
279 |
+
<span class="highlight">ལྔ་པ།</span>
|
280 |
+
</div>
|
281 |
+
<p>... and 85 more</p>
|
282 |
+
</div>
|
283 |
+
</div>
|
284 |
+
</div>
|
285 |
+
|
286 |
+
<div class="comparison">
|
287 |
+
<h2>Dolanji_16.txt vs Leiden_16.txt</h2>
|
288 |
+
<div class="scores">
|
289 |
+
<p><strong>Structural Similarity:</strong> 0.00</p>
|
290 |
+
<p><strong>Alignment Score:</strong> 0.06</p>
|
291 |
+
</div>
|
292 |
+
|
293 |
+
<div class="changes">
|
294 |
+
<div class="change-type insertion">
|
295 |
+
<h3>Insertions (772)</h3>
|
296 |
+
|
297 |
+
<div class="change">
|
298 |
+
<span class="highlight">དགྲ་ལ་ཕ་རོལ་གྱི་འདུལ་ཐབས་ལ།</span>
|
299 |
+
</div>
|
300 |
+
|
301 |
+
<div class="change">
|
302 |
+
<span class="highlight">ཐོག་མར་དྲག་པོ་མཐའ་སྐྱེལ་ཡ་འུད་མི་བསྲེ་བར་འཇིག་རྟེན་གྱི་ཁ་དཔེར།</span>
|
303 |
+
</div>
|
304 |
+
|
305 |
+
<div class="change">
|
306 |
+
<span class="highlight">ནང་ཅོག་ག་གྲོ་མོའི་ཚང་མ་ཐོར་བ་དགོས་ཟེར་བ་ལྟར་དང་།</span>
|
307 |
+
</div>
|
308 |
+
|
309 |
+
<div class="change">
|
310 |
+
<span class="highlight">བྱ་མ་འཕུར་བ་སྒོ་ང་ལོན་པའི་དཔེ་ལྟར།</span>
|
311 |
+
</div>
|
312 |
+
|
313 |
+
<div class="change">
|
314 |
+
<span class="highlight">མཁས་པ་རྣམས་ནི་གཞོམ་པའི་དོན།</span>
|
315 |
+
</div>
|
316 |
+
<p>... and 767 more</p>
|
317 |
+
</div>
|
318 |
+
<div class="change-type deletion">
|
319 |
+
<h3>Deletions (879)</h3>
|
320 |
+
|
321 |
+
<div class="change">
|
322 |
+
<span class="highlight">རྒྱལ་རིགས་བསྟན་བཅོས་དྲངས་པ་ལས།</span>
|
323 |
+
</div>
|
324 |
+
|
325 |
+
<div class="change">
|
326 |
+
<span class="highlight">དགྲ་ཕ་རོལ་པོའི་འདུལ་ཐབས་ལ།</span>
|
327 |
+
</div>
|
328 |
+
|
329 |
+
<div class="change">
|
330 |
+
<span class="highlight">ཐོག་མར་དྲག་པོ་མཐར་སྐྱེལ་ཡ་བུད་དང་མི་བསྲེ་བར།</span>
|
331 |
+
</div>
|
332 |
+
|
333 |
+
<div class="change">
|
334 |
+
<span class="highlight">ནང་ཅོ་ག་བྲོ་མོའི་ཚང་མ་འཐོར་བ་དགོས་ཟེར་བ་ལྟར་དང་།</span>
|
335 |
+
</div>
|
336 |
+
|
337 |
+
<div class="change">
|
338 |
+
<span class="highlight">བྱ་མ་ཕུར་སྒོང་ལོན་པའི་དཔེ་ལྟར།</span>
|
339 |
+
</div>
|
340 |
+
<p>... and 874 more</p>
|
341 |
+
</div>
|
342 |
+
<div class="change-type modification">
|
343 |
+
<h3>Modifications (178)</h3>
|
344 |
+
|
345 |
+
<div class="change">
|
346 |
+
<span class="highlight">དེ་ཡང་འདུལ་བ་ལུང་དུ།</span> →
|
347 |
+
<span class="highlight">དེ་ཡང་འདུལ་བ་ལུང་དུ་རྒྱལ་རིགས་ཀྱི་བསྟན་བཅོས་གྲངས་སར།</span>
|
348 |
+
</div>
|
349 |
+
|
350 |
+
<div class="change">
|
351 |
+
<span class="highlight">མཁས་པ་རྣམས་ནི་གཞམ་པ་དང་།</span> →
|
352 |
+
<span class="highlight">མཁས་པ་རྣམས་ནི་གཞོམས་པ་དང་།</span>
|
353 |
+
</div>
|
354 |
+
|
355 |
+
<div class="change">
|
356 |
+
<span class="highlight">ཀུན་ཏུ་སྤྱད་དང་ལྔ་པར་ནི།</span> →
|
357 |
+
<span class="highlight">ཀུན་ཏུ་དཔྱད་དང་ལྔ་པར་ནི།</span>
|
358 |
+
</div>
|
359 |
+
|
360 |
+
<div class="change">
|
361 |
+
<span class="highlight">དཔུང་གི་དོན་རྣམས་བསྒྲུབ་པར་བྱ།</span> →
|
362 |
+
<span class="highlight">དཔུང་གི་དོན་རྣམས་འགྲུབ་པར་བྱེད།</span>
|
363 |
+
</div>
|
364 |
+
|
365 |
+
<div class="change">
|
366 |
+
<span class="highlight">ཕྱི་པདྨ་དཀར་པོའི་རྒྱ་མ་ཞིག</span> →
|
367 |
+
<span class="highlight">ཕྱི་པདྨ་དཀར་པོའི་རྒྱ་མ་ཞིག་པ།</span>
|
368 |
+
</div>
|
369 |
+
<p>... and 173 more</p>
|
370 |
+
</div>
|
371 |
+
</div>
|
372 |
+
</div>
|
373 |
+
|
374 |
+
<div class="comparison">
|
375 |
+
<h2>Dolanji_16.txt vs Ngari 8.txt</h2>
|
376 |
+
<div class="scores">
|
377 |
+
<p><strong>Structural Similarity:</strong> 0.00</p>
|
378 |
+
<p><strong>Alignment Score:</strong> 0.00</p>
|
379 |
+
</div>
|
380 |
+
|
381 |
+
<div class="changes">
|
382 |
+
<div class="change-type insertion">
|
383 |
+
<h3>Insertions (296)</h3>
|
384 |
+
|
385 |
+
<div class="change">
|
386 |
+
<span class="highlight">གཉིས་པ།</span>
|
387 |
+
</div>
|
388 |
+
|
389 |
+
<div class="change">
|
390 |
+
<span class="highlight">གསུམ་པ།</span>
|
391 |
+
</div>
|
392 |
+
|
393 |
+
<div class="change">
|
394 |
+
<span class="highlight">བཀའ་ཁྲིམས་རིན་ཆེན་འབྱུང་གནས་ལས་ཐོག་པའི།</span>
|
395 |
+
</div>
|
396 |
+
|
397 |
+
<div class="change">
|
398 |
+
<span class="highlight">དྲང་པོའི་བདུད་རྩི་དཀར་སྤྲོ་བ་ཡི།</span>
|
399 |
+
</div>
|
400 |
+
|
401 |
+
<div class="change">
|
402 |
+
<span class="highlight">མརྫུན་པའི་སྣང་ཡོད་རབ་ཏུ་གཞོམ་པར་བགྱི།</span>
|
403 |
+
</div>
|
404 |
+
<p>... and 291 more</p>
|
405 |
+
</div>
|
406 |
+
<div class="change-type deletion">
|
407 |
+
<h3>Deletions (1056)</h3>
|
408 |
+
|
409 |
+
<div class="change">
|
410 |
+
<span class="highlight">དེ་ཡང་འདུལ་བ་ལུང་དུ།</span>
|
411 |
+
</div>
|
412 |
+
|
413 |
+
<div class="change">
|
414 |
+
<span class="highlight">མཁས་པ་རྣམས་ནི་གཞམ་པ་དང་།</span>
|
415 |
+
</div>
|
416 |
+
|
417 |
+
<div class="change">
|
418 |
+
<span class="highlight">རབ་ཏུ་སྦྱིན་དང་བསླུ་བ་དང་།</span>
|
419 |
+
</div>
|
420 |
+
|
421 |
+
<div class="change">
|
422 |
+
<span class="highlight">ཀུན་ཏུ་སྤྱད་དང་ལྔ་པར་ནི།</span>
|
423 |
+
</div>
|
424 |
+
|
425 |
+
<div class="change">
|
426 |
+
<span class="highlight">ཅེས་གསུངས་པ་ལྟར།</span>
|
427 |
+
</div>
|
428 |
+
<p>... and 1051 more</p>
|
429 |
+
</div>
|
430 |
+
<div class="change-type modification">
|
431 |
+
<h3>Modifications (72)</h3>
|
432 |
+
|
433 |
+
<div class="change">
|
434 |
+
<span class="highlight">དང་པོ་དཔའ་བོ་སྟག་གི་ཞལ་ལྕེ་ནི།</span> →
|
435 |
+
<span class="highlight">དང་པོ།</span>
|
436 |
+
</div>
|
437 |
+
|
438 |
+
<div class="change">
|
439 |
+
<span class="highlight">དགྲ་འདུལ་བ་ལ་དགོས་ཏེ།</span> →
|
440 |
+
<span class="highlight">བཞི་པ།</span>
|
441 |
+
</div>
|
442 |
+
|
443 |
+
<div class="change">
|
444 |
+
<span class="highlight">རྒྱལ་རིགས་བསྟན་བཅོས་དྲངས་པ་ལས།</span> →
|
445 |
+
<span class="highlight">བཞུ་ཤེས་བདེན་མཛུག་བྱེད་པའི་རི་བོང་འཛིན།</span>
|
446 |
+
</div>
|
447 |
+
|
448 |
+
<div class="change">
|
449 |
+
<span class="highlight">དཔུང་གི་དོན་རྣམས་བསྒྲུབ་པར་བྱ།</span> →
|
450 |
+
<span class="highlight">ཞེས་པ་ལྟར།</span>
|
451 |
+
</div>
|
452 |
+
|
453 |
+
<div class="change">
|
454 |
+
<span class="highlight">ཐོག་མར་དྲག་པོ་མཐར་སྐྱེལ་ཡ་བུད་དང་མི་བསྲེ་བར།</span> →
|
455 |
+
<span class="highlight">དོད་མི་མཉམ་ན་གདོང་ཤེར་མི་བྱེད་པ།</span>
|
456 |
+
</div>
|
457 |
+
<p>... and 67 more</p>
|
458 |
+
</div>
|
459 |
+
</div>
|
460 |
+
</div>
|
461 |
+
|
462 |
+
<div class="comparison">
|
463 |
+
<h2>Leiden_16.txt vs Ngari 8.txt</h2>
|
464 |
+
<div class="scores">
|
465 |
+
<p><strong>Structural Similarity:</strong> 0.00</p>
|
466 |
+
<p><strong>Alignment Score:</strong> 0.00</p>
|
467 |
+
</div>
|
468 |
+
|
469 |
+
<div class="changes">
|
470 |
+
<div class="change-type insertion">
|
471 |
+
<h3>Insertions (292)</h3>
|
472 |
+
|
473 |
+
<div class="change">
|
474 |
+
<span class="highlight">གཉིས་པ།</span>
|
475 |
+
</div>
|
476 |
+
|
477 |
+
<div class="change">
|
478 |
+
<span class="highlight">གསུམ་��།</span>
|
479 |
+
</div>
|
480 |
+
|
481 |
+
<div class="change">
|
482 |
+
<span class="highlight">བཀའ་ཁྲིམས་རིན་ཆེན་འབྱུང་གནས་ལས་ཐོག་པའི།</span>
|
483 |
+
</div>
|
484 |
+
|
485 |
+
<div class="change">
|
486 |
+
<span class="highlight">བཞུ་ཤེས་བདེན་མཛུག་བྱེད་པའི་རི་བོང་འཛིན།</span>
|
487 |
+
</div>
|
488 |
+
|
489 |
+
<div class="change">
|
490 |
+
<span class="highlight">དྲང་པོའི་བདུད་རྩི་དཀར་སྤྲོ་བ་ཡི།</span>
|
491 |
+
</div>
|
492 |
+
<p>... and 287 more</p>
|
493 |
+
</div>
|
494 |
+
<div class="change-type deletion">
|
495 |
+
<h3>Deletions (945)</h3>
|
496 |
+
|
497 |
+
<div class="change">
|
498 |
+
<span class="highlight">དེ་ཡང་འདུལ་བ་ལུང་དུ་རྒྱལ་རིགས་ཀྱི་བསྟན་བཅོས་གྲངས་སར།</span>
|
499 |
+
</div>
|
500 |
+
|
501 |
+
<div class="change">
|
502 |
+
<span class="highlight">མཁས་པ་རྣམས་ནི་གཞོམས་པ་དང་།</span>
|
503 |
+
</div>
|
504 |
+
|
505 |
+
<div class="change">
|
506 |
+
<span class="highlight">རབ་ཏུ་སྦྱིན་དང་བསླུ་བ་དང་།</span>
|
507 |
+
</div>
|
508 |
+
|
509 |
+
<div class="change">
|
510 |
+
<span class="highlight">ཀུན་ཏུ་དཔྱད་དང་ལྔ་པར་ནི།</span>
|
511 |
+
</div>
|
512 |
+
|
513 |
+
<div class="change">
|
514 |
+
<span class="highlight">ཅེས་གསུངས་པ་ལྟར།</span>
|
515 |
+
</div>
|
516 |
+
<p>... and 940 more</p>
|
517 |
+
</div>
|
518 |
+
<div class="change-type modification">
|
519 |
+
<h3>Modifications (76)</h3>
|
520 |
+
|
521 |
+
<div class="change">
|
522 |
+
<span class="highlight">དང་པོ་དཔའ་བོ་སྟག་གི་ཞལ་ལྕེ་ནི།</span> →
|
523 |
+
<span class="highlight">དང་པོ།</span>
|
524 |
+
</div>
|
525 |
+
|
526 |
+
<div class="change">
|
527 |
+
<span class="highlight">དགྲ་འདུལ་བ་ལ་དགོས་ཏེ།</span> →
|
528 |
+
<span class="highlight">བཞི་པ།</span>
|
529 |
+
</div>
|
530 |
+
|
531 |
+
<div class="change">
|
532 |
+
<span class="highlight">དཔུང་གི་དོན་རྣམས་འགྲུབ་པར་བྱེད།</span> →
|
533 |
+
<span class="highlight">ཞེས་པ་ལྟར།</span>
|
534 |
+
</div>
|
535 |
+
|
536 |
+
<div class="change">
|
537 |
+
<span class="highlight">ཐོག་མར་དྲག་པོ་མཐའ་སྐྱེལ་ཡ་འུད་མི་བསྲེ་བར་འཇིག་རྟེན་གྱི་ཁ་དཔེར།</span> →
|
538 |
+
<span class="highlight">དོད་མི་མཉམ་ན་གདོང་ཤེར་མི་བྱེད་པ།</span>
|
539 |
+
</div>
|
540 |
+
|
541 |
+
<div class="change">
|
542 |
+
<span class="highlight">འདྲ་བ་མཉམ་པོར་གྲུ་བཞིར་དྲོས་ཟེར་བ་ལྟར།</span> →
|
543 |
+
<span class="highlight">དགེ་ལོགས་ལོ་ཐོག་རྒྱས་པ་ཉིད་དུབགྱི་ཞེས།</span>
|
544 |
+
</div>
|
545 |
+
<p>... and 71 more</p>
|
546 |
+
</div>
|
547 |
+
</div>
|
548 |
+
</div>
|
549 |
+
|
550 |
+
</div>
|
551 |
+
</body>
|
552 |
+
</html>
|
553 |
+
|
app.py
CHANGED
@@ -148,8 +148,7 @@ def main_interface():
|
|
148 |
"Jaccard Similarity (%)": "Jaccard Similarity (%): Higher scores (darker) mean more shared unique words.",
|
149 |
"Normalized LCS": "Normalized LCS: Higher scores (darker) mean longer shared sequences of words.",
|
150 |
"Semantic Similarity": "Semantic Similarity (using word embeddings/experimental): Higher scores (darker) mean more similar meanings.",
|
151 |
-
"
|
152 |
-
"Word Counts": "Word Counts: Shows the number of words in each segment after tokenization."
|
153 |
}
|
154 |
|
155 |
metric_tooltips = {
|
@@ -209,28 +208,28 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
|
|
209 |
with gr.Tabs(elem_id="heatmap-tab-group"):
|
210 |
# Structural Analysis Tab
|
211 |
with gr.Tab("Structural Analysis"):
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
Results appear automatically when texts are processed. Use the export buttons to save detailed reports for philological analysis.
|
225 |
-
""")
|
226 |
|
227 |
# Structural analysis outputs
|
228 |
-
structural_heatmap = gr.Plot(label="Structural Changes
|
229 |
structural_report = gr.HTML(label="Differential Analysis Report")
|
230 |
structural_export = gr.File(label="Export Structural Analysis Report", file_types=[".html", ".md", ".json"])
|
231 |
|
232 |
-
# Process
|
233 |
-
for
|
|
|
|
|
234 |
with gr.Tab(metric_key):
|
235 |
# Set CSS class based on metric type
|
236 |
if metric_key == "Jaccard Similarity (%)":
|
@@ -242,9 +241,6 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
|
|
242 |
elif metric_key == "Semantic Similarity":
|
243 |
css_class = "metric-info-accordion semantic-info"
|
244 |
accordion_title = "Understanding Meaning Similarity"
|
245 |
-
elif metric_key == "TF-IDF Cosine Sim":
|
246 |
-
css_class = "metric-info-accordion tfidf-info"
|
247 |
-
accordion_title = "Understanding Term Importance"
|
248 |
elif metric_key == "Word Counts":
|
249 |
css_class = "metric-info-accordion wordcount-info"
|
250 |
accordion_title = "Understanding Text Length"
|
|
|
148 |
"Jaccard Similarity (%)": "Jaccard Similarity (%): Higher scores (darker) mean more shared unique words.",
|
149 |
"Normalized LCS": "Normalized LCS: Higher scores (darker) mean longer shared sequences of words.",
|
150 |
"Semantic Similarity": "Semantic Similarity (using word embeddings/experimental): Higher scores (darker) mean more similar meanings.",
|
151 |
+
"Word Counts": "Word Counts: Bar chart showing the number of words in each segment after tokenization.",
|
|
|
152 |
}
|
153 |
|
154 |
metric_tooltips = {
|
|
|
208 |
with gr.Tabs(elem_id="heatmap-tab-group"):
|
209 |
# Structural Analysis Tab
|
210 |
with gr.Tab("Structural Analysis"):
|
211 |
+
gr.Markdown("""
|
212 |
+
### Structural Analysis for Tibetan Legal Manuscripts
|
213 |
+
|
214 |
+
This tab provides detailed chapter-level structural analysis for Tibetan legal manuscript comparison.
|
215 |
+
|
216 |
+
**Features:**
|
217 |
+
- **Differential Highlighting**: Highlights significant textual variations
|
218 |
+
- **Per-Chapter Analysis**: Detailed comparison for each chapter pair
|
219 |
+
|
220 |
+
**Usage:**
|
221 |
+
Results appear automatically when texts are processed. Use the export buttons to save detailed reports for philological analysis.
|
222 |
+
""")
|
|
|
|
|
223 |
|
224 |
# Structural analysis outputs
|
225 |
+
structural_heatmap = gr.Plot(label="Structural Changes Summary", show_label=False, elem_classes="structural-heatmap")
|
226 |
structural_report = gr.HTML(label="Differential Analysis Report")
|
227 |
structural_export = gr.File(label="Export Structural Analysis Report", file_types=[".html", ".md", ".json"])
|
228 |
|
229 |
+
# Process metrics excluding TF-IDF
|
230 |
+
metrics_to_display = {k: v for k, v in heatmap_titles.items() if k != "TF-IDF Cosine Sim"}
|
231 |
+
|
232 |
+
for metric_key, descriptive_title in metrics_to_display.items():
|
233 |
with gr.Tab(metric_key):
|
234 |
# Set CSS class based on metric type
|
235 |
if metric_key == "Jaccard Similarity (%)":
|
|
|
241 |
elif metric_key == "Semantic Similarity":
|
242 |
css_class = "metric-info-accordion semantic-info"
|
243 |
accordion_title = "Understanding Meaning Similarity"
|
|
|
|
|
|
|
244 |
elif metric_key == "Word Counts":
|
245 |
css_class = "metric-info-accordion wordcount-info"
|
246 |
accordion_title = "Understanding Text Length"
|
pipeline/differential_viz.py
CHANGED
@@ -6,7 +6,6 @@ Provides enhanced heatmaps with structural change highlighting.
|
|
6 |
import plotly.graph_objects as go
|
7 |
from typing import Dict, List
|
8 |
import pandas as pd
|
9 |
-
from plotly.subplots import make_subplots
|
10 |
from .structural_analysis import detect_structural_changes, generate_structural_alignment
|
11 |
|
12 |
|
@@ -59,81 +58,65 @@ def create_differential_heatmap(texts_dict: Dict[str, str],
|
|
59 |
|
60 |
enhanced_df = pd.DataFrame(enhanced_data)
|
61 |
|
62 |
-
# Create
|
63 |
-
|
64 |
-
rows=2, cols=2,
|
65 |
-
subplot_titles=('Structural Changes', 'Modifications', 'Insertions/Deletions', 'Alignment Quality'),
|
66 |
-
specs=[[{"secondary_y": True}, {"secondary_y": True}],
|
67 |
-
[{"secondary_y": True}, {"secondary_y": True}]]
|
68 |
-
)
|
69 |
-
|
70 |
-
# Structural changes heatmap
|
71 |
-
pivot_changes = enhanced_df.pivot(index='Chapter', columns='Text Pair', values='structural_changes')
|
72 |
-
fig.add_trace(
|
73 |
-
go.Heatmap(
|
74 |
-
z=pivot_changes.values,
|
75 |
-
x=pivot_changes.columns,
|
76 |
-
y=pivot_changes.index,
|
77 |
-
colorscale='Reds',
|
78 |
-
name='Structural Changes',
|
79 |
-
showscale=True,
|
80 |
-
colorbar=dict(title="Changes", x=0.45)
|
81 |
-
),
|
82 |
-
row=1, col=1
|
83 |
-
)
|
84 |
-
|
85 |
-
# Modifications heatmap
|
86 |
-
pivot_mods = enhanced_df.pivot(index='Chapter', columns='Text Pair', values='modification_score')
|
87 |
-
fig.add_trace(
|
88 |
-
go.Heatmap(
|
89 |
-
z=pivot_mods.values,
|
90 |
-
x=pivot_mods.columns,
|
91 |
-
y=pivot_mods.index,
|
92 |
-
colorscale='Blues',
|
93 |
-
name='Modifications',
|
94 |
-
showscale=True,
|
95 |
-
colorbar=dict(title="Mods", x=1.0)
|
96 |
-
),
|
97 |
-
row=1, col=2
|
98 |
-
)
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
-
|
106 |
-
|
107 |
-
z=combined.values,
|
108 |
-
x=combined.columns,
|
109 |
-
y=combined.index,
|
110 |
-
colorscale='Greens',
|
111 |
-
name='Insertions+Deletions',
|
112 |
-
showscale=True,
|
113 |
-
colorbar=dict(title="Ins+Del", x=0.45)
|
114 |
-
),
|
115 |
-
row=2, col=1
|
116 |
-
)
|
117 |
|
118 |
-
#
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
name='Alignment Quality',
|
127 |
-
showscale=True,
|
128 |
-
colorbar=dict(title="Quality", x=1.0)
|
129 |
),
|
130 |
-
|
131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
fig.update_layout(
|
134 |
-
title=
|
135 |
-
height=
|
136 |
-
|
137 |
)
|
138 |
|
139 |
return fig
|
|
|
6 |
import plotly.graph_objects as go
|
7 |
from typing import Dict, List
|
8 |
import pandas as pd
|
|
|
9 |
from .structural_analysis import detect_structural_changes, generate_structural_alignment
|
10 |
|
11 |
|
|
|
58 |
|
59 |
enhanced_df = pd.DataFrame(enhanced_data)
|
60 |
|
61 |
+
# Create a clean table with numbers and percentages
|
62 |
+
summary_table = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
+
for row in enhanced_data:
|
65 |
+
text_pair = row['Text Pair']
|
66 |
+
chapter = row['Chapter']
|
67 |
+
|
68 |
+
# Calculate percentages
|
69 |
+
total_changes = row['structural_changes']
|
70 |
+
modifications = row['modification_score']
|
71 |
+
insertions_deletions = row['insertion_score'] + row['deletion_score']
|
72 |
+
alignment_quality = row['alignment_quality']
|
73 |
+
|
74 |
+
# Create summary row
|
75 |
+
summary_row = {
|
76 |
+
'Text Pair': text_pair,
|
77 |
+
'Chapter': chapter,
|
78 |
+
'Total Changes': total_changes,
|
79 |
+
'Modifications': modifications,
|
80 |
+
'Insertions/Deletions': insertions_deletions,
|
81 |
+
'Alignment Quality': f"{alignment_quality:.1f}%",
|
82 |
+
'Significant Differences': row['significant_differences']
|
83 |
+
}
|
84 |
+
|
85 |
+
summary_table.append(summary_row)
|
86 |
|
87 |
+
# Create DataFrame for table display
|
88 |
+
summary_df = pd.DataFrame(summary_table)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
+
# Create a simple table with styling
|
91 |
+
fig = go.Figure(data=[go.Table(
|
92 |
+
header=dict(
|
93 |
+
values=['Text Pair', 'Chapter', 'Total Changes', 'Modifications',
|
94 |
+
'Insertions/Deletions', 'Alignment Quality', 'Significant Differences'],
|
95 |
+
font=dict(size=12, color='white'),
|
96 |
+
fill_color='darkblue',
|
97 |
+
align='left'
|
|
|
|
|
|
|
98 |
),
|
99 |
+
cells=dict(
|
100 |
+
values=[
|
101 |
+
summary_df['Text Pair'],
|
102 |
+
summary_df['Chapter'],
|
103 |
+
summary_df['Total Changes'],
|
104 |
+
summary_df['Modifications'],
|
105 |
+
summary_df['Insertions/Deletions'],
|
106 |
+
summary_df['Alignment Quality'],
|
107 |
+
summary_df['Significant Differences']
|
108 |
+
],
|
109 |
+
font=dict(size=11),
|
110 |
+
align='left',
|
111 |
+
fill_color=['lightgrey' if i % 2 == 0 else 'white'
|
112 |
+
for i in range(len(summary_df))]
|
113 |
+
)
|
114 |
+
)])
|
115 |
|
116 |
fig.update_layout(
|
117 |
+
title="Structural Analysis Summary",
|
118 |
+
height=400,
|
119 |
+
margin=dict(l=10, r=10, t=40, b=10)
|
120 |
)
|
121 |
|
122 |
return fig
|
pipeline/metrics.py
CHANGED
@@ -8,9 +8,7 @@ from .fasttext_embedding import generate_embeddings as generate_fasttext_embeddi
|
|
8 |
from .hf_embedding import generate_embeddings as generate_hf_embeddings
|
9 |
|
10 |
import logging
|
11 |
-
|
12 |
-
from .stopwords_bo import TIBETAN_STOPWORDS
|
13 |
-
from .stopwords_lite_bo import TIBETAN_STOPWORDS_LITE
|
14 |
|
15 |
# Attempt to import the Cython-compiled fast_lcs module
|
16 |
try:
|
@@ -255,42 +253,6 @@ def compute_all_metrics(
|
|
255 |
logger.info(f"Built FastText corpus term frequency map with {len(term_freq_corpus_for_fasttext)} unique tokens.")
|
256 |
logger.info(f"Built FastText document frequency map with {len(document_frequency_map_for_fasttext)} unique tokens across {total_num_documents_for_fasttext} documents.")
|
257 |
|
258 |
-
# TF-IDF Vectorization and Cosine Similarity Calculation
|
259 |
-
if corpus_for_sklearn_tfidf:
|
260 |
-
try:
|
261 |
-
# Using a dummy tokenizer and preprocessor as input is already tokenized (as space-separated strings)
|
262 |
-
# and we don't want further case changes or token modifications for Tibetan.
|
263 |
-
|
264 |
-
# Select appropriate stopwords list based on user preference
|
265 |
-
if use_stopwords:
|
266 |
-
# Choose between regular and lite stopwords list
|
267 |
-
if use_lite_stopwords:
|
268 |
-
stopwords_to_use = TIBETAN_STOPWORDS_LITE
|
269 |
-
else:
|
270 |
-
stopwords_to_use = TIBETAN_STOPWORDS
|
271 |
-
else:
|
272 |
-
# If stopwords are disabled, use an empty list
|
273 |
-
stopwords_to_use = []
|
274 |
-
|
275 |
-
vectorizer = TfidfVectorizer(
|
276 |
-
tokenizer=lambda x: x.split(),
|
277 |
-
preprocessor=lambda x: x,
|
278 |
-
token_pattern=None,
|
279 |
-
stop_words=stopwords_to_use
|
280 |
-
)
|
281 |
-
tfidf_matrix = vectorizer.fit_transform(corpus_for_sklearn_tfidf)
|
282 |
-
# Calculate pairwise cosine similarity on the TF-IDF matrix
|
283 |
-
# This gives a square matrix where cosine_sim_matrix[i, j] is the similarity between doc i and doc j
|
284 |
-
cosine_sim_matrix = cosine_similarity(tfidf_matrix)
|
285 |
-
except ValueError as e:
|
286 |
-
if "empty vocabulary" in str(e):
|
287 |
-
# If vocabulary is empty after stopword removal, create a zero matrix
|
288 |
-
n = len(corpus_for_sklearn_tfidf)
|
289 |
-
cosine_sim_matrix = np.zeros((n, n))
|
290 |
-
else:
|
291 |
-
# Re-raise other ValueError
|
292 |
-
raise
|
293 |
-
else:
|
294 |
# Handle case with no texts or all empty texts
|
295 |
n = len(files) if files else 0
|
296 |
cosine_sim_matrix = np.zeros((n, n))
|
@@ -347,15 +309,7 @@ def compute_all_metrics(
|
|
347 |
"Jaccard Similarity (%)": jaccard_percent,
|
348 |
"Normalized LCS": norm_lcs,
|
349 |
# Pass tokens1 and tokens2 to compute_semantic_similarity
|
350 |
-
"Semantic Similarity": semantic_sim
|
351 |
-
"TF-IDF Cosine Sim": (
|
352 |
-
0.0 if both_only_stopwords else
|
353 |
-
cosine_sim_matrix[i, j]
|
354 |
-
if cosine_sim_matrix.size > 0
|
355 |
-
and i < cosine_sim_matrix.shape[0]
|
356 |
-
and j < cosine_sim_matrix.shape[1]
|
357 |
-
else np.nan
|
358 |
-
),
|
359 |
}
|
360 |
)
|
361 |
return pd.DataFrame(results)
|
|
|
8 |
from .hf_embedding import generate_embeddings as generate_hf_embeddings
|
9 |
|
10 |
import logging
|
11 |
+
|
|
|
|
|
12 |
|
13 |
# Attempt to import the Cython-compiled fast_lcs module
|
14 |
try:
|
|
|
253 |
logger.info(f"Built FastText corpus term frequency map with {len(term_freq_corpus_for_fasttext)} unique tokens.")
|
254 |
logger.info(f"Built FastText document frequency map with {len(document_frequency_map_for_fasttext)} unique tokens across {total_num_documents_for_fasttext} documents.")
|
255 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
# Handle case with no texts or all empty texts
|
257 |
n = len(files) if files else 0
|
258 |
cosine_sim_matrix = np.zeros((n, n))
|
|
|
309 |
"Jaccard Similarity (%)": jaccard_percent,
|
310 |
"Normalized LCS": norm_lcs,
|
311 |
# Pass tokens1 and tokens2 to compute_semantic_similarity
|
312 |
+
"Semantic Similarity": semantic_sim
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
313 |
}
|
314 |
)
|
315 |
return pd.DataFrame(results)
|
pipeline/stopwords_bo.py
CHANGED
@@ -64,7 +64,7 @@ _ALL_STOPWORDS_CATEGORIZED = (
|
|
64 |
INTERJECTIONS_EXCLAMATIONS
|
65 |
)
|
66 |
|
67 |
-
# Final flat list of unique stopwords
|
68 |
TIBETAN_STOPWORDS = list(set(_ALL_STOPWORDS_CATEGORIZED))
|
69 |
|
70 |
# Final set of unique stopwords for efficient Jaccard/LCS filtering (as a set)
|
|
|
64 |
INTERJECTIONS_EXCLAMATIONS
|
65 |
)
|
66 |
|
67 |
+
# Final flat list of unique stopwords
|
68 |
TIBETAN_STOPWORDS = list(set(_ALL_STOPWORDS_CATEGORIZED))
|
69 |
|
70 |
# Final set of unique stopwords for efficient Jaccard/LCS filtering (as a set)
|
pipeline/stopwords_lite_bo.py
CHANGED
@@ -27,7 +27,7 @@ _ALL_STOPWORDS_CATEGORIZED_LITE = (
|
|
27 |
MORE_PARTICLES_SUFFIXES_LITE
|
28 |
)
|
29 |
|
30 |
-
# Final flat list of unique stopwords
|
31 |
TIBETAN_STOPWORDS_LITE = list(set(_ALL_STOPWORDS_CATEGORIZED_LITE))
|
32 |
|
33 |
# Final set of unique stopwords for efficient Jaccard/LCS filtering (as a set)
|
|
|
27 |
MORE_PARTICLES_SUFFIXES_LITE
|
28 |
)
|
29 |
|
30 |
+
# Final flat list of unique stopwords
|
31 |
TIBETAN_STOPWORDS_LITE = list(set(_ALL_STOPWORDS_CATEGORIZED_LITE))
|
32 |
|
33 |
# Final set of unique stopwords for efficient Jaccard/LCS filtering (as a set)
|