mshamrai commited on
Commit
b719825
Β·
1 Parent(s): 661c42a

fix: families

Browse files
Files changed (2) hide show
  1. constants.py +294 -121
  2. utils.py +4 -5
constants.py CHANGED
@@ -1,124 +1,168 @@
1
- language_subfamilies = {
 
2
  "Afrikaans": "West Germanic",
3
- "Albanian": "Albanian",
4
- "Arabic": "Semitic",
5
- "Egyptian Arabic": "Semitic",
6
- "Aragonese": "Romance",
7
- "Armenian": "Armenian",
8
- "Asturian": "Romance",
9
- "Azerbaijani": "Oghuz",
10
- "Bashkir": "Kypchak",
11
- "Basque": "Language Isolate",
12
  "Bavarian": "Austro-Bavarian",
13
- "Belarusian": "East Slavic",
14
- "Bengali": "Eastern Indo-Aryan",
15
- "Bishnupriya Manipuri": "Eastern Indo-Aryan",
16
- "Bosnian": "South Slavic",
17
- "Breton": "Brythonic",
18
- "Bulgarian": "South Slavic",
19
- "Burmese": "Burmish",
20
- "Catalan": "Romance",
21
- "Cebuano": "Central Philippine",
22
- "Chechen": "Nakh-Daghestanian",
23
- "Chinese (Simplified)": "Sinitic",
24
- "Chinese (Traditional)": "Sinitic",
25
- "Min Nan Chinese": "Sinitic",
26
- "Chuvash": "Oghur",
27
- "Croatian": "South Slavic",
28
- "Czech": "West Slavic",
29
  "Danish": "North Germanic",
30
  "Dutch": "West Germanic",
31
  "English": "West Germanic",
32
- "Estonian": "Finnic",
33
- "Finnish": "Finnic",
34
- "French": "Gallo-Romance",
35
- "Galician": "Gallo-Romance",
36
- "Georgian": "Kartvelian",
37
  "German": "West Germanic",
38
- "Greek": "Hellenic",
39
- "Gujarati": "Gujarati",
40
- "Haitian": "French-based Creole",
41
- "Hebrew": "Semitic",
42
- "Hindi": "Central Indo-Aryan",
43
- "Hungarian": "Ugric",
44
  "Icelandic": "North Germanic",
45
- "Ido": "Constructed",
46
- "Indonesian": "Malayic",
47
- "Irish": "Goidelic",
48
- "Italian": "Italo-Dalmatian",
49
- "Japanese": "Japonic",
50
- "Javanese": "Javanic",
51
- "Kannada": "Southern Dravidian",
52
- "Kazakh": "Kypchak",
53
- "Kirghiz": "Kypchak",
54
- "Korean": "Koreanic",
55
- "Latin": "Italic",
56
- "Latvian": "Baltic",
57
- "Lithuanian": "Baltic",
58
- "Lombard": "Gallo-Italic",
59
  "Low Saxon": "West Germanic",
60
  "Luxembourgish": "West Germanic",
61
- "Macedonian": "South Slavic",
62
- "Malagasy": "Malayic",
63
- "Malay": "Malayic",
64
- "Malayalam": "Southern Dravidian",
65
- "Marathi": "Central Indo-Aryan",
66
- "Minangkabau": "Malayic",
67
- "Nepali": "Eastern Indo-Aryan",
68
- "Newar": "Newaric",
69
  "Norwegian (Bokmal)": "North Germanic",
70
  "Norwegian (Nynorsk)": "North Germanic",
71
- "Occitan": "Gallo-Romance",
72
- "Persian (Farsi)": "Iranian",
73
- "Piedmontese": "Gallo-Italic",
74
- "Polish": "West Slavic",
75
- "Portuguese": "Iberian Romance",
76
- "Punjabi": "Punjabi",
77
- "Romanian": "Eastern Romance",
78
- "Russian": "East Slavic",
79
  "Scots": "West Germanic",
80
- "Serbian": "South Slavic",
81
- "Serbo-Croatian": "South Slavic",
82
- "Sicilian": "Italo-Dalmatian",
83
- "Slovak": "West Slavic",
84
- "Slovenian": "South Slavic",
85
- "South Azerbaijani": "Oghuz",
86
- "Spanish": "Iberian Romance",
87
- "Sundanese": "Sundic",
88
- "Swahili": "Bantu",
89
  "Swedish": "North Germanic",
90
- "Tagalog": "Central Philippine",
91
- "Tajik": "Iranian",
92
- "Tamil": "Southern Dravidian",
93
- "Tatar": "Kypchak",
94
- "Telugu": "Southern Dravidian",
95
- "Turkish": "Oghuz",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  "Ukrainian": "East Slavic",
97
- "Urdu": "Central Indo-Aryan",
98
- "Uzbek": "Karluk",
99
- "Vietnamese": "Vietic",
100
- "VolapΓΌk": "Constructed",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  "Waray-Waray": "Central Philippine",
102
- "Welsh": "Brythonic",
103
- "West Frisian": "West Germanic",
104
- "Western Punjabi": "Punjabi",
105
- "Yoruba": "Yoruboid",
106
- "Esperanto": "Constructed",
107
- "Crimean Tatar": "Kypchak",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  }
109
 
110
- language_families = {
111
- "Afrikaans": "Germanic",
 
 
112
  "Albanian": "Indo-European",
113
  "Arabic": "Afroasiatic",
114
  "Egyptian Arabic": "Afroasiatic",
115
- "Aragonese": "Romance",
116
  "Armenian": "Indo-European",
117
- "Asturian": "Romance",
118
  "Azerbaijani": "Turkic",
119
  "Bashkir": "Turkic",
120
  "Basque": "Language Isolate",
121
- "Bavarian": "Germanic",
122
  "Belarusian": "Indo-European",
123
  "Bengali": "Indo-European",
124
  "Bishnupriya Manipuri": "Indo-European",
@@ -126,35 +170,35 @@ language_families = {
126
  "Breton": "Indo-European",
127
  "Bulgarian": "Indo-European",
128
  "Burmese": "Sino-Tibetan",
129
- "Catalan": "Romance",
130
  "Cebuano": "Austronesian",
131
- "Chechen": "Northeast Caucasian",
132
  "Chinese (Simplified)": "Sino-Tibetan",
133
  "Chinese (Traditional)": "Sino-Tibetan",
134
  "Min Nan Chinese": "Sino-Tibetan",
135
  "Chuvash": "Turkic",
136
  "Croatian": "Indo-European",
137
  "Czech": "Indo-European",
138
- "Danish": "Germanic",
139
- "Dutch": "Germanic",
140
- "English": "Germanic",
141
  "Estonian": "Uralic",
142
  "Finnish": "Uralic",
143
- "French": "Romance",
144
- "Galician": "Romance",
145
  "Georgian": "Kartvelian",
146
- "German": "Germanic",
147
  "Greek": "Indo-European",
148
  "Gujarati": "Indo-European",
149
  "Haitian": "Creole",
150
  "Hebrew": "Afroasiatic",
151
  "Hindi": "Indo-European",
152
  "Hungarian": "Uralic",
153
- "Icelandic": "Germanic",
154
  "Ido": "Constructed",
155
  "Indonesian": "Austronesian",
156
  "Irish": "Indo-European",
157
- "Italian": "Romance",
158
  "Japanese": "Japonic",
159
  "Javanese": "Austronesian",
160
  "Kannada": "Dravidian",
@@ -164,9 +208,9 @@ language_families = {
164
  "Latin": "Indo-European",
165
  "Latvian": "Indo-European",
166
  "Lithuanian": "Indo-European",
167
- "Lombard": "Romance",
168
- "Low Saxon": "Germanic",
169
- "Luxembourgish": "Germanic",
170
  "Macedonian": "Indo-European",
171
  "Malagasy": "Austronesian",
172
  "Malay": "Austronesian",
@@ -175,27 +219,27 @@ language_families = {
175
  "Minangkabau": "Austronesian",
176
  "Nepali": "Indo-European",
177
  "Newar": "Sino-Tibetan",
178
- "Norwegian (Bokmal)": "Germanic",
179
- "Norwegian (Nynorsk)": "Germanic",
180
- "Occitan": "Romance",
181
  "Persian (Farsi)": "Indo-European",
182
- "Piedmontese": "Romance",
183
  "Polish": "Indo-European",
184
- "Portuguese": "Romance",
185
  "Punjabi": "Indo-European",
186
- "Romanian": "Romance",
187
  "Russian": "Indo-European",
188
- "Scots": "Germanic",
189
  "Serbian": "Indo-European",
190
  "Serbo-Croatian": "Indo-European",
191
- "Sicilian": "Romance",
192
  "Slovak": "Indo-European",
193
  "Slovenian": "Indo-European",
194
  "South Azerbaijani": "Turkic",
195
- "Spanish": "Romance",
196
  "Sundanese": "Austronesian",
197
  "Swahili": "Niger-Congo",
198
- "Swedish": "Germanic",
199
  "Tagalog": "Austronesian",
200
  "Tajik": "Indo-European",
201
  "Tamil": "Dravidian",
@@ -209,9 +253,138 @@ language_families = {
209
  "VolapΓΌk": "Constructed",
210
  "Waray-Waray": "Austronesian",
211
  "Welsh": "Indo-European",
212
- "West Frisian": "Germanic",
213
  "Western Punjabi": "Indo-European",
214
  "Yoruba": "Niger-Congo",
215
  "Esperanto": "Constructed",
216
  "Crimean Tatar": "Turkic",
217
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ low_level_subfamilies = {
2
+ # Germanic
3
  "Afrikaans": "West Germanic",
 
 
 
 
 
 
 
 
 
4
  "Bavarian": "Austro-Bavarian",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "Danish": "North Germanic",
6
  "Dutch": "West Germanic",
7
  "English": "West Germanic",
 
 
 
 
 
8
  "German": "West Germanic",
 
 
 
 
 
 
9
  "Icelandic": "North Germanic",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  "Low Saxon": "West Germanic",
11
  "Luxembourgish": "West Germanic",
 
 
 
 
 
 
 
 
12
  "Norwegian (Bokmal)": "North Germanic",
13
  "Norwegian (Nynorsk)": "North Germanic",
 
 
 
 
 
 
 
 
14
  "Scots": "West Germanic",
 
 
 
 
 
 
 
 
 
15
  "Swedish": "North Germanic",
16
+ "West Frisian": "West Germanic",
17
+
18
+ # Romance
19
+ "Aragonese": "Ibero-Romance",
20
+ "Asturian": "Ibero-Romance",
21
+ "Catalan": "Occitano-Romance",
22
+ "French": "Gallo-Romance",
23
+ "Galician": "Ibero-Romance",
24
+ "Italian": "Italo-Dalmatian",
25
+ "Lombard": "Gallo-Italic",
26
+ "Occitan": "Occitano-Romance",
27
+ "Piedmontese":"Gallo-Italic",
28
+ "Portuguese":"Ibero-Romance",
29
+ "Romanian": "Eastern Romance",
30
+ "Sicilian": "Italo-Dalmatian",
31
+ "Spanish": "Ibero-Romance",
32
+
33
+ # Slavic
34
+ "Belarusian":"East Slavic",
35
+ "Bosnian": "South Slavic",
36
+ "Bulgarian": "South Slavic",
37
+ "Croatian": "South Slavic",
38
+ "Czech": "West Slavic",
39
+ "Macedonian":"South Slavic",
40
+ "Polish": "West Slavic",
41
+ "Russian": "East Slavic",
42
+ "Serbian": "South Slavic",
43
+ "Serbo-Croatian":"South Slavic",
44
+ "Slovak": "West Slavic",
45
+ "Slovenian": "South Slavic",
46
  "Ukrainian": "East Slavic",
47
+
48
+ # Celtic
49
+ "Breton": "Brythonic",
50
+ "Welsh": "Brythonic",
51
+ "Irish": "Goidelic",
52
+
53
+ # Baltic
54
+ "Latvian": "Baltic",
55
+ "Lithuanian": "Baltic",
56
+
57
+ # Indo-Aryan (Indic)
58
+ "Bengali": "Eastern Indo-Aryan",
59
+ "Bishnupriya Manipuri": "Eastern Indo-Aryan",
60
+ "Gujarati": "Western Indo-Aryan",
61
+ "Hindi": "Central Indo-Aryan",
62
+ "Marathi": "Southern Indo-Aryan",
63
+ "Nepali": "Northern Indo-Aryan",
64
+ "Punjabi": "North-western Indo-Aryan",
65
+ "Urdu": "Central Indo-Aryan",
66
+ "Western Punjabi": "North-western Indo-Aryan",
67
+
68
+ # Iranian
69
+ "Persian (Farsi)": "South-western Iranian",
70
+ "Tajik": "South-western Iranian",
71
+
72
+ # Turkic
73
+ "Azerbaijani": "Oghuz",
74
+ "South Azerbaijani": "Oghuz",
75
+ "Turkish": "Oghuz",
76
+ "Bashkir": "Kipchak",
77
+ "Kazakh": "Kipchak",
78
+ "Kirghiz": "Kipchak",
79
+ "Tatar": "Kipchak",
80
+ "Crimean Tatar":"Kipchak",
81
+ "Chuvash": "Oghur",
82
+ "Uzbek": "Karluk",
83
+
84
+ # Uralic
85
+ "Estonian": "Finnic",
86
+ "Finnish": "Finnic",
87
+ "Hungarian":"Ugric",
88
+
89
+ # Dravidian
90
+ "Kannada": "Southern Dravidian",
91
+ "Malayalam":"Southern Dravidian",
92
+ "Tamil": "Southern Dravidian",
93
+ "Telugu": "South-Central Dravidian",
94
+
95
+ # Sinitic (Chinese)
96
+ "Chinese (Simplified)": "Mandarin",
97
+ "Chinese (Traditional)": "Mandarin",
98
+ "Min Nan Chinese": "Southern Min",
99
+
100
+ # Other Sino-Tibetan
101
+ "Burmese": "Burmish",
102
+ "Newar": "Newaric",
103
+
104
+ # Japonic / Koreanic
105
+ "Japanese": "Japonic",
106
+ "Korean": "Koreanic",
107
+
108
+ # Caucasian & Kartvelian
109
+ "Chechen": "Nakh",
110
+ "Georgian": "Kartvelian",
111
+
112
+ # Austronesian
113
+ "Cebuano": "Central Philippine",
114
+ "Tagalog": "Central Philippine",
115
  "Waray-Waray": "Central Philippine",
116
+ "Indonesian": "Malayic",
117
+ "Malay": "Malayic",
118
+ "Minangkabau": "Malayic",
119
+ "Javanese": "Javanese",
120
+ "Sundanese": "Sundic",
121
+ "Malagasy": "East Barito",
122
+
123
+ # Philippine & Oceanic already covered above
124
+
125
+ # Vietic & MSEA
126
+ "Vietnamese": "Vietic",
127
+
128
+ # Altaic hypotheses excluded; Kipchak/Oghuz etc already above
129
+
130
+ # Afro-Asiatic
131
+ "Arabic": "Central Semitic",
132
+ "Egyptian Arabic": "Central Semitic",
133
+ "Hebrew": "North-west Semitic",
134
+
135
+ # Niger-Congo
136
+ "Swahili": "Sabaki",
137
+ "Yoruba": "Yoruboid",
138
+
139
+ # Isolates & special groups
140
+ "Albanian": "Albanian",
141
+ "Armenian": "Armenian",
142
+ "Basque": "Language Isolate",
143
+ "Greek": "Hellenic",
144
+ "Latin": "Latino-Faliscan",
145
+ "Japanese": "Japonic",
146
+ "Esperanto":"Constructed",
147
+ "Ido": "Constructed",
148
+ "VolapΓΌk": "Constructed",
149
+ "Haitian": "French-based Creole",
150
  }
151
 
152
+
153
+ # ── 1. high-level genealogical families ─────────────────────────────────────────
154
+ high_level_families = {
155
+ "Afrikaans": "Indo-European",
156
  "Albanian": "Indo-European",
157
  "Arabic": "Afroasiatic",
158
  "Egyptian Arabic": "Afroasiatic",
159
+ "Aragonese": "Indo-European",
160
  "Armenian": "Indo-European",
161
+ "Asturian": "Indo-European",
162
  "Azerbaijani": "Turkic",
163
  "Bashkir": "Turkic",
164
  "Basque": "Language Isolate",
165
+ "Bavarian": "Indo-European",
166
  "Belarusian": "Indo-European",
167
  "Bengali": "Indo-European",
168
  "Bishnupriya Manipuri": "Indo-European",
 
170
  "Breton": "Indo-European",
171
  "Bulgarian": "Indo-European",
172
  "Burmese": "Sino-Tibetan",
173
+ "Catalan": "Indo-European",
174
  "Cebuano": "Austronesian",
175
+ "Chechen": "NortheastΒ Caucasian",
176
  "Chinese (Simplified)": "Sino-Tibetan",
177
  "Chinese (Traditional)": "Sino-Tibetan",
178
  "Min Nan Chinese": "Sino-Tibetan",
179
  "Chuvash": "Turkic",
180
  "Croatian": "Indo-European",
181
  "Czech": "Indo-European",
182
+ "Danish": "Indo-European",
183
+ "Dutch": "Indo-European",
184
+ "English": "Indo-European",
185
  "Estonian": "Uralic",
186
  "Finnish": "Uralic",
187
+ "French": "Indo-European",
188
+ "Galician": "Indo-European",
189
  "Georgian": "Kartvelian",
190
+ "German": "Indo-European",
191
  "Greek": "Indo-European",
192
  "Gujarati": "Indo-European",
193
  "Haitian": "Creole",
194
  "Hebrew": "Afroasiatic",
195
  "Hindi": "Indo-European",
196
  "Hungarian": "Uralic",
197
+ "Icelandic": "Indo-European",
198
  "Ido": "Constructed",
199
  "Indonesian": "Austronesian",
200
  "Irish": "Indo-European",
201
+ "Italian": "Indo-European",
202
  "Japanese": "Japonic",
203
  "Javanese": "Austronesian",
204
  "Kannada": "Dravidian",
 
208
  "Latin": "Indo-European",
209
  "Latvian": "Indo-European",
210
  "Lithuanian": "Indo-European",
211
+ "Lombard": "Indo-European",
212
+ "Low Saxon": "Indo-European",
213
+ "Luxembourgish": "Indo-European",
214
  "Macedonian": "Indo-European",
215
  "Malagasy": "Austronesian",
216
  "Malay": "Austronesian",
 
219
  "Minangkabau": "Austronesian",
220
  "Nepali": "Indo-European",
221
  "Newar": "Sino-Tibetan",
222
+ "Norwegian (Bokmal)": "Indo-European",
223
+ "Norwegian (Nynorsk)": "Indo-European",
224
+ "Occitan": "Indo-European",
225
  "Persian (Farsi)": "Indo-European",
226
+ "Piedmontese": "Indo-European",
227
  "Polish": "Indo-European",
228
+ "Portuguese": "Indo-European",
229
  "Punjabi": "Indo-European",
230
+ "Romanian": "Indo-European",
231
  "Russian": "Indo-European",
232
+ "Scots": "Indo-European",
233
  "Serbian": "Indo-European",
234
  "Serbo-Croatian": "Indo-European",
235
+ "Sicilian": "Indo-European",
236
  "Slovak": "Indo-European",
237
  "Slovenian": "Indo-European",
238
  "South Azerbaijani": "Turkic",
239
+ "Spanish": "Indo-European",
240
  "Sundanese": "Austronesian",
241
  "Swahili": "Niger-Congo",
242
+ "Swedish": "Indo-European",
243
  "Tagalog": "Austronesian",
244
  "Tajik": "Indo-European",
245
  "Tamil": "Dravidian",
 
253
  "VolapΓΌk": "Constructed",
254
  "Waray-Waray": "Austronesian",
255
  "Welsh": "Indo-European",
256
+ "West Frisian": "Indo-European",
257
  "Western Punjabi": "Indo-European",
258
  "Yoruba": "Niger-Congo",
259
  "Esperanto": "Constructed",
260
  "Crimean Tatar": "Turkic",
261
  }
262
+
263
+ # ── 2. primary branches (first subdivision inside each family) ────────────────
264
+ primary_families_branches = {
265
+ # Indo-European
266
+ "Afrikaans": "Germanic",
267
+ "Albanian": "Albanian",
268
+ "Aragonese": "Romance",
269
+ "Armenian": "Armenian",
270
+ "Asturian": "Romance",
271
+ "Bavarian": "Germanic",
272
+ "Belarusian": "Slavic",
273
+ "Bengali": "Indo-Aryan",
274
+ "Bishnupriya Manipuri": "Indo-Aryan",
275
+ "Bosnian": "Slavic",
276
+ "Breton": "Celtic",
277
+ "Bulgarian": "Slavic",
278
+ "Catalan": "Romance",
279
+ "Croatian": "Slavic",
280
+ "Czech": "Slavic",
281
+ "Danish": "Germanic",
282
+ "Dutch": "Germanic",
283
+ "English": "Germanic",
284
+ "French": "Romance",
285
+ "Galician": "Romance",
286
+ "German": "Germanic",
287
+ "Greek": "Hellenic",
288
+ "Gujarati": "Indo-Aryan",
289
+ "Hindi": "Indo-Aryan",
290
+ "Icelandic": "Germanic",
291
+ "Irish": "Celtic",
292
+ "Italian": "Romance",
293
+ "Latin": "Italic",
294
+ "Latvian": "Baltic",
295
+ "Lithuanian": "Baltic",
296
+ "Lombard": "Romance",
297
+ "Low Saxon": "Germanic",
298
+ "Luxembourgish": "Germanic",
299
+ "Macedonian": "Slavic",
300
+ "Marathi": "Indo-Aryan",
301
+ "Nepali": "Indo-Aryan",
302
+ "Norwegian (Bokmal)": "Germanic",
303
+ "Norwegian (Nynorsk)": "Germanic",
304
+ "Occitan": "Romance",
305
+ "Persian (Farsi)": "Iranian",
306
+ "Piedmontese": "Romance",
307
+ "Polish": "Slavic",
308
+ "Portuguese": "Romance",
309
+ "Punjabi": "Indo-Aryan",
310
+ "Romanian": "Romance",
311
+ "Russian": "Slavic",
312
+ "Scots": "Germanic",
313
+ "Serbian": "Slavic",
314
+ "Serbo-Croatian": "Slavic",
315
+ "Sicilian": "Romance",
316
+ "Slovak": "Slavic",
317
+ "Slovenian": "Slavic",
318
+ "Spanish": "Romance",
319
+ "Swedish": "Germanic",
320
+ "Tajik": "Iranian",
321
+ "Ukrainian": "Slavic",
322
+ "Urdu": "Indo-Aryan",
323
+ "West Frisian": "Germanic",
324
+ "Western Punjabi": "Indo-Aryan",
325
+ "Welsh": "Celtic",
326
+
327
+ # Afroasiatic
328
+ "Arabic": "Semitic",
329
+ "Egyptian Arabic": "Semitic",
330
+ "Hebrew": "Semitic",
331
+
332
+ # Turkic
333
+ "Azerbaijani": "Oghuz",
334
+ "South Azerbaijani": "Oghuz",
335
+ "Turkish": "Oghuz",
336
+ "Bashkir": "Kipchak",
337
+ "Kazakh": "Kipchak",
338
+ "Kirghiz": "Kipchak",
339
+ "Tatar": "Kipchak",
340
+ "Crimean Tatar": "Kipchak",
341
+ "Chuvash": "Oghur",
342
+ "Uzbek": "Karluk",
343
+
344
+ # Uralic
345
+ "Estonian": "Finnic",
346
+ "Finnish": "Finnic",
347
+ "Hungarian": "Ugric",
348
+
349
+ # Sino-Tibetan
350
+ "Chinese (Simplified)": "Sinitic",
351
+ "Chinese (Traditional)": "Sinitic",
352
+ "Min Nan Chinese": "Sinitic",
353
+ "Burmese": "Tibeto-Burman",
354
+ "Newar": "Tibeto-Burman",
355
+
356
+ # Austronesian
357
+ "Cebuano": "Malayo-Polynesian",
358
+ "Indonesian": "Malayo-Polynesian",
359
+ "Javanese": "Malayo-Polynesian",
360
+ "Malagasy": "Malayo-Polynesian",
361
+ "Malay": "Malayo-Polynesian",
362
+ "Minangkabau": "Malayo-Polynesian",
363
+ "Sundanese": "Malayo-Polynesian",
364
+ "Tagalog": "Philippine",
365
+ "Waray-Waray": "Philippine",
366
+
367
+ # Dravidian
368
+ "Kannada": "South Dravidian",
369
+ "Malayalam": "South Dravidian",
370
+ "Tamil": "South Dravidian",
371
+ "Telugu": "South-Central Dravidian",
372
+
373
+ # Niger-Congo
374
+ "Swahili": "Atlantic-Congo",
375
+ "Yoruba": "Atlantic-Congo",
376
+
377
+ # Misc. single-branch families
378
+ "Basque": "Language Isolate",
379
+ "Chechen": "Nakh",
380
+ "Georgian": "Kartvelian",
381
+ "Japanese": "Japonic",
382
+ "Korean": "Koreanic",
383
+ "Vietnamese": "Vietic",
384
+
385
+ # Creole & Constructed
386
+ "Haitian": "French-based Creole",
387
+ "Esperanto": "Constructed",
388
+ "Ido": "Constructed",
389
+ "VolapΓΌk": "Constructed",
390
+ }
utils.py CHANGED
@@ -5,9 +5,8 @@ import numpy as np
5
  from sklearn.manifold import TSNE
6
  import umap
7
  from sklearn.cluster import KMeans
8
- from scipy.spatial import KDTree
9
  from adjustText import adjust_text
10
- from constants import language_families, language_subfamilies
11
 
12
 
13
  def filter_languages_by_families(matrix, languages, families):
@@ -24,7 +23,7 @@ def filter_languages_by_families(matrix, languages, families):
24
  filtered_languages = [
25
  (i, lang)
26
  for i, lang in enumerate(languages)
27
- if language_families[lang] in families
28
  ]
29
  filtered_indices = [i for i, lang in filtered_languages]
30
  filtered_languages = [lang for i, lang in filtered_languages]
@@ -48,7 +47,7 @@ def get_dynamic_color_map(n_colors):
48
 
49
 
50
  def cluster_languages_by_families(languages):
51
- lang_families = [language_families[lang] for lang in languages]
52
  legend = sorted(set(lang_families))
53
  clusters = [legend.index(family) for family in lang_families]
54
  return clusters, legend
@@ -56,7 +55,7 @@ def cluster_languages_by_families(languages):
56
 
57
  def cluster_languages_by_subfamilies(languages):
58
  labels = [
59
- language_families[lang] + f" ({language_subfamilies[lang]})"
60
  for lang in languages
61
  ]
62
  legend = sorted(set(labels))
 
5
  from sklearn.manifold import TSNE
6
  import umap
7
  from sklearn.cluster import KMeans
 
8
  from adjustText import adjust_text
9
+ from constants import high_level_families, primary_families_branches
10
 
11
 
12
  def filter_languages_by_families(matrix, languages, families):
 
23
  filtered_languages = [
24
  (i, lang)
25
  for i, lang in enumerate(languages)
26
+ if high_level_families[lang] in families
27
  ]
28
  filtered_indices = [i for i, lang in filtered_languages]
29
  filtered_languages = [lang for i, lang in filtered_languages]
 
47
 
48
 
49
  def cluster_languages_by_families(languages):
50
+ lang_families = [high_level_families[lang] for lang in languages]
51
  legend = sorted(set(lang_families))
52
  clusters = [legend.index(family) for family in lang_families]
53
  return clusters, legend
 
55
 
56
  def cluster_languages_by_subfamilies(languages):
57
  labels = [
58
+ high_level_families[lang] + f" ({primary_families_branches[lang]})"
59
  for lang in languages
60
  ]
61
  legend = sorted(set(labels))