language-metric-analysis / constants.py
mshamrai's picture
fix: families
b719825
low_level_subfamilies = {
# Germanic
"Afrikaans": "West Germanic",
"Bavarian": "Austro-Bavarian",
"Danish": "North Germanic",
"Dutch": "West Germanic",
"English": "West Germanic",
"German": "West Germanic",
"Icelandic": "North Germanic",
"Low Saxon": "West Germanic",
"Luxembourgish": "West Germanic",
"Norwegian (Bokmal)": "North Germanic",
"Norwegian (Nynorsk)": "North Germanic",
"Scots": "West Germanic",
"Swedish": "North Germanic",
"West Frisian": "West Germanic",
# Romance
"Aragonese": "Ibero-Romance",
"Asturian": "Ibero-Romance",
"Catalan": "Occitano-Romance",
"French": "Gallo-Romance",
"Galician": "Ibero-Romance",
"Italian": "Italo-Dalmatian",
"Lombard": "Gallo-Italic",
"Occitan": "Occitano-Romance",
"Piedmontese":"Gallo-Italic",
"Portuguese":"Ibero-Romance",
"Romanian": "Eastern Romance",
"Sicilian": "Italo-Dalmatian",
"Spanish": "Ibero-Romance",
# Slavic
"Belarusian":"East Slavic",
"Bosnian": "South Slavic",
"Bulgarian": "South Slavic",
"Croatian": "South Slavic",
"Czech": "West Slavic",
"Macedonian":"South Slavic",
"Polish": "West Slavic",
"Russian": "East Slavic",
"Serbian": "South Slavic",
"Serbo-Croatian":"South Slavic",
"Slovak": "West Slavic",
"Slovenian": "South Slavic",
"Ukrainian": "East Slavic",
# Celtic
"Breton": "Brythonic",
"Welsh": "Brythonic",
"Irish": "Goidelic",
# Baltic
"Latvian": "Baltic",
"Lithuanian": "Baltic",
# Indo-Aryan (Indic)
"Bengali": "Eastern Indo-Aryan",
"Bishnupriya Manipuri": "Eastern Indo-Aryan",
"Gujarati": "Western Indo-Aryan",
"Hindi": "Central Indo-Aryan",
"Marathi": "Southern Indo-Aryan",
"Nepali": "Northern Indo-Aryan",
"Punjabi": "North-western Indo-Aryan",
"Urdu": "Central Indo-Aryan",
"Western Punjabi": "North-western Indo-Aryan",
# Iranian
"Persian (Farsi)": "South-western Iranian",
"Tajik": "South-western Iranian",
# Turkic
"Azerbaijani": "Oghuz",
"South Azerbaijani": "Oghuz",
"Turkish": "Oghuz",
"Bashkir": "Kipchak",
"Kazakh": "Kipchak",
"Kirghiz": "Kipchak",
"Tatar": "Kipchak",
"Crimean Tatar":"Kipchak",
"Chuvash": "Oghur",
"Uzbek": "Karluk",
# Uralic
"Estonian": "Finnic",
"Finnish": "Finnic",
"Hungarian":"Ugric",
# Dravidian
"Kannada": "Southern Dravidian",
"Malayalam":"Southern Dravidian",
"Tamil": "Southern Dravidian",
"Telugu": "South-Central Dravidian",
# Sinitic (Chinese)
"Chinese (Simplified)": "Mandarin",
"Chinese (Traditional)": "Mandarin",
"Min Nan Chinese": "Southern Min",
# Other Sino-Tibetan
"Burmese": "Burmish",
"Newar": "Newaric",
# Japonic / Koreanic
"Japanese": "Japonic",
"Korean": "Koreanic",
# Caucasian & Kartvelian
"Chechen": "Nakh",
"Georgian": "Kartvelian",
# Austronesian
"Cebuano": "Central Philippine",
"Tagalog": "Central Philippine",
"Waray-Waray": "Central Philippine",
"Indonesian": "Malayic",
"Malay": "Malayic",
"Minangkabau": "Malayic",
"Javanese": "Javanese",
"Sundanese": "Sundic",
"Malagasy": "East Barito",
# Philippine & Oceanic already covered above
# Vietic & MSEA
"Vietnamese": "Vietic",
# Altaic hypotheses excluded; Kipchak/Oghuz etc already above
# Afro-Asiatic
"Arabic": "Central Semitic",
"Egyptian Arabic": "Central Semitic",
"Hebrew": "North-west Semitic",
# Niger-Congo
"Swahili": "Sabaki",
"Yoruba": "Yoruboid",
# Isolates & special groups
"Albanian": "Albanian",
"Armenian": "Armenian",
"Basque": "Language Isolate",
"Greek": "Hellenic",
"Latin": "Latino-Faliscan",
"Japanese": "Japonic",
"Esperanto":"Constructed",
"Ido": "Constructed",
"Volapük": "Constructed",
"Haitian": "French-based Creole",
}
# ── 1. high-level genealogical families ─────────────────────────────────────────
high_level_families = {
"Afrikaans": "Indo-European",
"Albanian": "Indo-European",
"Arabic": "Afroasiatic",
"Egyptian Arabic": "Afroasiatic",
"Aragonese": "Indo-European",
"Armenian": "Indo-European",
"Asturian": "Indo-European",
"Azerbaijani": "Turkic",
"Bashkir": "Turkic",
"Basque": "Language Isolate",
"Bavarian": "Indo-European",
"Belarusian": "Indo-European",
"Bengali": "Indo-European",
"Bishnupriya Manipuri": "Indo-European",
"Bosnian": "Indo-European",
"Breton": "Indo-European",
"Bulgarian": "Indo-European",
"Burmese": "Sino-Tibetan",
"Catalan": "Indo-European",
"Cebuano": "Austronesian",
"Chechen": "Northeast Caucasian",
"Chinese (Simplified)": "Sino-Tibetan",
"Chinese (Traditional)": "Sino-Tibetan",
"Min Nan Chinese": "Sino-Tibetan",
"Chuvash": "Turkic",
"Croatian": "Indo-European",
"Czech": "Indo-European",
"Danish": "Indo-European",
"Dutch": "Indo-European",
"English": "Indo-European",
"Estonian": "Uralic",
"Finnish": "Uralic",
"French": "Indo-European",
"Galician": "Indo-European",
"Georgian": "Kartvelian",
"German": "Indo-European",
"Greek": "Indo-European",
"Gujarati": "Indo-European",
"Haitian": "Creole",
"Hebrew": "Afroasiatic",
"Hindi": "Indo-European",
"Hungarian": "Uralic",
"Icelandic": "Indo-European",
"Ido": "Constructed",
"Indonesian": "Austronesian",
"Irish": "Indo-European",
"Italian": "Indo-European",
"Japanese": "Japonic",
"Javanese": "Austronesian",
"Kannada": "Dravidian",
"Kazakh": "Turkic",
"Kirghiz": "Turkic",
"Korean": "Koreanic",
"Latin": "Indo-European",
"Latvian": "Indo-European",
"Lithuanian": "Indo-European",
"Lombard": "Indo-European",
"Low Saxon": "Indo-European",
"Luxembourgish": "Indo-European",
"Macedonian": "Indo-European",
"Malagasy": "Austronesian",
"Malay": "Austronesian",
"Malayalam": "Dravidian",
"Marathi": "Indo-European",
"Minangkabau": "Austronesian",
"Nepali": "Indo-European",
"Newar": "Sino-Tibetan",
"Norwegian (Bokmal)": "Indo-European",
"Norwegian (Nynorsk)": "Indo-European",
"Occitan": "Indo-European",
"Persian (Farsi)": "Indo-European",
"Piedmontese": "Indo-European",
"Polish": "Indo-European",
"Portuguese": "Indo-European",
"Punjabi": "Indo-European",
"Romanian": "Indo-European",
"Russian": "Indo-European",
"Scots": "Indo-European",
"Serbian": "Indo-European",
"Serbo-Croatian": "Indo-European",
"Sicilian": "Indo-European",
"Slovak": "Indo-European",
"Slovenian": "Indo-European",
"South Azerbaijani": "Turkic",
"Spanish": "Indo-European",
"Sundanese": "Austronesian",
"Swahili": "Niger-Congo",
"Swedish": "Indo-European",
"Tagalog": "Austronesian",
"Tajik": "Indo-European",
"Tamil": "Dravidian",
"Tatar": "Turkic",
"Telugu": "Dravidian",
"Turkish": "Turkic",
"Ukrainian": "Indo-European",
"Urdu": "Indo-European",
"Uzbek": "Turkic",
"Vietnamese": "Austroasiatic",
"Volapük": "Constructed",
"Waray-Waray": "Austronesian",
"Welsh": "Indo-European",
"West Frisian": "Indo-European",
"Western Punjabi": "Indo-European",
"Yoruba": "Niger-Congo",
"Esperanto": "Constructed",
"Crimean Tatar": "Turkic",
}
# ── 2. primary branches (first subdivision inside each family) ────────────────
primary_families_branches = {
# Indo-European
"Afrikaans": "Germanic",
"Albanian": "Albanian",
"Aragonese": "Romance",
"Armenian": "Armenian",
"Asturian": "Romance",
"Bavarian": "Germanic",
"Belarusian": "Slavic",
"Bengali": "Indo-Aryan",
"Bishnupriya Manipuri": "Indo-Aryan",
"Bosnian": "Slavic",
"Breton": "Celtic",
"Bulgarian": "Slavic",
"Catalan": "Romance",
"Croatian": "Slavic",
"Czech": "Slavic",
"Danish": "Germanic",
"Dutch": "Germanic",
"English": "Germanic",
"French": "Romance",
"Galician": "Romance",
"German": "Germanic",
"Greek": "Hellenic",
"Gujarati": "Indo-Aryan",
"Hindi": "Indo-Aryan",
"Icelandic": "Germanic",
"Irish": "Celtic",
"Italian": "Romance",
"Latin": "Italic",
"Latvian": "Baltic",
"Lithuanian": "Baltic",
"Lombard": "Romance",
"Low Saxon": "Germanic",
"Luxembourgish": "Germanic",
"Macedonian": "Slavic",
"Marathi": "Indo-Aryan",
"Nepali": "Indo-Aryan",
"Norwegian (Bokmal)": "Germanic",
"Norwegian (Nynorsk)": "Germanic",
"Occitan": "Romance",
"Persian (Farsi)": "Iranian",
"Piedmontese": "Romance",
"Polish": "Slavic",
"Portuguese": "Romance",
"Punjabi": "Indo-Aryan",
"Romanian": "Romance",
"Russian": "Slavic",
"Scots": "Germanic",
"Serbian": "Slavic",
"Serbo-Croatian": "Slavic",
"Sicilian": "Romance",
"Slovak": "Slavic",
"Slovenian": "Slavic",
"Spanish": "Romance",
"Swedish": "Germanic",
"Tajik": "Iranian",
"Ukrainian": "Slavic",
"Urdu": "Indo-Aryan",
"West Frisian": "Germanic",
"Western Punjabi": "Indo-Aryan",
"Welsh": "Celtic",
# Afroasiatic
"Arabic": "Semitic",
"Egyptian Arabic": "Semitic",
"Hebrew": "Semitic",
# Turkic
"Azerbaijani": "Oghuz",
"South Azerbaijani": "Oghuz",
"Turkish": "Oghuz",
"Bashkir": "Kipchak",
"Kazakh": "Kipchak",
"Kirghiz": "Kipchak",
"Tatar": "Kipchak",
"Crimean Tatar": "Kipchak",
"Chuvash": "Oghur",
"Uzbek": "Karluk",
# Uralic
"Estonian": "Finnic",
"Finnish": "Finnic",
"Hungarian": "Ugric",
# Sino-Tibetan
"Chinese (Simplified)": "Sinitic",
"Chinese (Traditional)": "Sinitic",
"Min Nan Chinese": "Sinitic",
"Burmese": "Tibeto-Burman",
"Newar": "Tibeto-Burman",
# Austronesian
"Cebuano": "Malayo-Polynesian",
"Indonesian": "Malayo-Polynesian",
"Javanese": "Malayo-Polynesian",
"Malagasy": "Malayo-Polynesian",
"Malay": "Malayo-Polynesian",
"Minangkabau": "Malayo-Polynesian",
"Sundanese": "Malayo-Polynesian",
"Tagalog": "Philippine",
"Waray-Waray": "Philippine",
# Dravidian
"Kannada": "South Dravidian",
"Malayalam": "South Dravidian",
"Tamil": "South Dravidian",
"Telugu": "South-Central Dravidian",
# Niger-Congo
"Swahili": "Atlantic-Congo",
"Yoruba": "Atlantic-Congo",
# Misc. single-branch families
"Basque": "Language Isolate",
"Chechen": "Nakh",
"Georgian": "Kartvelian",
"Japanese": "Japonic",
"Korean": "Koreanic",
"Vietnamese": "Vietic",
# Creole & Constructed
"Haitian": "French-based Creole",
"Esperanto": "Constructed",
"Ido": "Constructed",
"Volapük": "Constructed",
}