Spaces:
Sleeping
Sleeping
File size: 2,841 Bytes
ac901c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013-present, Anoop Kunchukuttan
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Program to transliterate acronyms from one Latin script to Indic languages
#
# @author Anoop Kunchukuttan
#
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
import random
class LatinToIndicAcronymTransliterator(object):
LATIN_TO_DEVANAGARI_TRANSTABLE = str.maketrans(
{
"a": "ए",
"b": "बी",
"c": "सी",
"d": "डी",
"e": "ई",
"f": "एफ",
"g": "जी",
"h": "एच",
"i": "आई",
"j": "जे",
"k": "के",
"l": "एल",
"m": "एम",
"n": "एन",
"o": "ओ",
"p": "पी",
"q": "क्यू",
"r": "आर",
"s": "एस",
"t": "टी",
"u": "यू",
"v": "वी",
"w": "डब्ल्यू",
"x": "एक्स",
"y": "वाय",
"z": "जेड",
}
)
# a_unichr=ord('a')
# alphabet = [ chr(a_unichr+n) for n in range(26) ]
LATIN_ALPHABET = [
"a",
"b",
"c",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"q",
"r",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
]
@staticmethod
def get_transtable():
return LatinToIndicAcronymTransliterator.LATIN_TO_DEVANAGARI_TRANSTABLE
@staticmethod
def transliterate(w, lang):
return UnicodeIndicTransliterator.transliterate(
w.lower().translate(
LatinToIndicAcronymTransliterator.LATIN_TO_DEVANAGARI_TRANSTABLE
),
"hi",
lang,
)
@staticmethod
def generate_latin_acronyms(num_acronyms, min_len=2, max_len=6, strategy="random"):
"""
generate Latin acronyms in lower case
"""
def sample_acronym(strategy="random"):
if strategy == "random":
slen = random.randint(min_len, max_len)
return "".join(
random.choices(
LatinToIndicAcronymTransliterator.LATIN_ALPHABET, k=slen
)
)
return [sample_acronym(strategy) for i in range(num_acronyms)]
|