File size: 2,841 Bytes
ac901c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# -*- coding: utf-8 -*-
#
#  Copyright (c) 2013-present, Anoop Kunchukuttan
#  All rights reserved.
#
#  This source code is licensed under the MIT license found in the
#  LICENSE file in the root directory of this source tree.
#

# Program to transliterate acronyms from one Latin script to Indic languages
#
# @author Anoop Kunchukuttan
#

from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
import random


class LatinToIndicAcronymTransliterator(object):
    LATIN_TO_DEVANAGARI_TRANSTABLE = str.maketrans(
        {
            "a": "ए",
            "b": "बी",
            "c": "सी",
            "d": "डी",
            "e": "ई",
            "f": "एफ",
            "g": "जी",
            "h": "एच",
            "i": "आई",
            "j": "जे",
            "k": "के",
            "l": "एल",
            "m": "एम",
            "n": "एन",
            "o": "ओ",
            "p": "पी",
            "q": "क्यू",
            "r": "आर",
            "s": "एस",
            "t": "टी",
            "u": "यू",
            "v": "वी",
            "w": "डब्ल्यू",
            "x": "एक्स",
            "y": "वाय",
            "z": "जेड",
        }
    )

    # a_unichr=ord('a')
    # alphabet = [ chr(a_unichr+n) for n in range(26) ]
    LATIN_ALPHABET = [
        "a",
        "b",
        "c",
        "d",
        "e",
        "f",
        "g",
        "h",
        "i",
        "j",
        "k",
        "l",
        "m",
        "n",
        "o",
        "p",
        "q",
        "r",
        "s",
        "t",
        "u",
        "v",
        "w",
        "x",
        "y",
        "z",
    ]

    @staticmethod
    def get_transtable():
        return LatinToIndicAcronymTransliterator.LATIN_TO_DEVANAGARI_TRANSTABLE

    @staticmethod
    def transliterate(w, lang):
        return UnicodeIndicTransliterator.transliterate(
            w.lower().translate(
                LatinToIndicAcronymTransliterator.LATIN_TO_DEVANAGARI_TRANSTABLE
            ),
            "hi",
            lang,
        )

    @staticmethod
    def generate_latin_acronyms(num_acronyms, min_len=2, max_len=6, strategy="random"):
        """

        generate Latin acronyms in lower case

        """

        def sample_acronym(strategy="random"):
            if strategy == "random":
                slen = random.randint(min_len, max_len)
                return "".join(
                    random.choices(
                        LatinToIndicAcronymTransliterator.LATIN_ALPHABET, k=slen
                    )
                )

        return [sample_acronym(strategy) for i in range(num_acronyms)]