Hamza-cpp commited on
Commit
e69a4b4
·
1 Parent(s): bd2c536

i hope it will work

Browse files
Files changed (4) hide show
  1. Dockerfile +14 -0
  2. app.py +146 -0
  3. flores200_codes.py +211 -0
  4. requirements.txt +4 -0
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /app
4
+
5
+ COPY ./requirements.txt .
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
8
+
9
+ COPY . .
10
+
11
+ EXPOSE 5000
12
+
13
+ CMD ["python","app.py"]
14
+
app.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import os
2
+ # import torch
3
+ # import gradio s gr
4
+ # import time
5
+ # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
6
+ # from flores200_codes import flores_codes
7
+
8
+
9
+ # def load_models():
10
+ # # build model and tokenizer
11
+ # model_name_dict = {'nllb-distilled-600M': 'facebook/nllb-200-distilled-600M',
12
+ # #'nllb-1.3B': 'facebook/nllb-200-1.3B',
13
+ # #'nllb-distilled-1.3B': 'facebook/nllb-200-distilled-1.3B',
14
+ # #'nllb-3.3B': 'facebook/nllb-200-3.3B',
15
+ # }
16
+
17
+ # model_dict = {}
18
+
19
+ # for call_name, real_name in model_name_dict.items():
20
+ # print('\tLoading model: %s' % call_name)
21
+ # model = AutoModelForSeq2SeqLM.from_pretrained(real_name)
22
+ # tokenizer = AutoTokenizer.from_pretrained(real_name)
23
+ # model_dict[call_name+'_model'] = model
24
+ # model_dict[call_name+'_tokenizer'] = tokenizer
25
+
26
+ # return model_dict
27
+
28
+
29
+ # def translation(source, target, text):
30
+ # if len(model_dict) == 2:
31
+ # model_name = 'nllb-distilled-600M'
32
+
33
+ # start_time = time.time()
34
+ # source = flores_codes[source]
35
+ # target = flores_codes[target]
36
+
37
+ # model = model_dict[model_name + '_model']
38
+ # tokenizer = model_dict[model_name + '_tokenizer']
39
+
40
+ # translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=source, tgt_lang=target)
41
+ # output = translator(text, max_length=400)
42
+
43
+ # end_time = time.time()
44
+
45
+ # output = output[0]['translation_text']
46
+ # result = {'inference_time': end_time - start_time,
47
+ # 'source': source,
48
+ # 'target': target,
49
+ # 'result': output}
50
+ # return result
51
+
52
+
53
+ # if __name__ == '__main__':
54
+ # print('\tinit models')
55
+
56
+ # global model_dict
57
+
58
+ # model_dict = load_models()
59
+
60
+ # # define gradio demo
61
+ # lang_codes = list(flores_codes.keys())
62
+ # #inputs = [gr.inputs.Radio(['nllb-distilled-600M', 'nllb-1.3B', 'nllb-distilled-1.3B'], label='NLLB Model'),
63
+ # inputs = [gr.inputs.Dropdown(lang_codes, default='English', label='Source'),
64
+ # gr.inputs.Dropdown(lang_codes, default='Korean', label='Target'),
65
+ # gr.inputs.Textbox(lines=5, label="Input text"),
66
+ # ]
67
+
68
+ # outputs = gr.outputs.JSON()
69
+
70
+ # title = "NLLB distilled 600M demo"
71
+
72
+ # demo_status = "Demo is running on CPU"
73
+ # description = f"Details: https://github.com/facebookresearch/fairseq/tree/nllb. {demo_status}"
74
+ # examples = [
75
+ # ['English', 'Korean', 'Hi. nice to meet you']
76
+ # ]
77
+
78
+ # gr.Interface(translation,
79
+ # inputs,
80
+ # outputs,
81
+ # title=title,
82
+ # description=description,
83
+ # ).launch()
84
+
85
+
86
+ import os
87
+ import time
88
+ from flask import Flask, request, jsonify
89
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
90
+ from flores200_codes import flores_codes
91
+
92
+ app = Flask(__name__)
93
+
94
+ def load_models():
95
+ model_name_dict = {'nllb-distilled-600M': 'facebook/nllb-200-distilled-600M'}
96
+ model_dict = {}
97
+
98
+ for call_name, real_name in model_name_dict.items():
99
+ print(f'\tLoading model: {call_name}')
100
+ model = AutoModelForSeq2SeqLM.from_pretrained(real_name)
101
+ tokenizer = AutoTokenizer.from_pretrained(real_name)
102
+ model_dict[call_name + '_model'] = model
103
+ model_dict[call_name + '_tokenizer'] = tokenizer
104
+
105
+ return model_dict
106
+
107
+ global model_dict
108
+ model_dict = load_models()
109
+
110
+ @app.route('/api/translate', methods=['POST'])
111
+ def translate_text():
112
+ data = request.json
113
+ source_lang = data.get('source')
114
+ target_lang = data.get('target')
115
+ input_text = data.get('text')
116
+
117
+ if not source_lang or not target_lang or not input_text:
118
+ return jsonify({"error": "source, target, and text fields are required"}), 400
119
+
120
+ model_name = 'nllb-distilled-600M'
121
+ start_time = time.time()
122
+ source = flores_codes.get(source_lang)
123
+ target = flores_codes.get(target_lang)
124
+
125
+ if not source or not target:
126
+ return jsonify({"error": "Invalid source or target language code"}), 400
127
+
128
+ model = model_dict[model_name + '_model']
129
+ tokenizer = model_dict[model_name + '_tokenizer']
130
+
131
+ translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=source, tgt_lang=target)
132
+ output = translator(input_text, max_length=400)
133
+
134
+ end_time = time.time()
135
+ output_text = output[0]['translation_text']
136
+
137
+ result = {
138
+ 'inference_time': end_time - start_time,
139
+ 'source': source_lang,
140
+ 'target': target_lang,
141
+ 'result': output_text
142
+ }
143
+ return jsonify(result)
144
+
145
+ if __name__ == '__main__':
146
+ app.run(debug=True)
flores200_codes.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ codes_as_string = '''Acehnese (Arabic scrip) ace_Arab
2
+ Acehnese (Latin script) ace_Latn
3
+ Mesopotamian Arabic acm_Arab
4
+ Ta’izzi-Adeni Arabic acq_Arab
5
+ Tunisian Arabic aeb_Arab
6
+ Afrikaans afr_Latn
7
+ South Levantine Arabic ajp_Arab
8
+ Akan aka_Latn
9
+ Amharic amh_Ethi
10
+ North Levantine Arabic apc_Arab
11
+ Modern Standard Arabic arb_Arab
12
+ Modern Standard Arabic (Romanized) arb_Latn
13
+ Najdi Arabic ars_Arab
14
+ Moroccan Arabic ary_Arab
15
+ Egyptian Arabic arz_Arab
16
+ Assamese asm_Beng
17
+ Asturian ast_Latn
18
+ Awadhi awa_Deva
19
+ Central Aymara ayr_Latn
20
+ South Azerbaijani azb_Arab
21
+ North Azerbaijani azj_Latn
22
+ Bashkir bak_Cyrl
23
+ Bambara bam_Latn
24
+ Balinese ban_Latn
25
+ Belarusian bel_Cyrl
26
+ Bemba bem_Latn
27
+ Bengali ben_Beng
28
+ Bhojpuri bho_Deva
29
+ Banjar (Arabic script) bjn_Arab
30
+ Banjar (Latin script) bjn_Latn
31
+ Standard Tibetan bod_Tibt
32
+ Bosnian bos_Latn
33
+ Buginese bug_Latn
34
+ Bulgarian bul_Cyrl
35
+ Catalan cat_Latn
36
+ Cebuano ceb_Latn
37
+ Czech ces_Latn
38
+ Chokwe cjk_Latn
39
+ Central Kurdish ckb_Arab
40
+ Crimean Tatar crh_Latn
41
+ Welsh cym_Latn
42
+ Danish dan_Latn
43
+ German deu_Latn
44
+ Southwestern Dinka dik_Latn
45
+ Dyula dyu_Latn
46
+ Dzongkha dzo_Tibt
47
+ Greek ell_Grek
48
+ English eng_Latn
49
+ Esperanto epo_Latn
50
+ Estonian est_Latn
51
+ Basque eus_Latn
52
+ Ewe ewe_Latn
53
+ Faroese fao_Latn
54
+ Fijian fij_Latn
55
+ Finnish fin_Latn
56
+ Fon fon_Latn
57
+ French fra_Latn
58
+ Friulian fur_Latn
59
+ Nigerian Fulfulde fuv_Latn
60
+ Scottish Gaelic gla_Latn
61
+ Irish gle_Latn
62
+ Galician glg_Latn
63
+ Guarani grn_Latn
64
+ Gujarati guj_Gujr
65
+ Haitian Creole hat_Latn
66
+ Hausa hau_Latn
67
+ Hebrew heb_Hebr
68
+ Hindi hin_Deva
69
+ Chhattisgarhi hne_Deva
70
+ Croatian hrv_Latn
71
+ Hungarian hun_Latn
72
+ Armenian hye_Armn
73
+ Igbo ibo_Latn
74
+ Ilocano ilo_Latn
75
+ Indonesian ind_Latn
76
+ Icelandic isl_Latn
77
+ Italian ita_Latn
78
+ Javanese jav_Latn
79
+ Japanese jpn_Jpan
80
+ Kabyle kab_Latn
81
+ Jingpho kac_Latn
82
+ Kamba kam_Latn
83
+ Kannada kan_Knda
84
+ Kashmiri (Arabic script) kas_Arab
85
+ Kashmiri (Devanagari script) kas_Deva
86
+ Georgian kat_Geor
87
+ Central Kanuri (Arabic script) knc_Arab
88
+ Central Kanuri (Latin script) knc_Latn
89
+ Kazakh kaz_Cyrl
90
+ Kabiyè kbp_Latn
91
+ Kabuverdianu kea_Latn
92
+ Khmer khm_Khmr
93
+ Kikuyu kik_Latn
94
+ Kinyarwanda kin_Latn
95
+ Kyrgyz kir_Cyrl
96
+ Kimbundu kmb_Latn
97
+ Northern Kurdish kmr_Latn
98
+ Kikongo kon_Latn
99
+ Korean kor_Hang
100
+ Lao lao_Laoo
101
+ Ligurian lij_Latn
102
+ Limburgish lim_Latn
103
+ Lingala lin_Latn
104
+ Lithuanian lit_Latn
105
+ Lombard lmo_Latn
106
+ Latgalian ltg_Latn
107
+ Luxembourgish ltz_Latn
108
+ Luba-Kasai lua_Latn
109
+ Ganda lug_Latn
110
+ Luo luo_Latn
111
+ Mizo lus_Latn
112
+ Standard Latvian lvs_Latn
113
+ Magahi mag_Deva
114
+ Maithili mai_Deva
115
+ Malayalam mal_Mlym
116
+ Marathi mar_Deva
117
+ Minangkabau (Arabic script) min_Arab
118
+ Minangkabau (Latin script) min_Latn
119
+ Macedonian mkd_Cyrl
120
+ Plateau Malagasy plt_Latn
121
+ Maltese mlt_Latn
122
+ Meitei (Bengali script) mni_Beng
123
+ Halh Mongolian khk_Cyrl
124
+ Mossi mos_Latn
125
+ Maori mri_Latn
126
+ Burmese mya_Mymr
127
+ Dutch nld_Latn
128
+ Norwegian Nynorsk nno_Latn
129
+ Norwegian Bokmål nob_Latn
130
+ Nepali npi_Deva
131
+ Northern Sotho nso_Latn
132
+ Nuer nus_Latn
133
+ Nyanja nya_Latn
134
+ Occitan oci_Latn
135
+ West Central Oromo gaz_Latn
136
+ Odia ory_Orya
137
+ Pangasinan pag_Latn
138
+ Eastern Panjabi pan_Guru
139
+ Papiamento pap_Latn
140
+ Western Persian pes_Arab
141
+ Polish pol_Latn
142
+ Portuguese por_Latn
143
+ Dari prs_Arab
144
+ Southern Pashto pbt_Arab
145
+ Ayacucho Quechua quy_Latn
146
+ Romanian ron_Latn
147
+ Rundi run_Latn
148
+ Russian rus_Cyrl
149
+ Sango sag_Latn
150
+ Sanskrit san_Deva
151
+ Santali sat_Olck
152
+ Sicilian scn_Latn
153
+ Shan shn_Mymr
154
+ Sinhala sin_Sinh
155
+ Slovak slk_Latn
156
+ Slovenian slv_Latn
157
+ Samoan smo_Latn
158
+ Shona sna_Latn
159
+ Sindhi snd_Arab
160
+ Somali som_Latn
161
+ Southern Sotho sot_Latn
162
+ Spanish spa_Latn
163
+ Tosk Albanian als_Latn
164
+ Sardinian srd_Latn
165
+ Serbian srp_Cyrl
166
+ Swati ssw_Latn
167
+ Sundanese sun_Latn
168
+ Swedish swe_Latn
169
+ Swahili swh_Latn
170
+ Silesian szl_Latn
171
+ Tamil tam_Taml
172
+ Tatar tat_Cyrl
173
+ Telugu tel_Telu
174
+ Tajik tgk_Cyrl
175
+ Tagalog tgl_Latn
176
+ Thai tha_Thai
177
+ Tigrinya tir_Ethi
178
+ Tamasheq (Latin script) taq_Latn
179
+ Tamasheq (Tifinagh script) taq_Tfng
180
+ Tok Pisin tpi_Latn
181
+ Tswana tsn_Latn
182
+ Tsonga tso_Latn
183
+ Turkmen tuk_Latn
184
+ Tumbuka tum_Latn
185
+ Turkish tur_Latn
186
+ Twi twi_Latn
187
+ Central Atlas Tamazight tzm_Tfng
188
+ Uyghur uig_Arab
189
+ Ukrainian ukr_Cyrl
190
+ Umbundu umb_Latn
191
+ Urdu urd_Arab
192
+ Northern Uzbek uzn_Latn
193
+ Venetian vec_Latn
194
+ Vietnamese vie_Latn
195
+ Waray war_Latn
196
+ Wolof wol_Latn
197
+ Xhosa xho_Latn
198
+ Eastern Yiddish ydd_Hebr
199
+ Yoruba yor_Latn
200
+ Yue Chinese yue_Hant
201
+ Chinese (Simplified) zho_Hans
202
+ Chinese (Traditional) zho_Hant
203
+ Standard Malay zsm_Latn
204
+ Zulu zul_Latn'''
205
+
206
+ codes_as_string = codes_as_string.split('\n')
207
+
208
+ flores_codes = {}
209
+ for code in codes_as_string:
210
+ lang, lang_code = code.split('\t')
211
+ flores_codes[lang] = lang_code
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ httpx==0.24.1
2
+ flask
3
+ transformers
4
+ torch