Yaron Koresh commited on
Commit
89fc06b
·
verified ·
1 Parent(s): 39f7b3a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +311 -58
app.py CHANGED
@@ -1,4 +1,8 @@
1
-
 
 
 
 
2
  from langdetect import detect as get_language
3
  from collections import namedtuple
4
  from inspect import signature
@@ -38,7 +42,6 @@ from refiners.foundationals.latent_diffusion.stable_diffusion_1.multi_upscaler i
38
  UpscalerCheckpoints,
39
  )
40
  from datetime import datetime
41
- from translate import Translator
42
 
43
  model = T5ForConditionalGeneration.from_pretrained("t5-large")
44
  tokenizer = T5Tokenizer.from_pretrained("t5-large")
@@ -657,63 +660,310 @@ def all_pipes(pos,neg,artist,song):
657
  return imgs
658
 
659
  language_codes = {
660
- "af": "Afrikaans",
661
- "ar": "Arabic",
662
- "bg": "Bulgarian",
663
- "bn": "Bengali",
664
- "ca": "Catalan",
665
- "cs": "Czech",
666
- "cy": "Welsh",
667
- "da": "Danish",
668
- "de": "German",
669
- "el": "Greek",
670
- "en": "English",
671
- "es": "Spanish",
672
- "et": "Estonian",
673
- "fa": "Persian (Farsi)",
674
- "fi": "Finnish",
675
- "fr": "French",
676
- "gu": "Gujarati",
677
- "he": "Hebrew",
678
- "hi": "Hindi",
679
- "hr": "Croatian",
680
- "hu": "Hungarian",
681
- "id": "Indonesian",
682
- "it": "Italian",
683
- "ja": "Japanese",
684
- "kn": "Kannada",
685
- "ko": "Korean",
686
- "lt": "Lithuanian",
687
- "lv": "Latvian",
688
- "mk": "Macedonian",
689
- "ml": "Malayalam",
690
- "mr": "Marathi",
691
- "ne": "Nepali",
692
- "nl": "Dutch",
693
- "no": "Norwegian",
694
- "pa": "Punjabi",
695
- "pl": "Polish",
696
- "pt": "Portuguese",
697
- "ro": "Romanian",
698
- "ru": "Russian",
699
- "sk": "Slovak",
700
- "sl": "Slovenian",
701
- "so": "Somali",
702
- "sq": "Albanian",
703
- "sv": "Swedish",
704
- "sw": "Swahili",
705
- "ta": "Tamil",
706
- "te": "Telugu",
707
- "th": "Thai",
708
- "tl": "Tagalog (Filipino)",
709
- "tr": "Turkish",
710
- "uk": "Ukrainian",
711
- "ur": "Urdu",
712
- "vi": "Vietnamese",
713
- "zh-cn": "Chinese (Simplified)",
714
- "zh-tw": "Chinese (Traditional)",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
715
  }
716
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
717
  def translate(txt,to_lang="en",from_lang=False):
718
  log(f'CALL translate')
719
  if not from_lang:
@@ -721,7 +971,10 @@ def translate(txt,to_lang="en",from_lang=False):
721
  if(from_lang == to_lang):
722
  log(f'RET translate with txt as {txt}')
723
  return txt
724
-
 
 
 
725
  translator = Translator(from_lang=from_lang,to_lang=to_lang)
726
  translation = ""
727
  if len(txt) > 490:
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from abc import ABC, abstractmethod
4
+ from pathlib import Path
5
+ from typing import List, Optional, Union
6
  from langdetect import detect as get_language
7
  from collections import namedtuple
8
  from inspect import signature
 
42
  UpscalerCheckpoints,
43
  )
44
  from datetime import datetime
 
45
 
46
  model = T5ForConditionalGeneration.from_pretrained("t5-large")
47
  tokenizer = T5Tokenizer.from_pretrained("t5-large")
 
660
  return imgs
661
 
662
  language_codes = {
663
+ "afrikaans": "af",
664
+ "albanian": "sq",
665
+ "amharic": "am",
666
+ "arabic": "ar",
667
+ "armenian": "hy",
668
+ "assamese": "as",
669
+ "aymara": "ay",
670
+ "azerbaijani": "az",
671
+ "bambara": "bm",
672
+ "basque": "eu",
673
+ "belarusian": "be",
674
+ "bengali": "bn",
675
+ "bhojpuri": "bho",
676
+ "bosnian": "bs",
677
+ "bulgarian": "bg",
678
+ "catalan": "ca",
679
+ "cebuano": "ceb",
680
+ "chichewa": "ny",
681
+ "chinese (simplified)": "zh-CN",
682
+ "chinese (traditional)": "zh-TW",
683
+ "corsican": "co",
684
+ "croatian": "hr",
685
+ "czech": "cs",
686
+ "danish": "da",
687
+ "dhivehi": "dv",
688
+ "dogri": "doi",
689
+ "dutch": "nl",
690
+ "english": "en",
691
+ "esperanto": "eo",
692
+ "estonian": "et",
693
+ "ewe": "ee",
694
+ "filipino": "tl",
695
+ "finnish": "fi",
696
+ "french": "fr",
697
+ "frisian": "fy",
698
+ "galician": "gl",
699
+ "georgian": "ka",
700
+ "german": "de",
701
+ "greek": "el",
702
+ "guarani": "gn",
703
+ "gujarati": "gu",
704
+ "haitian creole": "ht",
705
+ "hausa": "ha",
706
+ "hawaiian": "haw",
707
+ "hebrew": "iw",
708
+ "hindi": "hi",
709
+ "hmong": "hmn",
710
+ "hungarian": "hu",
711
+ "icelandic": "is",
712
+ "igbo": "ig",
713
+ "ilocano": "ilo",
714
+ "indonesian": "id",
715
+ "irish": "ga",
716
+ "italian": "it",
717
+ "japanese": "ja",
718
+ "javanese": "jw",
719
+ "kannada": "kn",
720
+ "kazakh": "kk",
721
+ "khmer": "km",
722
+ "kinyarwanda": "rw",
723
+ "konkani": "gom",
724
+ "korean": "ko",
725
+ "krio": "kri",
726
+ "kurdish (kurmanji)": "ku",
727
+ "kurdish (sorani)": "ckb",
728
+ "kyrgyz": "ky",
729
+ "lao": "lo",
730
+ "latin": "la",
731
+ "latvian": "lv",
732
+ "lingala": "ln",
733
+ "lithuanian": "lt",
734
+ "luganda": "lg",
735
+ "luxembourgish": "lb",
736
+ "macedonian": "mk",
737
+ "maithili": "mai",
738
+ "malagasy": "mg",
739
+ "malay": "ms",
740
+ "malayalam": "ml",
741
+ "maltese": "mt",
742
+ "maori": "mi",
743
+ "marathi": "mr",
744
+ "meiteilon (manipuri)": "mni-Mtei",
745
+ "mizo": "lus",
746
+ "mongolian": "mn",
747
+ "myanmar": "my",
748
+ "nepali": "ne",
749
+ "norwegian": "no",
750
+ "odia (oriya)": "or",
751
+ "oromo": "om",
752
+ "pashto": "ps",
753
+ "persian": "fa",
754
+ "polish": "pl",
755
+ "portuguese": "pt",
756
+ "punjabi": "pa",
757
+ "quechua": "qu",
758
+ "romanian": "ro",
759
+ "russian": "ru",
760
+ "samoan": "sm",
761
+ "sanskrit": "sa",
762
+ "scots gaelic": "gd",
763
+ "sepedi": "nso",
764
+ "serbian": "sr",
765
+ "sesotho": "st",
766
+ "shona": "sn",
767
+ "sindhi": "sd",
768
+ "sinhala": "si",
769
+ "slovak": "sk",
770
+ "slovenian": "sl",
771
+ "somali": "so",
772
+ "spanish": "es",
773
+ "sundanese": "su",
774
+ "swahili": "sw",
775
+ "swedish": "sv",
776
+ "tajik": "tg",
777
+ "tamil": "ta",
778
+ "tatar": "tt",
779
+ "telugu": "te",
780
+ "thai": "th",
781
+ "tigrinya": "ti",
782
+ "tsonga": "ts",
783
+ "turkish": "tr",
784
+ "turkmen": "tk",
785
+ "twi": "ak",
786
+ "ukrainian": "uk",
787
+ "urdu": "ur",
788
+ "uyghur": "ug",
789
+ "uzbek": "uz",
790
+ "vietnamese": "vi",
791
+ "welsh": "cy",
792
+ "xhosa": "xh",
793
+ "yiddish": "yi",
794
+ "yoruba": "yo",
795
+ "zulu": "zu",
796
  }
797
 
798
+ class BaseTranslator(ABC):
799
+ """
800
+ Abstract class that serve as a base translator for other different translators
801
+ """
802
+
803
+ def __init__(
804
+ self,
805
+ base_url: str = None,
806
+ languages: dict = language_codes,
807
+ source: str = "auto",
808
+ target: str = "en",
809
+ payload_key: Optional[str] = None,
810
+ element_tag: Optional[str] = None,
811
+ element_query: Optional[dict] = None,
812
+ **url_params,
813
+ ):
814
+ """
815
+ @param source: source language to translate from
816
+ @param target: target language to translate to
817
+ """
818
+ self._base_url = base_url
819
+ self._languages = languages
820
+ self._supported_languages = list(self._languages.keys())
821
+ if not source:
822
+ raise InvalidSourceOrTargetLanguage(source)
823
+ if not target:
824
+ raise InvalidSourceOrTargetLanguage(target)
825
+
826
+ self._source, self._target = self._map_language_to_code(source, target)
827
+ self._url_params = url_params
828
+ self._element_tag = element_tag
829
+ self._element_query = element_query
830
+ self.payload_key = payload_key
831
+ super().__init__()
832
+
833
+ @property
834
+ def source(self):
835
+ return self._source
836
+
837
+ @source.setter
838
+ def source(self, lang):
839
+ self._source = lang
840
+
841
+ @property
842
+ def target(self):
843
+ return self._target
844
+
845
+ @target.setter
846
+ def target(self, lang):
847
+ self._target = lang
848
+
849
+ def _type(self):
850
+ return self.__class__.__name__
851
+
852
+ def _map_language_to_code(self, *languages):
853
+ """
854
+ map language to its corresponding code (abbreviation) if the language was passed
855
+ by its full name by the user
856
+ @param languages: list of languages
857
+ @return: mapped value of the language or raise an exception if the language is
858
+ not supported
859
+ """
860
+ for language in languages:
861
+ if language in self._languages.values() or language == "auto":
862
+ yield language
863
+ elif language in self._languages.keys():
864
+ yield self._languages[language]
865
+ else:
866
+ raise LanguageNotSupportedException(
867
+ language,
868
+ message=f"No support for the provided language.\n"
869
+ f"Please select on of the supported languages:\n"
870
+ f"{self._languages}",
871
+ )
872
+
873
+ def _same_source_target(self) -> bool:
874
+ return self._source == self._target
875
+
876
+ def get_supported_languages(
877
+ self, as_dict: bool = False, **kwargs
878
+ ) -> Union[list, dict]:
879
+ """
880
+ return the supported languages by the Google translator
881
+ @param as_dict: if True, the languages will be returned as a dictionary
882
+ mapping languages to their abbreviations
883
+ @return: list or dict
884
+ """
885
+ return self._supported_languages if not as_dict else self._languages
886
+
887
+ def is_language_supported(self, language: str, **kwargs) -> bool:
888
+ """
889
+ check if the language is supported by the translator
890
+ @param language: a string for 1 language
891
+ @return: bool or raise an Exception
892
+ """
893
+ if (
894
+ language == "auto"
895
+ or language in self._languages.keys()
896
+ or language in self._languages.values()
897
+ ):
898
+ return True
899
+ else:
900
+ return False
901
+
902
+ @abstractmethod
903
+ def translate(self, text: str, **kwargs) -> str:
904
+ """
905
+ translate a text using a translator under the hood and return
906
+ the translated text
907
+ @param text: text to translate
908
+ @param kwargs: additional arguments
909
+ @return: str
910
+ """
911
+ return NotImplemented("You need to implement the translate method!")
912
+
913
+ def _read_docx(self, f: str):
914
+ import docx2txt
915
+
916
+ return docx2txt.process(f)
917
+
918
+ def _read_pdf(self, f: str):
919
+ import pypdf
920
+
921
+ reader = pypdf.PdfReader(f)
922
+ page = reader.pages[0]
923
+ return page.extract_text()
924
+
925
+ def _translate_file(self, path: str, **kwargs) -> str:
926
+ """
927
+ translate directly from file
928
+ @param path: path to the target file
929
+ @type path: str
930
+ @param kwargs: additional args
931
+ @return: str
932
+ """
933
+ if not isinstance(path, Path):
934
+ path = Path(path)
935
+
936
+ if not path.exists():
937
+ print("Path to the file is wrong!")
938
+ exit(1)
939
+
940
+ ext = path.suffix
941
+
942
+ if ext == ".docx":
943
+ text = self._read_docx(f=str(path))
944
+
945
+ elif ext == ".pdf":
946
+ text = self._read_pdf(f=str(path))
947
+ else:
948
+ with open(path, "r", encoding="utf-8") as f:
949
+ text = f.read().strip()
950
+
951
+ return self.translate(text)
952
+
953
+ def _translate_batch(self, batch: List[str], **kwargs) -> List[str]:
954
+ """
955
+ translate a list of texts
956
+ @param batch: list of texts you want to translate
957
+ @return: list of translations
958
+ """
959
+ if not batch:
960
+ raise Exception("Enter your text list that you want to translate")
961
+ arr = []
962
+ for i, text in enumerate(batch):
963
+ translated = self.translate(text, **kwargs)
964
+ arr.append(translated)
965
+ return arr
966
+
967
  def translate(txt,to_lang="en",from_lang=False):
968
  log(f'CALL translate')
969
  if not from_lang:
 
971
  if(from_lang == to_lang):
972
  log(f'RET translate with txt as {txt}')
973
  return txt
974
+
975
+
976
+ translator_endpoint = "https://api.cognitive.microsofttranslator.com/translate?api-version=3.0"
977
+
978
  translator = Translator(from_lang=from_lang,to_lang=to_lang)
979
  translation = ""
980
  if len(txt) > 490: