Spaces:
Running
Running
Yaron Koresh
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -3,7 +3,6 @@ from bs4 import BeautifulSoup
|
|
3 |
from abc import ABC, abstractmethod
|
4 |
from pathlib import Path
|
5 |
from typing import List, Optional, Union
|
6 |
-
from langdetect import detect as get_language
|
7 |
from collections import namedtuple
|
8 |
from inspect import signature
|
9 |
import os
|
@@ -659,6 +658,7 @@ def all_pipes(pos,neg,artist,song):
|
|
659 |
|
660 |
return imgs
|
661 |
|
|
|
662 |
language_codes = {
|
663 |
"afrikaans": "af",
|
664 |
"albanian": "sq",
|
@@ -963,28 +963,121 @@ class BaseTranslator(ABC):
|
|
963 |
translated = self.translate(text, **kwargs)
|
964 |
arr.append(translated)
|
965 |
return arr
|
966 |
-
|
967 |
-
def translate(txt,to_lang="en",from_lang=False):
|
968 |
-
log(f'CALL translate')
|
969 |
-
if not from_lang:
|
970 |
-
from_lang = get_language(txt)
|
971 |
-
if(from_lang == to_lang):
|
972 |
-
log(f'RET translate with txt as {txt}')
|
973 |
-
return txt
|
974 |
|
|
|
|
|
|
|
|
|
975 |
|
976 |
-
|
977 |
-
|
978 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
979 |
translation = ""
|
980 |
-
if len(txt) >
|
981 |
words = txt.split()
|
982 |
while len(words) > 0:
|
983 |
chunk = ""
|
984 |
-
while len(words) > 0 and len(chunk) <
|
985 |
chunk = chunk + " " + words[0]
|
986 |
words = words[1:]
|
987 |
-
if len(chunk) >
|
988 |
_words = chunk.split()
|
989 |
words = [_words[-1], *words]
|
990 |
chunk = " ".join(_words[:-1])
|
|
|
3 |
from abc import ABC, abstractmethod
|
4 |
from pathlib import Path
|
5 |
from typing import List, Optional, Union
|
|
|
6 |
from collections import namedtuple
|
7 |
from inspect import signature
|
8 |
import os
|
|
|
658 |
|
659 |
return imgs
|
660 |
|
661 |
+
google_translate_endpoint = "https://translate.google.com/m"
|
662 |
language_codes = {
|
663 |
"afrikaans": "af",
|
664 |
"albanian": "sq",
|
|
|
963 |
translated = self.translate(text, **kwargs)
|
964 |
arr.append(translated)
|
965 |
return arr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
966 |
|
967 |
+
class GoogleTranslator(BaseTranslator):
|
968 |
+
"""
|
969 |
+
class that wraps functions, which use Google Translate under the hood to translate text(s)
|
970 |
+
"""
|
971 |
|
972 |
+
def __init__(
|
973 |
+
self,
|
974 |
+
source: str = "auto",
|
975 |
+
target: str = "en",
|
976 |
+
proxies: Optional[dict] = None,
|
977 |
+
**kwargs
|
978 |
+
):
|
979 |
+
"""
|
980 |
+
@param source: source language to translate from
|
981 |
+
@param target: target language to translate to
|
982 |
+
"""
|
983 |
+
self.proxies = proxies
|
984 |
+
super().__init__(
|
985 |
+
base_url=google_translate_endpoint,
|
986 |
+
source=source,
|
987 |
+
target=target,
|
988 |
+
element_tag="div",
|
989 |
+
element_query={"class": "t0"},
|
990 |
+
payload_key="q", # key of text in the url
|
991 |
+
**kwargs
|
992 |
+
)
|
993 |
+
|
994 |
+
self._alt_element_query = {"class": "result-container"}
|
995 |
+
|
996 |
+
def translate(self, text: str, **kwargs) -> str:
|
997 |
+
"""
|
998 |
+
function to translate a text
|
999 |
+
@param text: desired text to translate
|
1000 |
+
@return: str: translated text
|
1001 |
+
"""
|
1002 |
+
if is_input_valid(text, max_chars=1000):
|
1003 |
+
text = text.strip()
|
1004 |
+
if self._same_source_target() or is_empty(text):
|
1005 |
+
return text
|
1006 |
+
self._url_params["tl"] = self._target
|
1007 |
+
self._url_params["sl"] = self._source
|
1008 |
+
|
1009 |
+
if self.payload_key:
|
1010 |
+
self._url_params[self.payload_key] = text
|
1011 |
+
|
1012 |
+
response = requests.get(
|
1013 |
+
self._base_url, params=self._url_params, proxies=self.proxies
|
1014 |
+
)
|
1015 |
+
if response.status_code == 429:
|
1016 |
+
raise TooManyRequests()
|
1017 |
+
|
1018 |
+
if request_failed(status_code=response.status_code):
|
1019 |
+
raise RequestError()
|
1020 |
+
|
1021 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
1022 |
+
|
1023 |
+
element = soup.find(self._element_tag, self._element_query)
|
1024 |
+
response.close()
|
1025 |
+
|
1026 |
+
if not element:
|
1027 |
+
element = soup.find(self._element_tag, self._alt_element_query)
|
1028 |
+
if not element:
|
1029 |
+
raise TranslationNotFound(text)
|
1030 |
+
if element.get_text(strip=True) == text.strip():
|
1031 |
+
to_translate_alpha = "".join(
|
1032 |
+
ch for ch in text.strip() if ch.isalnum()
|
1033 |
+
)
|
1034 |
+
translated_alpha = "".join(
|
1035 |
+
ch for ch in element.get_text(strip=True) if ch.isalnum()
|
1036 |
+
)
|
1037 |
+
if (
|
1038 |
+
to_translate_alpha
|
1039 |
+
and translated_alpha
|
1040 |
+
and to_translate_alpha == translated_alpha
|
1041 |
+
):
|
1042 |
+
self._url_params["tl"] = self._target
|
1043 |
+
if "hl" not in self._url_params:
|
1044 |
+
return text.strip()
|
1045 |
+
del self._url_params["hl"]
|
1046 |
+
return self.translate(text)
|
1047 |
+
|
1048 |
+
else:
|
1049 |
+
return element.get_text(strip=True)
|
1050 |
+
|
1051 |
+
def translate_file(self, path: str, **kwargs) -> str:
|
1052 |
+
"""
|
1053 |
+
translate directly from file
|
1054 |
+
@param path: path to the target file
|
1055 |
+
@type path: str
|
1056 |
+
@param kwargs: additional args
|
1057 |
+
@return: str
|
1058 |
+
"""
|
1059 |
+
return self._translate_file(path, **kwargs)
|
1060 |
+
|
1061 |
+
def translate_batch(self, batch: List[str], **kwargs) -> List[str]:
|
1062 |
+
"""
|
1063 |
+
translate a list of texts
|
1064 |
+
@param batch: list of texts you want to translate
|
1065 |
+
@return: list of translations
|
1066 |
+
"""
|
1067 |
+
return self._translate_batch(batch, **kwargs)
|
1068 |
+
|
1069 |
+
def translate(txt,to_lang="en",from_lang="auto"):
|
1070 |
+
log(f'CALL translate')
|
1071 |
+
translator = GoogleTranslator(from_lang=from_lang,to_lang=to_lang)
|
1072 |
translation = ""
|
1073 |
+
if len(txt) > 1000:
|
1074 |
words = txt.split()
|
1075 |
while len(words) > 0:
|
1076 |
chunk = ""
|
1077 |
+
while len(words) > 0 and len(chunk) < 1000:
|
1078 |
chunk = chunk + " " + words[0]
|
1079 |
words = words[1:]
|
1080 |
+
if len(chunk) > 1000:
|
1081 |
_words = chunk.split()
|
1082 |
words = [_words[-1], *words]
|
1083 |
chunk = " ".join(_words[:-1])
|