Spaces:
Paused
Paused
| #!/usr/bin/python | |
| # Copyright (c) Facebook, Inc. and its affiliates. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the BSD-style license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| # | |
| # LASER Language-Agnostic SEntence Representations | |
| # is a toolkit to calculate multilingual sentence embeddings | |
| # and to use them for document classification, bitext filtering | |
| # and mining | |
| # | |
| # -------------------------------------------------------- | |
| # | |
| # Romanize and lower case text | |
| import os | |
| import sys | |
| import argparse | |
| from transliterate import translit, get_available_language_codes | |
| parser = argparse.ArgumentParser( | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| description="Calculate multilingual sentence encodings") | |
| parser.add_argument( | |
| '--input', '-i', type=argparse.FileType('r', encoding='UTF-8'), | |
| default=sys.stdin, | |
| metavar='PATH', | |
| help="Input text file (default: standard input).") | |
| parser.add_argument( | |
| '--output', '-o', type=argparse.FileType('w', encoding='UTF-8'), | |
| default=sys.stdout, | |
| metavar='PATH', | |
| help="Output text file (default: standard output).") | |
| parser.add_argument( | |
| '--language', '-l', type=str, | |
| metavar='STR', default="none", | |
| help="perform transliteration into Roman characters" | |
| " from the specified language (default none)") | |
| parser.add_argument( | |
| '--preserve-case', '-C', action='store_true', | |
| help="Preserve case of input texts (default is all lower case)") | |
| args = parser.parse_args() | |
| for line in args.input: | |
| if args.language != "none": | |
| line = translit(line, args.language, reversed=True) | |
| if not args.preserve_case: | |
| line = line.lower() | |
| args.output.write(line) | |