Spaces:
Paused
Paused
| # Copyright (c) Facebook, Inc. and its affiliates. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the BSD-style license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| # | |
| # LASER Language-Agnostic SEntence Representations | |
| # is a toolkit to calculate multilingual sentence embeddings | |
| # and to use them for document classification, bitext filtering | |
| # and mining | |
| # | |
| # -------------------------------------------------------- | |
| # | |
| # bash script to calculate sentence embeddings for arbitrary | |
| # text file | |
| ############################# | |
| # BEGIN PARAMETERS TO SET | |
| ############################# | |
| # location of models (e.g. /path/to/models); no trailing slash | |
| model_dir="laser" | |
| # version number for LASER3 models | |
| version=1 | |
| ############################# | |
| # END PARAMETERS TO SET | |
| ############################# | |
| if [ -z ${model_dir} ]; then | |
| echo "Please set model directory within script" | |
| exit 1 | |
| elif [ ! -d ${model_dir} ]; then | |
| echo "Can't find model directory: $model_dir" | |
| exit 1 | |
| fi | |
| if [ -z ${LASER} ] ; then | |
| echo "Please set the environment variable 'LASER'" | |
| exit 1 | |
| fi | |
| if [ $# -lt 2 ] ; then | |
| echo "usage: embed.sh input-file output-file [language]" | |
| exit 1 | |
| fi | |
| infile=$1 | |
| outfile=$2 | |
| language=$3 | |
| # default to laser2 | |
| model_file=${model_dir}/laser2.pt | |
| spm=${model_dir}/laser2.spm | |
| if [ ! -z ${language} ]; then | |
| model_file=${model_dir}/laser3-$language.v$version.pt | |
| lang_specific_spm=${model_dir}/laser3-$language.v$version.spm | |
| if [[ -s $lang_specific_spm ]]; then | |
| spm=$lang_specific_spm | |
| fi | |
| fi | |
| if [[ ! -s $model_file ]]; then | |
| echo "couldn't find model file: $model_file" | |
| exit 1 | |
| fi | |
| if [[ ! -s $spm ]]; then | |
| echo "couldn't find spm: $spm" | |
| exit 1 | |
| fi | |
| python3 ${LASER}/source/embed.py \ | |
| --input ${infile} \ | |
| --encoder ${model_file} \ | |
| --spm-model $spm \ | |
| --output ${outfile} \ | |
| --verbose | |