Spaces:
Runtime error
Runtime error
| # Copyright (c) Facebook, Inc. and its affiliates. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| # set -x -e | |
| if [ -z $WORKDIR_ROOT ] ; | |
| then | |
| echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." | |
| exit | |
| fi | |
| # put intermediate files | |
| TMP_DIR=$WORKDIR_ROOT/temp/af_xhv2 | |
| # output {train,valid,test} files to dest | |
| DEST=${WORKDIR_ROOT}/ML50/raw | |
| ROOT=${WORKDIR_ROOT} | |
| UTILS=$PWD/utils | |
| TMX2CORPUS="${UTILS}/tmx2corpus" | |
| TMX_TOOL="python ${TMX2CORPUS}/tmx2corpus.py" | |
| mkdir -p $TMP_DIR | |
| mkdir -p $DEST | |
| mkdir -p $UTILS | |
| function download_opus(){ | |
| src=$1 | |
| tgt=$2 | |
| subset=$3 | |
| ulr=$4 | |
| mkdir extract_$subset.$src-$tgt | |
| pushd extract_$subset.$src-$tgt | |
| if [ ! -f "$subset.$src-$tgt.tmx.gz" ]; then | |
| wget $url -O "$subset.$src-$tgt.tmx.gz" | |
| gzip -d "$subset.$src-$tgt.tmx.gz" | |
| f=$subset.$src-$tgt.tmx | |
| $TMX_TOOL $f | |
| mv bitext.$src ../$subset.$src-$tgt.$src | |
| mv bitext.$tgt ../$subset.$src-$tgt.$tgt | |
| fi | |
| popd | |
| } | |
| function concat_subsets(){ | |
| src=$1 | |
| tgt=$2 | |
| subsets=$3 | |
| src_train=raw_train.$src-$tgt.$src | |
| tgt_train=raw_train.$src-$tgt.$tgt | |
| > $src_train | |
| > $tgt_train | |
| for subset in $subsets; do | |
| cat $subset.$src-$tgt.$src >> $src_train | |
| cat $subset.$src-$tgt.$tgt >> $tgt_train | |
| done | |
| } | |
| function get_seeded_random() | |
| { | |
| seed="$1" | |
| openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \ | |
| </dev/zero 2>/dev/null | |
| } | |
| function split_train_valid(){ | |
| src=$1 | |
| tgt=$2 | |
| raw_src_train=raw_train.$src-$tgt.$src | |
| raw_tgt_train=raw_train.$src-$tgt.$tgt | |
| shuf --random-source=<(get_seeded_random 43) $raw_src_train > shuffled.$src-$tgt.$src | |
| shuf --random-source=<(get_seeded_random 43) $raw_tgt_train > shuffled.$src-$tgt.$tgt | |
| head -n 1500 shuffled.$src-$tgt.$src > valid.$src-$tgt.$src | |
| head -n 1500 shuffled.$src-$tgt.$tgt > valid.$src-$tgt.$tgt | |
| tail +1501 shuffled.$src-$tgt.$src > train.$src-$tgt.$src | |
| tail +1501 shuffled.$src-$tgt.$tgt > train.$src-$tgt.$tgt | |
| } | |
| function copy2dst(){ | |
| lsrc=$1 | |
| ltgt=$2 | |
| src=${lsrc:0:2} | |
| tgt=${ltgt:0:2} | |
| cp valid.$src-$tgt.$src $DEST/valid.$lsrc-$ltgt.$lsrc | |
| cp valid.$src-$tgt.$tgt $DEST/valid.$lsrc-$ltgt.$ltgt | |
| cp train.$src-$tgt.$src $DEST/train.$lsrc-$ltgt.$lsrc | |
| cp train.$src-$tgt.$tgt $DEST/train.$lsrc-$ltgt.$ltgt | |
| } | |
| #for xh-en | |
| declare -A xh_en_urls | |
| xh_en_urls=( | |
| [Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/en-xh.tmx.gz | |
| [wikimedia]=https://object.pouta.csc.fi/OPUS-wikimedia/v20190628/tmx/en-xh.tmx.gz | |
| [memat]=https://object.pouta.csc.fi/OPUS-memat/v1/tmx/en-xh.tmx.gz | |
| [uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/en-xh.tmx.gz | |
| [GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/en-xh.tmx.gz | |
| [XhosaNavy]=https://object.pouta.csc.fi/OPUS-XhosaNavy/v1/tmx/en-xh.tmx.gz | |
| [KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/en-xh.tmx.gz | |
| [Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/en-xh.tmx.gz | |
| ) | |
| mkdir $TMP_DIR/xh-en | |
| pushd $TMP_DIR/xh-en | |
| for k in "${!xh_en_urls[@]}" | |
| do | |
| name=$k | |
| url=${xh_en_urls[$k]} | |
| echo "$name: $url" | |
| download_opus xh en $name $ulr | |
| done | |
| concat_subsets xh en "${!xh_en_urls[@]}" | |
| split_train_valid xh en | |
| copy2dst xh_ZA en_XX | |
| popd | |
| ## | |
| #for af-en | |
| declare -A af_en_urls | |
| af_en_urls=( | |
| [Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/af-en.tmx.gz | |
| [uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/af-en.tmx.gz | |
| [GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/af-en.tmx.gz | |
| [QED]=https://object.pouta.csc.fi/OPUS-QED/v2.0a/tmx/af-en.tmx.gz | |
| [KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/af-en.tmx.gz | |
| [OpenSubtitles]=https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/tmx/af-en.tmx.gz | |
| [SPC]=https://object.pouta.csc.fi/OPUS-SPC/v1/tmx/af-en.tmx.gz | |
| [Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/af-en.tmx.gz | |
| ) | |
| mkdir $TMP_DIR/af-en | |
| pushd $TMP_DIR/af-en | |
| for k in "${!af_en_urls[@]}" | |
| do | |
| name=$k | |
| url=${af_en_urls[$k]} | |
| echo "$name: $url" | |
| download_opus af en $name $ulr | |
| done | |
| concat_subsets af en "${!af_en_urls[@]}" | |
| split_train_valid af en | |
| copy2dst af_ZA en_XX | |
| popd | |