|
|
|
"""Class Declaration of Transformer's Input layers.""" |
|
|
|
import chainer |
|
|
|
import chainer.functions as F |
|
import chainer.links as L |
|
|
|
from espnet.nets.chainer_backend.transformer.embedding import PositionalEncoding |
|
|
|
import logging |
|
import numpy as np |
|
|
|
|
|
class Conv2dSubsampling(chainer.Chain): |
|
"""Convolutional 2D subsampling (to 1/4 length). |
|
|
|
:param int idim: input dim |
|
:param int odim: output dim |
|
:param flaot dropout_rate: dropout rate |
|
|
|
""" |
|
|
|
def __init__( |
|
self, channels, idim, dims, dropout=0.1, initialW=None, initial_bias=None |
|
): |
|
"""Initialize Conv2dSubsampling.""" |
|
super(Conv2dSubsampling, self).__init__() |
|
self.dropout = dropout |
|
with self.init_scope(): |
|
|
|
n = 1 * 3 * 3 |
|
stvd = 1.0 / np.sqrt(n) |
|
self.conv1 = L.Convolution2D( |
|
1, |
|
channels, |
|
3, |
|
stride=2, |
|
pad=1, |
|
initialW=initialW(scale=stvd), |
|
initial_bias=initial_bias(scale=stvd), |
|
) |
|
n = channels * 3 * 3 |
|
stvd = 1.0 / np.sqrt(n) |
|
self.conv2 = L.Convolution2D( |
|
channels, |
|
channels, |
|
3, |
|
stride=2, |
|
pad=1, |
|
initialW=initialW(scale=stvd), |
|
initial_bias=initial_bias(scale=stvd), |
|
) |
|
stvd = 1.0 / np.sqrt(dims) |
|
self.out = L.Linear( |
|
idim, |
|
dims, |
|
initialW=initialW(scale=stvd), |
|
initial_bias=initial_bias(scale=stvd), |
|
) |
|
self.pe = PositionalEncoding(dims, dropout) |
|
|
|
def forward(self, xs, ilens): |
|
"""Subsample x. |
|
|
|
:param chainer.Variable x: input tensor |
|
:return: subsampled x and mask |
|
|
|
""" |
|
xs = self.xp.array(xs[:, None]) |
|
xs = F.relu(self.conv1(xs)) |
|
xs = F.relu(self.conv2(xs)) |
|
batch, _, length, _ = xs.shape |
|
xs = self.out(F.swapaxes(xs, 1, 2).reshape(batch * length, -1)) |
|
xs = self.pe(xs.reshape(batch, length, -1)) |
|
|
|
ilens = np.ceil(np.array(ilens, dtype=np.float32) / 2).astype(np.int) |
|
ilens = np.ceil(np.array(ilens, dtype=np.float32) / 2).astype(np.int) |
|
return xs, ilens |
|
|
|
|
|
class LinearSampling(chainer.Chain): |
|
"""Linear 1D subsampling. |
|
|
|
:param int idim: input dim |
|
:param int odim: output dim |
|
:param flaot dropout_rate: dropout rate |
|
|
|
""" |
|
|
|
def __init__(self, idim, dims, dropout=0.1, initialW=None, initial_bias=None): |
|
"""Initialize LinearSampling.""" |
|
super(LinearSampling, self).__init__() |
|
stvd = 1.0 / np.sqrt(dims) |
|
self.dropout = dropout |
|
with self.init_scope(): |
|
self.linear = L.Linear( |
|
idim, |
|
dims, |
|
initialW=initialW(scale=stvd), |
|
initial_bias=initial_bias(scale=stvd), |
|
) |
|
self.pe = PositionalEncoding(dims, dropout) |
|
|
|
def forward(self, xs, ilens): |
|
"""Subsample x. |
|
|
|
:param chainer.Variable x: input tensor |
|
:return: subsampled x and mask |
|
|
|
""" |
|
logging.info(xs.shape) |
|
xs = self.linear(xs, n_batch_axes=2) |
|
logging.info(xs.shape) |
|
xs = self.pe(xs) |
|
return xs, ilens |
|
|