Spaces:
Paused
Paused
| # cython: language_level=3 | |
| """ | |
| Copyright 2019 Brian Thompson | |
| Licensed under the Apache License, Version 2.0 (the "License"); | |
| you may not use this file except in compliance with the License. | |
| You may obtain a copy of the License at | |
| https://www.apache.org/licenses/LICENSE-2.0 | |
| Unless required by applicable law or agreed to in writing, software | |
| distributed under the License is distributed on an "AS IS" BASIS, | |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| See the License for the specific language governing permissions and | |
| limitations under the License. | |
| """ | |
| import numpy as np | |
| cimport numpy as np | |
| cimport cython | |
| def make_x_y_offsets(alignment_types): | |
| # alignment types for which we will precompute costs | |
| # deletion/insertion is added later | |
| for x, y in alignment_types: | |
| assert (x > 0) | |
| assert (y > 0) | |
| x_offsets = np.array([x for x, y in alignment_types], dtype=np.int32) # MUST **NOT** INCLUDE (0,1), (1,0) | |
| y_offsets = np.array([y for x, y in alignment_types], dtype=np.int32) # MUST **NOT** INCLUDE (0,1), (1,0) | |
| return x_offsets, y_offsets | |
| def make_dense_costs(np.ndarray[float, ndim=3] vecs0, # itput | |
| np.ndarray[float, ndim=3] vecs1, # input | |
| np.ndarray[float, ndim=2] norm0, # input | |
| np.ndarray[float, ndim=2] norm1, # input | |
| int offset0 = 0, # index into vecs0/norms0 | |
| int offset1 = 0, # index into vecs1/norms1 | |
| ): | |
| """ | |
| Make a full N*M feature matrix. By default, makes 1-1 alignments, | |
| can build others by specifying offset0, offset1 to index into | |
| vecs0, norms0 and vecs1, norms1 respectivly. | |
| """ | |
| assert vecs0.shape[0] > offset0 | |
| assert vecs1.shape[0] > offset1 | |
| assert norm0.shape[0] > offset0 | |
| assert norm1.shape[0] > offset1 | |
| cdef int size0 = np.shape(vecs0)[1] | |
| assert norm0.shape[1] == size0 | |
| cdef int size1 = np.shape(vecs1)[1] | |
| assert norm1.shape[1] == size1 | |
| cdef int vecsize = np.shape(vecs0)[2] | |
| assert vecs1.shape[2] == vecsize | |
| cdef int xi, yi | |
| cdef float sumx | |
| cdef np.ndarray[float, ndim=2] costs = np.empty((size0, size1), dtype=np.float32) | |
| for xi in range(size0): | |
| for yi in range(size1): | |
| sumx = 0.0 | |
| for jj in range(vecsize): | |
| sumx += vecs0[offset0, xi, jj] * vecs1[offset1, yi, jj] | |
| costs[xi, yi] = 2.0 * (1.0 - sumx) / (1e-6 + norm0[offset0, xi] + norm1[offset1, yi]) | |
| # normalize by alignment type | |
| costs[xi, yi] = costs[xi, yi] * (offset0 + 1) * (offset1 + 1) | |
| return costs | |
| def dense_dp(np.ndarray[float, ndim=2] alignment_cost, float pen): | |
| """ | |
| Compute cost matrix (csum) and backpointers (bp) | |
| from full 2-D 1-1 alignment costs matrix (alignment_cost) | |
| """ | |
| size0 = alignment_cost.shape[0] | |
| size1 = alignment_cost.shape[1] | |
| # csum and traceback matrix are both on nodes | |
| # so they are +1 in each dimension compared to the jump costs matrix | |
| # For anything being used in accumulation, use float64 | |
| cdef np.ndarray[double, ndim=2] csum = np.empty((size0 + 1, size1 + 1), dtype=np.float64) | |
| cdef np.ndarray[int, ndim=2] bp = np.empty((size0 + 1, size1 + 1), dtype=np.int32) | |
| # bp and csum are nodes, | |
| # while alignment_cost is the cost of going between the nodes | |
| # Size of nodes should be one larger than alignment costs | |
| b0, b1 = np.shape(bp) | |
| c0, c1 = np.shape(csum) | |
| j0, j1 = np.shape(alignment_cost) | |
| assert (b0 == c0 == j0 + 1) | |
| assert (b1 == c1 == j1 + 1) | |
| cdef int cmax = np.shape(csum)[1] | |
| cdef int rmax = np.shape(csum)[0] | |
| cdef int c, r | |
| cdef double cost0, cost1, cost2 | |
| # initialize the all c-direction deletion path | |
| for c in range(cmax): | |
| csum[0, c] = c * pen | |
| bp[0, c] = 1 | |
| # initialize the all r-direction deletion path | |
| for r in range(rmax): | |
| csum[r, 0] = r * pen | |
| bp[r, 0] = 2 | |
| # Initial cost is 0.0 | |
| csum[0, 0] = 0.0 # noop | |
| bp[0, 0] = 4 # should not matter | |
| # Calculate the rest recursively | |
| for c in range(1, cmax): | |
| for r in range(1, rmax): | |
| # alignment_cost indexes are off by 1 wrt | |
| # csum/bp, since csum/bp are nodes | |
| cost0 = csum[r - 1, c - 1] + alignment_cost[r - 1, c - 1] | |
| cost1 = csum[r, c - 1] + pen | |
| cost2 = csum[r - 1, c] + pen | |
| csum[r, c] = cost0 | |
| bp[r, c] = 0 | |
| if cost1 < csum[r, c]: | |
| csum[r, c] = cost1 | |
| bp[r, c] = 1 | |
| if cost2 < csum[r, c]: | |
| csum[r, c] = cost2 | |
| bp[r, c] = 2 | |
| return csum, bp | |
| def score_path(np.ndarray[int, ndim=1] xx, | |
| np.ndarray[int, ndim=1] yy, | |
| np.ndarray[float, ndim=1] norm1, | |
| np.ndarray[float, ndim=1] norm2, | |
| np.ndarray[float, ndim=2] vecs1, | |
| np.ndarray[float, ndim=2] vecs2, | |
| np.ndarray[float, ndim=1] out): | |
| cdef int xi, yi, ii, jj | |
| cdef float outx | |
| cdef int lenxy = xx.shape[0] | |
| cdef int vecsize = vecs1.shape[1] | |
| for ii in range(lenxy): | |
| xi = xx[ii] | |
| yi = yy[ii] | |
| outx = 0.0 | |
| for jj in range(vecsize): | |
| outx += vecs1[xi, jj] * vecs2[yi, jj] | |
| out[ii] = 2.0 * (1.0 - outx) / (norm1[xi] + norm2[yi]) | |
| # Bounds checking and wraparound slow things down by about 2x | |
| # Division by 0 checking has minimal speed impact | |
| # turn off bounds-checking for entire function | |
| # turn off negative index wrapping for entire function | |
| # use c-style division (no division-by-zero check) | |
| def make_sparse_costs(np.ndarray[float, ndim=3] vecs0, # intput: num aligns X num sents X dim | |
| np.ndarray[float, ndim=3] vecs1, # input | |
| np.ndarray[float, ndim=2] norms0, # intput: num aligns X num sents | |
| np.ndarray[float, ndim=2] norms1, # input | |
| x_y_path, | |
| alignment_types, | |
| int width_over2): | |
| """ | |
| Make features for DP, *for lines running across approximate path*, *for each alignment type* | |
| x_offsets, y_offsets should not include (0,1), (1,0) | |
| Basically, we take the feature matrix, rotate it 45 degress, | |
| and compute a "wavy" matrix for the features. | |
| It's like the diagonal but it moves around to hopefully always include the true path. | |
| """ | |
| cdef np.ndarray[int, ndim=2] x_y_path_ = np.array(x_y_path).astype(np.int32) | |
| assert (vecs0.shape[0] == norms0.shape[0]) | |
| assert (vecs1.shape[0] == norms1.shape[0]) | |
| assert (vecs0.shape[1] == norms0.shape[1]) | |
| assert (vecs1.shape[1] == norms1.shape[1]) | |
| # check how many overlaps vectors were passed in | |
| num_overlaps_in_vecs0 = vecs0.shape[0] | |
| num_overlaps_in_vecs1 = vecs1.shape[0] | |
| # check how many overlaps were requested | |
| # edge case: alignment_types could be empty | |
| # In that case, we should just return insertions/deletions | |
| # and max_x_overlap == max_y_overlap == 0 | |
| max_x_overlap = max([0] + [x for x, y in alignment_types]) # add [0] in case alignment_types is empty | |
| max_y_overlap = max([0] + [y for x, y in alignment_types]) # add [0] in case alignment_types is empty | |
| # note: alignment types are specified 1-based, but vectors are stored 0-based | |
| if max_x_overlap > num_overlaps_in_vecs0: | |
| raise Exception('%d x overlaps requrested (via alignment_types), but vecs0 only has %d' % ( | |
| max_x_overlap, num_overlaps_in_vecs0)) | |
| if max_y_overlap > num_overlaps_in_vecs1: | |
| raise Exception('%d y overlaps requrested (via alignment_types), but vecs1 only has %d' % ( | |
| max_y_overlap, num_overlaps_in_vecs1)) | |
| # number of sentences in each document | |
| cdef int xsize = vecs0.shape[1] | |
| cdef int ysize = vecs1.shape[1] | |
| # vector diminsions should match | |
| assert (vecs0.shape[2] == vecs1.shape[2]) | |
| cdef np.ndarray[int, ndim=1] x_offsets, y_offsets | |
| x_offsets, y_offsets = make_x_y_offsets(alignment_types) | |
| # reserve outputs | |
| a_len = x_y_path_.shape[0] | |
| b_len = 2 * width_over2 | |
| cdef np.ndarray[float, ndim=3] a_b_feats = np.empty((len(alignment_types), a_len, b_len), dtype=np.float32) | |
| cdef np.ndarray[int, ndim=1] b_offset = np.empty(a_len).astype(np.int32) | |
| cdef int x, y, aa, bb, xx, yy, a_idx, b_idx, bb2, x_offset, y_offset, ii_align, x_offset_idx, y_offset_idx | |
| cdef int vecsize = vecs0.shape[2] | |
| cdef int num_alignments = x_offsets.shape[0] | |
| cdef float sumx, feat | |
| cdef float inf = np.inf | |
| for ii in range(x_y_path_.shape[0]): | |
| x = x_y_path_[ii, 0] | |
| y = x_y_path_[ii, 1] | |
| # convert xy to ab cords | |
| aa = x + y | |
| bb = y | |
| a_idx = aa | |
| b_offset[aa] = bb - width_over2 | |
| for b_idx, bb2 in enumerate(range(bb - width_over2, bb + width_over2)): | |
| # convert ab to xy cords | |
| xx = aa - bb2 | |
| yy = bb2 | |
| for ii_align in range(num_alignments): | |
| x_offset = x_offsets[ii_align] | |
| x_offset_idx = x_offset - 1 # overlaps start at 1, vectors stored 0-based | |
| y_offset = y_offsets[ii_align] | |
| y_offset_idx = y_offset - 1 | |
| if 0 <= xx < xsize and 0 <= yy < ysize: | |
| sumx = 0.0 | |
| for jj in range(vecsize): | |
| sumx += vecs0[x_offset_idx, xx, jj] * vecs1[y_offset_idx, yy, jj] | |
| feat = 2.0 * x_offset * y_offset * (1.0 - sumx) / ( | |
| 1e-6 + norms0[x_offset_idx, xx] + norms1[y_offset_idx, yy]) | |
| else: | |
| feat = inf | |
| a_b_feats[ii_align, a_idx, b_idx] = feat | |
| return a_b_feats, b_offset | |
| def sparse_dp(np.ndarray[float, ndim=3] a_b_costs, | |
| np.ndarray[int, ndim=1] b_offset_in, | |
| alignment_types, | |
| double del_penalty, | |
| int x_in_size, | |
| int y_in_size): | |
| """ | |
| Do DP along a path, using features saved off along path. | |
| x_offsets, y_offsets should not include (0,1), (1,0) | |
| xsize, ysize refer to the costs a_b_csum, but in x/y space | |
| As in the simpler full-DP case, | |
| we compute cumulative costs and backpointers on notes, | |
| and there are COSTS associated with moving between them. | |
| This means the size of the notes +1,+1 larger (in x,y) than the COSTS. | |
| So the size of a_b_csum, a_b_xp, a_b_yp are all one larger in x and y compared to the costs | |
| In order to save memory (and time, vs a sparse matrix with hashes to look up values), let: | |
| a = x + y | |
| b = x - y | |
| b_offsets tells us how far from the left edge the features are computed for. | |
| basically it's like we are computing along the diagonal, | |
| but we shift the diagonal around based on our belief | |
| about where the alignments are. | |
| b_offsets is used for both costs AND csum, backpointers, so it needs to be | |
| +2 longer (it is in the a-direction) than the costs (in the a direction) | |
| """ | |
| cdef np.ndarray[int, ndim=1] x_offsets, y_offsets | |
| x_offsets, y_offsets = make_x_y_offsets(alignment_types) | |
| # make x/y offsets, including (0,1), (1,), i.e. including deletion and insertion | |
| x_offsets = np.concatenate([x_offsets, np.array([0, 1], dtype=np.int32)]) | |
| y_offsets = np.concatenate([y_offsets, np.array([1, 0], dtype=np.int32)]) | |
| cdef int a_in_size = a_b_costs.shape[1] | |
| cdef int b_in_size = a_b_costs.shape[2] | |
| cdef int a_out_size = a_in_size + 2 | |
| cdef int b_out_size = b_in_size | |
| cdef int x_out_size = x_in_size + 1 | |
| cdef int y_out_size = y_in_size + 1 | |
| # costs are the costs of going between nodes. | |
| # in x,y for the nodes, we basically add a buffer | |
| # at x=0 and y=0, and shift the cost by (x=+1,y=+1) | |
| # In a,b space, this means adding two points (for the buffer) | |
| # at the beginning, and shifting by (a=+0,b=+1) since | |
| # a=x+y and b=y | |
| # for the first two points, we can simply replicate the | |
| # original b_offset, since it should be -width_over2 | |
| # i.e. b_offset_in[0] == -width_over2 | |
| extra_two_points = np.array([b_offset_in[0], b_offset_in[0]], dtype=np.int32) | |
| cdef np.ndarray[int, ndim=1] b_offset_out = np.concatenate([extra_two_points, b_offset_in + 1]) | |
| # outputs | |
| # For anything being used in accumulation, use float64 | |
| cdef np.ndarray[double, ndim=2] a_b_csum = np.zeros((a_in_size + 2, b_in_size), | |
| dtype=np.float64) + np.inf # error cumulative sum | |
| cdef np.ndarray[int, ndim=2] a_b_xp = np.zeros((a_in_size + 2, b_in_size), dtype=np.int32) - 2 # backpointer for x | |
| cdef np.ndarray[int, ndim=2] a_b_yp = np.zeros((a_in_size + 2, b_in_size), dtype=np.int32) - 2 # backpointer for y | |
| cdef int num_alignments = x_offsets.shape[0] | |
| cdef double inf = np.inf | |
| cdef int xx_out, yy_out, ii_align, x_offset, y_offset | |
| cdef int aa_in_cost, bb_in_cost, aa_out, bb_out, aa_out_prev, bb_out_prev, xx_in_cost, yy_in_cost, xx_out_prev, yy_out_prev | |
| cdef double alignment_cost, total_cost, prev_cost | |
| # increasing in a is the same as going along diagonals in x/y, so DP order works | |
| # (and any ordering is fine in b - nothing depends on values adjacent on diagonal in x/y) | |
| for aa_out in range(a_in_size + 2): | |
| for bb_out in range(b_in_size): | |
| #xx_out, yy_out = ab2xy_w_offset(aa_out, bb_out, b_offset_out) | |
| yy_out = bb_out + b_offset_out[aa_out] | |
| xx_out = aa_out - yy_out | |
| # edge case: all deletions in y-direction | |
| if xx_out == 0 and 0 <= yy_out < y_out_size: | |
| a_b_csum[aa_out, bb_out] = del_penalty * yy_out | |
| a_b_xp[aa_out, bb_out] = 0 | |
| a_b_yp[aa_out, bb_out] = 1 | |
| # edge case: all deletions in x-direction | |
| elif yy_out == 0 and 0 <= xx_out < x_out_size: | |
| a_b_csum[aa_out, bb_out] = del_penalty * xx_out | |
| a_b_xp[aa_out, bb_out] = 1 | |
| a_b_yp[aa_out, bb_out] = 0 | |
| else: | |
| # initialize output to inf | |
| a_b_csum[aa_out, bb_out] = inf | |
| a_b_xp[aa_out, bb_out] = -42 | |
| a_b_yp[aa_out, bb_out] = -42 | |
| for ii_align in range(num_alignments): | |
| x_offset = x_offsets[ii_align] | |
| y_offset = y_offsets[ii_align] | |
| # coords of location of alignment cost, in input x/y space | |
| xx_in_cost = xx_out - 1 # features were front padded, | |
| yy_in_cost = yy_out - 1 # so offset is always 1 | |
| # the coords of location of previous cumsum cost, in input x/y space | |
| xx_out_prev = xx_out - x_offset | |
| yy_out_prev = yy_out - y_offset | |
| if 0 <= xx_in_cost < x_in_size and 0 <= yy_in_cost < y_in_size and 0 <= xx_out_prev < x_out_size and 0 <= yy_out_prev < y_out_size: | |
| # convert x,y to a,b | |
| aa_in_cost = xx_in_cost + yy_in_cost | |
| bb_in_cost = yy_in_cost - b_offset_in[aa_in_cost] | |
| aa_out_prev = xx_out_prev + yy_out_prev | |
| bb_out_prev = yy_out_prev - b_offset_out[aa_out_prev] | |
| if 0 <= aa_in_cost < a_in_size and 0 <= bb_in_cost < b_in_size and 0 <= aa_out_prev < a_out_size and 0 <= bb_out_prev < b_out_size: | |
| if x_offset == 0 or y_offset == 0: | |
| alignment_cost = del_penalty | |
| else: | |
| alignment_cost = a_b_costs[ii_align, aa_in_cost, bb_in_cost] | |
| prev_cost = a_b_csum[aa_out_prev, bb_out_prev] | |
| total_cost = prev_cost + alignment_cost | |
| if total_cost < a_b_csum[aa_out, bb_out]: | |
| a_b_csum[aa_out, bb_out] = total_cost | |
| a_b_xp[aa_out, bb_out] = x_offset | |
| a_b_yp[aa_out, bb_out] = y_offset | |
| return a_b_csum, a_b_xp, a_b_yp, b_offset_out | |