Spaces:
Sleeping
Sleeping
File size: 2,132 Bytes
2aebc50 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.!
#include "pretokenizer_for_training.h"
#include <string>
#include "third_party/absl/strings/str_replace.h"
namespace sentencepiece {
namespace pretokenizer {
namespace {
// TODO(taku): They are defined in trainer_interface.h but we
// defined them explicitly to avoid the dependency to trainier_interface.
// Currently, we have no separated build rules.
const char kWSStr[] = "\xe2\x96\x81";
} // namespace
std::vector<std::string> PretokenizerForTrainingInterface::PreTokenize(
absl::string_view text) const {
return Postprocess(Tokenize(Preprocess(text)));
}
// static
std::string PretokenizerForTrainingInterface::Preprocess(
absl::string_view text) {
// Escapes kWSStr (_) as this character may not be processed by pre-tokenizer.
return absl::StrReplaceAll(text, {{kWSStr, " "}});
}
// static
std::vector<std::string> PretokenizerForTrainingInterface::Postprocess(
const SentencePieceText &spt) {
// Inserts kUPPBoundaryStr before/after of token boundaries.
std::vector<std::string> result;
std::string output;
int prev = 0;
for (const auto &piece : spt.pieces()) {
if (prev == piece.begin() && piece.begin() != 0) {
result.push_back(output);
output.clear();
} else {
output.append(piece.begin() - prev, ' ');
}
output += piece.surface();
prev = piece.end();
}
if (!output.empty()) result.push_back(output);
for (auto &w : result) w = absl::StrReplaceAll(w, {{" ", kWSStr}});
return result;
}
} // namespace pretokenizer
} // namespace sentencepiece
|