Spaces:
Sleeping
Sleeping
File size: 2,892 Bytes
2aebc50 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.!
#include "pretokenizer_for_training.h"
#include "testharness.h"
#include "third_party/absl/strings/str_cat.h"
#include "third_party/absl/strings/str_join.h"
#include "third_party/absl/strings/str_split.h"
#include "trainer_interface.h"
namespace sentencepiece {
namespace pretokenizer {
class MockPretokenizer : public PretokenizerForTrainingInterface {
public:
MockPretokenizer() {}
~MockPretokenizer() {}
SentencePieceText Tokenize(absl::string_view text) const override {
return spt_;
}
util::Status status() const override { return util::OkStatus(); }
void SetOutput(const SentencePieceText &spt) { spt_ = spt; }
private:
SentencePieceText spt_;
};
TEST(PretokenizerForTrainingTest, BaseTest) {
MockPretokenizer mock;
{
SentencePieceText spt;
spt.set_text("I love sentencepiece");
auto *p1 = spt.add_pieces();
p1->set_surface("I");
p1->set_begin(0);
p1->set_end(1);
auto *p2 = spt.add_pieces();
p2->set_surface("love");
p2->set_begin(2);
p2->set_end(6);
auto *p3 = spt.add_pieces();
p3->set_surface("sentence");
p3->set_begin(7);
p3->set_end(15);
auto *p4 = spt.add_pieces();
p4->set_surface("piece");
p4->set_begin(15);
p4->set_end(20);
mock.SetOutput(spt);
const auto expected =
absl::StrCat("I", TrainerInterface::kWSStr, "love",
TrainerInterface::kWSStr, "sentence||||piece");
EXPECT_EQ(expected,
absl::StrJoin(mock.PreTokenize("I love sentencepiece"), "||||"));
}
{
SentencePieceText spt;
spt.set_text("γγγ―γγ³γ§γ");
auto *p1 = spt.add_pieces();
p1->set_surface("γγ");
p1->set_begin(0);
p1->set_end(6);
auto *p2 = spt.add_pieces();
p2->set_surface("γ―");
p2->set_begin(6);
p2->set_end(9);
auto *p3 = spt.add_pieces();
p3->set_surface("γγ³");
p3->set_begin(9);
p3->set_end(15);
auto *p4 = spt.add_pieces();
p4->set_surface("γ§γ");
p4->set_begin(15);
p4->set_end(21);
mock.SetOutput(spt);
const auto expected = "γγ||||γ―||||γγ³||||γ§γ";
EXPECT_EQ(expected,
absl::StrJoin(mock.PreTokenize("γγγ―γγ³γ§γ"), "||||"));
}
}
} // namespace pretokenizer
} // namespace sentencepiece
|