haoranzhao419 commited on
Commit
c2af953
·
1 Parent(s): 48d1611

Upload tokenizer

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ subtoken_reference.json filter=lfs diff=lfs merge=lfs -text
37
+ merges.json filter=lfs diff=lfs merge=lfs -text
chars.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ באדגוהחזיטכךםלןמסנףעץפקצשרתꙀꙁꙂꙃꙄꙅꙆꙇꙈꙉꙊꙋꙌꙍꙎꙏꙐꙑꙒꙓꙔꙕꙖꙗꙘꙙꙚꙛꙜꙝꙞꙟꙠꙡꙢꙣꙤꙥꙦꙧꙨꙩꙪꙫꙬꙭꙮ꙯꙰꙱꙲꙳ꙴꙵꙶꙷꙸꙹꙺꙻ꙼꙽꙾ꙿꚀꚁꚂꚃꚄꚅꚆꚇꚈꚉꚊꚋꚌꚍꚎꚏꚐꚑꚒꚓꚔꚕꚖꚗꚟЀЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяѐёђѓєѕіїјљњћќѝўџѠѡѢѣѤѥѦѧѨѩѪѫѬѭѮѯѰѱѲѳѴѵѶѷѸѹѺѻѼѽѾѿҀҁ҂҈҉ҊҋҌҍҎҏҐґҒғҔҕҖҗҘҙҚқҜҝҞҟҠҡҢңҤҥҦҧҨҩҪҫҬҭҮүҰұҲҳҴҵҶҷҸҹҺһҼҽҾҿӀӁӂӃӄӅӆӇӈӉӊӋӌӍӎӏӐӑӒӓӔӕӖӗӘәӚӛӜӝӞӟӠӡӢӣӤӥӦӧӨөӪӫӬӭӮӯӰӱӲӳӴӵӶӷӸӹӺӻӼӽӾӿԀԁԂԃԄԅԆԇԈԉԊԋԌԍԎԏԐԑԒԓԔԕԖԗԘԙԚԛԜԝԞԟԠԡԢԣԤԥԦԧἀἁἂἃἄἅἆἇἈἉἊἋἌἍἎἏἐἑἒἓἔἕἘἙἚἛἜἝἠἡἢἣἤἥἦἧἨἩἪἫἬἭἮἯἰἱἲἳἴἵἶἷἸἹἺἻἼἽἾἿὀὁὂὃὄὅὈὉὊὋὌὍὐὑὒὓὔὕὖὗὙὛὝὟὠὡὢὣὤὥὦὧὨὩὪὫὬὭὮὯὰάὲέὴήὶίὸόὺύὼώᾀᾁᾂᾃᾄᾅᾆᾇᾈᾉᾊᾋᾌᾍᾎᾏᾐᾑᾒᾓᾔᾕᾖᾗᾘᾙᾚᾛᾜᾝᾞᾟᾠᾡᾢᾣᾤᾥᾦᾧᾨᾩᾪᾫᾬᾭᾮᾯᾰᾱᾲᾳᾴᾶᾷᾸᾹᾺΆᾼ᾽ι᾿῀῁ῂῃῄῆῇῈΈῊΉῌ῍῎῏ῐῑῒΐῖῗῘῙῚΊ῝῞῟ῠῡῢΰῤῥῦῧῨῩῪΎῬ῭΅`ῲῳῴῶῷῸΌῺΏῼ´῾ϼϺϻϷϸ϶ϵϳϲϹϱϰϮϯϬϭϪϫϨϩϦϧϤϥϢϣϠϡϞϟϜϝϚϛϘϙϗϖϕϔϓϒϑϐΏώΎύΌόΫϋΪϊΩωΨψΧχΦφΥυΤτσΣςΡρΠπΟοΞξΝνΜμΛλΚκΙιΘθϴΗηΖζΕεΔδΓγΒβΑαΰΊίΉήΈέΆάΐͽϿͼϾͻϽȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏȐȑȒȓȔȕȖȗȴȵȶȘșȚțǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜǶǷǝǞǟǠǡǢǣǤǥǦǧǨǩǪǫǬǭǮǯǰDZDzdzǴǵǸǹǺǻǼǽǾǿƀƂƃƄƅƇƈƋƌƍƑƒƕƘƙƚƛƞƠơƢƣƤƥƧƨƪƫƬƭƯưƳƴƵƶƸƹƺƻƼƽƾƿƎƁƆƉƊƏƐƓƔƗƖƜƝƟƦƩƮƱƲƷɃȽȠȜȝȞȟȡȢȣȤȥȦȧȨȩȷȸȹȻȼȿɀɁɂɆɇɈɉɊɋɌɍɎɏɄɅȺȾȪȫȬȭȮȯȰȱȲȳDŽDždžLJLjljNJNjnjḂḃḊḋḞḟṀṁṖṗṠṡṪṫẀẁẂẃẄẅẛỲỳabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿچ‌گژءأآؤائکبتحجدخرذسزصشطضعظغفقملهنوًيیَُِّٔثپőű征安伎†ả箭ľ@ł介Œ四ởủ藩ɨžāąčḏđęĩĭ°ĽŁك則ō£őřɟ͡ũ火楽ー́粉上Č規リ₂井©±½ễńドňŌốộờ胡Šựế号żăʂ玄ćẒěğģḥīオ中²ịœ乙ś慎şţŽūıůŻćčČďĎęěĚĹĽľłŇňřŘšŠťŤůŮŽžăğıİşŞğĞݵćĆčČ𚊎žăĂćčČęěµńőŕřšŠŽž
merges.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f1b317e2025231b7f3c1a8a1baaa1f60f1abcf41000e71cadb55b435cff62ad
3
+ size 155103798
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
subtoken_reference.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ebe60b7d10310f37f08504ad458f718fda624eee21a09234f9b9222c6899084
3
+ size 74854818
tokenization_saffu.py ADDED
@@ -0,0 +1,1038 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC
2
+ import json, os, re, torch
3
+ from abc import abstractmethod
4
+ from transformers.utils import logging
5
+ from collections import Counter, defaultdict
6
+ from typing import TYPE_CHECKING, List, Optional, Tuple
7
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
8
+
9
+ logger = logging.get_logger(__name__)
10
+
11
+ VOCAB_FILES_NAMES = {
12
+ "subtoken_reference_file": "subtoken_reference.json",
13
+ "vocab_file": "vocab.json",
14
+ "merges_file": "merges.json",
15
+ "chars_file": "chars.txt"
16
+ }
17
+
18
+ PRETRAINED_VOCAB_FILES_MAP = {
19
+ "subtoken_reference_file": {
20
+ "https://huggingface.co/saffu-BBLM10M/resolve/main/subtoken_reference.json",
21
+ "https://huggingface.co/saffu-BBLM100M/resolve/main/subtoken_reference.json",
22
+ },
23
+ "chars_file": {
24
+ "https://huggingface.co/saffu-BBLM10M/resolve/main/chars.json",
25
+ "https://huggingface.co/saffu-BBLM100M/resolve/main/chars.json",
26
+ },
27
+ "vocab_file": {
28
+ "saffu-BBLM10M": "https://huggingface.co/saffu-BBLM10M/resolve/main/vocab.json",
29
+ "saffu-BBLM100M": "https://huggingface.co/saffu-BBLM100M/resolve/main/vocab.json",
30
+ },
31
+ "merges_file": {
32
+ "saffu-BBLM10M": "https://huggingface.co/saffu-BBLM10M/resolve/main/merges.json",
33
+ "saffu-BBLM100M": "https://huggingface.co/saffu-BBLM100M/resolve/main/merges.json",
34
+ },
35
+ }
36
+
37
+ class SAFFUTokenizer(PreTrainedTokenizer):
38
+ """
39
+ Construct a SAFFU tokenizer. Based on rule-based pre-tokenization followed by Byte-Pair sub-word chunking.
40
+ """
41
+
42
+ vocab_files_names = VOCAB_FILES_NAMES
43
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
44
+ model_input_names = ["input_ids"]
45
+
46
+ def __init__(
47
+ self,
48
+ vocab_file,
49
+ subtoken_reference_file,
50
+ merges_file,
51
+ chars_file,
52
+ r = 2,
53
+ block_size = 100,
54
+ heads = 2,
55
+ space = False,
56
+ pad = "<pad>",
57
+ oov = "<oov>",
58
+ sod = "<sod>",
59
+ eod = "<eod>",
60
+ frg = "<frg>",
61
+ **kwargs,
62
+ ):
63
+ super().__init__(
64
+ **kwargs,
65
+ )
66
+
67
+ self._r = r
68
+ self._space = space
69
+ self._heads = heads
70
+ self._block_size = block_size
71
+ self._raw_td = json.load(open(merges_file))
72
+ self._td = load_td(path = merges_file)
73
+ self._wordchars = re.sub(" ", "", open(chars_file).read().strip())
74
+ self._vocabulary = json.loads(open(vocab_file).read())
75
+ self._index = {self._vocabulary[t]: t for t in self._vocabulary}
76
+ self._subtoken_reference = json.loads(open(subtoken_reference_file).read())
77
+ self._pad = pad
78
+ self._oov = oov
79
+ self._sod = sod
80
+ self._eod = eod
81
+ self._frg = frg
82
+ self._padding = [self._vocabulary[self._pad]]*self._r
83
+ self._masking = [self._vocabulary[self._pad]]*self._block_size
84
+ self._heads_padding = [self._vocabulary[self._pad]]*self._heads
85
+
86
+ def save_vocabulary(self, save_directory: str,
87
+ filename_prefix: Optional[str] = None) -> Tuple[str]:
88
+ if not os.path.isdir(save_directory):
89
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
90
+ return
91
+ vocab_file = os.path.join(
92
+ save_directory, (filename_prefix + "-" if filename_prefix else "") +
93
+ VOCAB_FILES_NAMES["vocab_file"]
94
+ )
95
+ merge_file = os.path.join(
96
+ save_directory, (filename_prefix + "-" if filename_prefix else "") +
97
+ VOCAB_FILES_NAMES["merges_file"]
98
+ )
99
+ subtoken_reference_file = os.path.join(
100
+ save_directory, (filename_prefix + "-" if filename_prefix else "") +
101
+ VOCAB_FILES_NAMES["subtoken_reference_file"]
102
+ )
103
+ chars_file = os.path.join(
104
+ save_directory, (filename_prefix + "-" if filename_prefix else "") +
105
+ VOCAB_FILES_NAMES["chars_file"]
106
+ )
107
+
108
+ with open(vocab_file, "w", encoding="utf-8") as f:
109
+ f.write(json.dumps(self._vocabulary, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
110
+ with open(merge_file, "w", encoding="utf-8") as f:
111
+ f.write(json.dumps(self._raw_td, indent=2, sort_keys=True, ensure_ascii=False))
112
+ with open(subtoken_reference_file, "w", encoding="utf-8") as f:
113
+ f.write(json.dumps(self._subtoken_reference, indent=2, sort_keys=True, ensure_ascii=False))
114
+ with open(chars_file, "w", encoding="utf-8") as f:
115
+ f.write(self._wordchars)
116
+
117
+ return vocab_file, merge_file, subtoken_reference_file, chars_file
118
+
119
+ @property
120
+ def vocab_size(self):
121
+ return len(self._vocabulary)
122
+
123
+ @staticmethod
124
+ def word_tokenize(text, wordchars = "a-zA-Z0-9-'"):
125
+ return [token for token in re.split("(["+wordchars+"'-]+)", text) if token]
126
+
127
+ @staticmethod
128
+ def stick_spaces(stream):
129
+ tokens = []
130
+ for wi, w in enumerate(stream):
131
+ if not tokens:
132
+ tokens.append(w)
133
+ elif w == ' ':
134
+ if (tokens[-1][-1] != ' ') and (wi != len(stream)-1):
135
+ tokens.append(w)
136
+ else:
137
+ tokens[-1] = tokens[-1] + w
138
+ else:
139
+ if tokens[-1][-1] == ' ':
140
+ if tokens[-1] == ' ':
141
+ tokens[-1] = tokens[-1] + w
142
+ else:
143
+ tokens[-1] = tokens[-1][:-1]# + w
144
+ tokens.append(' ' + w)
145
+ else:
146
+ tokens.append(w)
147
+ return tokens
148
+
149
+ @staticmethod
150
+ def sentence_tokenize(text, wordchars = "a-zA-Z0-9-'", puncts = ".?!;:\n|"):
151
+ sentences = []
152
+ for sentence in re.split("(\s+(?<=["+puncts+"][^"+wordchars+"'-])\s*)", text):
153
+ if not sentence: continue
154
+ if not re.search("["+wordchars+"'-]", sentence):
155
+ if len(sentences):
156
+ if sentence[-1] == " ":
157
+ if len(sentence) > 1:
158
+ sentences[-1] = sentences[-1] + sentence[:-1]
159
+ sentences.append(sentence[-1])
160
+ else:
161
+ sentences.append(sentence)
162
+ else:
163
+ sentences[-1] = sentences[-1] + sentence
164
+ else:
165
+ sentences.append(sentence)
166
+ else:
167
+ if len(sentences):
168
+ if len(sentences[-1]) == 1 and sentences[-1] == " ":
169
+ sentences[-1] = sentences[-1] + sentence
170
+ else:
171
+ sentences.append(sentence)
172
+ else:
173
+ sentences.append(sentence)
174
+ return sentences
175
+
176
+ def bpe_tokenize(self, text): # , td = {}, reference = {}, space = False
177
+ stream = self._subtoken_reference.get(text, list_tokenize(text, td = self._td))
178
+ return (list(stream if self._space else self.stick_spaces(stream)))
179
+
180
+ def _tokenize(self, text):
181
+ """Tokenize a string."""
182
+ return [sub for s in self.sentence_tokenize(text, wordchars = self._wordchars)
183
+ for t in (self.word_tokenize(s, wordchars = self._wordchars) if self._space else
184
+ self.stick_spaces(self.word_tokenize(s, wordchars = self._wordchars)))
185
+ for sub in self.bpe_tokenize(t)]
186
+ return document
187
+
188
+ def preprocess(self, input_ids = []):
189
+ document = input_ids + [self._vocabulary[self._eod], self._vocabulary[self._pad]]
190
+ blocks = []; docsize = len(document)
191
+ for bi in range(int(docsize/self._block_size) + 1):
192
+ start = bi*self._block_size
193
+ if start > docsize: continue
194
+ end = min([(bi+1)*self._block_size, docsize])
195
+ data = [self._vocabulary[self._frg if bi else self._sod]] + document[start:end]
196
+ block = (lambda x: x[:3] + [x[3:]])(
197
+ list(map(list,
198
+ zip(*[(t, self._padding[:self._r - m] + data[:m] if m - self._r < 0 else data[m - self._r:m],
199
+ data[:m] + self._masking[:self._block_size - m] if m < self._block_size else data[:self._block_size],
200
+ *(self._heads_padding[:self._heads - m] + data[:m] if m - self._heads < 0 else
201
+ data[m - self._heads:m])[::-1]) # last heads tokens
202
+ for m, t in enumerate(data)])
203
+ )) )
204
+ blocks.append(block)
205
+ return blocks
206
+
207
+ def _convert_token_to_id(self, t):
208
+ """Converts a token (str) in an id using the vocab."""
209
+ return self._vocabulary.get(t, self._vocabulary.get(self._oov))
210
+
211
+ def _convert_id_to_token(self, i):
212
+ """Converts an index (integer) in a token (str) using the vocab."""
213
+ return self._index.get(i, self._oov)
214
+
215
+ def load_td(data = None, path = ''):
216
+ if data is None and path:
217
+ data = json.load(open(path))
218
+ td = {}
219
+ # from Tokenizer
220
+ td['tok2ind'] = data['tok2ind']
221
+ td['ind2tok'] = {v: k for k, v in td['tok2ind'].items()}
222
+ td['action_trace'] = [{'pair': tuple(a[0]), 'type': 'merge' if a[1] else 'split',
223
+ 'count': a[2], 'score': a[3]} for a in data['action_trace']]
224
+ td['tok2acts'] = defaultdict(list)
225
+ td['pair2merge'] = dict()
226
+ td['tok2splits'] = defaultdict(list)
227
+ for aix, a in enumerate(td['action_trace']):
228
+ if a['type'] =='split':
229
+ td['tok2acts']["".join(a['pair'])].append(aix)
230
+ td['tok2splits']["".join(a['pair'])].append(aix)
231
+ else:
232
+ td['pair2merge'][tuple(a['pair'])] = aix
233
+ td['tok2acts'][a['pair'][0]].append(aix)
234
+ td['tok2acts'][a['pair'][1]].append(aix)
235
+ td['maxtoklen'] = max([len(t) for t in td['tok2ind']])
236
+ # from BPE
237
+ if 'unigraph' in data:
238
+ td['unigraph'] = Counter(data['unigraph'])
239
+ td['digraph'] = Counter({(l, r): v for l, r, v in data['digraph']})
240
+ td['doc_unigraph'] = defaultdict(Counter)
241
+ for k, v in data['doc_unigraph'].items():
242
+ td['doc_unigraph'][k] = Counter(v)
243
+ td['init_method'] = data['init_method']
244
+ # from HRBPE
245
+ if 'param_method' in data:
246
+ td['param_method'] = data['param_method']
247
+ td['reg_model'] = data['reg_model']
248
+ td['early_stop'] = data['early_stop']
249
+ return td
250
+
251
+ def list_tokenize(text, td = {}):
252
+ assert td['action_trace'], "Can't tokenize, no trained model!"
253
+ mock = BPE()
254
+ mock.init([text], method=td['init_method'], apply=True) # method='char'
255
+ prev_aix = -1; available_action_indices = []; observed = set(); tokenizing = True
256
+ while tokenizing:
257
+ available_action_indices = sorted(list(filter(lambda next_aix: next_aix > prev_aix, available_action_indices)) +
258
+ [next_aix for next_aix in [aix for tok in mock._unigraph
259
+ for aix in td['tok2splits'][tok] if tok not in observed] +
260
+ [td['pair2merge'][pair] for pair in mock._digraph
261
+ if pair not in observed and pair in td['pair2merge']]
262
+ if next_aix > prev_aix])
263
+ observed = observed.union(set(mock._unigraph.keys()).union(set(mock._digraph.keys())))
264
+ if available_action_indices:
265
+ aix = available_action_indices[0]
266
+ else:
267
+ tokenizing = False
268
+ break
269
+ prev_aix = aix
270
+ action = td['action_trace'][aix]
271
+ if action['type'] == 'merge':
272
+ mock.merge(action['pair'])
273
+ else:
274
+ mock.split(action['pair'])
275
+ tks = []
276
+ for t, idxs in mock._tok_idx.items():
277
+ for ix in idxs:
278
+ tks.append((t, ix))
279
+ tks.sort(key=lambda ti: ti[1])
280
+ tks, _ = zip(*tks)
281
+ return tks
282
+
283
+ # purpose: cast a merge or split action object
284
+ # arguments: see __init__()
285
+ # prereqs: none
286
+ # use methods: none
287
+ # - __init__:
288
+ # use attributes:
289
+ # - pair: tuple, form of the action
290
+ # - type: str, 'merge' or 'split' describing the type of action
291
+ # - count: int, indicating the number of times an action has been observed (for ranking)
292
+ class Action:
293
+ # purpose: initialize a merge or split action object
294
+ # arguments:
295
+ # - pair: tuple, form of the action
296
+ # - type: str, 'merge' or 'split' describing the type of action
297
+ # - count: int, indicating the number of times an action has been observed (for ranking)
298
+ # prereqs: NA
299
+ # output: an initialized Action object for merge-split augmented BPE
300
+ def __init__(self, pair, type='merge', count=-1):
301
+ self.pair = pair
302
+ self.type = type
303
+ self.count = int(count)
304
+
305
+ # purpose: cast a tokenizer
306
+ # arguments: see __init__()
307
+ # prereqs:
308
+ # use methods:
309
+ # - __init__:
310
+ # - save: save a model for later use
311
+ # - load: load a saved model
312
+ # - init: initialize a model
313
+ # - fit: fit a model
314
+ # - tokenize: apply the model's trained tokenizer to a given text
315
+ # - display: plot the rank-frequency distribution of the current segmentation of the system's ingested data, as well as a given frequency model
316
+ # properties: the tokenizer's length is the size of its index
317
+ class Tokenizer(ABC):
318
+ # purpose: initialize a tokenizer
319
+ # arguments:
320
+ # - tok2ind: (optional) dict, used by .load to set the index
321
+ # prereqs: NA
322
+ # output: a model shell ready for training or trained parameters
323
+ def __init__(self, tok2ind=None):
324
+ if tok2ind is None:
325
+ self._tok2ind = {}
326
+ else:
327
+ self._tok2ind = tok2ind
328
+ self._ind2tok = {v: k for k, v in self._tok2ind.items()}
329
+ self._action_trace = []
330
+
331
+ # the tokenizer's length is the size of its index
332
+ def __len__(self):
333
+ return len(self._tok2ind)
334
+
335
+ # purpose: add a token to the index
336
+ # arguments: tok: str, the token to add to the index
337
+ def add_type(self, tok):
338
+ if tok not in self._tok2ind:
339
+ self._tok2ind[tok] = len(self._tok2ind)
340
+ self._ind2tok[self._tok2ind[tok]] = tok
341
+
342
+ # purpose: delete a type from the index
343
+ # arguments: tok: str, the token to delete from the index
344
+ def del_type(self, tok):
345
+ if tok in self._tok2ind:
346
+ idx = self._tok2ind[tok]
347
+ # delete the token from the index and the lookup
348
+ del self._ind2tok[idx]
349
+ del self._tok2ind[tok]
350
+ # shifting down each type that's a larger index
351
+ # than the one just removed
352
+ i = idx + 1
353
+ while i in self._ind2tok:
354
+ t = self._ind2tok[i]
355
+ self._tok2ind[t] = i - 1
356
+ self._ind2tok[i - 1] = t
357
+ del self._ind2tok[i]
358
+
359
+ # purpose: save a model for later use
360
+ # arguments:
361
+ # - path: str, directory location where models are to be saved
362
+ # - data: dict, with fields 'tok2ind', and 'action_trace', which key a dictionary index mapping the vocabulary, and a list of ranked actions to apply as the tokenizers parameters.
363
+ # output: saved model parameters in the location defined by path
364
+ def save(self, path, data=None):
365
+ if data is None:
366
+ data = {}
367
+ data['tok2ind'] = self._tok2ind
368
+ data['action_trace'] = [[a.pair, 1 if a.type == 'merge' else 0, a.count, a.score if hasattr(a, "score") else None]
369
+ for a in self._action_trace]
370
+ json.dump(data, open(path, 'w+'))
371
+
372
+ # purpose: load a saved model
373
+ # arguments: path: str, directory location from which the model will be loaded
374
+ # prereqs: a saved model
375
+ # output: loaded model parameters for operation of a trained tokenizer
376
+ def load(self, path):
377
+ data = json.load(open(path))
378
+ self._tok2ind = data['tok2ind']
379
+ self._ind2tok = {v: k for k, v in self._tok2ind.items()}
380
+ self._action_trace = [ScoredAction(tuple(a[0]), count=a[2], score=a[3],
381
+ type='merge' if a[1] else 'split')
382
+ for a in data['action_trace']]
383
+ self._tok2acts = defaultdict(list)
384
+ self._pair2merge = dict()
385
+ self._tok2splits = defaultdict(list)
386
+ for aix, a in enumerate(self._action_trace):
387
+ if a.type =='split':
388
+ self._tok2acts["".join(a.pair)].append(aix)
389
+ self._tok2splits["".join(a.pair)].append(aix)
390
+ else:
391
+ self._pair2merge[tuple(a.pair)] = aix
392
+ self._tok2acts[a.pair[0]].append(aix)
393
+ self._tok2acts[a.pair[1]].append(aix)
394
+ self._maxtoklen = max([len(t) for t in self._tok2ind])
395
+
396
+ return data
397
+
398
+ # purpose: initialize a model
399
+ # arguments:
400
+ # - docs: list (corpus) of strs (documents) to be used for training
401
+ # prereqs: a selected BPE variant, which is what will inherit Tokenizer
402
+ @abstractmethod
403
+ def init(self, docs, seed=None):
404
+ raise NotImplementedError
405
+
406
+ # purpose: fit a model
407
+ # arguments:
408
+ # - num_batches: int, indicating the number of batches over which to rank and apply actions
409
+ # - batch_size: int, indicating the number of actions to rank for each batch
410
+ # - seed: int, indicating the seed of randomization
411
+ # prereqs: a selected BPE variant, which is what will inherit Tokenizer
412
+ # output: a trained model
413
+ @abstractmethod
414
+ def fit(self, num_batches, batch_size=1, seed=None):
415
+ raise NotImplementedError
416
+
417
+ # purpose: tokenize a text and map its tokens to vocabulary indices
418
+ # arguments: text: str, to be tokenized
419
+ # prereqs: a trained tokenizer
420
+ # output: a list of token indices, mapped according to the model's vocabulary
421
+ def encode(self, text):
422
+ return self.tokens_to_indices(self.tokenize(text))
423
+
424
+ # purpose: apply the model's trained tokenizer to a given text
425
+ # arguments:
426
+ # - text: str, target text to be tokenized
427
+ # - start:
428
+ # prereqs: a trained tokenizer with an _action_trace, recording the ranked list of tokenization actions
429
+ # output: a tuple of strs (tokens), which if joined together form the text
430
+ def tokenize(self, text, start=-1):
431
+ assert self._action_trace, "Can't tokenize, no trained model!"
432
+ return self.apply_action_trace(text)
433
+
434
+ # purpose: update the index of actions according to those which are applicable to the data
435
+ # arguments:
436
+ # - available_action_indices: list (actions list) of ints (indices) indicating those that are currently available for non-null operation
437
+ # - model: a trained model object
438
+ # - prev_aix: int, indicating the previous action index that was applied
439
+ # - observed: set, indicating the set of observed tokens within the current tokenization of the data
440
+ def update_action_indices(self, available_action_indices, model, prev_aix = -1, observed = set()):
441
+ available_action_indices = sorted(list(filter(lambda next_aix: next_aix > prev_aix, available_action_indices)) +
442
+ [next_aix for next_aix in [aix for tok in model._unigraph
443
+ for aix in self._tok2splits[tok] if tok not in observed] +
444
+ [self._pair2merge[pair] for pair in model._digraph
445
+ if pair not in observed and pair in self._pair2merge]
446
+ if next_aix > prev_aix])
447
+ observed = observed.union(set(model._unigraph.keys()).union(set(model._digraph.keys())))
448
+ return available_action_indices, observed
449
+
450
+ # purpose: apply a trained models list of actions to a given text (operate BPE)
451
+ # arguments:
452
+ # - text: str, document to be tokenized
453
+ # prereqs: a trained model
454
+ # output: tuple (document) of strings (tokens)
455
+ def apply_action_trace(self, text):
456
+ mock = BPE()
457
+ mock.init([text], method=self._init_method, apply=True) # method='char'
458
+ prev_aix = -1; available_action_indices = []; observed = set(); tokenizing = True
459
+ while tokenizing:
460
+ available_action_indices, observed = self.update_action_indices(available_action_indices, mock,
461
+ prev_aix = prev_aix, observed = observed)
462
+ if available_action_indices:
463
+ aix = available_action_indices[0]
464
+ else:
465
+ tokenizing = False
466
+ break
467
+ prev_aix = aix
468
+ action = self._action_trace[aix]
469
+ if action.type == 'merge':
470
+ mock.merge(action.pair)
471
+ else:
472
+ mock.split(action.pair)
473
+ tks = []
474
+ for t, idxs in mock._tok_idx.items():
475
+ for ix in idxs:
476
+ tks.append((t, ix))
477
+ tks.sort(key=lambda ti: ti[1])
478
+ tks, _ = zip(*tks)
479
+ return tks
480
+
481
+ def return_tokenization(self):
482
+ tks = []
483
+ for t, idxs in self._tok_idx.items():
484
+ for ix in idxs:
485
+ tks.append((t, ix))
486
+ tks.sort(key=lambda ti: ti[1])
487
+ tks, _ = zip(*tks)
488
+ return tks
489
+
490
+ # purpose: convert a list of token indices to a str object
491
+ # arguments: indices: list (tokenized document) of ints (indices) to be mapped to strings and joined
492
+ # prereqs: a trained model
493
+ # output: str, representing the underlying form of the list of tokens
494
+ def decode(self, indices):
495
+ return ''.join(self.indices_to_tokens(indices))
496
+
497
+ # purpose: convert str tokens to a list of int token indices
498
+ # arguments:
499
+ # - toks: list (document) of strs (tokens) to be mapped theought the model's index to indices
500
+ # prereqs: a trained model
501
+ # output: a list of ints (indices) representing the model's tokenized representation of the document
502
+ def tokens_to_indices(self, toks):
503
+ return [self._tok2ind[t] for t in toks]
504
+
505
+ # purpose: conver a list of token indices to a list of str objects
506
+ # arguments: indices: list (tokenized document) of ints (indices) to be mapped to strings and joined
507
+ # prereqs: a trained model
508
+ # output: list of strs, representing the tokenized form of the document's tokens
509
+ def indices_to_tokens(self, indices):
510
+ return [self._ind2tok[i] for i in indices]
511
+
512
+ # purpose: cast a BPE-based tokenizer object
513
+ # arguments: see __init__()
514
+ # use methods:
515
+ # - __init__: intialize a BPE-based tokenizer object
516
+ # - save: (see Tokenizer)
517
+ # - init: (see Tokenizer)
518
+ # - load: (see Tokenizer)
519
+ # use attributes: none
520
+ class BPE(Tokenizer):
521
+ # purpose: initialize a BPE-based tokenizer object
522
+ # arguments:
523
+ # - tok2ind: dict (see Tokenizer)
524
+ # - covering_vocab: set, indicating a collection of strs that the tokenizer should consider as bounds for the result of all possible actions
525
+ def __init__(self, tok2ind=None, covering_vocab = set()):
526
+ # defining a covering vocabulary restricts merge/split-able pathways
527
+ self._covering_vocab = covering_vocab
528
+ self._covered = {}
529
+ self._covering = {}
530
+ if self._covering_vocab:
531
+ if tok2ind:
532
+ tok2ind = {t: i for i, t in enumerate(set(list(tok2ind.keys())+list(self._covering_vocab)))}
533
+ else:
534
+ tok2ind = {t: i for i, t in enumerate(self._covering_vocab)}
535
+ # initialize token index, now that the cover is included
536
+ super().__init__(tok2ind=tok2ind)
537
+ # starting and ending points for each token (as a set for constant lookup)
538
+ self._lefts = {}
539
+ self._rights = {}
540
+ # frequency-based information
541
+ self._unigraph = Counter()
542
+ self._doc_unigraph = defaultdict(Counter)
543
+ self._digraph = Counter()
544
+ # mapping to and from indices
545
+ self._tok_idx = defaultdict(set)
546
+ self._pair_idx = defaultdict(set)
547
+ self._char2docidx = {}
548
+
549
+ # purpose: save a model
550
+ # arguments:
551
+ # - path: (see Tokenizer)
552
+ # - data: (see Tokenizer)
553
+ def save(self, path, data=None):
554
+ if data is None:
555
+ data = {}
556
+ data['unigraph'] = dict(self._unigraph)
557
+ data['digraph'] = [[k[0], k[1], v] for k, v in self._digraph.items()]
558
+ data['doc_unigraph'] = {k: dict(v) for k, v in self._doc_unigraph.items()}
559
+ data['init_method'] = self._init_method
560
+ super(BPE, self).save(path, data=data)
561
+ # json.dump({'docs': self._training_data, 'covering': self._training_covering},
562
+ # open(re.sub('.json', '-docs.json', path), 'w+'))
563
+
564
+ # purpose: load a model
565
+ # arguments:
566
+ # - path: (see Tokenizer)
567
+ def load(self, path):
568
+ data = super(BPE, self).load(path)
569
+ self._unigraph = Counter(data['unigraph'])
570
+ self._digraph = Counter({(l, r): v for l, r, v in data['digraph']})
571
+ self._doc_unigraph = defaultdict(Counter)
572
+ for k, v in data['doc_unigraph'].items():
573
+ self._doc_unigraph[k] = Counter(v)
574
+ self._init_method = data['init_method']
575
+ return data
576
+
577
+ # purpose: intialize a BPE-based model
578
+ # arguments:
579
+ # - docs: list (corpus) of strs (document), containing the data on which the model will be trained
580
+ # - seed: int, indicating the seed of randomization
581
+ # - method: str, one from: 'char' (start from characters), 'warm' (start from a space-based segmentation), or 'rand' (start from a random segmentation)
582
+ # - apply: whether or not this will be an application of the action trace post-training, which just shuts off a progress bar
583
+ # - covering: list (corpus) of lists (documents) of strs (tokens), representing a collection of token boundaries that must be observed during learning, i.e., restricting the learnable rules.
584
+ # - action_protect: list of strs, indicating regular expressions of that cannot be included in actions, protecting the model from, e.g., learning to merge known unwanted tokens
585
+ # prereqs: a corpus of document to either tokenize or initialize for training
586
+ # output: none, data are ingested and structured for learning or application of a model
587
+ def init(self, docs, seed=None, method='char', apply=False, covering = [], action_protect = ''):
588
+ # self._training_data = docs; self._training_covering = covering
589
+ ##
590
+ self._doc_counts = Counter(); ds = []; cs = []; doc_index = {}
591
+ for di, doc in enumerate(docs): # tqdm(list(), desc = "Indexing and counting multiplicity of documents"):
592
+ if doc not in doc_index:
593
+ doc_index[doc] = len(ds)
594
+ ds.append(doc)
595
+ if covering:
596
+ cs.append(covering[di])
597
+ self._doc_counts[doc_index[doc]] += docs[doc] if type(docs) == Counter else 1
598
+ docs = ds; covering = cs
599
+ ##
600
+ self._init_method = method
601
+ self._action_protect = action_protect
602
+ ## guarentees a covering
603
+ self._covering = {}
604
+ self._hascover = bool(covering)
605
+ ix = 0
606
+ if covering:
607
+ # assert(len(docs) == len(covering))
608
+ for doc, segmentation in zip(docs, covering):
609
+ for s_ix, s in enumerate(segmentation):
610
+ for ch in s:
611
+ self._covering[ix] = s_ix
612
+ ix += 1
613
+ ix += 1
614
+ d_ix = 0
615
+ s_ix = max(self._covering.values()) if self._covering else -1
616
+ for doc in docs:
617
+ if d_ix + len(doc) > ix:
618
+ s_ix += 1
619
+ for ch in doc:
620
+ if d_ix > ix:
621
+ self._covering[d_ix] = s_ix
622
+ d_ix += 1
623
+ d_ix += 1
624
+ if seed:
625
+ np.random.seed(seed=seed)
626
+ offset = 0
627
+ for doc_idx, doc in enumerate(docs if apply else tqdm(docs, desc=f'Initializing')):
628
+ stream = self._init_doc(doc, method=method)
629
+ assert (sum(map(len, stream)) == len(doc))
630
+ for ix, tok in enumerate(stream):
631
+ self._unigraph[tok] += self._doc_counts[doc_idx] # 1
632
+ self._tok_idx[tok].add(offset)
633
+ for char_idx in range(offset, offset + len(tok)):
634
+ self._char2docidx[char_idx] = doc_idx #####################################
635
+ self._doc_unigraph[doc_idx][tok] += self._doc_counts[doc_idx] # 1
636
+ tok_pair = (stream[ix - 1], tok) if ix else ('', tok)
637
+ self._lefts[(offset - len(stream[ix - 1])) if ix else (offset - 1)] = tok_pair
638
+ self._rights[offset] = tok_pair
639
+ if ix:
640
+ self._digraph[tok_pair] += self._doc_counts[doc_idx] # 1
641
+ self._pair_idx[tok_pair].add(offset - len(stream[ix - 1]))
642
+ offset += len(tok)
643
+ tok_pair = (tok, '')
644
+ self._lefts[offset - len(tok)] = tok_pair
645
+ self._rights[offset] = tok_pair
646
+ offset += 1
647
+
648
+ # purpose: initialize a document for training or tokenization
649
+ # arguments:
650
+ # - d: str, representing the underlying data to be used for training, or which will be tokenized
651
+ # - method: str, indicating method for initialization. one of: 'char' (start from characters), 'warm' (start from a space-based segmentation), or 'rand' (start from a random segmentation)
652
+ # output: a list of strings, which will serve as starting points for tokenization by operation of actions
653
+ @staticmethod
654
+ def _init_doc(d, method='char'):
655
+ if method == 'char':
656
+ return d
657
+ elif method == 'warm':
658
+ return [token for token in re.split("([a-zA-Z0-9-']+)", d) if token] # tokenize(d)
659
+ elif method == 'rand':
660
+ topidx = sorted(set(
661
+ [0] + sorted(np.random.choice(np.arange(1, len(d)), size=int(len(d) / 2), replace=False)) + [len(d)]))
662
+ return [d[topidx[idx - 1]:topidx[idx]] for idx in range(1, len(topidx))]
663
+ else:
664
+ raise ValueError(f'Unrecognized document pre-processing method: {method}')
665
+
666
+ # purpose: determine if a given pair can be merged, based on whether the join of its constituents is a substring of the cover
667
+ # arguments:
668
+ # - pair: tuple of strs, indicating two tokens adjacent to one another
669
+ # output: boolean, indicating whether or not the pair's join would compatible with the cover
670
+ def under_cover(self, pair): # for span-level covering
671
+ newtok = "".join(pair)
672
+ skip_next = False
673
+ for i in sorted(list(self._pair_idx[pair])):
674
+ if skip_next: # handle odd numbers of repeated tokens
675
+ skip_next = False
676
+ continue
677
+ skip_next = True if pair[0] == pair[1] and pair[1] == self._lefts[i + len(pair[0])][1] else False
678
+ if (i in self._covering) and (i+len(newtok)-1 in self._covering):
679
+ if self._covering[i] != self._covering[i+len(newtok)-1]:
680
+ return False
681
+ elif (i in self._covering) or ((i+len(newtok)-1) in self._covering):
682
+ return False
683
+ else:
684
+ return True
685
+
686
+ # purpose: determine if a given pair can be split, based on whether breaking its consitituents violates the cover
687
+ # arguments:
688
+ # - wpair: tuple of strs whose interaction with the cover will be evaluated
689
+ # output: bool, indicating whether or not the split degrades the efficiency of the cover
690
+ def split_under_cover(self, wpair): # for span-level covering
691
+ oldtok = "".join(wpair)
692
+ locations = list(self._tok_idx[oldtok])
693
+ for i in sorted(locations):
694
+ if (self._covering[i] != self._covering[i+len(wpair[0])-1] or
695
+ self._covering[i+len(wpair[0])] != self._covering[i+len(wpair[0])+len(wpair[1])-1]):
696
+ return False
697
+ else:
698
+ return True
699
+
700
+ # purpose: determines if a given str is a token within the covering vocabulary
701
+ # arguments: newtok: str, to be evaluated for substring status within at least one token contained in the covering vocabulary
702
+ # output: bool, indicating if the token is a substring one from the covering vocabulary
703
+ def is_covered(self, newtok): # for vocab-level covering
704
+ if newtok in self._covered:
705
+ return self._covered[newtok]
706
+ else:
707
+ for cover_token in self._covering_vocab:
708
+ if newtok in cover_token:
709
+ self._covered[newtok] = True
710
+ return self._covered[newtok]
711
+ else:
712
+ self._covered[newtok] = False
713
+ return self._covered[newtok]
714
+
715
+ # purpose: determines if token covers a token within a covering vocabulary
716
+ # arguments: newtok: str, to be evaluated for superstring status over at least one token contained in the covering vocabulary
717
+ # output: bool, indicating if the token is a superstring of one from the covering vocabulary
718
+ def is_covering(self, newtok): # for vocab-level covering
719
+ if newtok in self._covering:
720
+ return self._covering[newtok]
721
+ else:
722
+ for cover_token in self._covering_vocab:
723
+ if cover_token in newtok:
724
+ self._covering[newtok] = True
725
+ return self._covering[newtok]
726
+ else:
727
+ self._covering[newtok] = False
728
+ return self._covering[newtok]
729
+
730
+ # purpose: trains an hr-bpe model over the system's current state of ingested data
731
+ # arguments:
732
+ # - num_batches: int, indicating the number of iterations of action ranking into test batches that will be operated
733
+ # - batch_size: int, indicating the number of potentially-optimizing actions to rank per test batch (merge and split, each)
734
+ # - actions_per_batch: int, indicating the number of optimizing actions to sample and test for inclusion as learned rules, per test batch
735
+ # - seed: int, for control of randomization sampling
736
+ # output: NA, method modifies the state of model parameters by learning optimizing actions
737
+ def fit(self, num_batches, batch_size=1, actions_per_batch=None, seed=None):
738
+ if seed:
739
+ np.random.seed(seed=seed)
740
+
741
+ if actions_per_batch is None:
742
+ actions_per_batch = batch_size
743
+ elif actions_per_batch > batch_size:
744
+ actions_per_batch = batch_size
745
+
746
+ pbar = tqdm(total=self._early_stop, desc = 'Fitting')
747
+ for batch in range(num_batches): # tqdm(range(num_batches), desc='Fitting'):
748
+ actions = self.rank_actions(self.get_actions(batch_size, actions_per_batch)) # [:batch_size]
749
+ for action in actions:
750
+ vsize = len(self._unigraph)
751
+ if action.type == 'merge':
752
+ ## Different criteria for avoiding merges
753
+ newtok = "".join(action.pair)
754
+ if self._action_protect:
755
+ if re.search("("+"|".join(self._action_protect)+")", newtok): continue
756
+ if self._hascover:
757
+ if not self.under_cover(action.pair):
758
+ continue
759
+ if self._covering_vocab:
760
+ if (not self.is_covered(newtok)) and (not self.is_covering(newtok)):
761
+ continue
762
+ self.merge(action.pair)
763
+ else:
764
+ ## Different criteria for avoiding splits
765
+ if self._action_protect:
766
+ if (re.search("("+"|".join(self._action_protect)+")", action.pair[0]) or
767
+ re.search("("+"|".join(self._action_protect)+")", action.pair[1])):
768
+ continue
769
+ if self._hascover:
770
+ if not self.split_under_cover(action.pair):
771
+ continue
772
+ if self._covering_vocab:
773
+ if (((not self.is_covered(action.pair[0])) and (not self.is_covering(action.pair[0]))) or
774
+ ((not self.is_covered(action.pair[1])) and (not self.is_covering(action.pair[1])))):
775
+ continue
776
+ self.split(action.pair)
777
+
778
+ self._action_trace.append(action)
779
+ pbar.update(len(self._unigraph) - vsize)
780
+ if self.do_break_early() or not actions:
781
+ break
782
+ if self.do_break_early() or not actions:
783
+ break
784
+
785
+ # for k in self._unigraph.keys():
786
+ for k, v in sorted(self._unigraph.items(), key=lambda kv: kv[1], reverse=True):
787
+ self.add_type(k)
788
+
789
+ self._tok2acts = defaultdict(list)
790
+ self._pair2merge = dict()
791
+ self._tok2splits = defaultdict(list)
792
+ for aix, a in enumerate(self._action_trace):
793
+ if a.type =='split':
794
+ self._tok2acts["".join(a.pair)].append(aix)
795
+ self._tok2splits["".join(a.pair)].append(aix)
796
+ else:
797
+ self._pair2merge[tuple(a.pair)] = aix
798
+ self._tok2acts[a.pair[0]].append(aix)
799
+ self._tok2acts[a.pair[1]].append(aix)
800
+ self._maxtoklen = max([len(t) for t in self._tok2ind])
801
+
802
+ print(f'Built a vocabulary of {len(self)} types')
803
+
804
+ # purpose: concatenate (merge) all adjacent pairs of a given type
805
+ # arguments:
806
+ # - pair: tuple of strs, indicating the types of two adjacent tokens
807
+ # output: an updated representation of the ingested documents, with all pairs of the given type merged
808
+ def merge(self, pair):
809
+ newtok = "".join(pair)
810
+
811
+ skip_next = False
812
+ locations = list(self._pair_idx[pair])
813
+ for i in sorted(locations):
814
+ if skip_next: # handle odd numbers of repeated tokens
815
+ skip_next = False
816
+ continue
817
+
818
+ # gather the instance's neighbors
819
+ lneighbor = self._rights[i][0]
820
+ rneighbor = self._lefts[i + len(pair[0])][1]
821
+ skip_next = True if pair[0] == pair[1] and pair[1] == rneighbor else False
822
+
823
+ # delete the entries for this pair in both indices
824
+ del (self._lefts[i])
825
+ del (self._rights[i + len(pair[0])])
826
+
827
+ # gather the old left and right adjacent pairs
828
+ lpair = (lneighbor, pair[0])
829
+ rpair = (pair[1], rneighbor)
830
+
831
+ # construct new left and right adjacent pairs
832
+ newlpair = (lneighbor, newtok)
833
+ newrpair = (newtok, rneighbor)
834
+
835
+ # delete the old left and right pair from both left/right indexings
836
+ del (self._lefts[i - len(lneighbor) if lneighbor else i - 1]) # lpair
837
+ del (self._rights[i]) # lpair
838
+ del (self._lefts[i + len(pair[0])]) # rpair
839
+ del (self._rights[i + len(newtok)]) # rpair
840
+
841
+ # update both left and right indexings with the new left and right pairs
842
+ self._lefts[i - len(lneighbor) if lneighbor else i - 1] = newlpair
843
+ self._rights[i] = newlpair
844
+ self._lefts[i] = newrpair
845
+ self._rights[i + len(newtok)] = newrpair
846
+
847
+ texti = self._char2docidx[i]
848
+ # weight = self._doc_counts[texti]
849
+ # only update left co-occurrences if lneighbor is non-empty
850
+ if lneighbor: # including deleting the lpair instance from codata
851
+ self._digraph[newlpair] += self._doc_counts[texti] # 1
852
+ self._digraph[lpair] -= self._doc_counts[texti] # 1
853
+ self._pair_idx[newlpair].add(i - len(lneighbor))
854
+ self._pair_idx[lpair].remove(i - len(lneighbor))
855
+ if not self._digraph[lpair]:
856
+ del (self._digraph[lpair])
857
+ if not self._pair_idx[lpair]:
858
+ del (self._pair_idx[lpair])
859
+
860
+ # only update right co-occurrences if rneighbor is non-empty
861
+ if rneighbor: # including deleting rpair the instance from codata
862
+ self._digraph[newrpair] += self._doc_counts[texti] # 1
863
+ self._digraph[rpair] -= self._doc_counts[texti] # 1
864
+ self._pair_idx[newrpair].add(i)
865
+ self._pair_idx[rpair].remove(i + len(pair[0]))
866
+ if not self._digraph[rpair]:
867
+ del (self._digraph[rpair])
868
+ if not self._pair_idx[rpair]:
869
+ del (self._pair_idx[rpair])
870
+
871
+ # update unigram frequencies
872
+ self._unigraph[newtok] += self._doc_counts[texti] # 1
873
+ self._unigraph[pair[0]] -= self._doc_counts[texti] # 1
874
+ self._unigraph[pair[1]] -= self._doc_counts[texti] # 1
875
+ if not self._unigraph[pair[0]]:
876
+ del (self._unigraph[pair[0]])
877
+ if not self._unigraph[pair[1]]:
878
+ del (self._unigraph[pair[1]])
879
+
880
+ # texti = self._char2docidx[i] ###############################################
881
+ self._doc_unigraph[texti][newtok] += self._doc_counts[texti] # 1
882
+ self._doc_unigraph[texti][pair[0]] -= self._doc_counts[texti] # 1
883
+ if not self._doc_unigraph[texti][pair[0]]:
884
+ del (self._doc_unigraph[texti][pair[0]])
885
+ self._doc_unigraph[texti][pair[1]] -= self._doc_counts[texti] # 1
886
+ if not self._doc_unigraph[texti][pair[1]]:
887
+ del (self._doc_unigraph[texti][pair[1]])
888
+
889
+ # update the token locations
890
+ self._tok_idx[newtok].add(i)
891
+ self._tok_idx[pair[0]].remove(i)
892
+ self._tok_idx[pair[1]].remove(i + len(pair[0]))
893
+ if not self._tok_idx[pair[0]]:
894
+ del (self._tok_idx[pair[0]])
895
+ if not self._tok_idx[pair[1]]:
896
+ del (self._tok_idx[pair[1]])
897
+
898
+ # delete the pair from the co-occurrence data record
899
+ self._digraph[pair] -= self._doc_counts[texti] # 1
900
+ self._pair_idx[pair].remove(i)
901
+ if not self._pair_idx[pair]:
902
+ del (self._pair_idx[pair])
903
+ if not self._digraph[pair]:
904
+ del (self._digraph[pair])
905
+
906
+ # purpose: divide (split) all tokens of a given type into a defined pair of types (wpair)
907
+ # arguments:
908
+ # - wpair: tuple of strs, indicating the two adjacent types inwo which the given token is split
909
+ # output: an updated representation of the ingested documents, with all instances of the given type split
910
+ def split(self, wpair):
911
+ oldtok = "".join(wpair)
912
+ locations = list(self._tok_idx[oldtok])
913
+ for i in sorted(locations):
914
+ # update the left/right and consequential digraph indices
915
+ # wpair[0] updates
916
+ lneighbor = self._rights[i][0]
917
+ rneighbor = self._lefts[i][1]
918
+ lpair = (lneighbor, oldtok)
919
+ rpair = (oldtok, rneighbor)
920
+ newlpair = (lneighbor, wpair[0])
921
+ newcpair = wpair
922
+ newrpair = (wpair[1], rneighbor)
923
+
924
+ texti = self._char2docidx[i]
925
+ # weight = self._doc_counts[texti]
926
+ # cpair
927
+ self._digraph[newcpair] += self._doc_counts[texti] # 1
928
+ self._pair_idx[newcpair].add(i)
929
+ self._lefts[i] = wpair
930
+ self._rights[i + len(wpair[0])] = wpair
931
+
932
+ # lpairs
933
+ del (self._rights[i])
934
+ self._rights[i] = newlpair
935
+ del (self._lefts[i - len(lneighbor) if lneighbor else i - 1])
936
+ self._lefts[i - len(lneighbor) if lneighbor else i - 1] = newlpair
937
+ if lneighbor:
938
+ self._digraph[newlpair] += self._doc_counts[texti] # 1
939
+ self._digraph[lpair] -= self._doc_counts[texti] # 1
940
+ self._pair_idx[newlpair].add(i - len(lneighbor))
941
+ self._pair_idx[lpair].remove(i - len(lneighbor))
942
+ if not self._digraph[lpair]:
943
+ del self._digraph[lpair]
944
+ if not self._pair_idx[lpair]:
945
+ del (self._pair_idx[lpair])
946
+
947
+ # rpairs
948
+ # del(left_indexed_pairs[i]) # technically, this was just overwritten w/wpair and doesn't need deletion
949
+ self._lefts[i + len(wpair[0])] = newrpair
950
+ # del(right_indexed_pairs[i+len(oldtok)])
951
+ self._rights[i + len(oldtok)] = newrpair
952
+ if rneighbor:
953
+ self._digraph[newrpair] += self._doc_counts[texti] # 1
954
+ self._digraph[rpair] -= self._doc_counts[texti] # 1
955
+ self._pair_idx[newrpair].add(i + len(wpair[0]))
956
+ self._pair_idx[rpair].remove(i)
957
+ if not self._digraph[rpair]:
958
+ del (self._digraph[rpair])
959
+ if not self._pair_idx[rpair]:
960
+ del (self._pair_idx[rpair])
961
+
962
+ # update unigram frequencies
963
+ self._unigraph[oldtok] -= self._doc_counts[texti] # 1
964
+ self._unigraph[wpair[0]] += self._doc_counts[texti] # 1
965
+ self._unigraph[wpair[1]] += self._doc_counts[texti] # 1
966
+ if not self._unigraph[oldtok]:
967
+ del self._unigraph[oldtok]
968
+
969
+ # update the token locations
970
+ self._tok_idx[oldtok].remove(i)
971
+ self._tok_idx[wpair[0]].add(i)
972
+ self._tok_idx[wpair[1]].add(i + len(wpair[0]))
973
+ if not self._tok_idx[oldtok]:
974
+ del (self._tok_idx[oldtok])
975
+
976
+ # texti = self._char2docidx[i]#########################################
977
+ self._doc_unigraph[texti][oldtok] -= self._doc_counts[texti] # 1
978
+ if not self._doc_unigraph[texti][oldtok]:
979
+ del (self._doc_unigraph[texti][oldtok])
980
+ self._doc_unigraph[texti][wpair[0]] += self._doc_counts[texti] # 1
981
+ self._doc_unigraph[texti][wpair[1]] += self._doc_counts[texti] # 1
982
+
983
+ # purpose: return a list of actions, ranked according to the given system settings (see: HRBPE from regularized, GreedyBPE from greedy)
984
+ # arguments:
985
+ # - batch_size: int, indicating the number of potentially-optimizing actions to rank per test batch (merge and split, each)
986
+ # - actions_per_batch: int, indicating the number of optimizing actions to sample and test for inclusion as learned rules, per test batch
987
+ # output: list of Action objects
988
+ def get_actions(self, batch_size, actions_per_batch):
989
+ raise NotImplementedError
990
+
991
+ # purpose: rank a list of actions according to the system's given settings (see: HRBPE from regularized, GreedyBPE from greedy)
992
+ # arguments:
993
+ # - actions: list of Action objects
994
+ # output: list of Action objects, ordered by decreasing sorting value, such as count
995
+ def rank_actions(self, actions):
996
+ raise NotImplementedError
997
+
998
+ # purpose: halt the given training process according to the system settings (see: HRBPE from regularized, GreedyBPE from greedy)
999
+ # arguments: NA
1000
+ # output: boolean, indicating whether or not a stopping criterion has been reached
1001
+ def do_break_early(self):
1002
+ return False
1003
+
1004
+ # purpose: instantiate a standard bpe model that greedily accepts merges of highest co-frequency
1005
+ # arguments: (see __init__ from base.BPE)
1006
+ # prereqs: (see base.BPE)
1007
+ # use methods: (see base.BPE)
1008
+ # use attributes: (see base.BPE)
1009
+ class GreedyBPE(BPE):
1010
+
1011
+ # - tok2ind: (optional) dict, used by .load to set the index
1012
+ # - covering_vocab: set, indicating a collection of strs that the tokenizer should consider as bounds for the result of all possible actions
1013
+ # - early_stop: bool, with True indicating the model should stop early, i.e., once no actions are predicted to optimize the negative log likelihood
1014
+ def __init__(self, tok2ind=None, covering_vocab = set(), early_stop=1_000_000_000):
1015
+ super().__init__(tok2ind=tok2ind, covering_vocab = covering_vocab)
1016
+ self._early_stop = early_stop
1017
+
1018
+ # purpose: return a list of actions, ranked according to the current count value for each action's pair of tokens
1019
+ # arguments:
1020
+ # - batch_size: int, indicating the number of potentially-optimizing actions to rank per test batch (merge and split, each)
1021
+ # output: list of Action objects
1022
+ def get_actions(self, batch_size, _):
1023
+ return [Action(pair, type='merge', count=cnt) for pair, cnt in self._digraph.most_common(batch_size)]
1024
+
1025
+ # purpose: rank a list of actions according to the system's current count value for each action's pair of tokens
1026
+ # arguments:
1027
+ # - actions: list of Action objects
1028
+ # output: list of Action objects, ordered by decreasing sorting value, such as count
1029
+ def rank_actions(self, actions):
1030
+ return sorted(actions, reverse=True, key=lambda a: a.count)
1031
+
1032
+ # purpose: halt the given training process when the vocabulary is the desired size (equal to self._early_stop)
1033
+ # arguments: NA
1034
+ # output: boolean, indicating whether or not a stopping criterion has been reached
1035
+ def do_break_early(self):
1036
+ ## the vocabulary size exceeds the self._early_stop (limit) or largest co-frequency pair has count 1
1037
+ return((len(self._unigraph) >= self._early_stop and self._early_stop) or
1038
+ self.get_actions(1, 1)[0].count == 1)
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenization_saffu.SAFFUTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "clean_up_tokenization_spaces": true,
9
+ "model_max_length": 1000000000000000019884624838656,
10
+ "tokenizer_class": "SAFFUTokenizer"
11
+ }
vocab.json ADDED
@@ -0,0 +1,4085 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "\t": 88,
3
+ "\t\"": 1895,
4
+ " ": 12,
5
+ " ": 484,
6
+ " ": 507,
7
+ " ": 819,
8
+ " ": 2176,
9
+ " ": 3203,
10
+ " !": 213,
11
+ " \"": 40,
12
+ " #": 1825,
13
+ " &": 748,
14
+ " '": 304,
15
+ " ''": 200,
16
+ " 'cause": 2101,
17
+ " 'd": 2881,
18
+ " 'em": 1230,
19
+ " 'll": 1497,
20
+ " 'm": 2010,
21
+ " 's": 357,
22
+ " 've": 2664,
23
+ " (": 85,
24
+ " (\"": 1339,
25
+ " ()": 1649,
26
+ " (;": 2745,
27
+ " )": 1448,
28
+ " *": 2870,
29
+ " ,": 37,
30
+ " -": 384,
31
+ " --": 361,
32
+ " .": 21,
33
+ " ..": 1613,
34
+ " ...": 610,
35
+ " 1": 729,
36
+ " 10": 903,
37
+ " 100": 1567,
38
+ " 11": 1746,
39
+ " 12": 1289,
40
+ " 13": 1867,
41
+ " 14": 1824,
42
+ " 15": 1246,
43
+ " 16": 1692,
44
+ " 17": 1855,
45
+ " 18": 1550,
46
+ " 19": 1669,
47
+ " 1950": 3812,
48
+ " 1960": 2912,
49
+ " 1970": 2732,
50
+ " 1980": 2578,
51
+ " 1990": 2684,
52
+ " 2": 697,
53
+ " 20": 893,
54
+ " 200": 3468,
55
+ " 2000": 2672,
56
+ " 2001": 3739,
57
+ " 2002": 3931,
58
+ " 2003": 4057,
59
+ " 2004": 3510,
60
+ " 2005": 3611,
61
+ " 2006": 2675,
62
+ " 2007": 3070,
63
+ " 2008": 2840,
64
+ " 2009": 3198,
65
+ " 2010": 1094,
66
+ " 2011": 1694,
67
+ " 2012": 1917,
68
+ " 2013": 1938,
69
+ " 2014": 2901,
70
+ " 2015": 3476,
71
+ " 2016": 2040,
72
+ " 2017": 2032,
73
+ " 2018": 2916,
74
+ " 2020": 3504,
75
+ " 21": 2313,
76
+ " 22": 2681,
77
+ " 23": 2899,
78
+ " 24": 2085,
79
+ " 25": 1959,
80
+ " 26": 3096,
81
+ " 27": 3246,
82
+ " 28": 3183,
83
+ " 29": 3564,
84
+ " 3": 782,
85
+ " 30": 1373,
86
+ " 31": 3999,
87
+ " 4": 1003,
88
+ " 40": 2550,
89
+ " 5": 1015,
90
+ " 50": 2048,
91
+ " 6": 1305,
92
+ " 60": 3584,
93
+ " 7": 1562,
94
+ " 8": 1484,
95
+ " 9": 1871,
96
+ " :": 1366,
97
+ " ;": 395,
98
+ " =": 112,
99
+ " ?": 41,
100
+ " A": 229,
101
+ " ALL": 3920,
102
+ " AND": 1544,
103
+ " ARE": 3820,
104
+ " About": 3338,
105
+ " Academy": 1985,
106
+ " According": 3164,
107
+ " Act": 2645,
108
+ " Adam": 3860,
109
+ " Africa": 1533,
110
+ " African": 2389,
111
+ " After": 829,
112
+ " Ah": 2985,
113
+ " Air": 2121,
114
+ " Airport": 3998,
115
+ " Alan": 3303,
116
+ " Alexander": 3756,
117
+ " Alice": 3494,
118
+ " All": 905,
119
+ " Also": 3563,
120
+ " Although": 3163,
121
+ " America": 924,
122
+ " American": 317,
123
+ " Americans": 2396,
124
+ " Amy": 3005,
125
+ " An": 2267,
126
+ " And": 356,
127
+ " Andy": 4052,
128
+ " Angeles": 2250,
129
+ " Anne": 1828,
130
+ " Another": 3706,
131
+ " April": 680,
132
+ " Are": 1668,
133
+ " Army": 1515,
134
+ " Art": 3093,
135
+ " Arthur": 3458,
136
+ " Arts": 3497,
137
+ " As": 734,
138
+ " Asia": 2755,
139
+ " Asian": 3239,
140
+ " Assembly": 3467,
141
+ " Association": 2143,
142
+ " At": 781,
143
+ " Atlantic": 3252,
144
+ " August": 689,
145
+ " Aunt": 2626,
146
+ " Australia": 1393,
147
+ " Australian": 2093,
148
+ " Austria": 3763,
149
+ " Award": 1579,
150
+ " Awards": 2807,
151
+ " B": 992,
152
+ " BBC": 4059,
153
+ " BC": 3909,
154
+ " BE": 3032,
155
+ " Bank": 3665,
156
+ " Battle": 3424,
157
+ " Bavaria": 3866,
158
+ " Bay": 2546,
159
+ " Because": 2248,
160
+ " Before": 3442,
161
+ " Bell": 4051,
162
+ " Ben": 2303,
163
+ " Berlin": 3435,
164
+ " Best": 2291,
165
+ " Bible": 3427,
166
+ " Big": 2454,
167
+ " Bill": 2078,
168
+ " Billy": 2677,
169
+ " Black": 2224,
170
+ " Blue": 3523,
171
+ " Board": 3451,
172
+ " Bob": 2808,
173
+ " Book": 3963,
174
+ " Boston": 3072,
175
+ " Both": 3418,
176
+ " Brazil": 2887,
177
+ " Brazilian": 3714,
178
+ " Britain": 2472,
179
+ " British": 669,
180
+ " Brown": 2209,
181
+ " But": 404,
182
+ " By": 1420,
183
+ " C": 1011,
184
+ " California": 1109,
185
+ " Can": 2705,
186
+ " Canada": 1536,
187
+ " Canadian": 2154,
188
+ " Captain": 1614,
189
+ " Carolina": 2826,
190
+ " Catholic": 2252,
191
+ " Center": 2242,
192
+ " Central": 1968,
193
+ " Chairman": 3635,
194
+ " Championship": 2031,
195
+ " Championships": 3513,
196
+ " Charles": 1744,
197
+ " Charlie": 2909,
198
+ " Chicago": 1811,
199
+ " Chief": 2417,
200
+ " China": 1463,
201
+ " Chinese": 1596,
202
+ " Chris": 3193,
203
+ " Christ": 2237,
204
+ " Christian": 1781,
205
+ " Christmas": 1489,
206
+ " Church": 1329,
207
+ " City": 764,
208
+ " Civil": 3438,
209
+ " Club": 2456,
210
+ " College": 1407,
211
+ " Colonel": 2878,
212
+ " Columbia": 3738,
213
+ " Come": 1396,
214
+ " Committee": 2589,
215
+ " Company": 2264,
216
+ " Congress": 2375,
217
+ " Council": 1399,
218
+ " County": 551,
219
+ " Court": 1773,
220
+ " Cup": 1236,
221
+ " D": 1243,
222
+ " DNA": 3455,
223
+ " DO": 3194,
224
+ " DON'T": 3686,
225
+ " Dad": 1903,
226
+ " Daddy": 4053,
227
+ " Dan": 3001,
228
+ " Daniel": 3656,
229
+ " Danny": 3682,
230
+ " David": 1300,
231
+ " Day": 2097,
232
+ " De": 3972,
233
+ " December": 708,
234
+ " Democratic": 2524,
235
+ " Department": 2319,
236
+ " Dick": 3016,
237
+ " Did": 2460,
238
+ " Director": 3644,
239
+ " District": 1453,
240
+ " Division": 2478,
241
+ " Do": 1244,
242
+ " Doctor": 2604,
243
+ " Don": 3367,
244
+ " Don't": 1571,
245
+ " Dr": 1075,
246
+ " Duke": 3560,
247
+ " During": 1665,
248
+ " Dutch": 2379,
249
+ " E": 1492,
250
+ " Each": 3600,
251
+ " Earth": 1594,
252
+ " East": 1172,
253
+ " Eastern": 3087,
254
+ " Education": 3919,
255
+ " Edward": 2740,
256
+ " Egypt": 3122,
257
+ " Elizabeth": 3328,
258
+ " Emperor": 3605,
259
+ " Empire": 2348,
260
+ " England": 869,
261
+ " English": 653,
262
+ " Europe": 1226,
263
+ " European": 1273,
264
+ " Even": 3342,
265
+ " F": 1842,
266
+ " FOR": 2799,
267
+ " Father": 2178,
268
+ " February": 745,
269
+ " Federal": 3924,
270
+ " Festival": 3149,
271
+ " Film": 3503,
272
+ " First": 1833,
273
+ " Florida": 2275,
274
+ " Football": 2969,
275
+ " For": 811,
276
+ " Force": 3067,
277
+ " Forest": 3793,
278
+ " Fox": 2271,
279
+ " France": 631,
280
+ " Francisco": 3322,
281
+ " Frank": 2103,
282
+ " French": 760,
283
+ " Friday": 2748,
284
+ " From": 1595,
285
+ " G": 1880,
286
+ " Games": 2414,
287
+ " General": 1299,
288
+ " George": 1141,
289
+ " Georgia": 3283,
290
+ " German": 925,
291
+ " Germany": 1032,
292
+ " Get": 2554,
293
+ " Girl": 3882,
294
+ " Go": 2306,
295
+ " God": 364,
296
+ " God's": 3471,
297
+ " Golden": 3681,
298
+ " Good": 1433,
299
+ " Government": 2295,
300
+ " Governor": 2637,
301
+ " Grand": 2470,
302
+ " Great": 1386,
303
+ " Greek": 2035,
304
+ " Green": 2096,
305
+ " Group": 2525,
306
+ " H": 2006,
307
+ " HAVE": 3125,
308
+ " Hall": 1832,
309
+ " Harry": 2837,
310
+ " Have": 3052,
311
+ " He": 106,
312
+ " He's": 2269,
313
+ " Health": 3978,
314
+ " Hello": 2646,
315
+ " Henry": 1763,
316
+ " Her": 1320,
317
+ " Here": 2025,
318
+ " Hey": 1298,
319
+ " Hi": 2475,
320
+ " High": 1802,
321
+ " Hill": 2479,
322
+ " His": 568,
323
+ " Holy": 3382,
324
+ " Home": 3724,
325
+ " Hong": 3717,
326
+ " House": 1010,
327
+ " How": 999,
328
+ " However": 970,
329
+ " I": 19,
330
+ " I'd": 606,
331
+ " I'll": 283,
332
+ " I'm": 124,
333
+ " I've": 310,
334
+ " II": 1190,
335
+ " III": 3062,
336
+ " IN": 1597,
337
+ " IS": 2160,
338
+ " IT": 1592,
339
+ " If": 809,
340
+ " Illinois": 2385,
341
+ " In": 179,
342
+ " India": 1153,
343
+ " Indian": 1281,
344
+ " Institute": 2544,
345
+ " International": 1466,
346
+ " Internet": 3913,
347
+ " Iowa": 2773,
348
+ " Iran": 3631,
349
+ " Ireland": 2452,
350
+ " Irish": 3008,
351
+ " Is": 1347,
352
+ " Island": 1656,
353
+ " Islands": 3670,
354
+ " Israel": 3176,
355
+ " It": 119,
356
+ " It's": 796,
357
+ " Italian": 1620,
358
+ " Italy": 1771,
359
+ " Its": 1460,
360
+ " J": 1776,
361
+ " Jack": 1365,
362
+ " Jackson": 3039,
363
+ " James": 1270,
364
+ " Jane": 2907,
365
+ " January": 598,
366
+ " Japan": 1025,
367
+ " Japanese": 1134,
368
+ " Jerry": 3872,
369
+ " Jersey": 3158,
370
+ " Jesus": 1705,
371
+ " Jewish": 2708,
372
+ " Jews": 3848,
373
+ " Jim": 2374,
374
+ " Jimmy": 3134,
375
+ " Jo": 2221,
376
+ " Joe": 1941,
377
+ " John": 571,
378
+ " Johnny": 3201,
379
+ " Johnson": 2993,
380
+ " Jones": 3251,
381
+ " Joseph": 2929,
382
+ " Jr": 3152,
383
+ " July": 683,
384
+ " June": 651,
385
+ " Just": 1853,
386
+ " Justice": 4025,
387
+ " K": 2567,
388
+ " KNOW": 3248,
389
+ " Kansas": 3056,
390
+ " Kentucky": 3937,
391
+ " Kim": 3381,
392
+ " King": 679,
393
+ " Kingdom": 1723,
394
+ " Kong": 3349,
395
+ " Korea": 2982,
396
+ " Korean": 3412,
397
+ " L": 2029,
398
+ " La": 2817,
399
+ " Lady": 2298,
400
+ " Lake": 2346,
401
+ " Later": 3079,
402
+ " Latin": 2585,
403
+ " Law": 3160,
404
+ " League": 990,
405
+ " Lee": 2201,
406
+ " Let": 2488,
407
+ " Let's": 3050,
408
+ " Life": 3217,
409
+ " Like": 2876,
410
+ " Line": 3770,
411
+ " Little": 1989,
412
+ " London": 854,
413
+ " Long": 3980,
414
+ " Look": 2927,
415
+ " Lord": 1005,
416
+ " Los": 2167,
417
+ " Louis": 2593,
418
+ " Love": 3020,
419
+ " M": 1357,
420
+ " ME": 2109,
421
+ " MY": 3018,
422
+ " Majesty": 2814,
423
+ " Major": 2644,
424
+ " Man": 1840,
425
+ " Many": 1893,
426
+ " March": 635,
427
+ " Maria": 3470,
428
+ " Mark": 2343,
429
+ " Martin": 2287,
430
+ " Mary": 1517,
431
+ " Massachusetts": 3450,
432
+ " Master": 2377,
433
+ " Max": 3935,
434
+ " May": 605,
435
+ " Me": 2726,
436
+ " Mexican": 3932,
437
+ " Mexico": 2060,
438
+ " Michael": 1464,
439
+ " Michigan": 3461,
440
+ " Middle": 3296,
441
+ " Mike": 2772,
442
+ " Minister": 1355,
443
+ " Miss": 890,
444
+ " Missouri": 3765,
445
+ " Mom": 2557,
446
+ " Monday": 3208,
447
+ " Moscow": 4035,
448
+ " Most": 2090,
449
+ " Mother": 2324,
450
+ " Mount": 3634,
451
+ " Mr": 292,
452
+ " Mrs": 655,
453
+ " Museum": 2719,
454
+ " Music": 2575,
455
+ " My": 1206,
456
+ " N": 2335,
457
+ " NOT": 3276,
458
+ " National": 714,
459
+ " Nations": 3615,
460
+ " Navy": 3299,
461
+ " Netherlands": 3454,
462
+ " New": 332,
463
+ " News": 4023,
464
+ " No": 359,
465
+ " North": 772,
466
+ " Northern": 2690,
467
+ " Not": 1565,
468
+ " Nothing": 3883,
469
+ " November": 719,
470
+ " Now": 1294,
471
+ " O": 1643,
472
+ " OF": 1052,
473
+ " OK": 1454,
474
+ " ON": 2194,
475
+ " October": 674,
476
+ " Of": 1618,
477
+ " Office": 3417,
478
+ " Oh": 537,
479
+ " Ohio": 3528,
480
+ " Okay": 1348,
481
+ " Old": 1684,
482
+ " Olympic": 3457,
483
+ " Olympics": 2482,
484
+ " On": 704,
485
+ " One": 894,
486
+ " Only": 3344,
487
+ " Open": 3859,
488
+ " Other": 2792,
489
+ " Oxford": 4081,
490
+ " P": 1527,
491
+ " Pacific": 3282,
492
+ " Pakistan": 3408,
493
+ " Paris": 1408,
494
+ " Park": 1342,
495
+ " Parliament": 2614,
496
+ " Party": 1160,
497
+ " Paul": 1371,
498
+ " Pennsylvania": 1661,
499
+ " People": 2422,
500
+ " Peter": 995,
501
+ " Philadelphia": 3870,
502
+ " Please": 3699,
503
+ " Poland": 3691,
504
+ " Police": 3790,
505
+ " Polish": 4034,
506
+ " President": 865,
507
+ " Prime": 2453,
508
+ " Prince": 1675,
509
+ " Princess": 2545,
510
+ " Prize": 3324,
511
+ " Professor": 2627,
512
+ " Province": 2866,
513
+ " Queen": 1697,
514
+ " R": 1591,
515
+ " Radio": 4039,
516
+ " Ray": 3415,
517
+ " Really": 3992,
518
+ " Records": 3098,
519
+ " Red": 1921,
520
+ " Republic": 1800,
521
+ " Republican": 2897,
522
+ " Richard": 1854,
523
+ " Right": 2512,
524
+ " River": 1114,
525
+ " Road": 2175,
526
+ " Robert": 1696,
527
+ " Rock": 3587,
528
+ " Roman": 1663,
529
+ " Rome": 2439,
530
+ " Rose": 3174,
531
+ " Royal": 1792,
532
+ " Russia": 2183,
533
+ " Russian": 1648,
534
+ " S": 1356,
535
+ " Saint": 3264,
536
+ " Sam": 2583,
537
+ " San": 1511,
538
+ " Santa": 3768,
539
+ " Sarah": 3675,
540
+ " Saturday": 2715,
541
+ " School": 962,
542
+ " Science": 3896,
543
+ " Scotland": 2528,
544
+ " Scott": 3063,
545
+ " Scottish": 3464,
546
+ " Sea": 2421,
547
+ " Second": 3323,
548
+ " Secretary": 2739,
549
+ " See": 3053,
550
+ " Senate": 2987,
551
+ " September": 673,
552
+ " Service": 3230,
553
+ " She": 249,
554
+ " She's": 3747,
555
+ " Show": 3472,
556
+ " Simon": 3752,
557
+ " Since": 2561,
558
+ " Sir": 1404,
559
+ " Smith": 2153,
560
+ " So": 822,
561
+ " Society": 2640,
562
+ " Some": 1177,
563
+ " South": 675,
564
+ " Southern": 2841,
565
+ " Soviet": 1873,
566
+ " Spain": 2120,
567
+ " Spanish": 1736,
568
+ " St": 1017,
569
+ " Star": 3912,
570
+ " State": 942,
571
+ " States": 355,
572
+ " Station": 4069,
573
+ " Steve": 3273,
574
+ " Street": 1416,
575
+ " Summer": 2458,
576
+ " Sun": 3074,
577
+ " Sunday": 2122,
578
+ " Super": 4029,
579
+ " Supreme": 3814,
580
+ " Susan": 4072,
581
+ " Sweden": 2742,
582
+ " Swedish": 3720,
583
+ " Switzerland": 2785,
584
+ " T": 1577,
585
+ " THAT": 1974,
586
+ " THE": 585,
587
+ " THIS": 2898,
588
+ " TO": 908,
589
+ " TV": 1322,
590
+ " Take": 3842,
591
+ " Taylor": 3960,
592
+ " Texas": 1644,
593
+ " Thank": 1847,
594
+ " Thanks": 3490,
595
+ " That": 870,
596
+ " That's": 1225,
597
+ " The": 71,
598
+ " Their": 2255,
599
+ " Then": 891,
600
+ " There": 542,
601
+ " There's": 3740,
602
+ " These": 1018,
603
+ " They": 288,
604
+ " This": 290,
605
+ " Thomas": 2012,
606
+ " Three": 3129,
607
+ " Time": 3254,
608
+ " Times": 3407,
609
+ " To": 1120,
610
+ " Tokyo": 4024,
611
+ " Tom": 1581,
612
+ " Tommy": 3505,
613
+ " Tony": 3105,
614
+ " Town": 3446,
615
+ " Township": 3669,
616
+ " Two": 2207,
617
+ " U": 712,
618
+ " UK": 2390,
619
+ " US": 1303,
620
+ " Uncle": 2796,
621
+ " Union": 1430,
622
+ " United": 300,
623
+ " University": 580,
624
+ " V": 2195,
625
+ " Valley": 2860,
626
+ " Victoria": 3716,
627
+ " Virginia": 2017,
628
+ " W": 2457,
629
+ " WAS": 2936,
630
+ " WE": 3818,
631
+ " WHAT": 3888,
632
+ " WITH": 3520,
633
+ " Wales": 2945,
634
+ " Walter": 3969,
635
+ " War": 802,
636
+ " Washington": 1490,
637
+ " We": 686,
638
+ " We're": 3292,
639
+ " Well": 998,
640
+ " West": 996,
641
+ " Western": 2116,
642
+ " What": 350,
643
+ " What's": 1817,
644
+ " When": 713,
645
+ " Where": 1667,
646
+ " While": 2532,
647
+ " White": 1877,
648
+ " Who": 1674,
649
+ " Why": 1069,
650
+ " Will": 2523,
651
+ " William": 1429,
652
+ " Williams": 3828,
653
+ " With": 1431,
654
+ " Women's": 3399,
655
+ " World": 599,
656
+ " X": 2398,
657
+ " YOU": 815,
658
+ " YOUR": 3206,
659
+ " Yeah": 619,
660
+ " Year": 3127,
661
+ " Yes": 583,
662
+ " York": 593,
663
+ " You": 293,
664
+ " You're": 1500,
665
+ " Young": 2698,
666
+ " Your": 1520,
667
+ " Zealand": 2896,
668
+ " [": 535,
669
+ " ]": 724,
670
+ " _": 779,
671
+ " `": 1058,
672
+ " ``": 800,
673
+ " a": 11,
674
+ " ability": 2413,
675
+ " able": 499,
676
+ " about": 62,
677
+ " above": 935,
678
+ " absolutely": 1801,
679
+ " accept": 1754,
680
+ " accepted": 2502,
681
+ " access": 2041,
682
+ " accident": 1757,
683
+ " according": 1866,
684
+ " account": 1627,
685
+ " acid": 3757,
686
+ " acquired": 4027,
687
+ " across": 716,
688
+ " act": 1186,
689
+ " acting": 2105,
690
+ " action": 1248,
691
+ " actions": 3250,
692
+ " active": 2106,
693
+ " activities": 2688,
694
+ " activity": 2560,
695
+ " actor": 1436,
696
+ " actress": 1981,
697
+ " acts": 3509,
698
+ " actual": 2279,
699
+ " actually": 331,
700
+ " add": 1729,
701
+ " added": 1284,
702
+ " addition": 1970,
703
+ " additional": 3172,
704
+ " address": 2054,
705
+ " administration": 3893,
706
+ " admit": 2911,
707
+ " adopted": 3365,
708
+ " adult": 3270,
709
+ " advance": 3463,
710
+ " advanced": 3484,
711
+ " advantage": 2813,
712
+ " advice": 2443,
713
+ " affair": 3704,
714
+ " affairs": 3811,
715
+ " affected": 3569,
716
+ " afford": 2904,
717
+ " afraid": 818,
718
+ " after": 159,
719
+ " afternoon": 1421,
720
+ " afterwards": 2844,
721
+ " again": 185,
722
+ " against": 333,
723
+ " age": 557,
724
+ " aged": 1646,
725
+ " agent": 2703,
726
+ " ago": 545,
727
+ " agree": 1569,
728
+ " agreed": 1882,
729
+ " agreement": 3113,
730
+ " ah": 2877,
731
+ " ahead": 1022,
732
+ " aid": 3453,
733
+ " ain't": 1008,
734
+ " air": 707,
735
+ " aircraft": 2777,
736
+ " airport": 3391,
737
+ " album": 699,
738
+ " albums": 3642,
739
+ " alive": 1119,
740
+ " all": 50,
741
+ " allow": 1603,
742
+ " allowed": 1163,
743
+ " allows": 3258,
744
+ " almost": 572,
745
+ " alone": 597,
746
+ " along": 480,
747
+ " already": 440,
748
+ " alright": 1170,
749
+ " also": 138,
750
+ " alternative": 4080,
751
+ " although": 1224,
752
+ " always": 250,
753
+ " am": 207,
754
+ " amazing": 1971,
755
+ " among": 813,
756
+ " amount": 1296,
757
+ " an": 60,
758
+ " analysis": 3284,
759
+ " ancient": 1745,
760
+ " and": 10,
761
+ " anger": 3774,
762
+ " angle": 3556,
763
+ " angry": 1688,
764
+ " animal": 1633,
765
+ " animals": 1183,
766
+ " announced": 1390,
767
+ " annual": 3429,
768
+ " another": 248,
769
+ " answer": 794,
770
+ " answered": 1227,
771
+ " answers": 3650,
772
+ " anxious": 3989,
773
+ " any": 153,
774
+ " anybody": 1209,
775
+ " anymore": 1140,
776
+ " anyone": 803,
777
+ " anything": 256,
778
+ " anyway": 1001,
779
+ " anywhere": 1770,
780
+ " apart": 1891,
781
+ " apartment": 2650,
782
+ " apparently": 3107,
783
+ " appear": 2046,
784
+ " appearance": 2320,
785
+ " appeared": 1097,
786
+ " appears": 2261,
787
+ " apple": 3268,
788
+ " application": 3347,
789
+ " applied": 3375,
790
+ " apply": 3364,
791
+ " appointed": 2145,
792
+ " appreciate": 3034,
793
+ " approach": 2562,
794
+ " appropriate": 3910,
795
+ " approximately": 3558,
796
+ " are": 33,
797
+ " area": 473,
798
+ " areas": 1256,
799
+ " aren't": 986,
800
+ " argument": 3097,
801
+ " arm": 1462,
802
+ " armed": 3522,
803
+ " arms": 1260,
804
+ " army": 1287,
805
+ " around": 242,
806
+ " arranged": 3721,
807
+ " arrest": 2863,
808
+ " arrested": 2651,
809
+ " arrive": 4079,
810
+ " arrived": 1632,
811
+ " art": 1068,
812
+ " article": 2962,
813
+ " artist": 2067,
814
+ " artists": 2928,
815
+ " arts": 3668,
816
+ " as": 32,
817
+ " ashamed": 3876,
818
+ " aside": 3306,
819
+ " ask": 427,
820
+ " asked": 465,
821
+ " asking": 1223,
822
+ " asleep": 2216,
823
+ " ass": 1718,
824
+ " assistant": 3232,
825
+ " associated": 2989,
826
+ " association": 3962,
827
+ " assume": 3491,
828
+ " at": 36,
829
+ " ate": 2830,
830
+ " atmosphere": 3685,
831
+ " attached": 4005,
832
+ " attack": 1043,
833
+ " attacked": 2856,
834
+ " attacks": 3352,
835
+ " attempt": 2359,
836
+ " attend": 3868,
837
+ " attended": 2915,
838
+ " attention": 1367,
839
+ " attitude": 3487,
840
+ " audience": 2660,
841
+ " aunt": 3671,
842
+ " author": 2336,
843
+ " authorities": 3392,
844
+ " authority": 2556,
845
+ " available": 1456,
846
+ " average": 1474,
847
+ " avoid": 2770,
848
+ " awake": 3880,
849
+ " award": 2798,
850
+ " awarded": 2485,
851
+ " awards": 3797,
852
+ " aware": 2341,
853
+ " away": 199,
854
+ " awful": 2308,
855
+ " b": 3742,
856
+ " babies": 3199,
857
+ " baby": 547,
858
+ " back": 121,
859
+ " background": 2975,
860
+ " bad": 417,
861
+ " badly": 3495,
862
+ " bag": 1512,
863
+ " balance": 3516,
864
+ " ball": 1045,
865
+ " balls": 3552,
866
+ " band": 850,
867
+ " bank": 1481,
868
+ " banks": 3713,
869
+ " bar": 1975,
870
+ " base": 1722,
871
+ " baseball": 3662,
872
+ " based": 795,
873
+ " basic": 2615,
874
+ " basically": 2066,
875
+ " basis": 2431,
876
+ " basketball": 3286,
877
+ " bastard": 3255,
878
+ " bath": 3180,
879
+ " bathroom": 3515,
880
+ " battle": 1537,
881
+ " be": 30,
882
+ " beach": 2631,
883
+ " bear": 1168,
884
+ " beat": 1368,
885
+ " beating": 3841,
886
+ " beautiful": 663,
887
+ " beauty": 2068,
888
+ " became": 349,
889
+ " because": 147,
890
+ " become": 550,
891
+ " becomes": 1844,
892
+ " becoming": 2053,
893
+ " bed": 720,
894
+ " bedroom": 3723,
895
+ " been": 90,
896
+ " beer": 3110,
897
+ " before": 170,
898
+ " beg": 3397,
899
+ " began": 475,
900
+ " begin": 1507,
901
+ " beginning": 1051,
902
+ " begins": 2890,
903
+ " begun": 3832,
904
+ " behavior": 3337,
905
+ " behind": 611,
906
+ " being": 215,
907
+ " beings": 4068,
908
+ " belief": 3799,
909
+ " believe": 379,
910
+ " believed": 1604,
911
+ " bell": 3555,
912
+ " belong": 2734,
913
+ " belongs": 3499,
914
+ " below": 1412,
915
+ " beneath": 3318,
916
+ " benefit": 3051,
917
+ " beside": 2605,
918
+ " besides": 3869,
919
+ " best": 320,
920
+ " bet": 1615,
921
+ " better": 259,
922
+ " between": 269,
923
+ " beyond": 1645,
924
+ " big": 277,
925
+ " bigger": 1797,
926
+ " biggest": 2055,
927
+ " bill": 2648,
928
+ " billion": 2750,
929
+ " bird": 1470,
930
+ " birds": 1815,
931
+ " birth": 1920,
932
+ " birthday": 1477,
933
+ " bit": 341,
934
+ " bitch": 2164,
935
+ " bite": 2976,
936
+ " bits": 3530,
937
+ " black": 698,
938
+ " blame": 2810,
939
+ " blind": 2761,
940
+ " block": 2387,
941
+ " blocks": 3440,
942
+ " blood": 849,
943
+ " bloody": 2049,
944
+ " blow": 1870,
945
+ " blue": 958,
946
+ " board": 1417,
947
+ " boat": 1370,
948
+ " boats": 4017,
949
+ " bodies": 2489,
950
+ " body": 554,
951
+ " bomb": 3173,
952
+ " bone": 3734,
953
+ " bones": 3336,
954
+ " book": 462,
955
+ " books": 948,
956
+ " border": 2610,
957
+ " born": 377,
958
+ " boss": 1904,
959
+ " both": 308,
960
+ " bother": 2804,
961
+ " bottle": 2434,
962
+ " bottom": 1458,
963
+ " bought": 1208,
964
+ " bound": 2940,
965
+ " bow": 3796,
966
+ " box": 959,
967
+ " boy": 435,
968
+ " boyfriend": 3290,
969
+ " boys": 866,
970
+ " brain": 1415,
971
+ " branch": 2625,
972
+ " branches": 3703,
973
+ " brand": 3705,
974
+ " brave": 3144,
975
+ " bread": 2198,
976
+ " break": 888,
977
+ " breakfast": 2003,
978
+ " breaking": 2868,
979
+ " breath": 2234,
980
+ " bridge": 1788,
981
+ " brief": 3557,
982
+ " bright": 2113,
983
+ " brilliant": 3380,
984
+ " bring": 573,
985
+ " bringing": 2508,
986
+ " brings": 3136,
987
+ " broad": 3295,
988
+ " broadcast": 3602,
989
+ " broke": 1249,
990
+ " broken": 1382,
991
+ " brother": 632,
992
+ " brothers": 1930,
993
+ " brought": 603,
994
+ " brown": 2013,
995
+ " buddy": 3204,
996
+ " budget": 3215,
997
+ " build": 1321,
998
+ " building": 775,
999
+ " buildings": 2193,
1000
+ " built": 770,
1001
+ " bullet": 4011,
1002
+ " bunch": 2730,
1003
+ " buried": 2301,
1004
+ " burn": 3021,
1005
+ " burned": 3465,
1006
+ " burning": 3159,
1007
+ " bus": 1799,
1008
+ " business": 559,
1009
+ " busy": 1501,
1010
+ " but": 58,
1011
+ " butter": 3874,
1012
+ " button": 3269,
1013
+ " buy": 834,
1014
+ " buying": 3538,
1015
+ " by": 45,
1016
+ " bye": 2490,
1017
+ " c": 3479,
1018
+ " ca": 3195,
1019
+ " cake": 2437,
1020
+ " call": 297,
1021
+ " called": 220,
1022
+ " calling": 1410,
1023
+ " calls": 1732,
1024
+ " calm": 2404,
1025
+ " came": 212,
1026
+ " camera": 2199,
1027
+ " camp": 1913,
1028
+ " campaign": 2169,
1029
+ " can": 64,
1030
+ " can't": 166,
1031
+ " cancer": 1898,
1032
+ " candidate": 3500,
1033
+ " cannot": 798,
1034
+ " cap": 4042,
1035
+ " capable": 3534,
1036
+ " capacity": 3591,
1037
+ " capital": 1149,
1038
+ " captain": 2084,
1039
+ " captured": 3325,
1040
+ " car": 393,
1041
+ " carbon": 3609,
1042
+ " card": 1786,
1043
+ " cards": 2857,
1044
+ " care": 432,
1045
+ " career": 743,
1046
+ " careful": 1523,
1047
+ " carefully": 2579,
1048
+ " carried": 1440,
1049
+ " carry": 1293,
1050
+ " carrying": 2718,
1051
+ " cars": 1759,
1052
+ " case": 464,
1053
+ " cases": 1516,
1054
+ " cash": 2596,
1055
+ " cast": 2236,
1056
+ " castle": 2573,
1057
+ " cat": 1402,
1058
+ " catch": 1217,
1059
+ " cats": 3950,
1060
+ " caught": 1136,
1061
+ " cause": 1080,
1062
+ " caused": 1319,
1063
+ " causes": 2769,
1064
+ " cell": 1841,
1065
+ " cells": 2408,
1066
+ " census": 1686,
1067
+ " center": 1491,
1068
+ " central": 1918,
1069
+ " centre": 2080,
1070
+ " centuries": 3676,
1071
+ " century": 827,
1072
+ " ceremony": 3881,
1073
+ " certain": 799,
1074
+ " certainly": 1152,
1075
+ " chain": 2843,
1076
+ " chair": 1387,
1077
+ " chairman": 3908,
1078
+ " challenge": 2913,
1079
+ " champion": 3257,
1080
+ " championship": 3483,
1081
+ " chance": 807,
1082
+ " change": 513,
1083
+ " changed": 853,
1084
+ " changes": 1447,
1085
+ " changing": 2658,
1086
+ " channel": 2803,
1087
+ " character": 1124,
1088
+ " characters": 2082,
1089
+ " charge": 1372,
1090
+ " charged": 3847,
1091
+ " charges": 3546,
1092
+ " chart": 3648,
1093
+ " cheap": 3537,
1094
+ " check": 1061,
1095
+ " checked": 3710,
1096
+ " cheese": 2926,
1097
+ " chemical": 3116,
1098
+ " chest": 3151,
1099
+ " chicken": 2629,
1100
+ " chief": 1901,
1101
+ " child": 579,
1102
+ " childhood": 3798,
1103
+ " children": 409,
1104
+ " children's": 3918,
1105
+ " chocolate": 3680,
1106
+ " choice": 1413,
1107
+ " choose": 1751,
1108
+ " chose": 2864,
1109
+ " chosen": 2670,
1110
+ " church": 1020,
1111
+ " circle": 2697,
1112
+ " circumstances": 3131,
1113
+ " cities": 1834,
1114
+ " citizens": 3396,
1115
+ " city": 338,
1116
+ " civil": 2367,
1117
+ " claim": 2552,
1118
+ " claimed": 3214,
1119
+ " claims": 3834,
1120
+ " class": 993,
1121
+ " classes": 2827,
1122
+ " clean": 1262,
1123
+ " clear": 863,
1124
+ " clearly": 2134,
1125
+ " clever": 3378,
1126
+ " client": 3373,
1127
+ " climate": 2972,
1128
+ " climb": 3971,
1129
+ " clock": 3459,
1130
+ " close": 627,
1131
+ " closed": 1401,
1132
+ " closely": 3265,
1133
+ " closer": 2484,
1134
+ " clothes": 1292,
1135
+ " club": 889,
1136
+ " clubs": 3954,
1137
+ " coach": 1836,
1138
+ " coast": 2249,
1139
+ " coat": 2382,
1140
+ " code": 1957,
1141
+ " coffee": 1493,
1142
+ " cold": 910,
1143
+ " collection": 2378,
1144
+ " college": 1314,
1145
+ " color": 1282,
1146
+ " colors": 3387,
1147
+ " colour": 3084,
1148
+ " combination": 3983,
1149
+ " combined": 3968,
1150
+ " come": 134,
1151
+ " comedy": 2847,
1152
+ " comes": 530,
1153
+ " comfort": 3708,
1154
+ " comfortable": 2656,
1155
+ " coming": 360,
1156
+ " command": 2328,
1157
+ " commercial": 2954,
1158
+ " committed": 3133,
1159
+ " committee": 3038,
1160
+ " common": 879,
1161
+ " commonly": 3223,
1162
+ " commune": 1054,
1163
+ " communication": 3826,
1164
+ " communities": 3846,
1165
+ " community": 1088,
1166
+ " companies": 1960,
1167
+ " company": 630,
1168
+ " compared": 3061,
1169
+ " competed": 3242,
1170
+ " competition": 1937,
1171
+ " complete": 1552,
1172
+ " completed": 2400,
1173
+ " completely": 1297,
1174
+ " complex": 2463,
1175
+ " complicated": 3437,
1176
+ " composed": 3329,
1177
+ " computer": 1361,
1178
+ " computers": 3976,
1179
+ " concept": 3154,
1180
+ " concern": 3361,
1181
+ " concerned": 2185,
1182
+ " concert": 3410,
1183
+ " condition": 2034,
1184
+ " conditions": 2023,
1185
+ " conference": 3186,
1186
+ " confidence": 3782,
1187
+ " confirmed": 3878,
1188
+ " conflict": 3817,
1189
+ " confused": 3806,
1190
+ " connected": 2584,
1191
+ " connection": 2766,
1192
+ " consider": 2065,
1193
+ " considered": 1238,
1194
+ " consists": 3460,
1195
+ " constant": 3175,
1196
+ " construction": 2330,
1197
+ " contact": 1979,
1198
+ " contains": 2531,
1199
+ " content": 2551,
1200
+ " continue": 1628,
1201
+ " continued": 1332,
1202
+ " continues": 2922,
1203
+ " contract": 1997,
1204
+ " control": 857,
1205
+ " conversation": 2345,
1206
+ " convinced": 3993,
1207
+ " cook": 2590,
1208
+ " cooking": 3654,
1209
+ " cool": 1286,
1210
+ " cop": 3852,
1211
+ " copies": 3359,
1212
+ " cops": 3927,
1213
+ " copy": 2680,
1214
+ " corner": 1605,
1215
+ " correct": 2449,
1216
+ " cos": 1060,
1217
+ " cost": 1234,
1218
+ " costs": 2793,
1219
+ " could": 122,
1220
+ " couldn't": 676,
1221
+ " council": 2566,
1222
+ " count": 2011,
1223
+ " countries": 1077,
1224
+ " country": 504,
1225
+ " county": 1029,
1226
+ " couple": 821,
1227
+ " courage": 3393,
1228
+ " course": 299,
1229
+ " court": 1210,
1230
+ " cousin": 2984,
1231
+ " cover": 1335,
1232
+ " covered": 1651,
1233
+ " covers": 3755,
1234
+ " cow": 3060,
1235
+ " crap": 3730,
1236
+ " crash": 3639,
1237
+ " crazy": 1040,
1238
+ " cream": 2819,
1239
+ " create": 1503,
1240
+ " created": 901,
1241
+ " creating": 3482,
1242
+ " creation": 3694,
1243
+ " creature": 3153,
1244
+ " credit": 2654,
1245
+ " crew": 2309,
1246
+ " cried": 1126,
1247
+ " crime": 1657,
1248
+ " criminal": 3049,
1249
+ " crisis": 3991,
1250
+ " critical": 3319,
1251
+ " cross": 1856,
1252
+ " crossed": 3647,
1253
+ " crowd": 2908,
1254
+ " crown": 4046,
1255
+ " cry": 1630,
1256
+ " crying": 2222,
1257
+ " cultural": 3475,
1258
+ " culture": 2005,
1259
+ " cup": 1954,
1260
+ " curious": 3300,
1261
+ " current": 1280,
1262
+ " currently": 1902,
1263
+ " customers": 3940,
1264
+ " cut": 732,
1265
+ " cute": 3041,
1266
+ " cutting": 3803,
1267
+ " d": 3333,
1268
+ " da": 3111,
1269
+ " dad": 1039,
1270
+ " daddy": 1590,
1271
+ " daily": 2957,
1272
+ " damage": 2294,
1273
+ " damn": 1859,
1274
+ " dance": 1221,
1275
+ " dancing": 2711,
1276
+ " danger": 2157,
1277
+ " dangerous": 1711,
1278
+ " dare": 2138,
1279
+ " dark": 989,
1280
+ " darkness": 3238,
1281
+ " darling": 2538,
1282
+ " data": 1264,
1283
+ " date": 1233,
1284
+ " dates": 3986,
1285
+ " dating": 3975,
1286
+ " daughter": 693,
1287
+ " daughters": 3974,
1288
+ " day": 188,
1289
+ " days": 388,
1290
+ " de": 613,
1291
+ " dead": 498,
1292
+ " deal": 735,
1293
+ " dealing": 3376,
1294
+ " dear": 727,
1295
+ " death": 472,
1296
+ " debt": 3745,
1297
+ " debut": 2380,
1298
+ " decide": 2086,
1299
+ " decided": 980,
1300
+ " decision": 1513,
1301
+ " decisions": 3758,
1302
+ " deck": 4047,
1303
+ " declared": 2495,
1304
+ " deep": 1232,
1305
+ " deeply": 3955,
1306
+ " defeat": 3394,
1307
+ " defeated": 2653,
1308
+ " defense": 3957,
1309
+ " defined": 3894,
1310
+ " definitely": 1980,
1311
+ " degree": 1784,
1312
+ " degrees": 2977,
1313
+ " demand": 3140,
1314
+ " department": 872,
1315
+ " depends": 3196,
1316
+ " describe": 3237,
1317
+ " described": 1673,
1318
+ " desert": 3664,
1319
+ " deserve": 3624,
1320
+ " design": 1543,
1321
+ " designed": 1933,
1322
+ " desire": 2574,
1323
+ " desk": 3493,
1324
+ " despite": 3593,
1325
+ " destroy": 2728,
1326
+ " destroyed": 2200,
1327
+ " detail": 3576,
1328
+ " details": 2800,
1329
+ " determined": 2930,
1330
+ " develop": 3030,
1331
+ " developed": 1443,
1332
+ " development": 1278,
1333
+ " device": 3425,
1334
+ " devil": 3813,
1335
+ " did": 108,
1336
+ " didn't": 180,
1337
+ " die": 710,
1338
+ " died": 336,
1339
+ " difference": 1330,
1340
+ " different": 302,
1341
+ " difficult": 1087,
1342
+ " difficulty": 3831,
1343
+ " dinner": 1073,
1344
+ " direct": 2355,
1345
+ " directed": 1313,
1346
+ " direction": 1747,
1347
+ " directly": 2110,
1348
+ " director": 1485,
1349
+ " dirty": 2371,
1350
+ " disappeared": 2903,
1351
+ " discovered": 1623,
1352
+ " discuss": 3103,
1353
+ " discussion": 3762,
1354
+ " disease": 1805,
1355
+ " display": 3772,
1356
+ " distance": 1608,
1357
+ " distributed": 3386,
1358
+ " distribution": 3619,
1359
+ " district": 1012,
1360
+ " districts": 3660,
1361
+ " divided": 2064,
1362
+ " division": 2263,
1363
+ " do": 42,
1364
+ " doctor": 1062,
1365
+ " doctors": 3512,
1366
+ " does": 243,
1367
+ " doesn't": 325,
1368
+ " dog": 883,
1369
+ " dogs": 2102,
1370
+ " doing": 204,
1371
+ " dollars": 1780,
1372
+ " don": 2419,
1373
+ " don't": 70,
1374
+ " done": 257,
1375
+ " door": 490,
1376
+ " doors": 2751,
1377
+ " double": 1988,
1378
+ " doubt": 1525,
1379
+ " down": 130,
1380
+ " downstairs": 4082,
1381
+ " dozen": 3616,
1382
+ " drama": 2388,
1383
+ " draw": 1405,
1384
+ " drawing": 2780,
1385
+ " drawn": 2924,
1386
+ " dream": 1272,
1387
+ " dreams": 2285,
1388
+ " dress": 1635,
1389
+ " dressed": 2515,
1390
+ " drew": 2293,
1391
+ " drink": 840,
1392
+ " drinking": 2591,
1393
+ " drive": 1133,
1394
+ " driven": 3884,
1395
+ " driver": 2622,
1396
+ " driving": 1863,
1397
+ " drop": 1621,
1398
+ " dropped": 1765,
1399
+ " drove": 3043,
1400
+ " drug": 2368,
1401
+ " drugs": 2429,
1402
+ " drunk": 2569,
1403
+ " dry": 1838,
1404
+ " duck": 3578,
1405
+ " dude": 3733,
1406
+ " due": 966,
1407
+ " during": 376,
1408
+ " dust": 3589,
1409
+ " duty": 2028,
1410
+ " dying": 2265,
1411
+ " e": 2665,
1412
+ " each": 312,
1413
+ " ear": 2720,
1414
+ " earlier": 1476,
1415
+ " early": 565,
1416
+ " earn": 4045,
1417
+ " earned": 3092,
1418
+ " ears": 2163,
1419
+ " earth": 1259,
1420
+ " easier": 2203,
1421
+ " easily": 1719,
1422
+ " east": 1341,
1423
+ " eastern": 2905,
1424
+ " easy": 808,
1425
+ " eat": 527,
1426
+ " eaten": 3191,
1427
+ " eating": 1578,
1428
+ " economic": 2340,
1429
+ " economy": 2892,
1430
+ " edge": 2061,
1431
+ " edition": 3737,
1432
+ " editor": 3690,
1433
+ " education": 1309,
1434
+ " effect": 1423,
1435
+ " effective": 3434,
1436
+ " effects": 2491,
1437
+ " effort": 2481,
1438
+ " efforts": 3320,
1439
+ " egg": 2854,
1440
+ " eggs": 2117,
1441
+ " eh": 2714,
1442
+ " eight": 843,
1443
+ " eighty": 2971,
1444
+ " either": 643,
1445
+ " elected": 1638,
1446
+ " election": 1336,
1447
+ " elections": 3462,
1448
+ " electric": 3165,
1449
+ " element": 3432,
1450
+ " elements": 2669,
1451
+ " eleven": 2992,
1452
+ " else": 363,
1453
+ " emergency": 3307,
1454
+ " empty": 2083,
1455
+ " end": 309,
1456
+ " ended": 1444,
1457
+ " ending": 3897,
1458
+ " ends": 2307,
1459
+ " enemies": 3389,
1460
+ " enemy": 1789,
1461
+ " energy": 1132,
1462
+ " engaged": 3014,
1463
+ " engine": 2070,
1464
+ " engineer": 4007,
1465
+ " engineering": 3990,
1466
+ " enjoy": 1909,
1467
+ " enjoyed": 3353,
1468
+ " enough": 330,
1469
+ " enter": 2342,
1470
+ " entered": 1750,
1471
+ " entire": 1242,
1472
+ " entirely": 2520,
1473
+ " entrance": 3736,
1474
+ " environment": 2461,
1475
+ " episode": 2220,
1476
+ " episodes": 3939,
1477
+ " equal": 1219,
1478
+ " equipment": 3130,
1479
+ " er": 193,
1480
+ " era": 3637,
1481
+ " erm": 305,
1482
+ " escape": 1864,
1483
+ " escaped": 3776,
1484
+ " especially": 1182,
1485
+ " established": 1514,
1486
+ " estate": 3355,
1487
+ " etc": 2764,
1488
+ " even": 190,
1489
+ " evening": 1028,
1490
+ " event": 1283,
1491
+ " events": 1553,
1492
+ " eventually": 1829,
1493
+ " ever": 289,
1494
+ " every": 284,
1495
+ " everybody": 983,
1496
+ " everyone": 878,
1497
+ " everything": 370,
1498
+ " everywhere": 2339,
1499
+ " evidence": 1334,
1500
+ " evil": 2038,
1501
+ " exact": 3327,
1502
+ " exactly": 832,
1503
+ " example": 649,
1504
+ " examples": 3317,
1505
+ " excellent": 3109,
1506
+ " except": 1395,
1507
+ " exchange": 3188,
1508
+ " excited": 2689,
1509
+ " exciting": 3632,
1510
+ " exclaimed": 3916,
1511
+ " excuse": 2415,
1512
+ " executive": 3767,
1513
+ " exercise": 3362,
1514
+ " exist": 2366,
1515
+ " existence": 2961,
1516
+ " existing": 3949,
1517
+ " expect": 1449,
1518
+ " expected": 1905,
1519
+ " expensive": 2791,
1520
+ " experience": 1154,
1521
+ " experiment": 3946,
1522
+ " explain": 1482,
1523
+ " explained": 3162,
1524
+ " express": 4038,
1525
+ " expression": 2376,
1526
+ " extended": 3688,
1527
+ " extent": 3925,
1528
+ " extra": 2002,
1529
+ " extremely": 2674,
1530
+ " eye": 1113,
1531
+ " eyes": 401,
1532
+ " f": 3141,
1533
+ " face": 398,
1534
+ " faces": 2673,
1535
+ " fact": 536,
1536
+ " factor": 3851,
1537
+ " factory": 3449,
1538
+ " facts": 3719,
1539
+ " fail": 3856,
1540
+ " failed": 2131,
1541
+ " failure": 2657,
1542
+ " fair": 1409,
1543
+ " fairly": 2812,
1544
+ " faith": 2210,
1545
+ " fall": 973,
1546
+ " fallen": 3010,
1547
+ " falling": 2564,
1548
+ " falls": 3298,
1549
+ " false": 3210,
1550
+ " familiar": 2965,
1551
+ " families": 1822,
1552
+ " family": 328,
1553
+ " famous": 1038,
1554
+ " fan": 3623,
1555
+ " fancy": 3083,
1556
+ " fans": 3809,
1557
+ " far": 438,
1558
+ " farm": 2442,
1559
+ " fashion": 2828,
1560
+ " fast": 984,
1561
+ " faster": 2572,
1562
+ " fat": 2170,
1563
+ " fate": 3218,
1564
+ " father": 367,
1565
+ " father's": 2692,
1566
+ " fault": 1702,
1567
+ " favor": 2763,
1568
+ " favorite": 2420,
1569
+ " favour": 3901,
1570
+ " fear": 1269,
1571
+ " feature": 2955,
1572
+ " featured": 3042,
1573
+ " features": 1977,
1574
+ " federal": 2685,
1575
+ " feed": 2504,
1576
+ " feel": 334,
1577
+ " feeling": 949,
1578
+ " feelings": 2329,
1579
+ " feels": 2317,
1580
+ " feet": 765,
1581
+ " fell": 916,
1582
+ " fellow": 1539,
1583
+ " felt": 709,
1584
+ " female": 1699,
1585
+ " festival": 3914,
1586
+ " few": 368,
1587
+ " fiction": 3895,
1588
+ " field": 1036,
1589
+ " fields": 2721,
1590
+ " fifteen": 2594,
1591
+ " fifth": 2842,
1592
+ " fifty": 1469,
1593
+ " fight": 826,
1594
+ " fighting": 1659,
1595
+ " figure": 1013,
1596
+ " figured": 3722,
1597
+ " figures": 2636,
1598
+ " file": 2459,
1599
+ " fill": 2576,
1600
+ " filled": 2000,
1601
+ " film": 751,
1602
+ " films": 2746,
1603
+ " final": 1027,
1604
+ " finally": 1161,
1605
+ " financial": 2516,
1606
+ " find": 223,
1607
+ " finding": 2402,
1608
+ " finds": 2849,
1609
+ " fine": 540,
1610
+ " finger": 2416,
1611
+ " fingers": 2352,
1612
+ " finish": 1391,
1613
+ " finished": 955,
1614
+ " fire": 662,
1615
+ " fired": 2542,
1616
+ " firm": 2606,
1617
+ " first": 126,
1618
+ " fish": 1090,
1619
+ " fishing": 3388,
1620
+ " fit": 1538,
1621
+ " five": 369,
1622
+ " fix": 1878,
1623
+ " fixed": 2321,
1624
+ " flag": 3377,
1625
+ " flat": 2129,
1626
+ " flew": 2696,
1627
+ " flight": 2204,
1628
+ " floor": 1078,
1629
+ " flow": 3403,
1630
+ " flower": 3065,
1631
+ " flowers": 1588,
1632
+ " fly": 1575,
1633
+ " flying": 2177,
1634
+ " focus": 2505,
1635
+ " folks": 2941,
1636
+ " follow": 1185,
1637
+ " followed": 1131,
1638
+ " following": 741,
1639
+ " follows": 3022,
1640
+ " food": 667,
1641
+ " fool": 2243,
1642
+ " foot": 1267,
1643
+ " football": 805,
1644
+ " for": 22,
1645
+ " force": 1074,
1646
+ " forced": 1862,
1647
+ " forces": 1690,
1648
+ " foreign": 2350,
1649
+ " forest": 1810,
1650
+ " forever": 2152,
1651
+ " forget": 974,
1652
+ " forgive": 2638,
1653
+ " forgot": 1774,
1654
+ " forgotten": 2468,
1655
+ " form": 701,
1656
+ " formed": 1394,
1657
+ " former": 752,
1658
+ " forms": 1987,
1659
+ " formula": 1671,
1660
+ " forth": 2107,
1661
+ " fortune": 3536,
1662
+ " forty": 2128,
1663
+ " forward": 1099,
1664
+ " fought": 2418,
1665
+ " found": 240,
1666
+ " founded": 1398,
1667
+ " four": 322,
1668
+ " fourth": 1761,
1669
+ " frame": 3871,
1670
+ " free": 742,
1671
+ " freedom": 2506,
1672
+ " frequently": 4031,
1673
+ " fresh": 2056,
1674
+ " friend": 483,
1675
+ " friendly": 3259,
1676
+ " friends": 581,
1677
+ " friendship": 3938,
1678
+ " frightened": 3294,
1679
+ " frog": 2192,
1680
+ " from": 44,
1681
+ " front": 688,
1682
+ " fruit": 2836,
1683
+ " fuck": 1337,
1684
+ " fucking": 927,
1685
+ " fuel": 3715,
1686
+ " full": 604,
1687
+ " fully": 2630,
1688
+ " fun": 848,
1689
+ " function": 1589,
1690
+ " functions": 3658,
1691
+ " funeral": 3666,
1692
+ " funny": 1023,
1693
+ " further": 1042,
1694
+ " future": 941,
1695
+ " gain": 3115,
1696
+ " gained": 3390,
1697
+ " game": 485,
1698
+ " games": 1021,
1699
+ " garden": 1666,
1700
+ " gas": 1721,
1701
+ " gate": 2783,
1702
+ " gathered": 3759,
1703
+ " gave": 453,
1704
+ " gay": 2284,
1705
+ " general": 861,
1706
+ " generally": 1935,
1707
+ " generation": 2943,
1708
+ " gentle": 4001,
1709
+ " gentleman": 2633,
1710
+ " gentlemen": 2147,
1711
+ " genus": 3641,
1712
+ " get": 75,
1713
+ " gets": 771,
1714
+ " getting": 396,
1715
+ " ghost": 3951,
1716
+ " giant": 2539,
1717
+ " gift": 2351,
1718
+ " girl": 373,
1719
+ " girlfriend": 3036,
1720
+ " girls": 858,
1721
+ " give": 210,
1722
+ " given": 544,
1723
+ " gives": 1240,
1724
+ " giving": 1066,
1725
+ " glad": 1138,
1726
+ " glass": 1764,
1727
+ " glasses": 3754,
1728
+ " global": 3858,
1729
+ " go": 77,
1730
+ " goal": 1914,
1731
+ " goals": 2693,
1732
+ " god": 1598,
1733
+ " gods": 3334,
1734
+ " goes": 514,
1735
+ " going": 103,
1736
+ " gold": 1079,
1737
+ " golden": 2599,
1738
+ " gone": 471,
1739
+ " gonna": 162,
1740
+ " good": 128,
1741
+ " goodness": 3192,
1742
+ " goods": 3227,
1743
+ " got": 94,
1744
+ " gotta": 756,
1745
+ " gotten": 3279,
1746
+ " government": 628,
1747
+ " governor": 3542,
1748
+ " grab": 3907,
1749
+ " grade": 3805,
1750
+ " graduated": 3626,
1751
+ " grand": 2774,
1752
+ " grandfather": 4022,
1753
+ " granted": 3829,
1754
+ " grass": 2553,
1755
+ " grateful": 3953,
1756
+ " grave": 3055,
1757
+ " gray": 3947,
1758
+ " great": 233,
1759
+ " greater": 2004,
1760
+ " greatest": 1973,
1761
+ " greatly": 4036,
1762
+ " green": 1049,
1763
+ " grew": 1245,
1764
+ " ground": 896,
1765
+ " group": 564,
1766
+ " groups": 1351,
1767
+ " grow": 1427,
1768
+ " growing": 1839,
1769
+ " grown": 2365,
1770
+ " growth": 2717,
1771
+ " guard": 2756,
1772
+ " guess": 652,
1773
+ " guest": 2885,
1774
+ " guests": 3649,
1775
+ " guilty": 2727,
1776
+ " guitar": 3379,
1777
+ " gun": 1158,
1778
+ " guns": 2395,
1779
+ " guy": 532,
1780
+ " guys": 538,
1781
+ " ha": 2071,
1782
+ " had": 54,
1783
+ " hadn't": 2356,
1784
+ " hair": 825,
1785
+ " half": 516,
1786
+ " hall": 2119,
1787
+ " hand": 378,
1788
+ " handle": 2043,
1789
+ " hands": 574,
1790
+ " handsome": 3161,
1791
+ " hang": 1953,
1792
+ " hanging": 2753,
1793
+ " happen": 696,
1794
+ " happened": 386,
1795
+ " happening": 1392,
1796
+ " happens": 975,
1797
+ " happiness": 2953,
1798
+ " happy": 609,
1799
+ " hard": 463,
1800
+ " harder": 3466,
1801
+ " hardly": 2072,
1802
+ " harm": 3080,
1803
+ " has": 81,
1804
+ " hasn't": 1782,
1805
+ " hat": 1626,
1806
+ " hate": 1250,
1807
+ " have": 28,
1808
+ " haven't": 582,
1809
+ " having": 433,
1810
+ " he": 29,
1811
+ " he'd": 1755,
1812
+ " he'll": 1860,
1813
+ " he's": 267,
1814
+ " head": 321,
1815
+ " headed": 3789,
1816
+ " headquarters": 3653,
1817
+ " heads": 2088,
1818
+ " health": 1327,
1819
+ " healthy": 3640,
1820
+ " hear": 408,
1821
+ " heard": 414,
1822
+ " hearing": 2527,
1823
+ " heart": 469,
1824
+ " hearts": 3521,
1825
+ " heat": 2127,
1826
+ " heaven": 2754,
1827
+ " heavily": 3875,
1828
+ " heavy": 1374,
1829
+ " height": 3190,
1830
+ " held": 659,
1831
+ " hell": 768,
1832
+ " hello": 2859,
1833
+ " help": 247,
1834
+ " helped": 1354,
1835
+ " helping": 2423,
1836
+ " helps": 2920,
1837
+ " her": 53,
1838
+ " here": 76,
1839
+ " here's": 2019,
1840
+ " hero": 2701,
1841
+ " herself": 956,
1842
+ " hey": 1634,
1843
+ " hi": 3967,
1844
+ " hidden": 2823,
1845
+ " hide": 1999,
1846
+ " hiding": 2833,
1847
+ " high": 495,
1848
+ " higher": 1442,
1849
+ " highest": 2001,
1850
+ " highly": 2666,
1851
+ " hill": 2273,
1852
+ " hills": 3590,
1853
+ " him": 63,
1854
+ " himself": 424,
1855
+ " hired": 3332,
1856
+ " his": 38,
1857
+ " historical": 3222,
1858
+ " history": 758,
1859
+ " hit": 754,
1860
+ " hm": 4016,
1861
+ " hockey": 3621,
1862
+ " hold": 670,
1863
+ " holding": 1672,
1864
+ " holds": 3121,
1865
+ " hole": 1940,
1866
+ " holiday": 3316,
1867
+ " home": 218,
1868
+ " homes": 3219,
1869
+ " honest": 2189,
1870
+ " honey": 1197,
1871
+ " honor": 2240,
1872
+ " honour": 3603,
1873
+ " hope": 618,
1874
+ " hoped": 4066,
1875
+ " hoping": 3044,
1876
+ " horrible": 3228,
1877
+ " horse": 1139,
1878
+ " horses": 2079,
1879
+ " hospital": 1166,
1880
+ " host": 2811,
1881
+ " hot": 964,
1882
+ " hotel": 2027,
1883
+ " hour": 860,
1884
+ " hours": 737,
1885
+ " house": 278,
1886
+ " houses": 1820,
1887
+ " how": 142,
1888
+ " however": 934,
1889
+ " huge": 1522,
1890
+ " huh": 851,
1891
+ " human": 718,
1892
+ " humans": 2433,
1893
+ " hundred": 646,
1894
+ " hundreds": 3059,
1895
+ " hung": 3167,
1896
+ " hungry": 1922,
1897
+ " hunting": 3091,
1898
+ " hurry": 2125,
1899
+ " hurt": 885,
1900
+ " hurts": 4018,
1901
+ " husband": 838,
1902
+ " i": 263,
1903
+ " i'll": 2310,
1904
+ " i'm": 1568,
1905
+ " ice": 1312,
1906
+ " idea": 448,
1907
+ " ideas": 1576,
1908
+ " identified": 3945,
1909
+ " identity": 3525,
1910
+ " idiot": 3095,
1911
+ " if": 78,
1912
+ " ill": 2462,
1913
+ " illegal": 4067,
1914
+ " image": 2173,
1915
+ " images": 3985,
1916
+ " imagine": 1652,
1917
+ " immediately": 1586,
1918
+ " impact": 3433,
1919
+ " importance": 3168,
1920
+ " important": 452,
1921
+ " impossible": 1894,
1922
+ " improve": 3802,
1923
+ " in": 13,
1924
+ " incident": 3845,
1925
+ " include": 943,
1926
+ " included": 1275,
1927
+ " includes": 1791,
1928
+ " including": 694,
1929
+ " income": 3040,
1930
+ " increase": 2044,
1931
+ " increased": 2609,
1932
+ " increasing": 4030,
1933
+ " incredible": 3995,
1934
+ " indeed": 1317,
1935
+ " independence": 3786,
1936
+ " independent": 2191,
1937
+ " individual": 1775,
1938
+ " individuals": 3124,
1939
+ " industrial": 3780,
1940
+ " industry": 1967,
1941
+ " influence": 2156,
1942
+ " information": 856,
1943
+ " inhabitants": 3371,
1944
+ " initially": 4061,
1945
+ " injured": 3771,
1946
+ " injury": 3702,
1947
+ " innocent": 2950,
1948
+ " inside": 677,
1949
+ " inspired": 3944,
1950
+ " instance": 2738,
1951
+ " instant": 3785,
1952
+ " instead": 940,
1953
+ " insurance": 3287,
1954
+ " intelligence": 3804,
1955
+ " intended": 3157,
1956
+ " interest": 1111,
1957
+ " interested": 1389,
1958
+ " interesting": 1095,
1959
+ " interests": 3620,
1960
+ " international": 1540,
1961
+ " interview": 2888,
1962
+ " into": 125,
1963
+ " introduce": 3586,
1964
+ " introduced": 2166,
1965
+ " invented": 3970,
1966
+ " investigation": 3302,
1967
+ " invited": 2663,
1968
+ " involved": 1188,
1969
+ " iron": 2522,
1970
+ " is": 16,
1971
+ " island": 1235,
1972
+ " islands": 3508,
1973
+ " isn't": 407,
1974
+ " issue": 1689,
1975
+ " issued": 3775,
1976
+ " issues": 1995,
1977
+ " it": 18,
1978
+ " it ": 3221,
1979
+ " it'll": 1886,
1980
+ " it's": 97,
1981
+ " items": 3712,
1982
+ " its": 173,
1983
+ " itself": 952,
1984
+ " jail": 2741,
1985
+ " job": 489,
1986
+ " jobs": 2466,
1987
+ " join": 1557,
1988
+ " joined": 1214,
1989
+ " joint": 3863,
1990
+ " joke": 2448,
1991
+ " journalist": 3930,
1992
+ " journey": 2363,
1993
+ " joy": 2710,
1994
+ " judge": 2274,
1995
+ " juice": 3026,
1996
+ " jump": 2205,
1997
+ " jumped": 3687,
1998
+ " just": 73,
1999
+ " justice": 2900,
2000
+ " keep": 318,
2001
+ " keeping": 2179,
2002
+ " keeps": 2619,
2003
+ " kept": 897,
2004
+ " key": 1290,
2005
+ " keys": 3212,
2006
+ " kick": 3003,
2007
+ " kid": 1105,
2008
+ " kidding": 2608,
2009
+ " kids": 875,
2010
+ " kill": 561,
2011
+ " killed": 594,
2012
+ " killer": 3075,
2013
+ " killing": 2100,
2014
+ " kind": 275,
2015
+ " kinda": 4070,
2016
+ " kinds": 1892,
2017
+ " king": 931,
2018
+ " kingdom": 3025,
2019
+ " kiss": 1816,
2020
+ " kitchen": 1875,
2021
+ " knees": 3788,
2022
+ " knew": 430,
2023
+ " knife": 2716,
2024
+ " knock": 3331,
2025
+ " knocked": 3784,
2026
+ " know": 46,
2027
+ " knowing": 2358,
2028
+ " knowledge": 1738,
2029
+ " known": 271,
2030
+ " knows": 791,
2031
+ " l": 2832,
2032
+ " la": 1731,
2033
+ " label": 3474,
2034
+ " labor": 3801,
2035
+ " lack": 2768,
2036
+ " lad": 3595,
2037
+ " ladies": 2159,
2038
+ " lady": 967,
2039
+ " laid": 1876,
2040
+ " lake": 2662,
2041
+ " land": 672,
2042
+ " lands": 3707,
2043
+ " language": 909,
2044
+ " languages": 2202,
2045
+ " large": 600,
2046
+ " larger": 1934,
2047
+ " largest": 1222,
2048
+ " last": 216,
2049
+ " late": 596,
2050
+ " later": 323,
2051
+ " latter": 2919,
2052
+ " laugh": 1726,
2053
+ " laughed": 2246,
2054
+ " laughing": 2300,
2055
+ " launched": 3182,
2056
+ " law": 835,
2057
+ " laws": 2299,
2058
+ " lawyer": 2206,
2059
+ " lay": 1135,
2060
+ " lead": 1048,
2061
+ " leader": 1486,
2062
+ " leaders": 2980,
2063
+ " leading": 1793,
2064
+ " leads": 3506,
2065
+ " league": 2315,
2066
+ " learn": 1019,
2067
+ " learned": 1505,
2068
+ " learning": 1982,
2069
+ " least": 577,
2070
+ " leave": 371,
2071
+ " leaves": 1528,
2072
+ " leaving": 1046,
2073
+ " led": 991,
2074
+ " left": 264,
2075
+ " leg": 2158,
2076
+ " legal": 2238,
2077
+ " legs": 1541,
2078
+ " length": 1580,
2079
+ " less": 645,
2080
+ " lesson": 3549,
2081
+ " let": 262,
2082
+ " let's": 478,
2083
+ " letter": 985,
2084
+ " letters": 1724,
2085
+ " letting": 3850,
2086
+ " level": 953,
2087
+ " levels": 2802,
2088
+ " library": 2722,
2089
+ " lie": 1378,
2090
+ " lies": 2124,
2091
+ " life": 197,
2092
+ " lift": 3315,
2093
+ " lifted": 3929,
2094
+ " light": 569,
2095
+ " lights": 2497,
2096
+ " like": 59,
2097
+ " liked": 1535,
2098
+ " likely": 1593,
2099
+ " likes": 1952,
2100
+ " limit": 3766,
2101
+ " limited": 2874,
2102
+ " line": 562,
2103
+ " lines": 1506,
2104
+ " link": 3781,
2105
+ " lips": 2549,
2106
+ " list": 960,
2107
+ " listed": 3139,
2108
+ " listen": 881,
2109
+ " listened": 3618,
2110
+ " listening": 2218,
2111
+ " literally": 3973,
2112
+ " literature": 3368,
2113
+ " little": 132,
2114
+ " live": 382,
2115
+ " lived": 691,
2116
+ " lives": 804,
2117
+ " living": 665,
2118
+ " load": 3535,
2119
+ " local": 841,
2120
+ " located": 1122,
2121
+ " location": 2360,
2122
+ " lock": 3023,
2123
+ " locked": 2952,
2124
+ " lonely": 3326,
2125
+ " long": 206,
2126
+ " longer": 842,
2127
+ " look": 167,
2128
+ " looked": 517,
2129
+ " looking": 380,
2130
+ " looks": 656,
2131
+ " loose": 3372,
2132
+ " lord": 2935,
2133
+ " lose": 1092,
2134
+ " losing": 2372,
2135
+ " loss": 2074,
2136
+ " lost": 496,
2137
+ " lot": 255,
2138
+ " lots": 1704,
2139
+ " loud": 2778,
2140
+ " love": 231,
2141
+ " loved": 1251,
2142
+ " lovely": 1450,
2143
+ " loves": 2381,
2144
+ " low": 1112,
2145
+ " lower": 1573,
2146
+ " luck": 1546,
2147
+ " lucky": 1899,
2148
+ " lunch": 1929,
2149
+ " lying": 1612,
2150
+ " m": 2571,
2151
+ " ma'am": 3143,
2152
+ " machine": 1534,
2153
+ " machines": 3921,
2154
+ " mad": 1679,
2155
+ " made": 163,
2156
+ " magazine": 2430,
2157
+ " magic": 2311,
2158
+ " mail": 3879,
2159
+ " main": 747,
2160
+ " mainly": 2257,
2161
+ " major": 944,
2162
+ " majority": 2577,
2163
+ " make": 145,
2164
+ " makes": 621,
2165
+ " making": 509,
2166
+ " male": 2149,
2167
+ " man": 157,
2168
+ " man's": 2668,
2169
+ " manage": 2948,
2170
+ " managed": 2259,
2171
+ " management": 2582,
2172
+ " manager": 1951,
2173
+ " manner": 2353,
2174
+ " many": 186,
2175
+ " map": 2968,
2176
+ " mark": 2412,
2177
+ " marked": 3360,
2178
+ " market": 1388,
2179
+ " marks": 3727,
2180
+ " marriage": 1425,
2181
+ " married": 587,
2182
+ " marry": 1547,
2183
+ " mass": 2111,
2184
+ " massive": 3899,
2185
+ " master": 1655,
2186
+ " match": 1181,
2187
+ " matches": 2426,
2188
+ " mate": 2616,
2189
+ " material": 1819,
2190
+ " materials": 3314,
2191
+ " matter": 479,
2192
+ " matters": 2471,
2193
+ " may": 234,
2194
+ " maybe": 591,
2195
+ " mayor": 3741,
2196
+ " me": 34,
2197
+ " meal": 3234,
2198
+ " mean": 176,
2199
+ " meaning": 1572,
2200
+ " means": 460,
2201
+ " meant": 1115,
2202
+ " measure": 2797,
2203
+ " meat": 2327,
2204
+ " medal": 3076,
2205
+ " media": 2130,
2206
+ " medical": 1845,
2207
+ " medicine": 2607,
2208
+ " meet": 640,
2209
+ " meeting": 1030,
2210
+ " member": 684,
2211
+ " members": 820,
2212
+ " memory": 1742,
2213
+ " men": 342,
2214
+ " men's": 3792,
2215
+ " mental": 2855,
2216
+ " mention": 2383,
2217
+ " mentioned": 1887,
2218
+ " merely": 3289,
2219
+ " mess": 2392,
2220
+ " message": 1487,
2221
+ " met": 769,
2222
+ " metal": 2347,
2223
+ " meters": 3592,
2224
+ " method": 2477,
2225
+ " methods": 3343,
2226
+ " middle": 1064,
2227
+ " might": 252,
2228
+ " mile": 3683,
2229
+ " miles": 1167,
2230
+ " military": 1203,
2231
+ " milk": 1639,
2232
+ " million": 695,
2233
+ " millions": 3652,
2234
+ " mind": 403,
2235
+ " minds": 3948,
2236
+ " mine": 847,
2237
+ " minister": 2671,
2238
+ " minor": 3220,
2239
+ " minus": 1963,
2240
+ " minute": 678,
2241
+ " minutes": 657,
2242
+ " mirror": 3823,
2243
+ " miss": 1411,
2244
+ " missed": 2057,
2245
+ " missing": 1735,
2246
+ " mission": 2316,
2247
+ " mistake": 1772,
2248
+ " mixed": 2702,
2249
+ " mm": 1964,
2250
+ " model": 1432,
2251
+ " models": 3822,
2252
+ " modern": 1326,
2253
+ " mom": 1285,
2254
+ " moment": 548,
2255
+ " moments": 3873,
2256
+ " mommy": 1498,
2257
+ " money": 286,
2258
+ " monster": 3421,
2259
+ " month": 1142,
2260
+ " months": 726,
2261
+ " mood": 3915,
2262
+ " moon": 2095,
2263
+ " moral": 3725,
2264
+ " more": 102,
2265
+ " morning": 431,
2266
+ " most": 205,
2267
+ " mostly": 1609,
2268
+ " mother": 387,
2269
+ " mother's": 2990,
2270
+ " motion": 2862,
2271
+ " mountain": 1848,
2272
+ " mountains": 2652,
2273
+ " mouse": 3836,
2274
+ " mouth": 1044,
2275
+ " move": 543,
2276
+ " moved": 753,
2277
+ " movement": 1510,
2278
+ " moves": 3179,
2279
+ " movie": 501,
2280
+ " movies": 1494,
2281
+ " moving": 1127,
2282
+ " much": 156,
2283
+ " multiple": 2758,
2284
+ " mum": 2921,
2285
+ " municipality": 1556,
2286
+ " murder": 1561,
2287
+ " murdered": 3599,
2288
+ " museum": 3146,
2289
+ " music": 466,
2290
+ " musical": 2500,
2291
+ " musician": 4041,
2292
+ " must": 214,
2293
+ " my": 49,
2294
+ " myself": 584,
2295
+ " mystery": 4056,
2296
+ " n": 2902,
2297
+ " n't": 443,
2298
+ " name": 222,
2299
+ " named": 664,
2300
+ " names": 1360,
2301
+ " narrow": 3478,
2302
+ " nation": 2632,
2303
+ " national": 749,
2304
+ " native": 2535,
2305
+ " natural": 1215,
2306
+ " naturally": 4044,
2307
+ " nature": 1295,
2308
+ " near": 620,
2309
+ " nearby": 3297,
2310
+ " nearly": 1310,
2311
+ " necessarily": 3867,
2312
+ " necessary": 1617,
2313
+ " neck": 1890,
2314
+ " need": 168,
2315
+ " needed": 1150,
2316
+ " needs": 880,
2317
+ " negative": 2099,
2318
+ " neighborhood": 3731,
2319
+ " neither": 2051,
2320
+ " nervous": 2744,
2321
+ " net": 4055,
2322
+ " network": 2258,
2323
+ " never": 169,
2324
+ " new": 203,
2325
+ " news": 900,
2326
+ " newspaper": 2563,
2327
+ " next": 307,
2328
+ " nice": 418,
2329
+ " night": 253,
2330
+ " nights": 3545,
2331
+ " nine": 1024,
2332
+ " nineteen": 2613,
2333
+ " ninety": 2786,
2334
+ " no": 83,
2335
+ " nobody": 1478,
2336
+ " noise": 2139,
2337
+ " nominated": 3189,
2338
+ " none": 1563,
2339
+ " nor": 1428,
2340
+ " normal": 1461,
2341
+ " normally": 2925,
2342
+ " north": 911,
2343
+ " northern": 2155,
2344
+ " northwest": 3849,
2345
+ " nose": 1560,
2346
+ " not": 35,
2347
+ " note": 1869,
2348
+ " noted": 3102,
2349
+ " notes": 2700,
2350
+ " nothing": 315,
2351
+ " notice": 1733,
2352
+ " noticed": 2338,
2353
+ " novel": 2197,
2354
+ " now": 100,
2355
+ " nuclear": 3200,
2356
+ " number": 319,
2357
+ " numbers": 1257,
2358
+ " numerous": 3363,
2359
+ " nurse": 3448,
2360
+ " nuts": 3830,
2361
+ " o": 3047,
2362
+ " o'clock": 2370,
2363
+ " object": 2020,
2364
+ " objects": 2994,
2365
+ " observed": 3456,
2366
+ " obvious": 3166,
2367
+ " obviously": 1850,
2368
+ " occasion": 3679,
2369
+ " occupied": 4009,
2370
+ " occur": 3810,
2371
+ " occurred": 2880,
2372
+ " ocean": 3019,
2373
+ " odd": 3150,
2374
+ " of": 9,
2375
+ " off": 165,
2376
+ " offer": 1532,
2377
+ " offered": 2033,
2378
+ " office": 783,
2379
+ " officer": 1700,
2380
+ " officers": 2357,
2381
+ " official": 1640,
2382
+ " officially": 3135,
2383
+ " often": 511,
2384
+ " oh": 616,
2385
+ " oil": 1767,
2386
+ " ok": 3249,
2387
+ " okay": 296,
2388
+ " old": 217,
2389
+ " older": 1606,
2390
+ " oldest": 2822,
2391
+ " on": 23,
2392
+ " once": 362,
2393
+ " one": 52,
2394
+ " one's": 1944,
2395
+ " ones": 852,
2396
+ " online": 2974,
2397
+ " only": 139,
2398
+ " onto": 2141,
2399
+ " open": 508,
2400
+ " opened": 1034,
2401
+ " opening": 1896,
2402
+ " opera": 3485,
2403
+ " operated": 4050,
2404
+ " operating": 3540,
2405
+ " operation": 2407,
2406
+ " operations": 2973,
2407
+ " opinion": 2233,
2408
+ " opportunity": 1814,
2409
+ " opposed": 3923,
2410
+ " opposite": 2394,
2411
+ " or": 69,
2412
+ " orange": 3106,
2413
+ " order": 586,
2414
+ " ordered": 2058,
2415
+ " orders": 2021,
2416
+ " ordinary": 3073,
2417
+ " organization": 2190,
2418
+ " organizations": 4013,
2419
+ " organized": 3384,
2420
+ " origin": 3636,
2421
+ " original": 1072,
2422
+ " originally": 2039,
2423
+ " other": 127,
2424
+ " others": 666,
2425
+ " otherwise": 2628,
2426
+ " ought": 1642,
2427
+ " our": 140,
2428
+ " ours": 3385,
2429
+ " ourselves": 2062,
2430
+ " out": 66,
2431
+ " outside": 721,
2432
+ " over": 129,
2433
+ " overall": 3235,
2434
+ " owe": 3231,
2435
+ " own": 245,
2436
+ " owned": 2326,
2437
+ " owner": 2851,
2438
+ " p": 2290,
2439
+ " pack": 3502,
2440
+ " page": 1625,
2441
+ " pages": 3532,
2442
+ " paid": 1195,
2443
+ " pain": 1451,
2444
+ " paint": 3058,
2445
+ " painted": 3583,
2446
+ " painting": 2598,
2447
+ " pair": 2059,
2448
+ " palace": 2245,
2449
+ " pale": 3594,
2450
+ " pants": 3606,
2451
+ " paper": 915,
2452
+ " papers": 2364,
2453
+ " parents": 965,
2454
+ " parish": 3601,
2455
+ " park": 1664,
2456
+ " part": 246,
2457
+ " particular": 1110,
2458
+ " particularly": 1889,
2459
+ " parties": 2875,
2460
+ " partner": 2406,
2461
+ " parts": 1067,
2462
+ " party": 681,
2463
+ " pass": 1173,
2464
+ " passed": 1002,
2465
+ " passing": 2649,
2466
+ " passion": 4060,
2467
+ " past": 763,
2468
+ " path": 2009,
2469
+ " patient": 2182,
2470
+ " patients": 3184,
2471
+ " pattern": 3256,
2472
+ " pay": 638,
2473
+ " paying": 2661,
2474
+ " peace": 1524,
2475
+ " pen": 3891,
2476
+ " people": 113,
2477
+ " people's": 3514,
2478
+ " per": 1156,
2479
+ " percent": 1677,
2480
+ " perfect": 1381,
2481
+ " perfectly": 2555,
2482
+ " perform": 3209,
2483
+ " performance": 1939,
2484
+ " performed": 1976,
2485
+ " perhaps": 932,
2486
+ " period": 1007,
2487
+ " permission": 3477,
2488
+ " person": 444,
2489
+ " personal": 1363,
2490
+ " personality": 4037,
2491
+ " personally": 3906,
2492
+ " persons": 3744,
2493
+ " philosophy": 3614,
2494
+ " phone": 824,
2495
+ " photo": 3751,
2496
+ " physical": 2132,
2497
+ " piano": 3012,
2498
+ " pick": 982,
2499
+ " picked": 1884,
2500
+ " picking": 4049,
2501
+ " picture": 954,
2502
+ " pictures": 1794,
2503
+ " piece": 951,
2504
+ " pieces": 1611,
2505
+ " pig": 3241,
2506
+ " pilot": 3928,
2507
+ " pink": 3078,
2508
+ " pity": 3613,
2509
+ " place": 236,
2510
+ " placed": 1743,
2511
+ " places": 1089,
2512
+ " plain": 2886,
2513
+ " plan": 913,
2514
+ " plane": 1965,
2515
+ " planet": 2144,
2516
+ " planned": 2601,
2517
+ " planning": 2276,
2518
+ " plans": 1906,
2519
+ " plant": 1807,
2520
+ " plants": 1993,
2521
+ " plastic": 3911,
2522
+ " plate": 2914,
2523
+ " platform": 3630,
2524
+ " play": 358,
2525
+ " played": 420,
2526
+ " player": 814,
2527
+ " players": 1777,
2528
+ " playing": 711,
2529
+ " plays": 1196,
2530
+ " pleasant": 3086,
2531
+ " please": 392,
2532
+ " pleased": 2641,
2533
+ " pleasure": 1947,
2534
+ " plenty": 2683,
2535
+ " plot": 3952,
2536
+ " plus": 1509,
2537
+ " pocket": 2580,
2538
+ " poem": 4008,
2539
+ " poet": 3501,
2540
+ " poetry": 3903,
2541
+ " point": 415,
2542
+ " pointed": 3548,
2543
+ " points": 1258,
2544
+ " police": 766,
2545
+ " policy": 1827,
2546
+ " political": 1057,
2547
+ " politician": 1255,
2548
+ " politics": 2918,
2549
+ " pool": 3009,
2550
+ " poor": 774,
2551
+ " pop": 2570,
2552
+ " popular": 1091,
2553
+ " population": 788,
2554
+ " port": 3677,
2555
+ " position": 922,
2556
+ " positions": 4004,
2557
+ " positive": 1885,
2558
+ " possibility": 3808,
2559
+ " possible": 728,
2560
+ " possibly": 2266,
2561
+ " post": 2212,
2562
+ " pot": 3977,
2563
+ " potential": 2687,
2564
+ " pound": 2503,
2565
+ " pounds": 1406,
2566
+ " power": 578,
2567
+ " powerful": 1996,
2568
+ " powers": 2747,
2569
+ " practical": 3865,
2570
+ " practice": 1660,
2571
+ " pray": 3356,
2572
+ " precious": 4062,
2573
+ " prefer": 3480,
2574
+ " pregnant": 3346,
2575
+ " prepare": 3853,
2576
+ " prepared": 2217,
2577
+ " presence": 2494,
2578
+ " present": 874,
2579
+ " presented": 2848,
2580
+ " president": 1241,
2581
+ " presidential": 3892,
2582
+ " press": 1932,
2583
+ " pressure": 1762,
2584
+ " pretend": 3436,
2585
+ " pretty": 482,
2586
+ " prevent": 2906,
2587
+ " previous": 2108,
2588
+ " previously": 3024,
2589
+ " price": 1683,
2590
+ " pride": 3374,
2591
+ " priest": 3278,
2592
+ " primary": 2620,
2593
+ " prime": 3340,
2594
+ " prince": 1942,
2595
+ " princess": 3066,
2596
+ " principle": 3965,
2597
+ " print": 3958,
2598
+ " prison": 1629,
2599
+ " prisoners": 3732,
2600
+ " private": 1328,
2601
+ " prize": 3492,
2602
+ " probably": 541,
2603
+ " problem": 521,
2604
+ " problems": 945,
2605
+ " process": 1137,
2606
+ " produce": 2168,
2607
+ " produced": 1216,
2608
+ " producer": 2694,
2609
+ " product": 2251,
2610
+ " production": 1555,
2611
+ " products": 2712,
2612
+ " professional": 1239,
2613
+ " professor": 2704,
2614
+ " program": 1263,
2615
+ " programme": 3663,
2616
+ " programs": 2624,
2617
+ " progress": 3233,
2618
+ " project": 1452,
2619
+ " projects": 3750,
2620
+ " promise": 1349,
2621
+ " promised": 2118,
2622
+ " proof": 3048,
2623
+ " proper": 2450,
2624
+ " properly": 3088,
2625
+ " properties": 4078,
2626
+ " property": 1741,
2627
+ " proposed": 2937,
2628
+ " protect": 1676,
2629
+ " protection": 3028,
2630
+ " proud": 1950,
2631
+ " prove": 2133,
2632
+ " proved": 3271,
2633
+ " provide": 1874,
2634
+ " provided": 2288,
2635
+ " provides": 3697,
2636
+ " province": 1883,
2637
+ " public": 715,
2638
+ " published": 1125,
2639
+ " pull": 1467,
2640
+ " pulled": 2171,
2641
+ " punishment": 4063,
2642
+ " pure": 3089,
2643
+ " purpose": 1769,
2644
+ " push": 1956,
2645
+ " pushed": 3118,
2646
+ " put": 174,
2647
+ " puts": 3769,
2648
+ " putting": 1479,
2649
+ " quality": 2229,
2650
+ " quarter": 2757,
2651
+ " queen": 2729,
2652
+ " question": 641,
2653
+ " questions": 1076,
2654
+ " quick": 1558,
2655
+ " quickly": 1096,
2656
+ " quiet": 1403,
2657
+ " quietly": 3582,
2658
+ " quit": 3071,
2659
+ " quite": 381,
2660
+ " race": 1180,
2661
+ " racing": 3696,
2662
+ " radio": 1488,
2663
+ " railway": 2679,
2664
+ " rain": 1924,
2665
+ " raise": 2411,
2666
+ " raised": 1600,
2667
+ " ran": 923,
2668
+ " range": 1698,
2669
+ " rank": 3861,
2670
+ " rare": 3011,
2671
+ " rate": 1720,
2672
+ " rather": 648,
2673
+ " reach": 1379,
2674
+ " reached": 1071,
2675
+ " reaction": 3335,
2676
+ " read": 555,
2677
+ " reading": 1670,
2678
+ " ready": 590,
2679
+ " real": 451,
2680
+ " reality": 2397,
2681
+ " realize": 2050,
2682
+ " realized": 2771,
2683
+ " really": 152,
2684
+ " reason": 692,
2685
+ " reasons": 2047,
2686
+ " receive": 2498,
2687
+ " received": 882,
2688
+ " recent": 2530,
2689
+ " recently": 2227,
2690
+ " recognize": 3068,
2691
+ " recognized": 3006,
2692
+ " record": 950,
2693
+ " recorded": 1717,
2694
+ " recording": 2794,
2695
+ " records": 2262,
2696
+ " red": 780,
2697
+ " reduced": 3824,
2698
+ " refer": 2467,
2699
+ " reference": 3625,
2700
+ " referred": 2958,
2701
+ " refused": 2865,
2702
+ " regard": 4054,
2703
+ " regarded": 4076,
2704
+ " region": 816,
2705
+ " regional": 4015,
2706
+ " regions": 3588,
2707
+ " regular": 1749,
2708
+ " related": 1955,
2709
+ " relations": 3645,
2710
+ " relationship": 1519,
2711
+ " release": 1678,
2712
+ " released": 601,
2713
+ " relief": 3726,
2714
+ " religion": 2410,
2715
+ " religious": 2188,
2716
+ " remain": 2089,
2717
+ " remained": 1857,
2718
+ " remaining": 3277,
2719
+ " remains": 2325,
2720
+ " remember": 439,
2721
+ " remembered": 2795,
2722
+ " remove": 3749,
2723
+ " removed": 2592,
2724
+ " rent": 3979,
2725
+ " repeated": 3844,
2726
+ " replace": 3857,
2727
+ " replaced": 2087,
2728
+ " replied": 1691,
2729
+ " reply": 4021,
2730
+ " report": 1207,
2731
+ " reported": 2135,
2732
+ " reports": 3119,
2733
+ " represent": 3178,
2734
+ " represented": 2667,
2735
+ " request": 3309,
2736
+ " require": 4003,
2737
+ " required": 2334,
2738
+ " rescue": 4020,
2739
+ " research": 1338,
2740
+ " resources": 2767,
2741
+ " respect": 1622,
2742
+ " response": 2526,
2743
+ " responsibility": 2846,
2744
+ " responsible": 2076,
2745
+ " rest": 637,
2746
+ " restaurant": 3117,
2747
+ " result": 1176,
2748
+ " results": 1872,
2749
+ " retired": 1994,
2750
+ " return": 907,
2751
+ " returned": 1037,
2752
+ " returning": 3840,
2753
+ " revealed": 3568,
2754
+ " review": 3439,
2755
+ " reviews": 3753,
2756
+ " rich": 1473,
2757
+ " rid": 2232,
2758
+ " ride": 1518,
2759
+ " ridiculous": 3310,
2760
+ " riding": 3585,
2761
+ " right": 86,
2762
+ " rights": 1376,
2763
+ " ring": 1318,
2764
+ " rings": 3431,
2765
+ " rise": 2501,
2766
+ " rising": 3791,
2767
+ " risk": 1986,
2768
+ " river": 1035,
2769
+ " rivers": 4075,
2770
+ " road": 806,
2771
+ " roads": 3517,
2772
+ " rock": 1104,
2773
+ " rocks": 2873,
2774
+ " role": 1055,
2775
+ " roles": 2634,
2776
+ " roll": 2568,
2777
+ " romantic": 3395,
2778
+ " roof": 2483,
2779
+ " room": 400,
2780
+ " rooms": 3045,
2781
+ " root": 3420,
2782
+ " rose": 1900,
2783
+ " rough": 3015,
2784
+ " round": 533,
2785
+ " route": 2699,
2786
+ " row": 2725,
2787
+ " royal": 3128,
2788
+ " rule": 1554,
2789
+ " ruled": 3544,
2790
+ " rules": 1682,
2791
+ " run": 497,
2792
+ " running": 867,
2793
+ " runs": 1865,
2794
+ " rush": 3839,
2795
+ " s": 2565,
2796
+ " sad": 1756,
2797
+ " safe": 1143,
2798
+ " safety": 2499,
2799
+ " said": 96,
2800
+ " sake": 2077,
2801
+ " sale": 3441,
2802
+ " sales": 3207,
2803
+ " salt": 3693,
2804
+ " same": 224,
2805
+ " sand": 3240,
2806
+ " sang": 3827,
2807
+ " sat": 1050,
2808
+ " satisfied": 4043,
2809
+ " save": 1014,
2810
+ " saved": 1908,
2811
+ " saw": 316,
2812
+ " say": 135,
2813
+ " saying": 447,
2814
+ " says": 374,
2815
+ " scale": 2782,
2816
+ " scared": 1564,
2817
+ " scene": 1653,
2818
+ " scheme": 3943,
2819
+ " school": 348,
2820
+ " schools": 1480,
2821
+ " science": 1475,
2822
+ " scientific": 2891,
2823
+ " scientists": 3444,
2824
+ " score": 2597,
2825
+ " scored": 2612,
2826
+ " screen": 2759,
2827
+ " sea": 937,
2828
+ " search": 1693,
2829
+ " season": 660,
2830
+ " seasons": 2824,
2831
+ " seat": 1031,
2832
+ " seats": 3383,
2833
+ " second": 329,
2834
+ " seconds": 2162,
2835
+ " secret": 1237,
2836
+ " secretary": 4048,
2837
+ " section": 2073,
2838
+ " secure": 3904,
2839
+ " security": 1787,
2840
+ " see": 91,
2841
+ " seeing": 1229,
2842
+ " seek": 3108,
2843
+ " seem": 994,
2844
+ " seemed": 830,
2845
+ " seems": 812,
2846
+ " seen": 337,
2847
+ " sees": 2835,
2848
+ " selected": 2509,
2849
+ " self": 3783,
2850
+ " sell": 1397,
2851
+ " selling": 2695,
2852
+ " send": 981,
2853
+ " sending": 3304,
2854
+ " senior": 3137,
2855
+ " sense": 846,
2856
+ " sent": 700,
2857
+ " sentence": 2978,
2858
+ " separate": 2075,
2859
+ " separated": 3889,
2860
+ " series": 629,
2861
+ " serious": 1201,
2862
+ " seriously": 2752,
2863
+ " servant": 3898,
2864
+ " serve": 2036,
2865
+ " served": 961,
2866
+ " service": 873,
2867
+ " services": 1631,
2868
+ " serving": 3054,
2869
+ " set": 390,
2870
+ " sets": 3313,
2871
+ " setting": 3007,
2872
+ " settle": 3887,
2873
+ " settled": 2588,
2874
+ " settlement": 3293,
2875
+ " seven": 784,
2876
+ " several": 626,
2877
+ " severe": 4065,
2878
+ " sex": 1457,
2879
+ " sexual": 2917,
2880
+ " shadow": 3348,
2881
+ " shake": 3651,
2882
+ " shall": 519,
2883
+ " shame": 3213,
2884
+ " shape": 2016,
2885
+ " share": 1384,
2886
+ " shared": 3341,
2887
+ " sharp": 2949,
2888
+ " she": 74,
2889
+ " she'd": 3357,
2890
+ " she'll": 3197,
2891
+ " she's": 505,
2892
+ " sheep": 3094,
2893
+ " ship": 1033,
2894
+ " ships": 2469,
2895
+ " shirt": 2970,
2896
+ " shit": 1026,
2897
+ " shock": 3819,
2898
+ " shoes": 1835,
2899
+ " shook": 3366,
2900
+ " shoot": 1650,
2901
+ " shooting": 2737,
2902
+ " shop": 1739,
2903
+ " shopping": 3672,
2904
+ " shore": 3004,
2905
+ " short": 776,
2906
+ " shortly": 3900,
2907
+ " shot": 912,
2908
+ " should": 171,
2909
+ " shoulder": 2946,
2910
+ " shoulders": 3689,
2911
+ " shouldn't": 1400,
2912
+ " show": 324,
2913
+ " showed": 1383,
2914
+ " shower": 4058,
2915
+ " showing": 2292,
2916
+ " shown": 1685,
2917
+ " shows": 1175,
2918
+ " shut": 1343,
2919
+ " sick": 1179,
2920
+ " side": 399,
2921
+ " sides": 2018,
2922
+ " sight": 1530,
2923
+ " sign": 1200,
2924
+ " signal": 2659,
2925
+ " signed": 1599,
2926
+ " significant": 2548,
2927
+ " signs": 3114,
2928
+ " silence": 2543,
2929
+ " silent": 2821,
2930
+ " silly": 2140,
2931
+ " silver": 2126,
2932
+ " similar": 1178,
2933
+ " simple": 1129,
2934
+ " simply": 1302,
2935
+ " since": 375,
2936
+ " sing": 1566,
2937
+ " singer": 1753,
2938
+ " singing": 2037,
2939
+ " single": 725,
2940
+ " sir": 442,
2941
+ " sister": 892,
2942
+ " sisters": 3082,
2943
+ " sit": 739,
2944
+ " site": 1468,
2945
+ " sites": 3674,
2946
+ " sitting": 1086,
2947
+ " situation": 1162,
2948
+ " six": 524,
2949
+ " sixth": 3657,
2950
+ " sixty": 2829,
2951
+ " size": 1266,
2952
+ " skills": 3404,
2953
+ " skin": 1831,
2954
+ " sky": 1734,
2955
+ " sleep": 759,
2956
+ " sleeping": 2302,
2957
+ " slept": 3185,
2958
+ " slide": 3729,
2959
+ " slightly": 2733,
2960
+ " slow": 2148,
2961
+ " slowly": 1881,
2962
+ " small": 436,
2963
+ " smaller": 1852,
2964
+ " smart": 2349,
2965
+ " smell": 2239,
2966
+ " smile": 1931,
2967
+ " smiled": 3646,
2968
+ " smiling": 3890,
2969
+ " smoke": 2428,
2970
+ " snow": 2098,
2971
+ " so": 65,
2972
+ " social": 1116,
2973
+ " society": 1496,
2974
+ " soft": 2063,
2975
+ " software": 3170,
2976
+ " soil": 3419,
2977
+ " sold": 1211,
2978
+ " soldier": 2894,
2979
+ " soldiers": 1730,
2980
+ " solid": 3409,
2981
+ " solution": 2587,
2982
+ " solve": 3205,
2983
+ " some": 105,
2984
+ " somebody": 930,
2985
+ " somehow": 2706,
2986
+ " someone": 546,
2987
+ " something": 160,
2988
+ " sometimes": 722,
2989
+ " somewhat": 3126,
2990
+ " somewhere": 1247,
2991
+ " son": 419,
2992
+ " song": 633,
2993
+ " songs": 1499,
2994
+ " sons": 2749,
2995
+ " soon": 487,
2996
+ " sooner": 3821,
2997
+ " sorry": 354,
2998
+ " sort": 468,
2999
+ " sorts": 3402,
3000
+ " soul": 1495,
3001
+ " sound": 797,
3002
+ " sounds": 1352,
3003
+ " soup": 4071,
3004
+ " source": 2008,
3005
+ " sources": 3633,
3006
+ " south": 1148,
3007
+ " southern": 2268,
3008
+ " space": 1000,
3009
+ " spare": 3541,
3010
+ " speak": 777,
3011
+ " speaking": 1680,
3012
+ " special": 833,
3013
+ " species": 933,
3014
+ " specific": 2304,
3015
+ " speech": 2112,
3016
+ " speed": 1813,
3017
+ " spell": 3519,
3018
+ " spend": 1508,
3019
+ " spending": 3565,
3020
+ " spent": 1198,
3021
+ " spirit": 1879,
3022
+ " spite": 3411,
3023
+ " split": 2547,
3024
+ " spoke": 1438,
3025
+ " spoken": 2322,
3026
+ " sport": 3577,
3027
+ " sports": 2405,
3028
+ " spot": 1809,
3029
+ " spread": 1926,
3030
+ " spring": 1991,
3031
+ " square": 1607,
3032
+ " squared": 3629,
3033
+ " staff": 1949,
3034
+ " stage": 1117,
3035
+ " stairs": 3559,
3036
+ " stand": 864,
3037
+ " standard": 2223,
3038
+ " standing": 1252,
3039
+ " stands": 2966,
3040
+ " star": 1521,
3041
+ " starring": 3761,
3042
+ " stars": 1308,
3043
+ " start": 445,
3044
+ " started": 423,
3045
+ " starting": 1439,
3046
+ " starts": 1943,
3047
+ " state": 446,
3048
+ " stated": 2872,
3049
+ " statement": 2455,
3050
+ " states": 1602,
3051
+ " station": 862,
3052
+ " stations": 2884,
3053
+ " statistics": 2092,
3054
+ " status": 2642,
3055
+ " stay": 510,
3056
+ " stayed": 2314,
3057
+ " staying": 2831,
3058
+ " steal": 3069,
3059
+ " steel": 3531,
3060
+ " step": 1254,
3061
+ " steps": 2297,
3062
+ " stick": 1737,
3063
+ " still": 196,
3064
+ " stock": 3226,
3065
+ " stole": 3202,
3066
+ " stolen": 3643,
3067
+ " stomach": 3673,
3068
+ " stone": 1647,
3069
+ " stones": 3695,
3070
+ " stood": 1041,
3071
+ " stop": 457,
3072
+ " stopped": 1164,
3073
+ " stops": 4002,
3074
+ " store": 1483,
3075
+ " stories": 1441,
3076
+ " storm": 2174,
3077
+ " story": 520,
3078
+ " straight": 1063,
3079
+ " strange": 1253,
3080
+ " stranger": 3547,
3081
+ " stream": 3855,
3082
+ " street": 1228,
3083
+ " streets": 2581,
3084
+ " strength": 1798,
3085
+ " strike": 2834,
3086
+ " string": 2991,
3087
+ " strong": 886,
3088
+ " stronger": 3311,
3089
+ " struck": 2436,
3090
+ " structure": 1961,
3091
+ " struggle": 3489,
3092
+ " stuck": 1972,
3093
+ " student": 1804,
3094
+ " students": 1085,
3095
+ " studied": 1823,
3096
+ " studies": 2332,
3097
+ " studio": 1888,
3098
+ " study": 1106,
3099
+ " studying": 3608,
3100
+ " stuff": 685,
3101
+ " stupid": 1377,
3102
+ " style": 1548,
3103
+ " subject": 1502,
3104
+ " subjects": 3825,
3105
+ " succeeded": 3661,
3106
+ " success": 1531,
3107
+ " successful": 1641,
3108
+ " such": 225,
3109
+ " sudden": 2289,
3110
+ " suddenly": 1359,
3111
+ " suffer": 3698,
3112
+ " suffered": 2883,
3113
+ " suffering": 3064,
3114
+ " sugar": 2815,
3115
+ " suggest": 2852,
3116
+ " suggested": 2655,
3117
+ " suicide": 2988,
3118
+ " suit": 2409,
3119
+ " sum": 3579,
3120
+ " summer": 1274,
3121
+ " sun": 1189,
3122
+ " supper": 3902,
3123
+ " supply": 2805,
3124
+ " support": 929,
3125
+ " supported": 2762,
3126
+ " suppose": 1128,
3127
+ " supposed": 895,
3128
+ " sure": 276,
3129
+ " surely": 3358,
3130
+ " surface": 1713,
3131
+ " surgery": 3414,
3132
+ " surprise": 1826,
3133
+ " surprised": 2172,
3134
+ " surrounded": 3735,
3135
+ " survive": 3081,
3136
+ " suspect": 3426,
3137
+ " swear": 2401,
3138
+ " sweet": 1316,
3139
+ " sweetie": 3354,
3140
+ " swim": 3835,
3141
+ " swimming": 3266,
3142
+ " switch": 3659,
3143
+ " sword": 2595,
3144
+ " symbol": 3748,
3145
+ " system": 567,
3146
+ " systems": 1818,
3147
+ " t": 3224,
3148
+ " table": 868,
3149
+ " tail": 2151,
3150
+ " take": 146,
3151
+ " taken": 624,
3152
+ " takes": 939,
3153
+ " taking": 636,
3154
+ " talk": 294,
3155
+ " talked": 1333,
3156
+ " talking": 437,
3157
+ " talks": 3604,
3158
+ " tall": 2052,
3159
+ " tape": 1998,
3160
+ " target": 3090,
3161
+ " task": 2963,
3162
+ " taste": 2187,
3163
+ " taught": 1796,
3164
+ " tax": 2586,
3165
+ " tea": 1445,
3166
+ " teach": 1861,
3167
+ " teacher": 1709,
3168
+ " teachers": 3285,
3169
+ " teaching": 2496,
3170
+ " team": 410,
3171
+ " teams": 1752,
3172
+ " tears": 2529,
3173
+ " technology": 2181,
3174
+ " teeth": 2045,
3175
+ " telephone": 2934,
3176
+ " television": 887,
3177
+ " tell": 181,
3178
+ " telling": 987,
3179
+ " tells": 1662,
3180
+ " temperature": 2540,
3181
+ " temple": 2979,
3182
+ " ten": 668,
3183
+ " tend": 3301,
3184
+ " term": 1108,
3185
+ " terms": 1159,
3186
+ " terrible": 1549,
3187
+ " territory": 3033,
3188
+ " test": 1344,
3189
+ " tests": 4014,
3190
+ " text": 2514,
3191
+ " th": 2869,
3192
+ " than": 148,
3193
+ " thank": 787,
3194
+ " thanks": 1362,
3195
+ " that": 17,
3196
+ " that ": 3351,
3197
+ " that's": 151,
3198
+ " the": 7,
3199
+ " the ": 2537,
3200
+ " theatre": 3961,
3201
+ " thee": 2391,
3202
+ " their": 104,
3203
+ " them": 80,
3204
+ " theme": 3416,
3205
+ " themselves": 906,
3206
+ " then": 107,
3207
+ " theory": 1619,
3208
+ " there": 67,
3209
+ " there's": 285,
3210
+ " therefore": 1570,
3211
+ " these": 154,
3212
+ " they": 55,
3213
+ " they'd": 2559,
3214
+ " they'll": 1740,
3215
+ " they're": 346,
3216
+ " they've": 1311,
3217
+ " thick": 2923,
3218
+ " thin": 2933,
3219
+ " thing": 189,
3220
+ " things": 191,
3221
+ " think": 84,
3222
+ " thinking": 650,
3223
+ " thinks": 1758,
3224
+ " third": 761,
3225
+ " thirty": 1582,
3226
+ " this": 31,
3227
+ " those": 192,
3228
+ " thou": 1868,
3229
+ " though": 402,
3230
+ " thought": 208,
3231
+ " thoughts": 2445,
3232
+ " thousand": 946,
3233
+ " thousands": 2441,
3234
+ " threat": 3773,
3235
+ " three": 194,
3236
+ " threw": 2254,
3237
+ " throat": 2879,
3238
+ " through": 195,
3239
+ " throughout": 1778,
3240
+ " throw": 1307,
3241
+ " throwing": 3982,
3242
+ " thrown": 3406,
3243
+ " thus": 1897,
3244
+ " thy": 2789,
3245
+ " ticket": 3526,
3246
+ " tickets": 3964,
3247
+ " tie": 3262,
3248
+ " tied": 2996,
3249
+ " tight": 2998,
3250
+ " till": 778,
3251
+ " time": 95,
3252
+ " times": 383,
3253
+ " tiny": 2444,
3254
+ " tired": 1276,
3255
+ " title": 1268,
3256
+ " to": 8,
3257
+ " today": 406,
3258
+ " together": 343,
3259
+ " told": 270,
3260
+ " tomorrow": 859,
3261
+ " tone": 2788,
3262
+ " tongue": 2790,
3263
+ " tonight": 904,
3264
+ " too": 155,
3265
+ " took": 303,
3266
+ " tools": 3339,
3267
+ " top": 588,
3268
+ " total": 1107,
3269
+ " totally": 2104,
3270
+ " touch": 1082,
3271
+ " touched": 2960,
3272
+ " tough": 2686,
3273
+ " tour": 2513,
3274
+ " tournament": 2889,
3275
+ " toward": 1945,
3276
+ " towards": 1103,
3277
+ " tower": 3017,
3278
+ " town": 467,
3279
+ " towns": 3148,
3280
+ " toy": 3988,
3281
+ " toys": 3370,
3282
+ " track": 1526,
3283
+ " tracks": 3245,
3284
+ " trade": 1748,
3285
+ " tradition": 3445,
3286
+ " traditional": 2451,
3287
+ " traffic": 2760,
3288
+ " train": 1100,
3289
+ " trained": 3570,
3290
+ " training": 1422,
3291
+ " trains": 3469,
3292
+ " transfer": 3862,
3293
+ " transferred": 4012,
3294
+ " transport": 3488,
3295
+ " travel": 2030,
3296
+ " treat": 2611,
3297
+ " treated": 2981,
3298
+ " treatment": 2425,
3299
+ " tree": 919,
3300
+ " trees": 1315,
3301
+ " trial": 2272,
3302
+ " trick": 3275,
3303
+ " tried": 744,
3304
+ " tries": 3800,
3305
+ " trip": 1958,
3306
+ " troops": 2369,
3307
+ " tropical": 3575,
3308
+ " trouble": 884,
3309
+ " truck": 2115,
3310
+ " true": 552,
3311
+ " truly": 3031,
3312
+ " trust": 1265,
3313
+ " truth": 902,
3314
+ " try": 412,
3315
+ " trying": 461,
3316
+ " turn": 549,
3317
+ " turned": 687,
3318
+ " turning": 2136,
3319
+ " turns": 2024,
3320
+ " twelve": 1803,
3321
+ " twenty": 963,
3322
+ " twice": 1703,
3323
+ " two": 120,
3324
+ " type": 899,
3325
+ " types": 1907,
3326
+ " typical": 4077,
3327
+ " ugly": 3398,
3328
+ " uh": 314,
3329
+ " um": 1053,
3330
+ " unable": 3473,
3331
+ " uncle": 2281,
3332
+ " under": 311,
3333
+ " understand": 425,
3334
+ " understanding": 2735,
3335
+ " understood": 2600,
3336
+ " union": 2964,
3337
+ " unique": 3261,
3338
+ " unit": 2165,
3339
+ " units": 2393,
3340
+ " universe": 2816,
3341
+ " university": 1983,
3342
+ " unknown": 3057,
3343
+ " unless": 1808,
3344
+ " until": 298,
3345
+ " unusual": 3422,
3346
+ " up": 61,
3347
+ " upon": 455,
3348
+ " upper": 2801,
3349
+ " upset": 2510,
3350
+ " upstairs": 2707,
3351
+ " urban": 4032,
3352
+ " us": 136,
3353
+ " use": 261,
3354
+ " used": 198,
3355
+ " useful": 2517,
3356
+ " uses": 1849,
3357
+ " using": 717,
3358
+ " usual": 2286,
3359
+ " usually": 767,
3360
+ " v": 3779,
3361
+ " valley": 3305,
3362
+ " value": 1191,
3363
+ " values": 3013,
3364
+ " van": 3229,
3365
+ " variety": 2910,
3366
+ " various": 1194,
3367
+ " vast": 3997,
3368
+ " vehicle": 3498,
3369
+ " version": 1375,
3370
+ " very": 115,
3371
+ " via": 3539,
3372
+ " victim": 3401,
3373
+ " victims": 3843,
3374
+ " victory": 2643,
3375
+ " video": 988,
3376
+ " videos": 4033,
3377
+ " view": 1184,
3378
+ " views": 3787,
3379
+ " village": 921,
3380
+ " violence": 3035,
3381
+ " violent": 4006,
3382
+ " vision": 3169,
3383
+ " visit": 1419,
3384
+ " visited": 3518,
3385
+ " voice": 731,
3386
+ " voices": 3610,
3387
+ " volume": 3543,
3388
+ " vote": 1919,
3389
+ " votes": 3922,
3390
+ " wait": 560,
3391
+ " waited": 2867,
3392
+ " waiting": 920,
3393
+ " wake": 2213,
3394
+ " walk": 845,
3395
+ " walked": 1636,
3396
+ " walking": 1687,
3397
+ " wall": 1279,
3398
+ " walls": 2337,
3399
+ " wanna": 518,
3400
+ " want": 101,
3401
+ " wanted": 351,
3402
+ " wants": 671,
3403
+ " war": 647,
3404
+ " warm": 1707,
3405
+ " warning": 3529,
3406
+ " was": 20,
3407
+ " wash": 2931,
3408
+ " wasn't": 529,
3409
+ " waste": 2215,
3410
+ " watch": 790,
3411
+ " watched": 2270,
3412
+ " watching": 1358,
3413
+ " water": 339,
3414
+ " waters": 4010,
3415
+ " wave": 3104,
3416
+ " waves": 3566,
3417
+ " way": 141,
3418
+ " ways": 1147,
3419
+ " we": 39,
3420
+ " we'd": 1923,
3421
+ " we'll": 566,
3422
+ " we're": 258,
3423
+ " we've": 612,
3424
+ " weak": 2765,
3425
+ " weapon": 2947,
3426
+ " weapons": 2536,
3427
+ " wear": 1350,
3428
+ " wearing": 1915,
3429
+ " weather": 2180,
3430
+ " web": 3936,
3431
+ " website": 3138,
3432
+ " wedding": 1725,
3433
+ " week": 589,
3434
+ " weekend": 2809,
3435
+ " weeks": 1009,
3436
+ " weight": 1969,
3437
+ " weird": 2091,
3438
+ " welcome": 1610,
3439
+ " well": 150,
3440
+ " went": 209,
3441
+ " were": 68,
3442
+ " weren't": 1437,
3443
+ " west": 1331,
3444
+ " western": 2541,
3445
+ " wet": 2492,
3446
+ " what": 56,
3447
+ " what's": 470,
3448
+ " whatever": 928,
3449
+ " wheel": 3423,
3450
+ " when": 89,
3451
+ " whenever": 3496,
3452
+ " where": 144,
3453
+ " where's": 3132,
3454
+ " whether": 792,
3455
+ " which": 87,
3456
+ " while": 282,
3457
+ " white": 608,
3458
+ " who": 98,
3459
+ " who's": 1151,
3460
+ " whole": 405,
3461
+ " whom": 1056,
3462
+ " whose": 1102,
3463
+ " why": 238,
3464
+ " wide": 1585,
3465
+ " widely": 3709,
3466
+ " wife": 491,
3467
+ " wild": 1712,
3468
+ " will": 79,
3469
+ " willing": 2435,
3470
+ " win": 957,
3471
+ " wind": 1353,
3472
+ " window": 1157,
3473
+ " windows": 2736,
3474
+ " wine": 2211,
3475
+ " wing": 3959,
3476
+ " wings": 2895,
3477
+ " winner": 3291,
3478
+ " winning": 2373,
3479
+ " winter": 2014,
3480
+ " wire": 3917,
3481
+ " wise": 3225,
3482
+ " wish": 755,
3483
+ " wished": 3263,
3484
+ " with": 27,
3485
+ " within": 658,
3486
+ " without": 344,
3487
+ " witness": 3369,
3488
+ " wo": 3481,
3489
+ " woman": 441,
3490
+ " women": 622,
3491
+ " women's": 2776,
3492
+ " won": 488,
3493
+ " won't": 345,
3494
+ " wonder": 1193,
3495
+ " wondered": 3942,
3496
+ " wonderful": 1364,
3497
+ " wondering": 2858,
3498
+ " wood": 1768,
3499
+ " woods": 3280,
3500
+ " word": 502,
3501
+ " words": 614,
3502
+ " wore": 3321,
3503
+ " work": 182,
3504
+ " worked": 723,
3505
+ " workers": 2635,
3506
+ " working": 553,
3507
+ " works": 793,
3508
+ " world": 260,
3509
+ " world's": 3145,
3510
+ " worn": 4074,
3511
+ " worried": 1846,
3512
+ " worry": 828,
3513
+ " worse": 1459,
3514
+ " worst": 2312,
3515
+ " worth": 1202,
3516
+ " would": 82,
3517
+ " would've": 3598,
3518
+ " wouldn't": 563,
3519
+ " wound": 3777,
3520
+ " wounded": 3580,
3521
+ " write": 839,
3522
+ " writer": 1851,
3523
+ " writing": 1192,
3524
+ " written": 801,
3525
+ " wrong": 458,
3526
+ " wrote": 823,
3527
+ " x": 1288,
3528
+ " y": 2253,
3529
+ " ya": 2280,
3530
+ " yard": 3607,
3531
+ " yards": 3553,
3532
+ " ye": 2932,
3533
+ " yeah": 335,
3534
+ " year": 287,
3535
+ " years": 183,
3536
+ " yellow": 1658,
3537
+ " yes": 449,
3538
+ " yesterday": 1542,
3539
+ " yet": 421,
3540
+ " you": 14,
3541
+ " you ": 2839,
3542
+ " you'd": 898,
3543
+ " you'll": 634,
3544
+ " you're": 149,
3545
+ " you've": 450,
3546
+ " young": 434,
3547
+ " younger": 2318,
3548
+ " your": 48,
3549
+ " yours": 1101,
3550
+ " yourself": 531,
3551
+ " youth": 2007,
3552
+ " zero": 2214,
3553
+ " |": 1340,
3554
+ " £": 3655,
3555
+ " –": 2142,
3556
+ " —": 3596,
3557
+ " ‘": 2465,
3558
+ " “": 352,
3559
+ " ♪": 570,
3560
+ "!": 26,
3561
+ "! ": 3100,
3562
+ "!\"": 607,
3563
+ "!”": 1301,
3564
+ "\"": 51,
3565
+ "\" ": 3628,
3566
+ "\")": 1165,
3567
+ "\"),": 3267,
3568
+ "\").": 2781,
3569
+ "\",": 265,
3570
+ "\".": 226,
3571
+ "\"?": 2208,
3572
+ "#": 969,
3573
+ "&": 736,
3574
+ "'": 654,
3575
+ "'Cause": 3312,
3576
+ "'s": 2682,
3577
+ "(": 201,
3578
+ ")": 110,
3579
+ ") ": 1465,
3580
+ "),": 515,
3581
+ ").": 385,
3582
+ "):": 3000,
3583
+ "*": 1715,
3584
+ "+": 2944,
3585
+ ",": 6,
3586
+ ", ": 556,
3587
+ ",\"": 273,
3588
+ ",”": 576,
3589
+ "-": 25,
3590
+ "--": 2621,
3591
+ "-I": 2022,
3592
+ "-No": 2731,
3593
+ "-What": 2775,
3594
+ "-Yeah": 3701,
3595
+ "-Yes": 3345,
3596
+ "-You": 2838,
3597
+ "-year-old": 3934,
3598
+ ".": 5,
3599
+ ". ": 301,
3600
+ ".\"": 158,
3601
+ ".\" ": 1084,
3602
+ ".)": 3260,
3603
+ ".,": 1418,
3604
+ "..": 525,
3605
+ "...": 43,
3606
+ "...\"": 3156,
3607
+ "....": 1830,
3608
+ "...?": 2447,
3609
+ ".]": 3507,
3610
+ ".”": 523,
3611
+ "/": 773,
3612
+ "/&": 3933,
3613
+ "1": 1916,
3614
+ "2": 1927,
3615
+ "3": 2995,
3616
+ "4": 3746,
3617
+ ":": 92,
3618
+ ";": 93,
3619
+ ";&": 1681,
3620
+ ";/": 3807,
3621
+ "<eod>": 3,
3622
+ "<frg>": 4,
3623
+ "<oov>": 1,
3624
+ "<pad>": 0,
3625
+ "<sod>": 2,
3626
+ "=": 416,
3627
+ ">": 3581,
3628
+ "> ": 3405,
3629
+ "?": 15,
3630
+ "? ": 4028,
3631
+ "?\"": 454,
3632
+ "?\" ": 3760,
3633
+ "?”": 1047,
3634
+ "A": 184,
3635
+ "AND": 2432,
3636
+ "About": 2283,
3637
+ "According": 2959,
3638
+ "Actually": 2424,
3639
+ "After": 750,
3640
+ "Again": 3572,
3641
+ "Ah": 786,
3642
+ "All": 279,
3643
+ "Alright": 1936,
3644
+ "Also": 3794,
3645
+ "Although": 2939,
3646
+ "Am": 3524,
3647
+ "An": 1306,
3648
+ "And": 72,
3649
+ "Another": 2403,
3650
+ "Any": 2114,
3651
+ "Anything": 3428,
3652
+ "Anyway": 2446,
3653
+ "Are": 389,
3654
+ "Aren't": 3987,
3655
+ "As": 494,
3656
+ "At": 623,
3657
+ "Aye": 2951,
3658
+ "B": 2354,
3659
+ "Back": 3574,
3660
+ "Be": 1706,
3661
+ "Because": 558,
3662
+ "Before": 2081,
3663
+ "Besides": 3684,
3664
+ "Big": 3638,
3665
+ "Both": 3272,
3666
+ "Bring": 3692,
3667
+ "But": 111,
3668
+ "By": 1155,
3669
+ "Bye": 2137,
3670
+ "C": 1424,
3671
+ "CHAPTER": 3288,
3672
+ "Call": 3181,
3673
+ "Can": 602,
3674
+ "Can't": 1978,
3675
+ "Captain": 3554,
3676
+ "Club": 3046,
3677
+ "Come": 251,
3678
+ "Cos": 2427,
3679
+ "Could": 1992,
3680
+ "D": 1795,
3681
+ "Dad": 2323,
3682
+ "Damn": 2558,
3683
+ "David": 3400,
3684
+ "Dear": 4019,
3685
+ "Did": 534,
3686
+ "Didn't": 2518,
3687
+ "Do": 274,
3688
+ "Does": 1710,
3689
+ "Don't": 268,
3690
+ "Dr": 2386,
3691
+ "During": 2241,
3692
+ "E": 3085,
3693
+ "Early": 2986,
3694
+ "Eh": 3838,
3695
+ "Er": 1083,
3696
+ "Erm": 877,
3697
+ "Even": 1146,
3698
+ "Every": 1821,
3699
+ "Everybody": 2845,
3700
+ "Everyone": 2713,
3701
+ "Everything": 2196,
3702
+ "Exactly": 3511,
3703
+ "Excuse": 1435,
3704
+ "Father": 3247,
3705
+ "Fine": 2331,
3706
+ "First": 1601,
3707
+ "Five": 3486,
3708
+ "For": 503,
3709
+ "Forget": 4040,
3710
+ "Four": 3718,
3711
+ "From": 1144,
3712
+ "Fuck": 2507,
3713
+ "Get": 526,
3714
+ "Give": 1065,
3715
+ "Go": 522,
3716
+ "God": 1380,
3717
+ "Good": 528,
3718
+ "Got": 1911,
3719
+ "Great": 1966,
3720
+ "Ha": 3142,
3721
+ "Has": 3350,
3722
+ "Have": 733,
3723
+ "He": 131,
3724
+ "He'll": 2983,
3725
+ "He's": 340,
3726
+ "Hello": 740,
3727
+ "Help": 2226,
3728
+ "Her": 1583,
3729
+ "Here": 644,
3730
+ "Here's": 2344,
3731
+ "Hey": 291,
3732
+ "Hi": 1081,
3733
+ "His": 871,
3734
+ "History": 1790,
3735
+ "Hmm": 2779,
3736
+ "Hold": 1928,
3737
+ "How": 227,
3738
+ "How's": 2942,
3739
+ "However": 2511,
3740
+ "Huh": 2186,
3741
+ "Hurry": 2806,
3742
+ "I": 24,
3743
+ "I'd": 968,
3744
+ "I'll": 254,
3745
+ "I'm": 118,
3746
+ "I've": 372,
3747
+ "If": 187,
3748
+ "In": 164,
3749
+ "Is": 347,
3750
+ "Isn't": 2228,
3751
+ "It": 133,
3752
+ "It'll": 2871,
3753
+ "It's": 114,
3754
+ "Jack": 3743,
3755
+ "James": 3956,
3756
+ "Jesus": 3077,
3757
+ "John": 1912,
3758
+ "Just": 353,
3759
+ "K": 4073,
3760
+ "Keep": 1948,
3761
+ "La": 3994,
3762
+ "Last": 3216,
3763
+ "Leave": 2440,
3764
+ "Let": 617,
3765
+ "Let's": 391,
3766
+ "Life": 3678,
3767
+ "Like": 917,
3768
+ "Listen": 1174,
3769
+ "Little": 3177,
3770
+ "Look": 422,
3771
+ "Looks": 2997,
3772
+ "Love": 3571,
3773
+ "M": 2603,
3774
+ "MAN": 3815,
3775
+ "Make": 2399,
3776
+ "Man": 2230,
3777
+ "Many": 2534,
3778
+ "May": 1962,
3779
+ "Maybe": 703,
3780
+ "Me": 1925,
3781
+ "Mhm": 1584,
3782
+ "Michael": 3711,
3783
+ "Miss": 2244,
3784
+ "Mm": 500,
3785
+ "Mom": 2487,
3786
+ "More": 2743,
3787
+ "Most": 2361,
3788
+ "Mother": 3622,
3789
+ "Move": 2853,
3790
+ "Mr": 785,
3791
+ "Mrs": 2184,
3792
+ "Music": 3926,
3793
+ "My": 281,
3794
+ "Never": 1812,
3795
+ "New": 2602,
3796
+ "Next": 2473,
3797
+ "Nice": 2277,
3798
+ "No": 116,
3799
+ "Nobody": 2618,
3800
+ "Not": 426,
3801
+ "Nothing": 1204,
3802
+ "Now": 228,
3803
+ "O": 2219,
3804
+ "OH": 3667,
3805
+ "OK": 1323,
3806
+ "Of": 810,
3807
+ "Oh": 117,
3808
+ "Ok": 2476,
3809
+ "Okay": 272,
3810
+ "On": 512,
3811
+ "Once": 2042,
3812
+ "One": 539,
3813
+ "Only": 1529,
3814
+ "Ooh": 1984,
3815
+ "Open": 3112,
3816
+ "Or": 844,
3817
+ "Other": 3274,
3818
+ "Our": 1231,
3819
+ "Out": 3561,
3820
+ "Over": 2724,
3821
+ "P": 3700,
3822
+ "People": 1545,
3823
+ "Perhaps": 2123,
3824
+ "Peter": 3120,
3825
+ "Please": 615,
3826
+ "Probably": 3886,
3827
+ "Put": 1858,
3828
+ "R": 4064,
3829
+ "Really": 1169,
3830
+ "Remember": 2260,
3831
+ "Right": 397,
3832
+ "S": 682,
3833
+ "Say": 2278,
3834
+ "See": 918,
3835
+ "Shall": 3443,
3836
+ "She": 230,
3837
+ "She's": 639,
3838
+ "Shit": 2893,
3839
+ "Should": 3330,
3840
+ "Shut": 2533,
3841
+ "Since": 1806,
3842
+ "Sir": 1574,
3843
+ "Sit": 3027,
3844
+ "So": 109,
3845
+ "Some": 997,
3846
+ "Somebody": 3966,
3847
+ "Someone": 2956,
3848
+ "Something": 2225,
3849
+ "Sometimes": 2384,
3850
+ "Sorry": 936,
3851
+ "Stay": 2519,
3852
+ "Still": 2850,
3853
+ "Stop": 1004,
3854
+ "Sure": 1559,
3855
+ "T": 3308,
3856
+ "THE": 1990,
3857
+ "Take": 855,
3858
+ "Tell": 977,
3859
+ "Thank": 428,
3860
+ "Thanks": 938,
3861
+ "That": 237,
3862
+ "That's": 175,
3863
+ "The": 47,
3864
+ "Their": 3211,
3865
+ "Then": 366,
3866
+ "There": 266,
3867
+ "There's": 459,
3868
+ "These": 976,
3869
+ "They": 211,
3870
+ "They'll": 3452,
3871
+ "They're": 690,
3872
+ "They've": 3029,
3873
+ "Think": 2639,
3874
+ "This": 161,
3875
+ "Those": 1783,
3876
+ "Three": 2094,
3877
+ "Time": 3877,
3878
+ "To": 575,
3879
+ "Today": 2521,
3880
+ "Too": 3551,
3881
+ "Try": 3244,
3882
+ "Two": 1291,
3883
+ "Uh": 757,
3884
+ "Uh-huh": 817,
3885
+ "Um": 1346,
3886
+ "Until": 3984,
3887
+ "Very": 1345,
3888
+ "Wait": 914,
3889
+ "Was": 1728,
3890
+ "Watch": 3123,
3891
+ "We": 137,
3892
+ "We'll": 836,
3893
+ "We're": 456,
3894
+ "We've": 1098,
3895
+ "Welcome": 2678,
3896
+ "Well": 143,
3897
+ "Were": 3864,
3898
+ "What": 99,
3899
+ "What's": 394,
3900
+ "Whatever": 2617,
3901
+ "When": 313,
3902
+ "Where": 476,
3903
+ "Where's": 1654,
3904
+ "Which": 1271,
3905
+ "While": 2486,
3906
+ "Who": 506,
3907
+ "Who's": 2026,
3908
+ "Whoa": 2723,
3909
+ "Why": 235,
3910
+ "Will": 1277,
3911
+ "With": 876,
3912
+ "Without": 3854,
3913
+ "Would": 1472,
3914
+ "Wow": 1760,
3915
+ "YOU": 2480,
3916
+ "Yeah": 123,
3917
+ "Yep": 3612,
3918
+ "Yes": 178,
3919
+ "You": 57,
3920
+ "You'd": 3187,
3921
+ "You'll": 1130,
3922
+ "You're": 241,
3923
+ "You've": 947,
3924
+ "Your": 493,
3925
+ "[": 177,
3926
+ "]": 202,
3927
+ "] ": 1070,
3928
+ "_": 474,
3929
+ "`": 661,
3930
+ "``": 244,
3931
+ "a": 492,
3932
+ "about": 3837,
3933
+ "ah": 3099,
3934
+ "aha": 1426,
3935
+ "all": 1695,
3936
+ "alright": 2296,
3937
+ "also": 3155,
3938
+ "amp": 306,
3939
+ "an": 2882,
3940
+ "and": 172,
3941
+ "are": 1145,
3942
+ "as": 1624,
3943
+ "asked": 3728,
3944
+ "at": 2282,
3945
+ "be": 2820,
3946
+ "because": 1837,
3947
+ "born": 625,
3948
+ "br": 1261,
3949
+ "but": 595,
3950
+ "by": 2967,
3951
+ "can": 971,
3952
+ "come": 1471,
3953
+ "d": 3253,
3954
+ "did": 1325,
3955
+ "do": 762,
3956
+ "does": 3281,
3957
+ "don't": 1843,
3958
+ "e": 3101,
3959
+ "for": 1218,
3960
+ "from": 2231,
3961
+ "get": 3533,
3962
+ "go": 2818,
3963
+ "good": 1946,
3964
+ "gt": 481,
3965
+ "had": 2438,
3966
+ "have": 2247,
3967
+ "he": 789,
3968
+ "he's": 2069,
3969
+ "hello": 3981,
3970
+ "her": 2493,
3971
+ "here": 1199,
3972
+ "here's": 2999,
3973
+ "hey": 2825,
3974
+ "hi": 3617,
3975
+ "him": 3764,
3976
+ "his": 1785,
3977
+ "hm": 1701,
3978
+ "how": 1123,
3979
+ "huh": 2305,
3980
+ "i": 326,
3981
+ "i'll": 2861,
3982
+ "i'm": 1714,
3983
+ "if": 1727,
3984
+ "in": 738,
3985
+ "is": 642,
3986
+ "it": 730,
3987
+ "it's": 706,
3988
+ "just": 2146,
3989
+ "km": 2709,
3990
+ "let": 2150,
3991
+ "let's": 1213,
3992
+ "like": 1616,
3993
+ "little": 3562,
3994
+ "ll": 2676,
3995
+ "look": 1093,
3996
+ "lt": 1121,
3997
+ "m": 1446,
3998
+ "man": 4000,
3999
+ "maybe": 3527,
4000
+ "mhm": 1637,
4001
+ "mm": 1551,
4002
+ "more": 3550,
4003
+ "my": 2362,
4004
+ "nd": 2784,
4005
+ "no": 477,
4006
+ "not": 1504,
4007
+ "now": 1205,
4008
+ "of": 702,
4009
+ "oh": 295,
4010
+ "okay": 411,
4011
+ "on": 1779,
4012
+ "one": 1716,
4013
+ "or": 1059,
4014
+ "put": 2333,
4015
+ "rd": 3171,
4016
+ "re": 3243,
4017
+ "right": 1434,
4018
+ "s": 219,
4019
+ "said": 2235,
4020
+ "say": 3885,
4021
+ "see": 1414,
4022
+ "she": 1220,
4023
+ "she's": 3627,
4024
+ "should": 3597,
4025
+ "so": 926,
4026
+ "some": 3778,
4027
+ "st": 2256,
4028
+ "t": 831,
4029
+ "th": 365,
4030
+ "thank": 2623,
4031
+ "that": 592,
4032
+ "that's": 486,
4033
+ "the": 221,
4034
+ "their": 3795,
4035
+ "then": 2474,
4036
+ "there": 972,
4037
+ "there's": 1766,
4038
+ "they": 1187,
4039
+ "they're": 3236,
4040
+ "this": 1006,
4041
+ "those": 3941,
4042
+ "to": 705,
4043
+ "two": 3147,
4044
+ "uh": 2464,
4045
+ "up": 3996,
4046
+ "ve": 3833,
4047
+ "very": 2938,
4048
+ "want": 3447,
4049
+ "was": 1369,
4050
+ "we": 978,
4051
+ "we'll": 4026,
4052
+ "we're": 3905,
4053
+ "well": 837,
4054
+ "were": 3037,
4055
+ "what": 280,
4056
+ "what's": 746,
4057
+ "when": 2015,
4058
+ "where": 1118,
4059
+ "where's": 1708,
4060
+ "which": 1324,
4061
+ "who": 1385,
4062
+ "who's": 2691,
4063
+ "why": 1587,
4064
+ "will": 3430,
4065
+ "with": 1212,
4066
+ "would": 1910,
4067
+ "wow": 3816,
4068
+ "x": 3002,
4069
+ "yeah": 429,
4070
+ "yes": 1016,
4071
+ "you": 232,
4072
+ "you're": 1171,
4073
+ "your": 2787,
4074
+ "’": 3567,
4075
+ " ": 2647,
4076
+ "´": 1304,
4077
+ "¶": 3573,
4078
+ "–": 1455,
4079
+ "—": 979,
4080
+ "’": 239,
4081
+ "“": 327,
4082
+ "”": 2161,
4083
+ "♪": 413,
4084
+ "♫": 3413
4085
+ }