Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -21,12 +21,14 @@ async def train_tokenizer():
|
|
21 |
vocab_size = 50000
|
22 |
min_frequency = 2
|
23 |
|
24 |
-
dataset_greek = load_dataset("oscar", "unshuffled_deduplicated_el", split="train", streaming=True)
|
25 |
-
|
|
|
|
|
26 |
|
27 |
try:
|
28 |
dataset_code = load_dataset("bigcode/the-stack", split="train", streaming=True)
|
29 |
-
datasets_list = [dataset_greek, dataset_english
|
30 |
except:
|
31 |
datasets_list = [dataset_greek, dataset_english]
|
32 |
|
|
|
21 |
vocab_size = 50000
|
22 |
min_frequency = 2
|
23 |
|
24 |
+
#dataset_greek = load_dataset("oscar", "unshuffled_deduplicated_el", split="train", streaming=True)
|
25 |
+
dataset_greek = load_dataset("wikipedia", "20231101.el", split="train", streaming=True)
|
26 |
+
dataset_english = load_dataset("wikipedia", "20231101.en", split="train", streaming=True)
|
27 |
+
|
28 |
|
29 |
try:
|
30 |
dataset_code = load_dataset("bigcode/the-stack", split="train", streaming=True)
|
31 |
+
datasets_list = [dataset_greek, dataset_english]
|
32 |
except:
|
33 |
datasets_list = [dataset_greek, dataset_english]
|
34 |
|