Itsme5 commited on
Commit
283d1fe
·
verified ·
1 Parent(s): 75183a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -3
app.py CHANGED
@@ -21,12 +21,14 @@ async def train_tokenizer():
21
  vocab_size = 50000
22
  min_frequency = 2
23
 
24
- dataset_greek = load_dataset("oscar", "unshuffled_deduplicated_el", split="train", streaming=True)
25
- dataset_english = load_dataset("wikipedia", "20220301.en", split="train", streaming=True)
 
 
26
 
27
  try:
28
  dataset_code = load_dataset("bigcode/the-stack", split="train", streaming=True)
29
- datasets_list = [dataset_greek, dataset_english, dataset_code]
30
  except:
31
  datasets_list = [dataset_greek, dataset_english]
32
 
 
21
  vocab_size = 50000
22
  min_frequency = 2
23
 
24
+ #dataset_greek = load_dataset("oscar", "unshuffled_deduplicated_el", split="train", streaming=True)
25
+ dataset_greek = load_dataset("wikipedia", "20231101.el", split="train", streaming=True)
26
+ dataset_english = load_dataset("wikipedia", "20231101.en", split="train", streaming=True)
27
+
28
 
29
  try:
30
  dataset_code = load_dataset("bigcode/the-stack", split="train", streaming=True)
31
+ datasets_list = [dataset_greek, dataset_english]
32
  except:
33
  datasets_list = [dataset_greek, dataset_english]
34