Update app.py
Browse files
app.py
CHANGED
@@ -9,21 +9,29 @@ from train_tokenizer import train_tokenizer
|
|
9 |
from tokenizers import Tokenizer
|
10 |
from datasets import load_dataset
|
11 |
|
12 |
-
def create_iterator(files=None, dataset_name=None, split="train", streaming=True):
|
13 |
-
"""
|
14 |
-
Δημιουργεί έναν iterator που διαβάζει κείμενο είτε από τοπικά αρχεία είτε από Hugging Face Dataset.
|
15 |
-
"""
|
16 |
if dataset_name:
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
elif files:
|
22 |
for file in files:
|
23 |
with open(file.name, 'r', encoding='utf-8') as f:
|
24 |
for line in f:
|
25 |
-
|
26 |
-
yield line.strip()
|
27 |
|
28 |
def enhanced_validation(tokenizer, test_text):
|
29 |
"""
|
@@ -97,7 +105,7 @@ def train_and_test(files, dataset_name, split, vocab_size, min_freq, test_text):
|
|
97 |
"histogram": validation["token_length_distribution"]
|
98 |
}
|
99 |
|
100 |
-
#
|
101 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
102 |
gr.Markdown("## Προχωρημένος BPE Tokenizer Trainer")
|
103 |
|
|
|
9 |
from tokenizers import Tokenizer
|
10 |
from datasets import load_dataset
|
11 |
|
12 |
+
def create_iterator(files=None, dataset_name=None, dataset_config=None, split="train", streaming=True):
|
|
|
|
|
|
|
13 |
if dataset_name:
|
14 |
+
try:
|
15 |
+
# Επεξεργασία ονόματος dataset με έλεγχο εγκυρότητας
|
16 |
+
if not re.match(r'^[\w\-\.]+(/[\w\-\.]+)*$', dataset_name):
|
17 |
+
raise ValueError(f"Μη έγκυρο όνομα dataset: {dataset_name}")
|
18 |
+
|
19 |
+
# Φόρτωση dataset με config αν υπάρχει
|
20 |
+
dataset = load_dataset(
|
21 |
+
dataset_name,
|
22 |
+
name=dataset_config if dataset_config else None,
|
23 |
+
split=split,
|
24 |
+
streaming=streaming
|
25 |
+
)
|
26 |
+
for example in dataset:
|
27 |
+
yield example['text']
|
28 |
+
except Exception as e:
|
29 |
+
raise gr.Error(f"Σφάλμα φόρτωσης dataset: {str(e)}")
|
30 |
elif files:
|
31 |
for file in files:
|
32 |
with open(file.name, 'r', encoding='utf-8') as f:
|
33 |
for line in f:
|
34 |
+
yield line.strip()
|
|
|
35 |
|
36 |
def enhanced_validation(tokenizer, test_text):
|
37 |
"""
|
|
|
105 |
"histogram": validation["token_length_distribution"]
|
106 |
}
|
107 |
|
108 |
+
# Ενημερωμένο Gradio Interface
|
109 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
110 |
gr.Markdown("## Προχωρημένος BPE Tokenizer Trainer")
|
111 |
|