Itsme5 commited on
Commit
a3780db
·
verified ·
1 Parent(s): bb5cb64

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -53
app.py CHANGED
@@ -1,58 +1,8 @@
1
  from fastapi import FastAPI
2
- from transformers import PreTrainedTokenizerFast
3
- from tokenizers import ByteLevelBPETokenizer
4
- from datasets import load_dataset
5
- from contextlib import asynccontextmanager
6
- import logging
7
 
8
- logging.basicConfig(level=logging.INFO)
9
- logger = logging.getLogger(__name__)
10
-
11
- @asynccontextmanager
12
- async def lifespan(app: FastAPI):
13
- logger.info("Application starting...")
14
- await train_tokenizer()
15
- yield
16
- logger.info("Application shutting down...")
17
-
18
- app = FastAPI(lifespan=lifespan)
19
-
20
- async def train_tokenizer():
21
- vocab_size = 50000
22
- min_frequency = 2
23
-
24
- #dataset_greek = load_dataset("oscar", "unshuffled_deduplicated_el", split="train", streaming=True)
25
- dataset_greek = load_dataset("wikipedia", "20231101.el", split="train", streaming=True)
26
- dataset_english = load_dataset("wikipedia", "20231101.en", split="train", streaming=True)
27
-
28
-
29
- try:
30
- dataset_code = load_dataset("bigcode/the-stack", split="train", streaming=True)
31
- datasets_list = [dataset_greek, dataset_english]
32
- except:
33
- datasets_list = [dataset_greek, dataset_english]
34
-
35
- def preprocess_data(dataset):
36
- for item in dataset:
37
- text = item["text"]
38
- text = text.strip().lower()
39
- if text:
40
- yield text
41
-
42
- combined_data = (preprocess_data(dataset.take(1000)) for dataset in datasets_list)
43
-
44
- tokenizer = ByteLevelBPETokenizer()
45
-
46
- tokenizer.train_from_iterator(
47
- combined_data,
48
- vocab_size=vocab_size,
49
- min_frequency=min_frequency,
50
- special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
51
- )
52
-
53
- tokenizer.save_model(".")
54
- logger.info("Tokenizer training completed!")
55
 
56
  @app.get("/")
57
  async def root():
58
- return {"message": "Custom Tokenizer Training Completed and Saved"}
 
1
  from fastapi import FastAPI
 
 
 
 
 
2
 
3
+ # Δημιουργία FastAPI εφαρμογής
4
+ app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  @app.get("/")
7
  async def root():
8
+ return {"message": "Welcome to your basic FastAPI application!"}