Commit
·
11d3b20
1
Parent(s):
fca027b
feat: add dataset preparator script
Browse files- .gitignore +1 -0
- dataset.py +20 -0
.gitignore
CHANGED
|
@@ -1,3 +1,4 @@
|
|
| 1 |
venv
|
| 2 |
flagged
|
| 3 |
.env
|
|
|
|
|
|
| 1 |
venv
|
| 2 |
flagged
|
| 3 |
.env
|
| 4 |
+
*.csv
|
dataset.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
divider = 1
|
| 5 |
+
data_size = 25000 // divider
|
| 6 |
+
case_size = data_size // 2
|
| 7 |
+
|
| 8 |
+
dataset = load_dataset("imdb")
|
| 9 |
+
|
| 10 |
+
train_df = pd.DataFrame(dataset['train'])
|
| 11 |
+
test_df = pd.DataFrame(dataset['test'])
|
| 12 |
+
|
| 13 |
+
train_df = train_df.iloc[::divider, :]
|
| 14 |
+
test_df = test_df.iloc[::divider, :]
|
| 15 |
+
|
| 16 |
+
train_df['label'] = train_df['label'].apply(lambda x: 'NEGATIVE' if x == 0 else 'POSITIVE')
|
| 17 |
+
test_df['label'] = test_df['label'].apply(lambda x: 'NEGATIVE' if x == 0 else 'POSITIVE')
|
| 18 |
+
|
| 19 |
+
train_df.to_csv(f'imdb_train_{case_size}_{case_size}.csv', index=False)
|
| 20 |
+
test_df.to_csv(f'imdb_test_{case_size}_{case_size}.csv', index=False)
|