Tbruand commited on
Commit
1efccc5
·
1 Parent(s): 1b785e3

chore(notebook): met à jour le chemin vers le dataset nettoyé FR

Browse files
Files changed (1) hide show
  1. notebooks/01_exploration.ipynb +21 -15
notebooks/01_exploration.ipynb CHANGED
@@ -67,7 +67,7 @@
67
  },
68
  {
69
  "cell_type": "code",
70
- "execution_count": 9,
71
  "id": "0ff4a780",
72
  "metadata": {},
73
  "outputs": [
@@ -92,6 +92,7 @@
92
  " <thead>\n",
93
  " <tr style=\"text-align: right;\">\n",
94
  " <th></th>\n",
 
95
  " <th>comment_text</th>\n",
96
  " <th>toxic</th>\n",
97
  " <th>severe_toxic</th>\n",
@@ -104,6 +105,7 @@
104
  " <tbody>\n",
105
  " <tr>\n",
106
  " <th>0</th>\n",
 
107
  " <td>Hé mec, je n'essaye vraiment pas de modifier l...</td>\n",
108
  " <td>0</td>\n",
109
  " <td>0</td>\n",
@@ -114,6 +116,7 @@
114
  " </tr>\n",
115
  " <tr>\n",
116
  " <th>1</th>\n",
 
117
  " <td>\"\\n\\nFélicitations de ma part également, utili...</td>\n",
118
  " <td>0</td>\n",
119
  " <td>0</td>\n",
@@ -124,6 +127,7 @@
124
  " </tr>\n",
125
  " <tr>\n",
126
  " <th>2</th>\n",
 
127
  " <td>Le point Mitsurugi n'avait aucun sens - pourqu...</td>\n",
128
  " <td>0</td>\n",
129
  " <td>0</td>\n",
@@ -134,6 +138,7 @@
134
  " </tr>\n",
135
  " <tr>\n",
136
  " <th>3</th>\n",
 
137
  " <td>\"\\nPlus\\nJe ne peux pas faire de véritables su...</td>\n",
138
  " <td>0</td>\n",
139
  " <td>0</td>\n",
@@ -144,6 +149,7 @@
144
  " </tr>\n",
145
  " <tr>\n",
146
  " <th>4</th>\n",
 
147
  " <td>alignement sur ce sujet et qui sont contraires...</td>\n",
148
  " <td>0</td>\n",
149
  " <td>0</td>\n",
@@ -157,22 +163,22 @@
157
  "</div>"
158
  ],
159
  "text/plain": [
160
- " comment_text toxic severe_toxic \\\n",
161
- "0 Hé mec, je n'essaye vraiment pas de modifier l... 0 0 \n",
162
- "1 \"\\n\\nFélicitations de ma part également, utili... 0 0 \n",
163
- "2 Le point Mitsurugi n'avait aucun sens - pourqu... 0 0 \n",
164
- "3 \"\\nPlus\\nJe ne peux pas faire de véritables su... 0 0 \n",
165
- "4 alignement sur ce sujet et qui sont contraires... 0 0 \n",
166
  "\n",
167
- " obscene threat insult identity_hate \n",
168
- "0 0 0 0 0 \n",
169
- "1 0 0 0 0 \n",
170
- "2 0 0 0 0 \n",
171
- "3 0 0 0 0 \n",
172
- "4 0 0 0 0 "
173
  ]
174
  },
175
- "execution_count": 9,
176
  "metadata": {},
177
  "output_type": "execute_result"
178
  }
@@ -189,7 +195,7 @@
189
  "plt.rcParams[\"figure.figsize\"] = (10, 6)\n",
190
  "\n",
191
  "# Chargement des données\n",
192
- "df = pd.read_csv(\"../data/jigsaw_toxic_fr_clean.csv\")\n",
193
  "\n",
194
  "# Nettoyage de colonnes inutiles si présentes\n",
195
  "df = df.loc[:, ~df.columns.str.startswith(\"Unnamed\")]\n",
 
67
  },
68
  {
69
  "cell_type": "code",
70
+ "execution_count": 14,
71
  "id": "0ff4a780",
72
  "metadata": {},
73
  "outputs": [
 
92
  " <thead>\n",
93
  " <tr style=\"text-align: right;\">\n",
94
  " <th></th>\n",
95
+ " <th>id</th>\n",
96
  " <th>comment_text</th>\n",
97
  " <th>toxic</th>\n",
98
  " <th>severe_toxic</th>\n",
 
105
  " <tbody>\n",
106
  " <tr>\n",
107
  " <th>0</th>\n",
108
+ " <td>000113f07ec002fd</td>\n",
109
  " <td>Hé mec, je n'essaye vraiment pas de modifier l...</td>\n",
110
  " <td>0</td>\n",
111
  " <td>0</td>\n",
 
116
  " </tr>\n",
117
  " <tr>\n",
118
  " <th>1</th>\n",
119
+ " <td>00025465d4725e87</td>\n",
120
  " <td>\"\\n\\nFélicitations de ma part également, utili...</td>\n",
121
  " <td>0</td>\n",
122
  " <td>0</td>\n",
 
127
  " </tr>\n",
128
  " <tr>\n",
129
  " <th>2</th>\n",
130
+ " <td>0009801bd85e5806</td>\n",
131
  " <td>Le point Mitsurugi n'avait aucun sens - pourqu...</td>\n",
132
  " <td>0</td>\n",
133
  " <td>0</td>\n",
 
138
  " </tr>\n",
139
  " <tr>\n",
140
  " <th>3</th>\n",
141
+ " <td>0001b41b1c6bb37e</td>\n",
142
  " <td>\"\\nPlus\\nJe ne peux pas faire de véritables su...</td>\n",
143
  " <td>0</td>\n",
144
  " <td>0</td>\n",
 
149
  " </tr>\n",
150
  " <tr>\n",
151
  " <th>4</th>\n",
152
+ " <td>00040093b2687caa</td>\n",
153
  " <td>alignement sur ce sujet et qui sont contraires...</td>\n",
154
  " <td>0</td>\n",
155
  " <td>0</td>\n",
 
163
  "</div>"
164
  ],
165
  "text/plain": [
166
+ " id comment_text toxic \\\n",
167
+ "0 000113f07ec002fd Hé mec, je n'essaye vraiment pas de modifier l... 0 \n",
168
+ "1 00025465d4725e87 \"\\n\\nFélicitations de ma part également, utili... 0 \n",
169
+ "2 0009801bd85e5806 Le point Mitsurugi n'avait aucun sens - pourqu... 0 \n",
170
+ "3 0001b41b1c6bb37e \"\\nPlus\\nJe ne peux pas faire de véritables su... 0 \n",
171
+ "4 00040093b2687caa alignement sur ce sujet et qui sont contraires... 0 \n",
172
  "\n",
173
+ " severe_toxic obscene threat insult identity_hate \n",
174
+ "0 0 0 0 0 0 \n",
175
+ "1 0 0 0 0 0 \n",
176
+ "2 0 0 0 0 0 \n",
177
+ "3 0 0 0 0 0 \n",
178
+ "4 0 0 0 0 0 "
179
  ]
180
  },
181
+ "execution_count": 14,
182
  "metadata": {},
183
  "output_type": "execute_result"
184
  }
 
195
  "plt.rcParams[\"figure.figsize\"] = (10, 6)\n",
196
  "\n",
197
  "# Chargement des données\n",
198
+ "df = pd.read_csv(\"../data/jigsaw-toxic-comment-train-google-fr-cleaned.csv\")\n",
199
  "\n",
200
  "# Nettoyage de colonnes inutiles si présentes\n",
201
  "df = df.loc[:, ~df.columns.str.startswith(\"Unnamed\")]\n",