{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Setup Working directory" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "if os.path.basename(os.getcwd()) == \"notebooks\":\n", " os.chdir(\"../\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Install Libraries and Download Dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "execution": { "iopub.execute_input": "2025-01-10T12:31:19.124429Z", "iopub.status.busy": "2025-01-10T12:31:19.123936Z", "iopub.status.idle": "2025-01-10T12:31:27.021410Z", "shell.execute_reply": "2025-01-10T12:31:27.019831Z", "shell.execute_reply.started": "2025-01-10T12:31:19.124383Z" }, "trusted": true }, "outputs": [], "source": [ "!pip install -q underthesea\n", "!pip install -q kaggle" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2025-01-10T12:31:27.023522Z", "iopub.status.busy": "2025-01-10T12:31:27.023199Z", "iopub.status.idle": "2025-01-10T12:31:35.077460Z", "shell.execute_reply": "2025-01-10T12:31:35.076234Z", "shell.execute_reply.started": "2025-01-10T12:31:27.023476Z" }, "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dataset URL: https://www.kaggle.com/datasets/phmanhth/vietnamese-text-classification\n", "License(s): unknown\n", "Downloading vietnamese-text-classification.zip to c:\\Users\\MyLaptop\\Desktop\\New folder (4)\\Vietnamese-News-Classification\\notebooks\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", " 0%| | 0.00/45.7M [00:00" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "os.makedirs(\"visualization\", exist_ok=True)\n", "plt.figure(figsize=(11, 4.5))\n", "sns.countplot(x='label', data=train_df)#, order=label_counts.index)\n", "plt.xticks(rotation=35) # Rotate x-axis labels for better readability\n", "plt.xlabel(\"Label\")\n", "plt.ylabel(\"Number of Samples\")\n", "plt.title(\"Distribution of Labels in Training Data\")\n", "\n", "# Format y-axis ticks with commas as thousands separators\n", "plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))\n", "\n", "# Annotate each bar with its count\n", "for p in plt.gca().patches:\n", " plt.gca().annotate(f'{int(p.get_height()):,}', (p.get_x() + p.get_width() / 2., p.get_height()),\n", " ha='center', va='center', fontsize=10, xytext=(0, 5),\n", " textcoords='offset points')\n", "\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "plt.savefig('visualization/training_data_distribution.png', dpi=300)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "execution": { "iopub.execute_input": "2025-01-10T12:36:44.945738Z", "iopub.status.busy": "2025-01-10T12:36:44.945304Z", "iopub.status.idle": "2025-01-10T12:36:44.961340Z", "shell.execute_reply": "2025-01-10T12:36:44.959968Z", "shell.execute_reply.started": "2025-01-10T12:36:44.945704Z" }, "trusted": true }, "outputs": [], "source": [ "train_df = train_df.dropna()\n", "test_df = test_df.dropna()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "execution": { "iopub.execute_input": "2025-01-10T12:36:46.243827Z", "iopub.status.busy": "2025-01-10T12:36:46.243428Z", "iopub.status.idle": "2025-01-10T12:36:46.250429Z", "shell.execute_reply": "2025-01-10T12:36:46.249271Z", "shell.execute_reply.started": "2025-01-10T12:36:46.243793Z" }, "trusted": true }, "outputs": [ { "data": { "text/plain": [ "26059" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(train_df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Word Tokenize in Training Dataset and Test Set" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.status.busy": "2025-01-10T12:31:53.606363Z", "iopub.status.idle": "2025-01-10T12:31:53.606880Z", "shell.execute_reply": "2025-01-10T12:31:53.606634Z" }, "trusted": true }, "outputs": [], "source": [ "train_df['content'] = [word_tokenize(el, format='text') for el in train_df['content']]\n", "test_df['content'] = [word_tokenize(el, format='text') for el in test_df['content']]\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Save Converted Dataframes to CSV files" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.status.busy": "2025-01-10T12:31:53.607935Z", "iopub.status.idle": "2025-01-10T12:31:53.608419Z", "shell.execute_reply": "2025-01-10T12:31:53.608209Z" }, "trusted": true }, "outputs": [], "source": [ "os.makedirs(\"dataset\", exist_ok=True)\n", "\n", "train_df.to_csv(\"dataset/converted_train_dataset.csv\", index=False)\n", "test_df.to_csv(\"dataset/converted_test_dataset.csv\", index=False)" ] } ], "metadata": { "kaggle": { "accelerator": "none", "dataSources": [], "dockerImageVersionId": 30822, "isGpuEnabled": false, "isInternetEnabled": true, "language": "python", "sourceType": "notebook" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 4 }