Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Commit 
							
							·
						
						9047480
	
0
								Parent(s):
							
							
Duplicate from kya5/milestone-3
Browse files- .gitattributes +5 -0
- .github/workflows/main.yml +0 -0
- .github/workflows/sync_to_hf.yml +20 -0
- README.md +110 -0
- app.py +72 -0
- bert/_bert_model/config.json +44 -0
- bert/_bert_model/pytorch_model.bin +3 -0
- bert/_bert_model/training_args.bin +0 -0
- distilbert/_distilbert_model/config.json +41 -0
- distilbert/_distilbert_model/pytorch_model.bin +3 -0
- distilbert/_distilbert_model/training_args.bin +0 -0
- jigsaw-toxic-comment-classification-challenge/sample_submission.csv +0 -0
- jigsaw-toxic-comment-classification-challenge/test.csv +3 -0
- jigsaw-toxic-comment-classification-challenge/test_labels.csv +0 -0
- jigsaw-toxic-comment-classification-challenge/train.csv +3 -0
- requirements.txt +5 -0
- roberta/_roberta_model/config.json +43 -0
- roberta/_roberta_model/pytorch_model.bin +3 -0
- roberta/_roberta_model/training_args.bin +0 -0
- train.py +156 -0
    	
        .gitattributes
    ADDED
    
    | @@ -0,0 +1,5 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            bert/_bert_model/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
         | 
| 2 | 
            +
            distilbert/_distilbert_model/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
         | 
| 3 | 
            +
            roberta/_roberta_model/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
         | 
| 4 | 
            +
            jigsaw-toxic-comment-classification-challenge/test.csv filter=lfs diff=lfs merge=lfs -text
         | 
| 5 | 
            +
            jigsaw-toxic-comment-classification-challenge/train.csv filter=lfs diff=lfs merge=lfs -text
         | 
    	
        .github/workflows/main.yml
    ADDED
    
    | 
            File without changes
         | 
    	
        .github/workflows/sync_to_hf.yml
    ADDED
    
    | @@ -0,0 +1,20 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            name: Sync to Hugging Face hub
         | 
| 2 | 
            +
            on:
         | 
| 3 | 
            +
              push:
         | 
| 4 | 
            +
                branches: [main]
         | 
| 5 | 
            +
             | 
| 6 | 
            +
              # to run this workflow manually from the Actions tab
         | 
| 7 | 
            +
              workflow_dispatch:
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            jobs:
         | 
| 10 | 
            +
              sync-to-hub:
         | 
| 11 | 
            +
                runs-on: ubuntu-latest
         | 
| 12 | 
            +
                steps:
         | 
| 13 | 
            +
                  - uses: actions/checkout@v3
         | 
| 14 | 
            +
                    with:
         | 
| 15 | 
            +
                      fetch-depth: 0
         | 
| 16 | 
            +
                      lfs: true
         | 
| 17 | 
            +
                  - name: Push to hub
         | 
| 18 | 
            +
                    env:
         | 
| 19 | 
            +
                      HF_TOKEN: ${{ secrets.HF_TOKEN }}
         | 
| 20 | 
            +
                    run: git push --force https://jjmakes:[email protected]/spaces/jjmakes/cs482-toxic-tweets main
         | 
    	
        README.md
    ADDED
    
    | @@ -0,0 +1,110 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            ---
         | 
| 2 | 
            +
            title: Cs482 Toxic Tweets
         | 
| 3 | 
            +
            emoji: ⚡
         | 
| 4 | 
            +
            colorFrom: green
         | 
| 5 | 
            +
            colorTo: green
         | 
| 6 | 
            +
            sdk: streamlit
         | 
| 7 | 
            +
            sdk_version: 1.17.0
         | 
| 8 | 
            +
            app_file: app.py
         | 
| 9 | 
            +
            pinned: false
         | 
| 10 | 
            +
            duplicated_from: kya5/milestone-3
         | 
| 11 | 
            +
            ---
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            # Finetuning Language Models - Toxic Tweets
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            [](https://github.com/jjmakes/cs482-project/actions/workflows/sync_to_hf.yml)
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            ## [See the deployed App on HuggingFace](https://huggingface.co/spaces/jjmakes/cs482-toxic-tweets)
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            CS 482 Project - [Instructions](https://pantelis.github.io/data-mining/aiml-common/projects/nlp/finetuning-language-models-tweets/index.html)
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            ## Milestone 1 - Development Environment
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            ## OS Version
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            This project was created in Ubuntu 20.04. Thus, steps for installing and developing in Windows are not included.
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            ```
         | 
| 28 | 
            +
            Distributor ID: Ubuntu
         | 
| 29 | 
            +
            Description: Ubuntu 20.04.6 LTS
         | 
| 30 | 
            +
            Release: 20.04
         | 
| 31 | 
            +
            Codename: focal
         | 
| 32 | 
            +
            ```
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            ## Docker Installation
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            The instructions below will help install Docker on Ubuntu version 20.04.6
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            ```
         | 
| 39 | 
            +
            ## Update list of existing packages
         | 
| 40 | 
            +
            sudo apt update
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            ## Install prerequisite packages
         | 
| 43 | 
            +
            sudo apt install apt-transport-https ca-certificates curl software-properties-common
         | 
| 44 | 
            +
             | 
| 45 | 
            +
            ## Add GPG key for the official Docker repository
         | 
| 46 | 
            +
            curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
         | 
| 47 | 
            +
             | 
| 48 | 
            +
            ## Add the Docker repository to APT sources
         | 
| 49 | 
            +
            sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu focal stable"
         | 
| 50 | 
            +
             | 
| 51 | 
            +
            ## Prep to install from docker repo
         | 
| 52 | 
            +
            apt-cache policy docker-ce
         | 
| 53 | 
            +
             | 
| 54 | 
            +
            ## Install docker
         | 
| 55 | 
            +
            sudo apt install docker-ce
         | 
| 56 | 
            +
             | 
| 57 | 
            +
            ## Check if docker is running
         | 
| 58 | 
            +
            sudo systemctl status docker
         | 
| 59 | 
            +
             | 
| 60 | 
            +
            ## Add sudo docker permissions to current user
         | 
| 61 | 
            +
            sudo usermod -aG docker ${USER}[](https://github.com/jjmakes/cs482-project/actions/workflows/sync_to_hf.yml)
         | 
| 62 | 
            +
             | 
| 63 | 
            +
            ## VS Code Installation
         | 
| 64 | 
            +
             | 
| 65 | 
            +
            The instructions below will help install VS Code on Ubuntu version 20.04.6
         | 
| 66 | 
            +
             | 
| 67 | 
            +
            [Download the VS Code .deb package (64 bit)](https://code.visualstudio.com/download)
         | 
| 68 | 
            +
             | 
| 69 | 
            +
            ```
         | 
| 70 | 
            +
            ## Navigate to downloads folder
         | 
| 71 | 
            +
            cd ~/Downloads
         | 
| 72 | 
            +
             | 
| 73 | 
            +
            ## Install VS Code (replace <file> with the downloaded package)
         | 
| 74 | 
            +
            sudo apt install ./<file>.deb
         | 
| 75 | 
            +
            ```
         | 
| 76 | 
            +
             | 
| 77 | 
            +
            ## Creating a development environment with docker
         | 
| 78 | 
            +
             | 
| 79 | 
            +
            [Quick Start Development Container](https://code.visualstudio.com/docs/devcontainers/containers#_quick-start-try-a-development-container)
         | 
| 80 | 
            +
             | 
| 81 | 
            +
            1. **F1**, _Dev Containers: Open Folder in Container..._
         | 
| 82 | 
            +
            2. Select starting image
         | 
| 83 | 
            +
             | 
| 84 | 
            +
            Some notable images worth using are:
         | 
| 85 | 
            +
             | 
| 86 | 
            +
            - Alpine: Barebones Linux OS
         | 
| 87 | 
            +
            - Python3: Container for developing Python 3 Applications
         | 
| 88 | 
            +
             | 
| 89 | 
            +
            
         | 
| 90 | 
            +
             | 
| 91 | 
            +
             | 
| 92 | 
            +
            ## Milestone 2
         | 
| 93 | 
            +
             | 
| 94 | 
            +
            App is deployed to [HuggingFace](https://huggingface.co/spaces/jjmakes/cs482-toxic-tweets) via GitHub actions following [instructions provided in this tutorial](https://www.youtube.com/watch?v=8hOzsFETm4I). HuggingFace provides documentation for performing [sentiment analysis with python](https://huggingface.co/blog/sentiment-analysis-python).
         | 
| 95 | 
            +
             | 
| 96 | 
            +
            ### Testing with Streamlit Locally
         | 
| 97 | 
            +
             | 
| 98 | 
            +
            To test with streamlit, install the project dependencies locally with:
         | 
| 99 | 
            +
            ```
         | 
| 100 | 
            +
            pip3 install -r requirements.txt
         | 
| 101 | 
            +
            ```
         | 
| 102 | 
            +
             | 
| 103 | 
            +
            To run the project, use:
         | 
| 104 | 
            +
            ```
         | 
| 105 | 
            +
            streamlit run app.py --server.port 8888
         | 
| 106 | 
            +
            ```
         | 
| 107 | 
            +
             | 
| 108 | 
            +
            The page can be set to hot-reload by selecting `Always Rerun` after a change is made.
         | 
| 109 | 
            +
             | 
| 110 | 
            +
            Models used are pretrained and provided by [HuggingFace](https://huggingface.co/models?pipeline_tag=text-classification&sort=likes&search=sentiment).
         | 
    	
        app.py
    ADDED
    
    | @@ -0,0 +1,72 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import streamlit as st
         | 
| 2 | 
            +
            from transformers import AutoModelForSequenceClassification, AutoTokenizer
         | 
| 3 | 
            +
            import torch
         | 
| 4 | 
            +
            import pandas as pd
         | 
| 5 | 
            +
            import random
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            classifiers = ['toxic', 'severe_toxic', 'obscene',
         | 
| 8 | 
            +
                           'threat', 'insult', 'identity_hate']
         | 
| 9 | 
            +
             | 
| 10 | 
            +
             | 
| 11 | 
            +
            def reset_scores():
         | 
| 12 | 
            +
                global scores_df
         | 
| 13 | 
            +
                scores_df = pd.DataFrame(columns=['Comment'] + classifiers)
         | 
| 14 | 
            +
             | 
| 15 | 
            +
             | 
| 16 | 
            +
            def get_score(model_base, text):
         | 
| 17 | 
            +
                if model_base == "bert-base-cased":
         | 
| 18 | 
            +
                    model_dir = "./bert/_bert_model"
         | 
| 19 | 
            +
                elif model_base == "distilbert-base-cased":
         | 
| 20 | 
            +
                    model_dir = "./distilbert/_distilbert_model"
         | 
| 21 | 
            +
                else:
         | 
| 22 | 
            +
                    model_dir = "./roberta/_roberta_model"
         | 
| 23 | 
            +
                model = AutoModelForSequenceClassification.from_pretrained(model_dir)
         | 
| 24 | 
            +
                tokenizer = AutoTokenizer.from_pretrained(model_base)
         | 
| 25 | 
            +
                inputs = tokenizer.encode_plus(
         | 
| 26 | 
            +
                    text, max_length=512, truncation=True, padding=True, return_tensors='pt')
         | 
| 27 | 
            +
                outputs = model(**inputs)
         | 
| 28 | 
            +
                predictions = torch.sigmoid(outputs.logits)
         | 
| 29 | 
            +
                return predictions
         | 
| 30 | 
            +
             | 
| 31 | 
            +
             | 
| 32 | 
            +
            st.title("Toxic Comment Classifier")
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            model_base = st.selectbox("Select a pretrained model",
         | 
| 35 | 
            +
                                      ["roberta-base", "bert-base-cased", "distilbert-base-cased"])
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            text_input = st.text_input("Enter text for toxicity classification",
         | 
| 38 | 
            +
                                       "")
         | 
| 39 | 
            +
            submit_btn = st.button("Submit")
         | 
| 40 | 
            +
             | 
| 41 | 
            +
             | 
| 42 | 
            +
            if submit_btn and text_input:
         | 
| 43 | 
            +
                result = get_score(model_base, text_input)
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                df = pd.DataFrame([result[0].tolist()], columns=classifiers)
         | 
| 46 | 
            +
                df = df.round(2)  # Round the values to 2 decimal places
         | 
| 47 | 
            +
                df = df.applymap(lambda x: '{:.0%}'.format(x))
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                st.table(df)
         | 
| 50 | 
            +
             | 
| 51 | 
            +
            test_df = pd.read_csv(
         | 
| 52 | 
            +
                "./jigsaw-toxic-comment-classification-challenge/test.csv")
         | 
| 53 | 
            +
             | 
| 54 | 
            +
            sample_df = test_df.sample(n=3)
         | 
| 55 | 
            +
             | 
| 56 | 
            +
            reset_scores()
         | 
| 57 | 
            +
             | 
| 58 | 
            +
            for index, row in sample_df.iterrows():
         | 
| 59 | 
            +
                result = get_score(model_base, row['comment_text'])
         | 
| 60 | 
            +
                scores = result[0].tolist()
         | 
| 61 | 
            +
                scores_df.loc[len(scores_df)] = [row['comment_text']] + scores
         | 
| 62 | 
            +
             | 
| 63 | 
            +
            scores_df = scores_df.round(2)
         | 
| 64 | 
            +
             | 
| 65 | 
            +
             | 
| 66 | 
            +
            st.subheader("Toxicity Scores for Random Comments")
         | 
| 67 | 
            +
            if st.button("Refresh"):
         | 
| 68 | 
            +
                reset_scores()
         | 
| 69 | 
            +
                st.success("New tweets have been loaded!")
         | 
| 70 | 
            +
            st.table(scores_df)
         | 
| 71 | 
            +
             | 
| 72 | 
            +
             | 
    	
        bert/_bert_model/config.json
    ADDED
    
    | @@ -0,0 +1,44 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "_name_or_path": "vinai/bertweet-base",
         | 
| 3 | 
            +
              "architectures": [
         | 
| 4 | 
            +
                "RobertaForSequenceClassification"
         | 
| 5 | 
            +
              ],
         | 
| 6 | 
            +
              "attention_probs_dropout_prob": 0.1,
         | 
| 7 | 
            +
              "bos_token_id": 0,
         | 
| 8 | 
            +
              "eos_token_id": 2,
         | 
| 9 | 
            +
              "gradient_checkpointing": false,
         | 
| 10 | 
            +
              "hidden_act": "gelu",
         | 
| 11 | 
            +
              "hidden_dropout_prob": 0.1,
         | 
| 12 | 
            +
              "hidden_size": 768,
         | 
| 13 | 
            +
              "id2label": {
         | 
| 14 | 
            +
                "0": "LABEL_0",
         | 
| 15 | 
            +
                "1": "LABEL_1",
         | 
| 16 | 
            +
                "2": "LABEL_2",
         | 
| 17 | 
            +
                "3": "LABEL_3",
         | 
| 18 | 
            +
                "4": "LABEL_4",
         | 
| 19 | 
            +
                "5": "LABEL_5"
         | 
| 20 | 
            +
              },
         | 
| 21 | 
            +
              "initializer_range": 0.02,
         | 
| 22 | 
            +
              "intermediate_size": 3072,
         | 
| 23 | 
            +
              "label2id": {
         | 
| 24 | 
            +
                "LABEL_0": 0,
         | 
| 25 | 
            +
                "LABEL_1": 1,
         | 
| 26 | 
            +
                "LABEL_2": 2,
         | 
| 27 | 
            +
                "LABEL_3": 3,
         | 
| 28 | 
            +
                "LABEL_4": 4,
         | 
| 29 | 
            +
                "LABEL_5": 5
         | 
| 30 | 
            +
              },
         | 
| 31 | 
            +
              "layer_norm_eps": 1e-05,
         | 
| 32 | 
            +
              "max_position_embeddings": 130,
         | 
| 33 | 
            +
              "model_type": "roberta",
         | 
| 34 | 
            +
              "num_attention_heads": 12,
         | 
| 35 | 
            +
              "num_hidden_layers": 12,
         | 
| 36 | 
            +
              "pad_token_id": 1,
         | 
| 37 | 
            +
              "position_embedding_type": "absolute",
         | 
| 38 | 
            +
              "problem_type": "multi_label_classification",
         | 
| 39 | 
            +
              "tokenizer_class": "BertweetTokenizer",
         | 
| 40 | 
            +
              "transformers_version": "4.8.0",
         | 
| 41 | 
            +
              "type_vocab_size": 1,
         | 
| 42 | 
            +
              "use_cache": true,
         | 
| 43 | 
            +
              "vocab_size": 64001
         | 
| 44 | 
            +
            }
         | 
    	
        bert/_bert_model/pytorch_model.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:c1c171ff9ebed4a7224889a84edd1ea084ed01f4bcda6c6a637bb1ed63d3d196
         | 
| 3 | 
            +
            size 539702389
         | 
    	
        bert/_bert_model/training_args.bin
    ADDED
    
    | Binary file (2.56 kB). View file | 
|  | 
    	
        distilbert/_distilbert_model/config.json
    ADDED
    
    | @@ -0,0 +1,41 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "_name_or_path": "distilbert-base-cased",
         | 
| 3 | 
            +
              "activation": "gelu",
         | 
| 4 | 
            +
              "architectures": [
         | 
| 5 | 
            +
                "DistilBertForSequenceClassification"
         | 
| 6 | 
            +
              ],
         | 
| 7 | 
            +
              "attention_dropout": 0.1,
         | 
| 8 | 
            +
              "dim": 768,
         | 
| 9 | 
            +
              "dropout": 0.1,
         | 
| 10 | 
            +
              "hidden_dim": 3072,
         | 
| 11 | 
            +
              "id2label": {
         | 
| 12 | 
            +
                "0": "LABEL_0",
         | 
| 13 | 
            +
                "1": "LABEL_1",
         | 
| 14 | 
            +
                "2": "LABEL_2",
         | 
| 15 | 
            +
                "3": "LABEL_3",
         | 
| 16 | 
            +
                "4": "LABEL_4",
         | 
| 17 | 
            +
                "5": "LABEL_5"
         | 
| 18 | 
            +
              },
         | 
| 19 | 
            +
              "initializer_range": 0.02,
         | 
| 20 | 
            +
              "label2id": {
         | 
| 21 | 
            +
                "LABEL_0": 0,
         | 
| 22 | 
            +
                "LABEL_1": 1,
         | 
| 23 | 
            +
                "LABEL_2": 2,
         | 
| 24 | 
            +
                "LABEL_3": 3,
         | 
| 25 | 
            +
                "LABEL_4": 4,
         | 
| 26 | 
            +
                "LABEL_5": 5
         | 
| 27 | 
            +
              },
         | 
| 28 | 
            +
              "max_position_embeddings": 512,
         | 
| 29 | 
            +
              "model_type": "distilbert",
         | 
| 30 | 
            +
              "n_heads": 12,
         | 
| 31 | 
            +
              "n_layers": 6,
         | 
| 32 | 
            +
              "output_past": true,
         | 
| 33 | 
            +
              "pad_token_id": 0,
         | 
| 34 | 
            +
              "problem_type": "multi_label_classification",
         | 
| 35 | 
            +
              "qa_dropout": 0.1,
         | 
| 36 | 
            +
              "seq_classif_dropout": 0.2,
         | 
| 37 | 
            +
              "sinusoidal_pos_embds": false,
         | 
| 38 | 
            +
              "tie_weights_": true,
         | 
| 39 | 
            +
              "transformers_version": "4.8.0",
         | 
| 40 | 
            +
              "vocab_size": 28996
         | 
| 41 | 
            +
            }
         | 
    	
        distilbert/_distilbert_model/pytorch_model.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:a4276639fc9c2f4f22680df4f17412ba1cf058f6e3a0b4f77a6df203cea934b9
         | 
| 3 | 
            +
            size 263185709
         | 
    	
        distilbert/_distilbert_model/training_args.bin
    ADDED
    
    | Binary file (2.56 kB). View file | 
|  | 
    	
        jigsaw-toxic-comment-classification-challenge/sample_submission.csv
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        jigsaw-toxic-comment-classification-challenge/test.csv
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:c2513ce4abb98c4d1d216e3ca0d4377d57589a0989aa8c06a840509a16c786e8
         | 
| 3 | 
            +
            size 60354593
         | 
    	
        jigsaw-toxic-comment-classification-challenge/test_labels.csv
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        jigsaw-toxic-comment-classification-challenge/train.csv
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:bd4084611bd27c939ba98e5e63bc3e5a2c1a4e99477dcba46c829e4c986c429d
         | 
| 3 | 
            +
            size 68802655
         | 
    	
        requirements.txt
    ADDED
    
    | @@ -0,0 +1,5 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            streamlit
         | 
| 2 | 
            +
            numpy
         | 
| 3 | 
            +
            transformers
         | 
| 4 | 
            +
            tensorflow
         | 
| 5 | 
            +
            torch
         | 
    	
        roberta/_roberta_model/config.json
    ADDED
    
    | @@ -0,0 +1,43 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "_name_or_path": "roberta-base",
         | 
| 3 | 
            +
              "architectures": [
         | 
| 4 | 
            +
                "RobertaForSequenceClassification"
         | 
| 5 | 
            +
              ],
         | 
| 6 | 
            +
              "attention_probs_dropout_prob": 0.1,
         | 
| 7 | 
            +
              "bos_token_id": 0,
         | 
| 8 | 
            +
              "eos_token_id": 2,
         | 
| 9 | 
            +
              "gradient_checkpointing": false,
         | 
| 10 | 
            +
              "hidden_act": "gelu",
         | 
| 11 | 
            +
              "hidden_dropout_prob": 0.1,
         | 
| 12 | 
            +
              "hidden_size": 768,
         | 
| 13 | 
            +
              "id2label": {
         | 
| 14 | 
            +
                "0": "LABEL_0",
         | 
| 15 | 
            +
                "1": "LABEL_1",
         | 
| 16 | 
            +
                "2": "LABEL_2",
         | 
| 17 | 
            +
                "3": "LABEL_3",
         | 
| 18 | 
            +
                "4": "LABEL_4",
         | 
| 19 | 
            +
                "5": "LABEL_5"
         | 
| 20 | 
            +
              },
         | 
| 21 | 
            +
              "initializer_range": 0.02,
         | 
| 22 | 
            +
              "intermediate_size": 3072,
         | 
| 23 | 
            +
              "label2id": {
         | 
| 24 | 
            +
                "LABEL_0": 0,
         | 
| 25 | 
            +
                "LABEL_1": 1,
         | 
| 26 | 
            +
                "LABEL_2": 2,
         | 
| 27 | 
            +
                "LABEL_3": 3,
         | 
| 28 | 
            +
                "LABEL_4": 4,
         | 
| 29 | 
            +
                "LABEL_5": 5
         | 
| 30 | 
            +
              },
         | 
| 31 | 
            +
              "layer_norm_eps": 1e-05,
         | 
| 32 | 
            +
              "max_position_embeddings": 514,
         | 
| 33 | 
            +
              "model_type": "roberta",
         | 
| 34 | 
            +
              "num_attention_heads": 12,
         | 
| 35 | 
            +
              "num_hidden_layers": 12,
         | 
| 36 | 
            +
              "pad_token_id": 1,
         | 
| 37 | 
            +
              "position_embedding_type": "absolute",
         | 
| 38 | 
            +
              "problem_type": "multi_label_classification",
         | 
| 39 | 
            +
              "transformers_version": "4.8.0",
         | 
| 40 | 
            +
              "type_vocab_size": 1,
         | 
| 41 | 
            +
              "use_cache": true,
         | 
| 42 | 
            +
              "vocab_size": 50265
         | 
| 43 | 
            +
            }
         | 
    	
        roberta/_roberta_model/pytorch_model.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:56b176692120cdc3c43be9880d33b1e6fa138146784a91f6c473cc3c701c81ce
         | 
| 3 | 
            +
            size 498688117
         | 
    	
        roberta/_roberta_model/training_args.bin
    ADDED
    
    | Binary file (2.56 kB). View file | 
|  | 
    	
        train.py
    ADDED
    
    | @@ -0,0 +1,156 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import pandas as pd
         | 
| 2 | 
            +
            import os
         | 
| 3 | 
            +
            from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, RobertaTokenizer, RobertaForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification
         | 
| 4 | 
            +
            import torch
         | 
| 5 | 
            +
            from torch.utils.data import Dataset
         | 
| 6 | 
            +
            torch.cuda.empty_cache()
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            class MultiLabelClassifierDataset(Dataset):
         | 
| 9 | 
            +
                def __init__(self, encodings, labels):
         | 
| 10 | 
            +
                    self.encodings = encodings
         | 
| 11 | 
            +
                    self.labels = labels
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                def __getitem__(self, idx):
         | 
| 14 | 
            +
                    item = {key: torch.tensor(val[idx])
         | 
| 15 | 
            +
                            for key, val in self.encodings.items()}
         | 
| 16 | 
            +
                    item['labels'] = torch.tensor(self.labels[idx]).float()
         | 
| 17 | 
            +
                    return item
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                def __len__(self):
         | 
| 20 | 
            +
                    return len(self.labels)
         | 
| 21 | 
            +
             | 
| 22 | 
            +
             | 
| 23 | 
            +
            work_dir = os.path.dirname(os.path.realpath(__file__)) + '/'
         | 
| 24 | 
            +
            dataset_dir = work_dir + 'jigsaw-toxic-comment-classification-challenge/'
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            classifiers = ['toxic', 'severe_toxic', 'obscene',
         | 
| 27 | 
            +
                           'threat', 'insult', 'identity_hate']
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            df = pd.read_csv(dataset_dir + 'train.csv')
         | 
| 30 | 
            +
            df = df.sample(frac=1).reset_index(drop=True)  # Shuffle
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            train_df = df[:int(len(df)*0.1)]
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            train_labels = train_df[classifiers].to_numpy()
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            device = torch.device('cuda')
         | 
| 37 | 
            +
            print("Using device: ", device)
         | 
| 38 | 
            +
             | 
| 39 | 
            +
             | 
| 40 | 
            +
            training_args = TrainingArguments(
         | 
| 41 | 
            +
                output_dir='./results',
         | 
| 42 | 
            +
                num_train_epochs=2,
         | 
| 43 | 
            +
                per_device_train_batch_size=32,
         | 
| 44 | 
            +
                per_device_eval_batch_size=64,
         | 
| 45 | 
            +
                warmup_steps=500,
         | 
| 46 | 
            +
                weight_decay=0.01,
         | 
| 47 | 
            +
                logging_dir='./logs',
         | 
| 48 | 
            +
                logging_steps=10,
         | 
| 49 | 
            +
                fp16=True
         | 
| 50 | 
            +
            )
         | 
| 51 | 
            +
             | 
| 52 | 
            +
            print("BERT")
         | 
| 53 | 
            +
            bert_dir = work_dir + 'bert/'
         | 
| 54 | 
            +
             | 
| 55 | 
            +
            print("Model base: ", "vinai/bertweet-base")
         | 
| 56 | 
            +
            tokenizer = AutoTokenizer.from_pretrained(
         | 
| 57 | 
            +
                "vinai/bertweet-base", model_max_length=128)
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            train_encodings = tokenizer(
         | 
| 60 | 
            +
                train_df['comment_text'].tolist(), truncation=True, padding=True)
         | 
| 61 | 
            +
             | 
| 62 | 
            +
            print("Training model to be stored in" + bert_dir)
         | 
| 63 | 
            +
             | 
| 64 | 
            +
            print("Creating dataset")
         | 
| 65 | 
            +
            train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels)
         | 
| 66 | 
            +
             | 
| 67 | 
            +
            print("Loading model for training...")
         | 
| 68 | 
            +
            model = AutoModelForSequenceClassification.from_pretrained(
         | 
| 69 | 
            +
                'vinai/bertweet-base', num_labels=6)
         | 
| 70 | 
            +
             | 
| 71 | 
            +
            trainer = Trainer(
         | 
| 72 | 
            +
                model=model,
         | 
| 73 | 
            +
                args=training_args,
         | 
| 74 | 
            +
                train_dataset=train_dataset
         | 
| 75 | 
            +
            )
         | 
| 76 | 
            +
             | 
| 77 | 
            +
            trainer.train()
         | 
| 78 | 
            +
             | 
| 79 | 
            +
            trainer.save_model(bert_dir + '_bert_model')
         | 
| 80 | 
            +
             | 
| 81 | 
            +
             | 
| 82 | 
            +
            training_args = TrainingArguments(
         | 
| 83 | 
            +
                output_dir='./results',
         | 
| 84 | 
            +
                num_train_epochs=1,
         | 
| 85 | 
            +
                per_device_train_batch_size=32,
         | 
| 86 | 
            +
                per_device_eval_batch_size=16,
         | 
| 87 | 
            +
                warmup_steps=500,
         | 
| 88 | 
            +
                weight_decay=0.01,
         | 
| 89 | 
            +
                logging_dir='./logs',
         | 
| 90 | 
            +
                logging_steps=10,
         | 
| 91 | 
            +
                fp16=True
         | 
| 92 | 
            +
            )
         | 
| 93 | 
            +
             | 
| 94 | 
            +
            print("RoBERTa")
         | 
| 95 | 
            +
            roberta_dir = work_dir + 'roberta/'
         | 
| 96 | 
            +
             | 
| 97 | 
            +
            tokenizer = RobertaTokenizer.from_pretrained(
         | 
| 98 | 
            +
                'roberta-base', model_max_length=128)
         | 
| 99 | 
            +
             | 
| 100 | 
            +
            train_encodings = tokenizer(
         | 
| 101 | 
            +
                train_df['comment_text'].tolist(), truncation=True, padding=True)
         | 
| 102 | 
            +
             | 
| 103 | 
            +
             | 
| 104 | 
            +
            train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels)
         | 
| 105 | 
            +
             | 
| 106 | 
            +
            model = AutoModelForSequenceClassification.from_pretrained(
         | 
| 107 | 
            +
                'roberta-base', num_labels=6)
         | 
| 108 | 
            +
             | 
| 109 | 
            +
            trainer = Trainer(
         | 
| 110 | 
            +
                model=model,
         | 
| 111 | 
            +
                args=training_args,
         | 
| 112 | 
            +
                train_dataset=train_dataset
         | 
| 113 | 
            +
            )
         | 
| 114 | 
            +
             | 
| 115 | 
            +
            trainer.train()
         | 
| 116 | 
            +
             | 
| 117 | 
            +
            trainer.save_model(roberta_dir + '_roberta_model')
         | 
| 118 | 
            +
             | 
| 119 | 
            +
             | 
| 120 | 
            +
            training_args = TrainingArguments(
         | 
| 121 | 
            +
                output_dir='./results',
         | 
| 122 | 
            +
                num_train_epochs=1,
         | 
| 123 | 
            +
                per_device_train_batch_size=32,
         | 
| 124 | 
            +
                per_device_eval_batch_size=64,
         | 
| 125 | 
            +
                warmup_steps=500,
         | 
| 126 | 
            +
                weight_decay=0.01,
         | 
| 127 | 
            +
                logging_dir='./logs',
         | 
| 128 | 
            +
                logging_steps=10,
         | 
| 129 | 
            +
                fp16=True
         | 
| 130 | 
            +
            )
         | 
| 131 | 
            +
             | 
| 132 | 
            +
             | 
| 133 | 
            +
            print("DISTILBERT")
         | 
| 134 | 
            +
            distilbert_dir = work_dir + 'distilbert/'
         | 
| 135 | 
            +
             | 
| 136 | 
            +
            tokenizer = AutoTokenizer.from_pretrained(
         | 
| 137 | 
            +
                'distilbert-base-cased', model_max_length=128)
         | 
| 138 | 
            +
             | 
| 139 | 
            +
            train_encodings = tokenizer(
         | 
| 140 | 
            +
                train_df['comment_text'].tolist(), truncation=True, padding=True)
         | 
| 141 | 
            +
             | 
| 142 | 
            +
             | 
| 143 | 
            +
            train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels)
         | 
| 144 | 
            +
             | 
| 145 | 
            +
            model = AutoModelForSequenceClassification.from_pretrained(
         | 
| 146 | 
            +
                'distilbert-base-cased', num_labels=6)
         | 
| 147 | 
            +
             | 
| 148 | 
            +
            trainer = Trainer(
         | 
| 149 | 
            +
                model=model,
         | 
| 150 | 
            +
                args=training_args,
         | 
| 151 | 
            +
                train_dataset=train_dataset
         | 
| 152 | 
            +
            )
         | 
| 153 | 
            +
             | 
| 154 | 
            +
            trainer.train()
         | 
| 155 | 
            +
             | 
| 156 | 
            +
            trainer.save_model(distilbert_dir + '_distilbert_model')
         |