Spaces:

suryadev1
/

astra

Sleeping

File size: 17,718 Bytes

8b6cbfb
 
 
 
 
 
cecfca1
 
 
9f91555
8b6cbfb
 
 
 
 
9f91555
8b6cbfb
cecfca1
 
 
 
8b6cbfb
 
 
 
cecfca1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f91555
cecfca1
 
 
 
 
 
8b6cbfb
 
 
 
 
 
 
 
 
cecfca1
8b6cbfb
 
 
 
 
cecfca1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b6cbfb
 
 
9f91555
cecfca1
9f91555
 
cecfca1
 
9f91555
 
 
cecfca1
 
 
9f91555
 
cecfca1
 
 
 
9f91555
 
 
 
cecfca1
9f91555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cecfca1
9f91555
cecfca1
 
 
 
9f91555
 
 
cecfca1
9f91555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cecfca1
9f91555
 
 
 
 
cecfca1
 
 
 
9f91555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cecfca1
9f91555
cecfca1
 
9f91555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cecfca1
 
9f91555
cecfca1
 
 
9f91555
 
 
 
cecfca1
 
9f91555
cecfca1
 
9f91555
cecfca1
 
 
 
 
 
 
9f91555
cecfca1
 
 
 
 
 
 
 
 
 
 
9f91555
 
cecfca1
 
 
 
9f91555
8b6cbfb
9f91555

import torch
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import tqdm
import random
from .vocab import Vocab
import pickle
import copy
import os

class TokenizerDataset(Dataset):
    """
        Class name: TokenizerDataset
        Tokenize the data in the dataset
        Feat length: 17 
    """
    def __init__(self, dataset_path, label_path, vocab, seq_len=30):
        self.dataset_path = dataset_path
        self.label_path = label_path
        self.vocab = vocab # Vocab object
        
        # Related to input dataset file
        self.lines = []
        self.labels = []
        self.feats = []
        if self.label_path:
            self.label_file = open(self.label_path, "r")
            for line in self.label_file:
                if line:
                    line = line.strip()
                    if not line:
                        continue
                    self.labels.append(int(line))
            self.label_file.close()
            
            # Comment this section if you are not using feat attribute
            try:
                j = 0
                dataset_info_file = open(self.label_path.replace("label", "info"), "r")
                for line in dataset_info_file:
                    if line:
                        line = line.strip()
                        if not line:
                            continue
                      
                        # # highGRschool_w_prior
                        # feat_vec = [float(i) for i in line.split(",")[-3].split("\t")]
                        
                        # highGRschool_w_prior_w_diffskill_wo_fa
                        feat_vec = [float(i) for i in line.split(",")[-3].split("\t")]
                        feat2 = [float(i) for i in line.split(",")[-2].split("\t")]
                        feat_vec.extend(feat2[1:])
                        
                        if j == 0:
                            print(len(feat_vec))
                            j+=1
                    
                        self.feats.append(feat_vec)
                dataset_info_file.close()
            except Exception as e:
                print(e)

        self.file = open(self.dataset_path, "r")
        for line in self.file:
            if line:
                line = line.strip()
                if line:
                    self.lines.append(line)
        self.file.close()             
        
        self.len = len(self.lines)
        self.seq_len = seq_len
        print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels) if self.label_path else 0)
        
    def __len__(self):
        return self.len
    
    def __getitem__(self, item):
        org_line = self.lines[item].split("\t")
        dup_line = []
        opt = False
        for l in org_line:
            if l in ["OptionalTask_1", "EquationAnswer", "NumeratorFactor", "DenominatorFactor", "OptionalTask_2", "FirstRow1:1", "FirstRow1:2", "FirstRow2:1", "FirstRow2:2", "SecondRow", "ThirdRow"]:
                opt = True
            if opt and 'FinalAnswer-' in l: 
                dup_line.append('[UNK]')
            else:
                dup_line.append(l)
        dup_line = "\t".join(dup_line)
        # print(dup_line)
        s1 = self.vocab.to_seq(dup_line, self.seq_len) # This is like tokenizer and adds [CLS] and [SEP].
        s1_label = self.labels[item] if self.label_path else 0
        segment_label = [1 for _ in range(len(s1))]
        s1_feat = self.feats[item] if len(self.feats)>0 else 0
        padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))]
        s1.extend(padding), segment_label.extend(padding)
        
        output = {'input': s1,
                 'label': s1_label,
                  'feat': s1_feat,
                 'segment_label': segment_label}
        return {key: torch.tensor(value) for key, value in output.items()}
        
class TokenizerwSkillsDataset(Dataset):
    """
        Feature length: 17

    """
    def __init__(self, dataset_path, label_path, vocab, seq_len=30):
        print(f"dataset_path: {dataset_path}")
        print(f"label_path: {label_path}")

        self.dataset_path = dataset_path
        self.label_path = label_path
        self.vocab = vocab # Vocab object
        self.seq_len = seq_len

        # Related to input dataset file
        self.lines = []
        self.labels = []
        self.feats = []
        selected_lines = []

        print("TokenizerwSkillsDataset...............................")

        if self.label_path:
            # Comment this section if you are not using feat attribute
            dataset_info_file = open(self.label_path.replace("label", "info"), "r").readlines()
            print(">>>>>>>>>>>>>>>>>", len(dataset_info_file))
            j = 0
            for idex, line in enumerate(dataset_info_file):
                try:
                    if line:
                        line = line.strip()
                        if not line:
                            continue

                        feat_vec = [float(i) for i in line.split(",")[-9].split("\t")]
                        feat2 = [float(i) for i in line.split(",")[-8].split("\t")]
                        feat_vec.extend(feat2[1:])

                        if j == 0:
                            print(";;;;", len(feat_vec), feat_vec)
                            j+=1
                        self.feats.append(feat_vec)
                        selected_lines.append(idex)
                except Exception as e:
                    print("................>")
                    print(e)
                    print("Error at index: ", idex)

            self.label_file = open(self.label_path, "r")
            for idex, line in enumerate(self.label_file):
                if line:
                    line = line.strip()
                    if not line:
                        continue
                    if idex in selected_lines:
                        self.labels.append(int(line))
                    # self.labels.append(int(line))
            self.label_file.close()

        self.file = open(self.dataset_path, "r")
        for idex, line in enumerate(self.file):
            if line:
                line = line.strip()
                if line:
                    if idex in selected_lines:
                        self.lines.append(line)
                    # self.lines.append(line)
        self.file.close()
        self.len = len(self.lines)
        print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels) if self.label_path else 0)

    def __len__(self):
        return self.len

    def __getitem__(self, item):
        org_line = self.lines[item].split("\t")
        dup_line = []
        opt = False
        for l in org_line:
            if l in ["OptionalTask_1", "EquationAnswer", "NumeratorFactor", "DenominatorFactor", "OptionalTask_2", "FirstRow1:1", "FirstRow1:2", "FirstRow2:1", "FirstRow2:2", "SecondRow", "ThirdRow"]:
                opt = True
            if opt and 'FinalAnswer-' in l:
                dup_line.append('[UNK]')
            else:
                dup_line.append(l)
        dup_line = "\t".join(dup_line)
        # print(dup_line)
        s1 = self.vocab.to_seq(dup_line, self.seq_len) # This is like tokenizer and adds [CLS] and [SEP].
        s1_label = self.labels[item] if self.label_path else 0
        segment_label = [1 for _ in range(len(s1))]
        s1_feat = self.feats[item] if len(self.feats)>0 else 0
        padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))]
        s1.extend(padding), segment_label.extend(padding)
        # print(s1_feat)

        output = {'input': s1,
                 'label': s1_label,
                  'feat': s1_feat,
                 'segment_label': segment_label}
        return {key: torch.tensor(value) for key, value in output.items()}


class TokenizerwTimeDataset(Dataset):
    """
        Feature length: 4

    """
    def __init__(self, dataset_path, label_path, vocab, seq_len=30):
        print(f"dataset_path: {dataset_path}")
        print(f"label_path: {label_path}")

        self.dataset_path = dataset_path
        self.label_path = label_path
        self.vocab = vocab # Vocab object
        self.seq_len = seq_len

        # Related to input dataset file
        self.lines = []
        self.labels = []
        self.feats = []
        selected_lines = []

        print("TokenizerwTimeDataset...............................")
        time_df = pickle.load(open("ratio_proportion_change3_2223/sch_largest_100-coded/time_info/full_data_normalized_time.pkl", "rb"))
        print("time: ?? ", time_df.shape)

        if self.label_path:
            # Comment this section if you are not using feat attribute
            dataset_info_file = open(self.label_path.replace("label", "info"), "r").readlines()
            print(">>>>>>>>>>>>>>>>>", len(dataset_info_file))
            j = 0
            for idex, line in enumerate(dataset_info_file):
                try:
                    if line:
                        line = line.strip()
                        if not line:
                            continue

                        feat_vec = []

                        sch = line.split(",")[0]
                        stu = line.split(",")[2]
                        progress = line.split(",")[3]
                        prob_id = line.split(",")[4]

                        total_time = time_df.loc[(sch, stu, progress, prob_id)]['total_time'].item()
                        faopt_time = time_df.loc[(sch, stu, progress, prob_id)]['faopt_time'].item()
                        opt_time = time_df.loc[(sch, stu, progress, prob_id)]['opt_time'].item()
                        nonopt_time = time_df.loc[(sch, stu, progress, prob_id)]['nonopt_time'].item()

                        feat_vec.append(faopt_time)
                        feat_vec.append(total_time)
                        feat_vec.append(opt_time)
                        feat_vec.append(nonopt_time)

                        if j == 0:
                            print(";;;;", len(feat_vec), feat_vec)
                            j+=1
                        self.feats.append(feat_vec)
                        selected_lines.append(idex)
                except Exception as e:
                    print("................>")
                    print(e)
                    print("Error at index: ", idex)

            self.label_file = open(self.label_path, "r")
            for idex, line in enumerate(self.label_file):
                if line:
                    line = line.strip()
                    if not line:
                        continue
                    if idex in selected_lines:
                        self.labels.append(int(line))
                    # self.labels.append(int(line))
            self.label_file.close()

        self.file = open(self.dataset_path, "r")
        for idex, line in enumerate(self.file):
            if line:
                line = line.strip()
                if line:
                    if idex in selected_lines:
                        self.lines.append(line)
                    # self.lines.append(line)
        self.file.close()
        self.len = len(self.lines)
        print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels) if self.label_path else 0)

    def __len__(self):
        return self.len

    def __getitem__(self, item):
        org_line = self.lines[item].split("\t")
        dup_line = []
        opt = False
        for l in org_line:
            if l in ["OptionalTask_1", "EquationAnswer", "NumeratorFactor", "DenominatorFactor", "OptionalTask_2", "FirstRow1:1", "FirstRow1:2", "FirstRow2:1", "FirstRow2:2", "SecondRow", "ThirdRow"]:
                opt = True
            if opt and 'FinalAnswer-' in l:
                dup_line.append('[UNK]')
            else:
                dup_line.append(l)
        dup_line = "\t".join(dup_line)
        # print(dup_line)
        s1 = self.vocab.to_seq(dup_line, self.seq_len) # This is like tokenizer and adds [CLS] and [SEP].
        s1_label = self.labels[item] if self.label_path else 0
        segment_label = [1 for _ in range(len(s1))]
        s1_feat = self.feats[item] if len(self.feats)>0 else 0
        padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))]
        s1.extend(padding), segment_label.extend(padding)
        # print(s1_feat)

        output = {'input': s1,
                 'label': s1_label,
                  'feat': s1_feat,
                 'segment_label': segment_label}
        return {key: torch.tensor(value) for key, value in output.items()}
        
class TokenizerwSkillsTimeDataset(Dataset):
    """
        Feature length: 17+4 = 21

    """
    def __init__(self, dataset_path, label_path, vocab, seq_len=30):
        print(f"dataset_path: {dataset_path}")
        print(f"label_path: {label_path}")

        self.dataset_path = dataset_path
        self.label_path = label_path
        self.vocab = vocab # Vocab object
        self.seq_len = seq_len

        # Related to input dataset file
        self.lines = []
        self.labels = []
        self.feats = []
        selected_lines = []

        print("TokenizerwSkillsTimeDataset...............................")
        time_df = pickle.load(open("ratio_proportion_change3_2223/sch_largest_100-coded/time_info/full_data_normalized_time.pkl", "rb"))
        print("time: ", time_df.shape)

        if self.label_path:
            # Comment this section if you are not using feat attribute
            dataset_info_file = open(self.label_path.replace("label", "info"), "r").readlines()
            print(">>>>>>>>>>>>>>>>>", len(dataset_info_file))
            j = 0
            for idex, line in enumerate(dataset_info_file):
                try:
                    if line:
                        line = line.strip()
                        if not line:
                            continue

                        feat_vec = [float(i) for i in line.split(",")[-9].split("\t")]
                        feat2 = [float(i) for i in line.split(",")[-8].split("\t")]
                        feat_vec.extend(feat2[1:])

                        sch = line.split(",")[0]
                        stu = line.split(",")[2]
                        progress = line.split(",")[3]
                        prob_id = line.split(",")[4]

                        total_time = time_df.loc[(sch, stu, progress, prob_id)]['total_time'].item()
                        faopt_time = time_df.loc[(sch, stu, progress, prob_id)]['faopt_time'].item()
                        opt_time = time_df.loc[(sch, stu, progress, prob_id)]['opt_time'].item()
                        nonopt_time = time_df.loc[(sch, stu, progress, prob_id)]['nonopt_time'].item()

                        feat_vec.append(faopt_time)
                        feat_vec.append(total_time)
                        feat_vec.append(opt_time)
                        feat_vec.append(nonopt_time)

                        if j == 0:
                            print(";;;;", len(feat_vec), feat_vec)
                            j+=1
                        self.feats.append(feat_vec)
                        selected_lines.append(idex)
                except Exception as e:
                    print("................>")
                    print(e)
                    print("Error at index: ", idex)

            self.label_file = open(self.label_path, "r")
            for idex, line in enumerate(self.label_file):
                if line:
                    line = line.strip()
                    if not line:
                        continue
                    if idex in selected_lines:
                        self.labels.append(int(line))
                    # self.labels.append(int(line))
            self.label_file.close()

        self.file = open(self.dataset_path, "r")
        for idex, line in enumerate(self.file):
            if line:
                line = line.strip()
                if line:
                    if idex in selected_lines:
                        self.lines.append(line)
                    # self.lines.append(line)
        self.file.close()
        self.len = len(self.lines)
        print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels) if self.label_path else 0)

    def __len__(self):
        return self.len

    def __getitem__(self, item):
        org_line = self.lines[item].split("\t")
        dup_line = []
        opt = False
        for l in org_line:
            if l in ["OptionalTask_1", "EquationAnswer", "NumeratorFactor", "DenominatorFactor", "OptionalTask_2", "FirstRow1:1", "FirstRow1:2", "FirstRow2:1", "FirstRow2:2", "SecondRow", "ThirdRow"]:
                opt = True
            if opt and 'FinalAnswer-' in l:
                dup_line.append('[UNK]')
            else:
                dup_line.append(l)
        dup_line = "\t".join(dup_line)
        # print(dup_line)
        s1 = self.vocab.to_seq(dup_line, self.seq_len) # This is like tokenizer and adds [CLS] and [SEP].
        s1_label = self.labels[item] if self.label_path else 0
        segment_label = [1 for _ in range(len(s1))]
        s1_feat = self.feats[item] if len(self.feats)>0 else 0
        padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))]
        s1.extend(padding), segment_label.extend(padding)
        # print(s1_feat)

        output = {'input': s1,
                 'label': s1_label,
                  'feat': s1_feat,
                 'segment_label': segment_label}
        return {key: torch.tensor(value) for key, value in output.items()}