Spaces:

ksatzke
/

klsTestSpace

Sleeping

App Files Files Community

ksatzke commited on Feb 27

Commit

3485f9a

verified ·

1 Parent(s): e6a0876

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -299

app.py DELETED Viewed

@@ -1,299 +0,0 @@
-from pathlib import Path
-import io
-import json
-import math
-import statistics
-import sys
-import time
-from datasets import concatenate_datasets, Dataset
-from datasets import load_dataset
-from huggingface_hub import hf_hub_url
-import pandas as pd
-import numpy as np
-from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
-from evaluate import load
-# 1. record each file name included
-# 1.1 read different file formats depending on parameters (i.e., filetype)
-# 2. determine column types and report how many rows for each type (format check)
-# (in a well-formatted dataset, each column should only have one type)
-# 3. report on the null values
-# 4. for certain column types, report statistics
-# 4.1 uniqueness: if all rows are of a small number of <string> values, treat the column as 'categorical' < 10.
-# 4.2 strings: length ranges
-# 4.3 lists: length ranges
-# 4.3 int/float/double: their percentiles, min, max, mean
-CELL_TYPES_LENGTH = ["<class 'str'>", "<class 'list'>"]
-CELL_TYPES_NUMERIC = ["<class 'int'>", "<class 'float'>"]
-PERCENTILES = [1, 5, 10, 25, 50, 100, 250, 500, 750, 900, 950, 975, 990, 995, 999]
-def read_data(all_files, filetype):
-    df = None
-    func_name = ""
-    if filetype in ["parquet", "csv", "json"]:
-        if filetype == "parquet":
-            func_name = pd.read_parquet
-        elif filetype == "csv":
-            func_name = pd.read_csv
-        elif filetype == "json":
-            func_name = pd.read_json
-        df = pd.concat(func_name(f) for f in all_files)
-    elif filetype == "arrow":
-        ds = concatenate_datasets([Dataset.from_file(str(fname)) for fname in all_files])
-        df = pd.DataFrame(data=ds)
-    elif filetype == "jsonl":
-        func_name = pd.read_json
-        all_lines = []
-        for fname in all_files:
-            with open(fname, "r") as f:
-                all_lines.extend(f.readlines())
-        df = pd.concat([pd.DataFrame.from_dict([json.loads(line)]) for line in all_lines])
-    return df
-def compute_cell_length_ranges(cell_lengths, cell_unique_string_values):
-    cell_length_ranges = {}
-    cell_length_ranges = {}
-    string_categorical = {}
-    # this is probably a 'categorical' (i.e., 'classes' in HuggingFace) value
-    # with few unique items (need to check that while reading the cell),
-    # so no need to treat it as a normal string
-    if len(cell_unique_string_values) > 0 and len(cell_unique_string_values) <= 10:
-        string_categorical = str(len(cell_unique_string_values)) + " class(es)"
-    elif cell_lengths:
-        cell_lengths = sorted(cell_lengths)
-        min_val = cell_lengths[0]
-        max_val = cell_lengths[-1]
-        distance = math.ceil((max_val - min_val) / 10.0)
-        ranges = []
-        if min_val != max_val:
-            for j in range(min_val, max_val, distance):
-                ranges.append(j)
-            for j in range(len(ranges)-1):
-                cell_length_ranges[str(ranges[j]) + "-" + str(ranges[j+1])] = 0
-            ranges.append(max_val)
-            j = 1
-            c = 0
-            for k in cell_lengths:
-                if j == len(ranges):
-                    c += 1
-                elif k < ranges[j]:
-                    c += 1
-                else:
-                    cell_length_ranges[str(ranges[j-1]) + "-" + str(ranges[j])] = c
-                    j += 1
-                    c = 1
-            cell_length_ranges[str(ranges[j-1]) + "-" + str(max_val)] = c
-        else:
-            ranges = [min_val]
-            c = 0
-            for k in cell_lengths:
-                c += 1
-            cell_length_ranges[str(min_val)] = c
-    return cell_length_ranges, string_categorical
-def _compute_percentiles(values, percentiles=PERCENTILES):
-    result = {}
-    quantiles = statistics.quantiles(values, n=max(PERCENTILES)+1, method='inclusive')
-    for p in percentiles:
-        result[p/10] = quantiles[p-1]
-    return result
-def compute_cell_value_statistics(cell_values):
-    stats = {}
-    if cell_values:
-        cell_values = sorted(cell_values)
-        stats["min"] = cell_values[0]
-        stats["max"] = cell_values[-1]
-        stats["mean"] = statistics.mean(cell_values)
-        stats["stdev"] = statistics.stdev(cell_values)
-        stats["variance"] = statistics.variance(cell_values)
-        stats["percentiles"] = _compute_percentiles(cell_values)
-    return stats
-def check_null(cell, cell_type):
-    if cell_type == "<class 'float'>":
-        if math.isnan(cell):
-            return True
-    elif cell is None:
-        return True
-    return False
-def compute_property(data_path, glob, filetype):
-    output = {}
-    data_dir = Path(data_path)
-    filenames = []
-    all_files = list(data_dir.glob(glob))
-    for f in all_files:
-        print(str(f))
-        base_fname = str(f)[len(str(data_path)):]
-        if not data_path.endswith("/"):
-            base_fname = base_fname[1:]
-        filenames.append(base_fname)
-    output["filenames"] = filenames
-    df = read_data(all_files, filetype)
-    column_info = {}
-    for col_name in df.columns:
-        if col_name not in column_info:
-            column_info[col_name] = {}
-        cell_types = {}
-        cell_lengths = {}
-        cell_unique_string_values = {}
-        cell_values = {}
-        null_count = 0
-        col_values = df[col_name].to_list()
-        for cell in col_values:
-        # for index, row in df.iterrows():
-        #     cell = row[col_name]
-            cell_type = str(type(cell))
-            cell_type = str(type(cell))
-            # print(cell, cell_type)
-            if check_null(cell, cell_type):
-                null_count += 1
-                continue
-            if cell_type not in cell_types:
-                cell_types[cell_type] = 1
-            else:
-                cell_types[cell_type] += 1
-            if cell_type in CELL_TYPES_LENGTH:
-                cell_length = len(cell)
-                if cell_type not in cell_lengths:
-                    cell_lengths[cell_type] = []
-                cell_lengths[cell_type].append(cell_length)
-                if cell_type == "<class 'str'>" and cell not in cell_unique_string_values:
-                    cell_unique_string_values[cell] = True
-            elif cell_type in CELL_TYPES_NUMERIC:
-                if cell_type not in cell_values:
-                    cell_values[cell_type] = []
-                cell_values[cell_type].append(cell)
-            else:
-                print(cell_type)
-        clrs = {}
-        ccs = {}
-        for cell_type in CELL_TYPES_LENGTH:
-            if cell_type in cell_lengths:
-                clr, cc = compute_cell_length_ranges(cell_lengths[cell_type], cell_unique_string_values)
-                clrs[cell_type] = clr
-                ccs[cell_type] = cc
-        css = {}
-        for cell_type in CELL_TYPES_NUMERIC:
-            if cell_type in cell_values:
-                cell_stats = compute_cell_value_statistics(cell_values[cell_type])
-                css[cell_type] = cell_stats
-        column_info[col_name]["cell_types"] = cell_types
-        column_info[col_name]["cell_length_ranges"] = clrs
-        column_info[col_name]["cell_categories"] = ccs
-        column_info[col_name]["cell_stats"] = css
-        column_info[col_name]["cell_missing"] = null_count
-    output["column_info"] = column_info
-    output["number_of_items"] = len(df)
-    output["timestamp"] = time.time()
-    return output
-def preprocess_function(examples):
-    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)
-def compute_metrics(eval_pred):
-    predictions, labels = eval_pred
-    predictions = np.argmax(predictions, axis=1)
-    return metric.compute(predictions=predictions, references=labels)
-def compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric):
-    tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
-    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
-    batch_size = 16
-    args = TrainingArguments(
-        "test-glue",
-        evaluation_strategy = "epoch",
-        learning_rate=5e-5,
-        seed=42,
-        lr_scheduler_type="linear",
-        per_device_train_batch_size=batch_size,
-        per_device_eval_batch_size=batch_size,
-        num_train_epochs=3,
-        weight_decay=0.01,
-        load_best_model_at_end=False,
-        metric_for_best_model="accuracy",
-        report_to="none"
-        )
-    trainer = Trainer(
-        model,
-        args,
-        train_dataset=tokenized_datasets["train"],
-        eval_dataset=tokenized_datasets["validation"],
-        tokenizer=tokenizer,
-        compute_metrics=compute_metrics
-    )
-    result = trainer.evaluate()
-    return result
-if __name__ == "__main__":
-    in_container = True
-    if len(sys.argv) > 1:
-        model_checkpoint = sys.argv[1]
-        dataset_name = sys.argv[2]
-        metric = sys.argv[3]
-        in_container = False
-    else:
-        model_checkpoint = "sgugger/glue-mrpc"
-        dataset_name = "nyu-mll/glue"
-        metric = ["glue", "mrpc"]
-        in_container = False
-    print(model_checkpoint, dataset_name, metric)
-    model_checkpoint = model_checkpoint
-    raw_datasets = load_dataset(dataset_name, "mrpc")
-    metric = load("glue", "mrpc")
-    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
-    output = compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric)
-    print(json.dumps(output))
-    if in_container:
-        with open("/tmp/outputs/computation_result.json", "w") as f:
-            json.dump(output, f, indent=4, sort_keys=True)
-    else:
-        print(json.dumps(output, indent=4, sort_keys=True))