Spaces:

ksatzke
/

klsStreamlitTestSpace

Sleeping

App Files Files Community

ksatzke commited on Feb 26

Commit

e711649

verified ·

1 Parent(s): dc2bd70

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -228

app.py CHANGED Viewed

@@ -1,243 +1,30 @@
 import streamlit as st
 import gradio as gr
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 import logging
 from typing import List, Dict
 import gc
 import os
-from pathlib import Path
-import io
 import json
-import math
-import statistics
-import sys
-import time
 from datasets import concatenate_datasets, Dataset
 from datasets import load_dataset
 from huggingface_hub import hf_hub_url
-import pandas as pd
-import numpy as np
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
 from evaluate import load
-# 1. record each file name included
-# 1.1 read different file formats depending on parameters (i.e., filetype)
-# 2. determine column types and report how many rows for each type (format check)
-# (in a well-formatted dataset, each column should only have one type)
-# 3. report on the null values
-# 4. for certain column types, report statistics
-# 4.1 uniqueness: if all rows are of a small number of <string> values, treat the column as 'categorical' < 10.
-# 4.2 strings: length ranges
-# 4.3 lists: length ranges
-# 4.3 int/float/double: their percentiles, min, max, mean
-CELL_TYPES_LENGTH = ["<class 'str'>", "<class 'list'>"]
-CELL_TYPES_NUMERIC = ["<class 'int'>", "<class 'float'>"]
-PERCENTILES = [1, 5, 10, 25, 50, 100, 250, 500, 750, 900, 950, 975, 990, 995, 999]
-def read_data(all_files, filetype):
-    df = None
-    func_name = ""
-    if filetype in ["parquet", "csv", "json"]:
-        if filetype == "parquet":
-            func_name = pd.read_parquet
-        elif filetype == "csv":
-            func_name = pd.read_csv
-        elif filetype == "json":
-            func_name = pd.read_json
-        df = pd.concat(func_name(f) for f in all_files)
-    elif filetype == "arrow":
-        ds = concatenate_datasets([Dataset.from_file(str(fname)) for fname in all_files])
-        df = pd.DataFrame(data=ds)
-    elif filetype == "jsonl":
-        func_name = pd.read_json
-        all_lines = []
-        for fname in all_files:
-            with open(fname, "r") as f:
-                all_lines.extend(f.readlines())
-        df = pd.concat([pd.DataFrame.from_dict([json.loads(line)]) for line in all_lines])
-    return df
-def compute_cell_length_ranges(cell_lengths, cell_unique_string_values):
-    cell_length_ranges = {}
-    cell_length_ranges = {}
-    string_categorical = {}
-    # this is probably a 'categorical' (i.e., 'classes' in HuggingFace) value
-    # with few unique items (need to check that while reading the cell),
-    # so no need to treat it as a normal string
-    if len(cell_unique_string_values) > 0 and len(cell_unique_string_values) <= 10:
-        string_categorical = str(len(cell_unique_string_values)) + " class(es)"
-    elif cell_lengths:
-        cell_lengths = sorted(cell_lengths)
-        min_val = cell_lengths[0]
-        max_val = cell_lengths[-1]
-        distance = math.ceil((max_val - min_val) / 10.0)
-        ranges = []
-        if min_val != max_val:
-            for j in range(min_val, max_val, distance):
-                ranges.append(j)
-            for j in range(len(ranges)-1):
-                cell_length_ranges[str(ranges[j]) + "-" + str(ranges[j+1])] = 0
-            ranges.append(max_val)
-            j = 1
-            c = 0
-            for k in cell_lengths:
-                if j == len(ranges):
-                    c += 1
-                elif k < ranges[j]:
-                    c += 1
-                else:
-                    cell_length_ranges[str(ranges[j-1]) + "-" + str(ranges[j])] = c
-                    j += 1
-                    c = 1
-            cell_length_ranges[str(ranges[j-1]) + "-" + str(max_val)] = c
-        else:
-            ranges = [min_val]
-            c = 0
-            for k in cell_lengths:
-                c += 1
-            cell_length_ranges[str(min_val)] = c
-    return cell_length_ranges, string_categorical
-def _compute_percentiles(values, percentiles=PERCENTILES):
-    result = {}
-    quantiles = statistics.quantiles(values, n=max(PERCENTILES)+1, method='inclusive')
-    for p in percentiles:
-        result[p/10] = quantiles[p-1]
-    return result
-def compute_cell_value_statistics(cell_values):
-    stats = {}
-    if cell_values:
-        cell_values = sorted(cell_values)
-        stats["min"] = cell_values[0]
-        stats["max"] = cell_values[-1]
-        stats["mean"] = statistics.mean(cell_values)
-        stats["stdev"] = statistics.stdev(cell_values)
-        stats["variance"] = statistics.variance(cell_values)
-        stats["percentiles"] = _compute_percentiles(cell_values)
-    return stats
-def check_null(cell, cell_type):
-    if cell_type == "<class 'float'>":
-        if math.isnan(cell):
-            return True
-    elif cell is None:
-        return True
-    return False
-def compute_property(data_path, glob, filetype):
-    output = {}
-    data_dir = Path(data_path)
-    filenames = []
-    all_files = list(data_dir.glob(glob))
-    for f in all_files:
-        print(str(f))
-        base_fname = str(f)[len(str(data_path)):]
-        if not data_path.endswith("/"):
-            base_fname = base_fname[1:]
-        filenames.append(base_fname)
-    output["filenames"] = filenames
-    df = read_data(all_files, filetype)
-    column_info = {}
-    for col_name in df.columns:
-        if col_name not in column_info:
-            column_info[col_name] = {}
-        cell_types = {}
-        cell_lengths = {}
-        cell_unique_string_values = {}
-        cell_values = {}
-        null_count = 0
-        col_values = df[col_name].to_list()
-        for cell in col_values:
-        # for index, row in df.iterrows():
-        #     cell = row[col_name]
-            cell_type = str(type(cell))
-            cell_type = str(type(cell))
-            # print(cell, cell_type)
-            if check_null(cell, cell_type):
-                null_count += 1
-                continue
-            if cell_type not in cell_types:
-                cell_types[cell_type] = 1
-            else:
-                cell_types[cell_type] += 1
-            if cell_type in CELL_TYPES_LENGTH:
-                cell_length = len(cell)
-                if cell_type not in cell_lengths:
-                    cell_lengths[cell_type] = []
-                cell_lengths[cell_type].append(cell_length)
-                if cell_type == "<class 'str'>" and cell not in cell_unique_string_values:
-                    cell_unique_string_values[cell] = True
-            elif cell_type in CELL_TYPES_NUMERIC:
-                if cell_type not in cell_values:
-                    cell_values[cell_type] = []
-                cell_values[cell_type].append(cell)
-            else:
-                print(cell_type)
-        clrs = {}
-        ccs = {}
-        for cell_type in CELL_TYPES_LENGTH:
-            if cell_type in cell_lengths:
-                clr, cc = compute_cell_length_ranges(cell_lengths[cell_type], cell_unique_string_values)
-                clrs[cell_type] = clr
-                ccs[cell_type] = cc
-        css = {}
-        for cell_type in CELL_TYPES_NUMERIC:
-            if cell_type in cell_values:
-                cell_stats = compute_cell_value_statistics(cell_values[cell_type])
-                css[cell_type] = cell_stats
-        column_info[col_name]["cell_types"] = cell_types
-        column_info[col_name]["cell_length_ranges"] = clrs
-        column_info[col_name]["cell_categories"] = ccs
-        column_info[col_name]["cell_stats"] = css
-        column_info[col_name]["cell_missing"] = null_count
-    output["column_info"] = column_info
-    output["number_of_items"] = len(df)
-    output["timestamp"] = time.time()
-    return output
 def preprocess_function(examples):
     return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)
@@ -280,10 +67,9 @@ if __name__ == "__main__":
     st.title("Hugging Face Model Evaluation Demo")
-    with st.form("my_form"):
         # Create an input text box
-        #input_text = st.text_area("Enter model and dataset identifiers", "")
         dataset_name = st.text_input("Enter dataset identifier", "")
         model_checkpoint = st.text_input("Enter model identifier", "")
         # Every form must have a submit button.
@@ -292,12 +78,12 @@ if __name__ == "__main__":
         if submitted:
             print(dataset_name, model_checkpoint)
             # hardcode input data
-            model_checkpoint = "sgugger/glue-mrpc"
-            dataset_name = "nyu-mll/glue"
             metric = load("glue", "mrpc")
             tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
-            in_container = False
             model_checkpoint = model_checkpoint
             raw_datasets = load_dataset(dataset_name, "mrpc")
             metric = load("glue", "mrpc")

 import streamlit as st
 import gradio as gr
+imort torch
 import logging
 from typing import List, Dict
 import gc
 import os
+import pandas as pd
+import numpy as np
+#from pathlib import Path
+#import io
 import json
+#import math
+#import statistics
+#import sys
+#import time
 from datasets import concatenate_datasets, Dataset
 from datasets import load_dataset
 from huggingface_hub import hf_hub_url
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
 from evaluate import load
 def preprocess_function(examples):
     return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)
     st.title("Hugging Face Model Evaluation Demo")
+    with st.form("my_st_form"):
         # Create an input text box
         dataset_name = st.text_input("Enter dataset identifier", "")
         model_checkpoint = st.text_input("Enter model identifier", "")
         # Every form must have a submit button.
         if submitted:
             print(dataset_name, model_checkpoint)
             # hardcode input data
+            #model_checkpoint = "sgugger/glue-mrpc"
+            #dataset_name = "nyu-mll/glue"
             metric = load("glue", "mrpc")
             tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
+            #in_container = False
             model_checkpoint = model_checkpoint
             raw_datasets = load_dataset(dataset_name, "mrpc")
             metric = load("glue", "mrpc")