ksatzke commited on
Commit
9cc9a99
·
verified ·
1 Parent(s): 1218270

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +271 -0
app.py CHANGED
@@ -339,6 +339,277 @@ class GradioInterface:
339
 
340
  return demo
341
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
  def main():
343
  try:
344
  interface = GradioInterface()
 
339
 
340
  return demo
341
 
342
+ from pathlib import Path
343
+ import io
344
+ import json
345
+ import math
346
+ import statistics
347
+ import sys
348
+ import time
349
+
350
+ from datasets import concatenate_datasets, Dataset
351
+ from datasets import load_dataset
352
+
353
+ from huggingface_hub import hf_hub_url
354
+
355
+ import pandas as pd
356
+ import numpy as np
357
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
358
+ from evaluate import load
359
+
360
+
361
+ # 1. record each file name included
362
+ # 1.1 read different file formats depending on parameters (i.e., filetype)
363
+ # 2. determine column types and report how many rows for each type (format check)
364
+ # (in a well-formatted dataset, each column should only have one type)
365
+ # 3. report on the null values
366
+ # 4. for certain column types, report statistics
367
+ # 4.1 uniqueness: if all rows are of a small number of <string> values, treat the column as 'categorical' < 10.
368
+ # 4.2 strings: length ranges
369
+ # 4.3 lists: length ranges
370
+ # 4.3 int/float/double: their percentiles, min, max, mean
371
+
372
+ CELL_TYPES_LENGTH = ["<class 'str'>", "<class 'list'>"]
373
+ CELL_TYPES_NUMERIC = ["<class 'int'>", "<class 'float'>"]
374
+
375
+ PERCENTILES = [1, 5, 10, 25, 50, 100, 250, 500, 750, 900, 950, 975, 990, 995, 999]
376
+
377
+ def read_data(all_files, filetype):
378
+ df = None
379
+
380
+ func_name = ""
381
+
382
+ if filetype in ["parquet", "csv", "json"]:
383
+ if filetype == "parquet":
384
+ func_name = pd.read_parquet
385
+ elif filetype == "csv":
386
+ func_name = pd.read_csv
387
+ elif filetype == "json":
388
+ func_name = pd.read_json
389
+
390
+ df = pd.concat(func_name(f) for f in all_files)
391
+
392
+ elif filetype == "arrow":
393
+ ds = concatenate_datasets([Dataset.from_file(str(fname)) for fname in all_files])
394
+ df = pd.DataFrame(data=ds)
395
+
396
+ elif filetype == "jsonl":
397
+ func_name = pd.read_json
398
+ all_lines = []
399
+ for fname in all_files:
400
+ with open(fname, "r") as f:
401
+ all_lines.extend(f.readlines())
402
+
403
+ df = pd.concat([pd.DataFrame.from_dict([json.loads(line)]) for line in all_lines])
404
+
405
+ return df
406
+
407
+ def compute_cell_length_ranges(cell_lengths, cell_unique_string_values):
408
+ cell_length_ranges = {}
409
+ cell_length_ranges = {}
410
+ string_categorical = {}
411
+ # this is probably a 'categorical' (i.e., 'classes' in HuggingFace) value
412
+ # with few unique items (need to check that while reading the cell),
413
+ # so no need to treat it as a normal string
414
+ if len(cell_unique_string_values) > 0 and len(cell_unique_string_values) <= 10:
415
+ string_categorical = str(len(cell_unique_string_values)) + " class(es)"
416
+
417
+ elif cell_lengths:
418
+ cell_lengths = sorted(cell_lengths)
419
+ min_val = cell_lengths[0]
420
+ max_val = cell_lengths[-1]
421
+ distance = math.ceil((max_val - min_val) / 10.0)
422
+ ranges = []
423
+ if min_val != max_val:
424
+ for j in range(min_val, max_val, distance):
425
+ ranges.append(j)
426
+ for j in range(len(ranges)-1):
427
+ cell_length_ranges[str(ranges[j]) + "-" + str(ranges[j+1])] = 0
428
+ ranges.append(max_val)
429
+
430
+ j = 1
431
+ c = 0
432
+ for k in cell_lengths:
433
+ if j == len(ranges):
434
+ c += 1
435
+ elif k < ranges[j]:
436
+ c += 1
437
+ else:
438
+ cell_length_ranges[str(ranges[j-1]) + "-" + str(ranges[j])] = c
439
+ j += 1
440
+ c = 1
441
+
442
+ cell_length_ranges[str(ranges[j-1]) + "-" + str(max_val)] = c
443
+
444
+ else:
445
+ ranges = [min_val]
446
+ c = 0
447
+ for k in cell_lengths:
448
+ c += 1
449
+ cell_length_ranges[str(min_val)] = c
450
+
451
+ return cell_length_ranges, string_categorical
452
+
453
+ def _compute_percentiles(values, percentiles=PERCENTILES):
454
+ result = {}
455
+ quantiles = statistics.quantiles(values, n=max(PERCENTILES)+1, method='inclusive')
456
+ for p in percentiles:
457
+ result[p/10] = quantiles[p-1]
458
+ return result
459
+
460
+ def compute_cell_value_statistics(cell_values):
461
+ stats = {}
462
+ if cell_values:
463
+ cell_values = sorted(cell_values)
464
+
465
+ stats["min"] = cell_values[0]
466
+ stats["max"] = cell_values[-1]
467
+ stats["mean"] = statistics.mean(cell_values)
468
+ stats["stdev"] = statistics.stdev(cell_values)
469
+ stats["variance"] = statistics.variance(cell_values)
470
+
471
+ stats["percentiles"] = _compute_percentiles(cell_values)
472
+
473
+ return stats
474
+
475
+ def check_null(cell, cell_type):
476
+ if cell_type == "<class 'float'>":
477
+ if math.isnan(cell):
478
+ return True
479
+ elif cell is None:
480
+ return True
481
+ return False
482
+
483
+ def compute_property(data_path, glob, filetype):
484
+ output = {}
485
+
486
+ data_dir = Path(data_path)
487
+
488
+ filenames = []
489
+ all_files = list(data_dir.glob(glob))
490
+ for f in all_files:
491
+ print(str(f))
492
+ base_fname = str(f)[len(str(data_path)):]
493
+ if not data_path.endswith("/"):
494
+ base_fname = base_fname[1:]
495
+ filenames.append(base_fname)
496
+
497
+ output["filenames"] = filenames
498
+
499
+ df = read_data(all_files, filetype)
500
+
501
+ column_info = {}
502
+
503
+ for col_name in df.columns:
504
+ if col_name not in column_info:
505
+ column_info[col_name] = {}
506
+
507
+ cell_types = {}
508
+
509
+ cell_lengths = {}
510
+ cell_unique_string_values = {}
511
+ cell_values = {}
512
+ null_count = 0
513
+ col_values = df[col_name].to_list()
514
+ for cell in col_values:
515
+ # for index, row in df.iterrows():
516
+ # cell = row[col_name]
517
+ cell_type = str(type(cell))
518
+ cell_type = str(type(cell))
519
+ # print(cell, cell_type)
520
+ if check_null(cell, cell_type):
521
+ null_count += 1
522
+ continue
523
+
524
+ if cell_type not in cell_types:
525
+ cell_types[cell_type] = 1
526
+ else:
527
+ cell_types[cell_type] += 1
528
+
529
+ if cell_type in CELL_TYPES_LENGTH:
530
+ cell_length = len(cell)
531
+ if cell_type not in cell_lengths:
532
+ cell_lengths[cell_type] = []
533
+
534
+ cell_lengths[cell_type].append(cell_length)
535
+ if cell_type == "<class 'str'>" and cell not in cell_unique_string_values:
536
+ cell_unique_string_values[cell] = True
537
+
538
+ elif cell_type in CELL_TYPES_NUMERIC:
539
+ if cell_type not in cell_values:
540
+ cell_values[cell_type] = []
541
+
542
+ cell_values[cell_type].append(cell)
543
+
544
+ else:
545
+ print(cell_type)
546
+
547
+ clrs = {}
548
+ ccs = {}
549
+ for cell_type in CELL_TYPES_LENGTH:
550
+ if cell_type in cell_lengths:
551
+ clr, cc = compute_cell_length_ranges(cell_lengths[cell_type], cell_unique_string_values)
552
+ clrs[cell_type] = clr
553
+ ccs[cell_type] = cc
554
+
555
+ css = {}
556
+ for cell_type in CELL_TYPES_NUMERIC:
557
+ if cell_type in cell_values:
558
+ cell_stats = compute_cell_value_statistics(cell_values[cell_type])
559
+ css[cell_type] = cell_stats
560
+
561
+ column_info[col_name]["cell_types"] = cell_types
562
+ column_info[col_name]["cell_length_ranges"] = clrs
563
+ column_info[col_name]["cell_categories"] = ccs
564
+ column_info[col_name]["cell_stats"] = css
565
+ column_info[col_name]["cell_missing"] = null_count
566
+
567
+ output["column_info"] = column_info
568
+ output["number_of_items"] = len(df)
569
+ output["timestamp"] = time.time()
570
+
571
+ return output
572
+
573
+ def preprocess_function(examples):
574
+ return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)
575
+
576
+ def compute_metrics(eval_pred):
577
+ predictions, labels = eval_pred
578
+ predictions = np.argmax(predictions, axis=1)
579
+ return metric.compute(predictions=predictions, references=labels)
580
+
581
+ def compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric):
582
+ tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
583
+ model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
584
+ batch_size = 16
585
+ args = TrainingArguments(
586
+ "test-glue",
587
+ evaluation_strategy = "epoch",
588
+ learning_rate=5e-5,
589
+ seed=42,
590
+ lr_scheduler_type="linear",
591
+ per_device_train_batch_size=batch_size,
592
+ per_device_eval_batch_size=batch_size,
593
+ num_train_epochs=3,
594
+ weight_decay=0.01,
595
+ load_best_model_at_end=False,
596
+ metric_for_best_model="accuracy",
597
+ report_to="none"
598
+ )
599
+
600
+ trainer = Trainer(
601
+ model,
602
+ args,
603
+ train_dataset=tokenized_datasets["train"],
604
+ eval_dataset=tokenized_datasets["validation"],
605
+ tokenizer=tokenizer,
606
+ compute_metrics=compute_metrics
607
+ )
608
+ result = trainer.evaluate()
609
+ return result
610
+
611
+
612
+
613
  def main():
614
  try:
615
  interface = GradioInterface()