Spaces:

theresatvan
/

patent-language-model

Runtime error

App Files Files Community

theresatvan commited on May 2, 2023

Commit

81414ba

1 Parent(s): 0fb0bdd

Preprocess data & train model

Browse files

Files changed (16) hide show

.gitattributes +8 -0
models/content/dataset_dict/dataset_dict.json +1 -0
models/content/dataset_dict/train/cache-b56665f85f005b13.arrow +3 -0
models/content/dataset_dict/train/data-00000-of-00002.arrow +3 -0
models/content/dataset_dict/train/data-00001-of-00002.arrow +3 -0
models/content/dataset_dict/train/dataset_info.json +112 -0
models/content/dataset_dict/train/state.json +16 -0
models/content/dataset_dict/validation/cache-0fb09a456da0a13c.arrow +3 -0
models/content/dataset_dict/validation/cache-56a4339a2c8de01a.arrow +3 -0
models/content/dataset_dict/validation/cache-8107f3f237676f0e.arrow +3 -0
models/content/dataset_dict/validation/cache-f992378180dfe232.arrow +3 -0
models/content/dataset_dict/validation/data-00000-of-00001.arrow +3 -0
models/content/dataset_dict/validation/dataset_info.json +112 -0
models/content/dataset_dict/validation/state.json +13 -0
models/preprocessing.py +49 -0
models/train.py +173 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,8 @@

+models/content/dataset_dict/train/cache-b56665f85f005b13.arrow filter=lfs diff=lfs merge=lfs -text
+models/content/dataset_dict/train/data-00000-of-00002.arrow filter=lfs diff=lfs merge=lfs -text
+models/content/dataset_dict/train/data-00001-of-00002.arrow filter=lfs diff=lfs merge=lfs -text
+models/content/dataset_dict/validation/cache-0fb09a456da0a13c.arrow filter=lfs diff=lfs merge=lfs -text
+models/content/dataset_dict/validation/cache-56a4339a2c8de01a.arrow filter=lfs diff=lfs merge=lfs -text
+models/content/dataset_dict/validation/cache-8107f3f237676f0e.arrow filter=lfs diff=lfs merge=lfs -text
+models/content/dataset_dict/validation/cache-f992378180dfe232.arrow filter=lfs diff=lfs merge=lfs -text
+models/content/dataset_dict/validation/data-00000-of-00001.arrow filter=lfs diff=lfs merge=lfs -text

models/content/dataset_dict/dataset_dict.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"splits": ["train", "validation"]}

models/content/dataset_dict/train/cache-b56665f85f005b13.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95c4796d4a7e5d26b19049d858dbde51415ecd026957fb0ee8529b16ec286d55
+size 658677464

models/content/dataset_dict/train/data-00000-of-00002.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca9f363c02cb2049734a724c1324a27d659e66b4f5fe90e820b6dcbca3caac0f
+size 309302984

models/content/dataset_dict/train/data-00001-of-00002.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3aed3e86a963f7d8f374839818366af5ee654a2793ca5b751214e7dd62aa68d7
+size 326984520

models/content/dataset_dict/train/dataset_info.json ADDED Viewed

	@@ -0,0 +1,112 @@

+{
+  "builder_name": "hupd",
+  "citation": "@InProceedings{suzgun2021:hupd,\ntitle = {The Harvard USPTO Patent Dataset},\nauthors={Mirac Suzgun and Suproteem Sarkar and Luke Melas-Kyriazi and Scott Kominers and Stuart Shieber},\nyear={2021}\n}\n",
+  "config_name": "sample",
+  "dataset_size": 1848322042,
+  "description": "\nThe Harvard USPTO Patent Dataset (HUPD) is a large-scale, well-structured, and multi-purpose corpus \nof English-language patent applications filed to the United States Patent and Trademark Office (USPTO) \nbetween 2004 and 2018. With more than 4.5 million patent documents, HUPD is two to three times larger \nthan comparable corpora. Unlike other NLP patent datasets, HUPD contains the inventor-submitted versions \nof patent applications, not the final versions of granted patents, allowing us to study patentability at \nthe time of filing using NLP methods for the first time.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/HUPD/hupd/resolve/main/hupd_metadata_jan16_2022-02-22.feather": {
+      "num_bytes": 6665746,
+      "checksum": null
+    },
+    "https://huggingface.co/datasets/HUPD/hupd/resolve/main/data/sample-jan-2016.tar.gz": {
+      "num_bytes": 387636489,
+      "checksum": null
+    }
+  },
+  "download_size": 394302235,
+  "features": {
+    "patent_number": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "decision": {
+      "dtype": "int64",
+      "_type": "Value"
+    },
+    "title": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "abstract": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "claims": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "background": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "summary": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "description": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "cpc_label": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "ipc_label": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "filing_date": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "patent_issue_date": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "date_published": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "examiner_id": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "https://github.com/suzgunmirac/hupd",
+  "license": "",
+  "size_in_bytes": 2242624277,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 1184126558,
+      "num_examples": 16153,
+      "shard_lengths": [
+        7000,
+        7000,
+        2153
+      ],
+      "dataset_name": "hupd"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 664195484,
+      "num_examples": 9094,
+      "shard_lengths": [
+        7000,
+        2094
+      ],
+      "dataset_name": "hupd"
+    }
+  },
+  "supervised_keys": {
+    "input": "claims",
+    "output": "decision"
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}

models/content/dataset_dict/train/state.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00002.arrow"
+    },
+    {
+      "filename": "data-00001-of-00002.arrow"
+    }
+  ],
+  "_fingerprint": "5fe802206a6f6c6f",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "train"
+}

models/content/dataset_dict/validation/cache-0fb09a456da0a13c.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da9135daa052e81c84a26b153b6794de3089413cec1d15e2ea7bd2bc3f407913
+size 367685864

models/content/dataset_dict/validation/cache-56a4339a2c8de01a.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7312bb14fe9365dcc94d783bf6ae32574dcda815a5cffe309b8ea84687989988
+size 367685864

models/content/dataset_dict/validation/cache-8107f3f237676f0e.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:47409881486805513c0efdb578039ef5ebd96d6912948565e1174ae45d90e838
+size 367685864

models/content/dataset_dict/validation/cache-f992378180dfe232.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ecbe5a4baa1c1f422305e6b7a59da5a5cc6b5133850cc7d3bea2e833873b34f4
+size 367685864

models/content/dataset_dict/validation/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dac0232941a79f4bea7b714d57931bba46e0c67e97d474bb1722107dafa59e4a
+size 355132024

models/content/dataset_dict/validation/dataset_info.json ADDED Viewed

	@@ -0,0 +1,112 @@

+{
+  "builder_name": "hupd",
+  "citation": "@InProceedings{suzgun2021:hupd,\ntitle = {The Harvard USPTO Patent Dataset},\nauthors={Mirac Suzgun and Suproteem Sarkar and Luke Melas-Kyriazi and Scott Kominers and Stuart Shieber},\nyear={2021}\n}\n",
+  "config_name": "sample",
+  "dataset_size": 1848322042,
+  "description": "\nThe Harvard USPTO Patent Dataset (HUPD) is a large-scale, well-structured, and multi-purpose corpus \nof English-language patent applications filed to the United States Patent and Trademark Office (USPTO) \nbetween 2004 and 2018. With more than 4.5 million patent documents, HUPD is two to three times larger \nthan comparable corpora. Unlike other NLP patent datasets, HUPD contains the inventor-submitted versions \nof patent applications, not the final versions of granted patents, allowing us to study patentability at \nthe time of filing using NLP methods for the first time.\n",
+  "download_checksums": {
+    "https://huggingface.co/datasets/HUPD/hupd/resolve/main/hupd_metadata_jan16_2022-02-22.feather": {
+      "num_bytes": 6665746,
+      "checksum": null
+    },
+    "https://huggingface.co/datasets/HUPD/hupd/resolve/main/data/sample-jan-2016.tar.gz": {
+      "num_bytes": 387636489,
+      "checksum": null
+    }
+  },
+  "download_size": 394302235,
+  "features": {
+    "patent_number": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "decision": {
+      "dtype": "int64",
+      "_type": "Value"
+    },
+    "title": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "abstract": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "claims": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "background": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "summary": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "description": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "cpc_label": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "ipc_label": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "filing_date": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "patent_issue_date": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "date_published": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "examiner_id": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "https://github.com/suzgunmirac/hupd",
+  "license": "",
+  "size_in_bytes": 2242624277,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 1184126558,
+      "num_examples": 16153,
+      "shard_lengths": [
+        7000,
+        7000,
+        2153
+      ],
+      "dataset_name": "hupd"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 664195484,
+      "num_examples": 9094,
+      "shard_lengths": [
+        7000,
+        2094
+      ],
+      "dataset_name": "hupd"
+    }
+  },
+  "supervised_keys": {
+    "input": "claims",
+    "output": "decision"
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}

models/content/dataset_dict/validation/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "da58beb6a9e5af41",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}

models/preprocessing.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from datasets import load_dataset
+# Initializing global variables
+file_path = '/app/models/content/'
+"""## Loading the Dataset
+We will be finetuning the DistilBERT model on a subset of patents filed in January 2016. We perform the train-test split as
+any patents filed on and before January 21st, 2016 will be part of the training set and and patents filed on January 22nd,
+2016 and after will be part of the validation set.
+"""
+dataset_dict = load_dataset('HUPD/hupd',
+    name='sample',
+    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
+    icpr_label=None,
+    train_filing_start_date='2016-01-01',
+    train_filing_end_date='2016-01-21',
+    val_filing_start_date='2016-01-22',
+    val_filing_end_date='2016-01-31',
+)
+print(dataset_dict)
+print(f'Train dataset size: {dataset_dict["train"].shape}')
+print(f'Validation dataset size: {dataset_dict["validation"].shape}')
+"""## Pre-Processing Steps
+Our model will only be able to predict rejections or acceptances. We will have to filter out any
+other decisions from our training and validation set in order to proceed.
+"""
+# Label-to-index mapping for the decision status field
+decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5}
+# Helper function
+def map_decision_to_string(example):
+    return {'decision': decision_to_str[example['decision']]}
+# Re-labeling/mapping.
+# Filtering out any decisions that are not 'REJECTED' or 'ACCEPTED'.
+for name in ['train', 'validation']:
+    dataset_dict[name] = dataset_dict[name].map(map_decision_to_string)
+    # Remove the pending and CONT-patent applications
+    dataset_dict[name] = dataset_dict[name].filter(lambda e: e['decision'] <= 1)
+# Save the dataset dictionary to disk
+dataset_dict.save_to_disk(file_path + 'dataset_dict')

models/train.py ADDED Viewed

	@@ -0,0 +1,173 @@

+# -*- coding: utf-8 -*-
+"""Finetuning Language Models - Can I Patent This?.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/1x9XfLKvGNBsajOK8rztsZnCoD2ucGfO6
+# Finetuning Language Models - Can I Patent This?
+Using the [Harvard USPTO patent dataset](https://github.com/suzgunmirac/hupd), we will fine-tune a DistilBERT model
+obtained from Hugging Face that can predict whether a patent is accepted or rejected based off of its abstract and claims.
+"""
+import gc
+import argparse
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from torch.optim import AdamW
+from datasets import load_dataset, load_from_disk
+from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig
+# Initializing global variables
+file_path = '/app/models/content/'
+decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5}
+criterion = torch.nn.CrossEntropyLoss()
+def create_dataloaders(dataset_dict, section):
+  # Initializing the tokenizer
+  model_name = 'distilbert-base-uncased'
+  tokenizer = DistilBertTokenizer.from_pretrained(model_name, do_lower_case=True)
+  train_set, val_set = dataset_dict['train'], dataset_dict['validation']
+  # Training set
+  train_set = train_set.map(
+    lambda e: tokenizer((e[section]), truncation=True, padding='max_length'),
+    batched=True)
+  # Validation set
+  val_set = val_set.map(
+      lambda e: tokenizer((e[section]), truncation=True, padding='max_length'),
+      batched=True)
+  train_set.set_format(type='torch',
+    columns=['input_ids', 'attention_mask', 'decision'])
+  val_set.set_format(type='torch',
+    columns=['input_ids', 'attention_mask', 'decision'])
+  train_loader = DataLoader(train_set, batch_size=8, shuffle=True)
+  val_loader = DataLoader(val_set, batch_size=8, shuffle=False)
+  return train_loader, val_loader, tokenizer
+def measure_accuracy(outputs, labels):
+  # This function will accept a model's outputs and the actual decisions
+  # and return test accuracy and number of samples.
+  preds = np.argmax(outputs, axis=1).flatten()
+  labels = labels.flatten()
+  correct = np.sum(preds == labels)
+  return correct, len(labels)
+def validation(model, val_loader):
+  # This function accepts a model and a validation set DataLoader as its parameters
+  # and returns the test accuracy.
+  model.eval()
+  total_correct = 0
+  total_samples = 0
+  for batch in val_loader:
+    input_ids = batch['input_ids'].to(device)
+    labels = batch['decision'].to(device)
+    with torch.no_grad():
+      outputs = model(input_ids=input_ids, labels=labels)
+    logits = outputs.logits
+    num_correct, num_samples = measure_accuracy(logits.cpu().numpy(), labels.cpu().numpy())
+    total_correct += num_correct
+    total_samples += num_samples
+    del input_ids, labels, logits
+    gc.collect()
+    torch.cuda.empty_cache()
+  return (total_correct/total_samples) * 100
+def train(device, model, tokenizer, train_loader, val_loader, section):
+  # This function will accept a model, the training set DataLoader, validation set
+  # DataLoader, and section as its parameters and return the trained model.
+  model.train()
+  # Define optimizer.
+  optim = AdamW(model.parameters(), lr=5e-5)
+  num_epochs = 5
+  best_val_acc = 0
+  for epoch in range(num_epochs):
+    for batch in train_loader:
+      optim.zero_grad()
+      input_ids = batch['input_ids'].to(device, non_blocking=True)
+      attention_mask = batch['attention_mask'].to(device, non_blocking=True)
+      labels = batch['decision'].to(device, non_blocking=True)
+      outputs = model(input_ids, attention_mask=attention_mask, labels=labels).logits
+      loss = criterion(outputs, labels)
+      loss.backward()
+      optim.step()
+      del input_ids, attention_mask, labels
+      gc.collect()
+      torch.cuda.empty_cache()
+      # Calculate test accuracy.
+      val_acc = validation(model, val_loader)
+      # Save the model that yields the best test accuracy
+      if best_val_acc < val_acc:
+        best_val_acc = val_acc
+        model.save_pretrained(file_path + section + '/')
+        tokenizer.save_pretrained(file_path + section + '_model_tokenizer/')
+      model.train()
+  return model
+if __name__ == '__main__':
+  device = 'cuda' if torch.cuda.is_available() else 'cpu'
+  parser = argparse.ArgumentParser()
+  parser.add_argument('--section', type=str)
+  args = parser.parse_args()
+  section = args.section
+  dataset_dict = load_from_disk(file_path + 'dataset_dict')
+  train_loader, val_loader, tokenizer = create_dataloaders(dataset_dict, section)
+  del dataset_dict
+  gc.collect()
+  torch.cuda.empty_cache()
+  # Defining the models.
+  config = DistilBertConfig(num_classes=2, output_hidden_states=False)
+  model = DistilBertForSequenceClassification(config=config)
+  model.to(device)
+  # Train the model.
+  model = train(device, model, tokenizer, train_loader, val_loader, section)
+  val_acc = validation(model, val_loader)
+  print(f'*** Accuracy on the validation set ({section}): {val_acc}')