File size: 2,640 Bytes
2d5fb99
bbf7aff
 
90928a3
2efaef6
 
2d5fb99
bbf7aff
 
 
bebd184
2d5fb99
9bcd196
90928a3
 
 
 
 
 
 
 
 
 
2efaef6
 
 
2d5fb99
 
 
 
 
2efaef6
2d5fb99
 
bbf7aff
fd97dee
5fd502b
dfc2950
9bcd196
 
 
bbf7aff
2d5fb99
bbf7aff
9bcd196
 
 
 
2d5fb99
bbf7aff
 
7daebfc
 
 
 
 
 
 
 
 
 
 
bbf7aff
 
 
 
 
 
 
 
 
2d5fb99
 
bbf7aff
8aad599
 
bbf7aff
8aad599
bbf7aff
9bcd196
8aad599
 
0cd3e56
8aad599
 
0cd3e56
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import pandas as pd


# finetuned model
language_model_path = "juliaannjose/finetuned_model"

# load the dataset to
# use the patent number, abstract and claim columns for UI
with st.spinner("Loading..."):
    dataset_dict = load_dataset(
        "HUPD/hupd",
        name="sample",
        data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
        icpr_label=None,
        train_filing_start_date="2016-01-01",
        train_filing_end_date="2016-01-21",
        val_filing_start_date="2016-01-22",
        val_filing_end_date="2016-01-31",
    )
    df_train = pd.DataFrame(dataset_dict["train"])
    df_val = pd.DataFrame(dataset_dict["validation"])
    df = pd.concat([df_train, df_val], ignore_index=True)


# drop down menu with patent numbers
_patent_id = st.selectbox(
    "Select the Patent Number",
    options=df["patent_number"],
)


# display abstract and claim
def get_abs_claim(_pid):
    # get abstract and claim corresponding to this patent id
    _abs = df.loc[df["patent_number"] == _pid]["abstract"]
    _cl = df.loc[df["patent_number"] == _pid]["claims"]
    return _abs.values[0], _cl.values[0]


_abstract, _claim = get_abs_claim(_patent_id)
st.title("Abstract:")  # display abstract
st.write(_abstract)
st.title("Claim:")  # display claims
st.write(_claim)


# model and tokenizer initialization
@st.cache_resource
def load_model(language_model_path):
    tokenizer = AutoTokenizer.from_pretrained(language_model_path)
    model = AutoModelForSequenceClassification.from_pretrained(language_model_path)
    return tokenizer, model


tokenizer, model = load_model(language_model_path)
# input to our model
input_text = _abstract + _claim
# get tokens
inputs = tokenizer(
    input_text,
    truncation=True,
    padding=True,
    return_tensors="pt",
)

# get predictions
id2label = {0: "REJECTED", 1: "ACCEPTED"}
# when submit button clicked, run the model and get result
if st.button("Submit"):
    with torch.no_grad():
        outputs = model(**inputs)
        probability = torch.nn.functional.softmax(outputs.logits, dim=1)

    predicted_class_id = probability.argmax().item()
    pred_label = id2label[predicted_class_id]
    st.title("Predicted Patentability")
    if probability[0][0] > probability[0][1]:
        st.write("Rejection Score:")
        st.write(probability[0][0].item())
    else:
        st.write("Acceptance Score:")
        st.write(probability[0][1].item())
    st.write("Result:", pred_label)