Spaces:

minemile
/

arxiv-tag-classification

Sleeping

App Files Files Community

evstifeev.stepan commited on Apr 7

Commit

920d148

0 Parent(s):

init

Browse files

Files changed (4) hide show

.gitattributes +35 -0
README.md +17 -0
app.py +177 -0
requirements.txt +4 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,17 @@

+---
+title: Arxiv Tag Classifier
+emoji: 🌖
+colorFrom: green
+colorTo: indigo
+sdk: streamlit
+sdk_version: 1.44.1
+app_file: app.py
+pinned: false
+license: mit
+short_description: Classify ArXiv paper with ease
+---
+# ArXiv Paper Classifier
+A Streamlit web application to predict subject category (or categories) for scientific papers from
+[arXiv](https://arxiv.org/), using a fine-tuned transformer model.

app.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import logging
+import numpy as np
+import streamlit as st
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+MAPPING_FROM_TAG_TO_CATEGORY = {
+    "cs.AI": ["Computer Science", "Artificial Intelligence"],
+    "cs.CL": ["Computer Science", "Computation and Language"],
+    "cs.CV": ["Computer Science", "Computer Vision and Pattern Recognition"],
+    "cs.NE": ["Computer Science", "Neural and Evolutionary Computing"],
+    "stat.ML": ["Statistics", "Machine Learning"],
+    "cs.LG": ["Computer Science", "Machine Learning"],
+    "physics.soc-ph": ["Physics", "Physics and Society"],
+    "stat.AP": ["Statistics", "Applications"],
+    "cs.RO": ["Computer Science", "Robotics"],
+    "cs.MA": ["Computer Science", "Multiagent Systems"],
+    "math.OC": ["Mathematics", "Optimization and Control"],
+    "cs.IR": ["Computer Science", "Information Retrieval"],
+    "stat.ME": ["Statistics", "Methodology"],
+    "cs.DC": ["Computer Science", "Distributed, Parallel, and Cluster Computing"],
+    "stat.CO": ["Statistics", "Computation"],
+    "q-bio.NC": ["Quantitative Biology", "Neurons and Cognition"],
+    "cs.GT": ["Computer Science", "Computer Science and Game Theory"],
+    "cs.MM": ["Computer Science", "Multimedia"],
+    "cs.CR": ["Computer Science", "Cryptography and Security"],
+    "cs.HC": ["Computer Science", "Human-Computer Interaction"],
+    "cs.SD": ["Computer Science", "Sound"],
+    "cs.GR": ["Computer Science", "Graphics"],
+    "cs.CY": ["Computer Science", "Computers and Society"],
+    "math.ST": ["Mathematics", "Statistics Theory"],
+    "stat.TH": ["Statistics", "Statistics Theory"],
+    "cs.IT": ["Computer Science", "Information Theory"],
+    "math.IT": ["Mathematics", "Information Theory"],
+    "cs.SI": ["Computer Science", "Social and Information Networks"],
+    "cs.DB": ["Computer Science", "Databases"],
+    "cs.LO": ["Computer Science", "Logic in Computer Science"],
+    "cs.SY": ["Computer Science", "Systems and Control"],
+    "q-bio.QM": ["Quantitative Biology", "Quantitative Methods"],
+    "cs.DS": ["Computer Science", "Data Structures and Algorithms"],
+    "cs.NA": ["Computer Science", "Numerical Analysis"],
+    "cs.CE": ["Computer Science", "Computational Engineering, Finance, and Science"],
+}
+MODEL_PATH = "minemile/arxiv-tag-classifier"
+MODEL_MAX_LENGTH = 512
+CUM_PROB_THRESHOLD = 0.95
+@st.cache_resource
+def load_model_and_tokenizer(model_path):
+    logger.info("Loading model and tokenizer from %s", model_path)
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        model = AutoModelForSequenceClassification.from_pretrained(
+            model_path, device_map="cpu"
+        )
+        model.eval()
+        logger.info("Model and tokenizer loaded successfully.")
+        return model, tokenizer
+    except Exception as e:
+        st.error(f"An error occurred during prediction: {e}")
+        return None, None
+def inference(model, tokenizer, title, summary=None):
+    if not model or not tokenizer:
+        st.error("Model or tokenizer not loaded. Cannot predict.")
+        return None, None
+    combined_text = title
+    if summary is not None:
+        combined_text += " " + summary
+    logger.info("Predicting for text: %s", combined_text)
+    try:
+        tokenized_inputs = tokenizer(
+            combined_text,
+            return_tensors="pt",
+            truncation=True,
+            padding=True,
+            max_length=MODEL_MAX_LENGTH,
+        )
+        with torch.no_grad():
+            logits = model(**tokenized_inputs).logits
+            probabilities = torch.softmax(logits, dim=-1).squeeze().cpu().numpy()
+        return probabilities
+    except Exception as e:
+        st.error(f"An error occurred during prediction: {e}")
+        return None, None
+def prepare_output(probabilities, id2label):
+    top_indices = np.argsort(probabilities)[::-1]
+    cum_sum = 0.0
+    top_tags = []
+    top_category = []
+    top_subcategory = []
+    top_probas = []
+    for indx in top_indices:
+        tag = id2label[indx]
+        if tag not in MAPPING_FROM_TAG_TO_CATEGORY:
+            top_tags.append(f"{tag}")
+            top_category.append("Unknown")
+            top_subcategory.append("Unknown")
+            logger.warning("Tag %s not found in mapping from tag to category.", tag)
+        else:
+            top_tags.append(f"{tag}")
+            top_category.append(MAPPING_FROM_TAG_TO_CATEGORY[tag][0])
+            top_subcategory.append(MAPPING_FROM_TAG_TO_CATEGORY[tag][1])
+        top_probas.append(f"{probabilities[indx]*100:.2f}%")
+        cum_sum += probabilities[indx]
+        if cum_sum >= CUM_PROB_THRESHOLD or len(top_tags) >= 5:
+            break
+    return {
+        "Tag": top_tags,
+        "Category": top_category,
+        "Subcategory": top_subcategory,
+        "Probability": top_probas,
+    }
+def main():
+    st.set_page_config(page_title="ArXiv Category Tag Classifier", layout="wide")
+    st.title("ArXiv Category Tag Classifier")
+    with st.spinner("Loading model and tokenizer..."):
+        model, tokenizer = load_model_and_tokenizer(MODEL_PATH)
+    if model is None or tokenizer is None:
+        st.error("Failed to load model/tokenizer.")
+        return
+    st.markdown(
+        f"Enter the title (required) and summary (abstract) of an ArXiv paper to predict its "
+        f"ArXiv category using a transformer model. There are {len(model.config.id2label)} available categories."
+    )
+    st.divider()
+    col1, col2 = st.columns(2)
+    with col1:
+        title = st.text_input(
+            "Title",
+            placeholder="Enter the title of the paper (Required)",
+        )
+    with col2:
+        paper_summary = st.text_area(
+            "Paper Summary (Optional):",
+            placeholder="Paste the paper's abstract here. It can increase the accuracy of the prediction.",
+            height=180,
+        )
+    predict_button = st.button("Predict Category", type="primary")
+    st.markdown("---")
+    if predict_button:
+        if not title:
+            st.error("Title of the paper is required!")
+        else:
+            with st.spinner("Predicting category..."):
+                class_probabilities = inference(model, tokenizer, title, paper_summary)
+            if class_probabilities is not None:
+                st.subheader("Top Tags Predictions:")
+                output = prepare_output(class_probabilities, model.config.id2label)
+                logger.info("Output: %s", output)
+                st.dataframe(output, use_container_width=True)
+    st.markdown("---")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+streamlit
+transformers
+torch
+numpy