|
import gradio as gr |
|
from transformers import pipeline |
|
import PyPDF2 |
|
from docx import Document |
|
|
|
|
|
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") |
|
ner = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english", grouped_entities=True) |
|
|
|
|
|
def read_file(file_obj): |
|
name = file_obj.name |
|
if name.endswith(".txt"): |
|
return file_obj.read().decode("utf-8") |
|
elif name.endswith(".pdf"): |
|
reader = PyPDF2.PdfReader(file_obj) |
|
return " ".join([page.extract_text() for page in reader.pages if page.extract_text()]) |
|
elif name.endswith(".docx"): |
|
doc = Document(file_obj) |
|
return "\n".join([para.text for para in doc.paragraphs]) |
|
else: |
|
return "Unsupported file format" |
|
|
|
|
|
def is_contract(text): |
|
result = classifier(text[:1000], ["contract", "not a contract"]) |
|
return result['labels'][0] == 'contract', result |
|
|
|
|
|
def extract_parties(text): |
|
entities = ner(text[:1000]) |
|
return list(set(ent['word'] for ent in entities if ent['entity_group'] in ['ORG', 'PER'])) |
|
|
|
|
|
def process_file(file): |
|
text = read_file(file) |
|
if not text.strip(): |
|
return "Empty or unreadable file.", None |
|
|
|
is_contract_flag, classification = is_contract(text) |
|
if is_contract_flag: |
|
parties = extract_parties(text) |
|
return "β
This is a contract.", parties |
|
else: |
|
return "β This is NOT a contract.", [] |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_file, |
|
inputs=gr.File(file_types=[".txt", ".pdf", ".docx"], label="Upload a document"), |
|
outputs=[ |
|
gr.Textbox(label="Classification Result"), |
|
gr.Label(label="Detected Parties") |
|
], |
|
title="Contract Classifier with RoBERTa", |
|
description="Upload a document (.pdf, .txt, .docx) to detect if it's a contract and extract involved parties using RoBERTa." |
|
) |
|
|
|
iface.launch() |