Spaces:
Sleeping
Sleeping
File size: 2,513 Bytes
c1ccd2b 3318c67 c1ccd2b 3318c67 c1ccd2b 3318c67 c1ccd2b 3318c67 c1ccd2b 3318c67 07be6f3 c1ccd2b 07be6f3 c1ccd2b 3318c67 c1ccd2b 3318c67 c1ccd2b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import gradio as gr
from transformers import pipeline
import PyPDF2
from docx import Document
import re
# Load pipelines
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
ner = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english", grouped_entities=True)
# File reading
def read_file(file_obj):
name = file_obj.name
if name.endswith(".txt"):
return file_obj.read().decode("utf-8", errors="ignore")
elif name.endswith(".pdf"):
reader = PyPDF2.PdfReader(file_obj)
return " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
elif name.endswith(".docx"):
doc = Document(file_obj)
return "\n".join([para.text for para in doc.paragraphs])
else:
return "Unsupported file format"
# Contract classification
def is_contract(text):
result = classifier(text[:1000], ["contract", "not a contract"])
return result['labels'][0] == 'contract', result
# Rule-based + NER-based party extraction
def extract_parties_with_rules(text):
results = set()
# Rule-based: between X and Y
matches = re.findall(r'between\s+(.*?)\s+and\s+(.*?)[\.,\n]', text, re.IGNORECASE)
for match in matches:
results.update(match)
# Rule-based: "X" (Party A), etc.
named_matches = re.findall(r'β([^β]+)β\s*\(.*?Party [AB]\)', text)
results.update(named_matches)
# NER fallback
entities = ner(text[:1000])
ner_parties = [ent['word'] for ent in entities if ent['entity_group'] in ['ORG', 'PER']]
results.update(ner_parties)
return list(results)
# Main logic
def process_file(file):
text = read_file(file)
if not text.strip():
return "Empty or unreadable file.", None
is_contract_flag, classification = is_contract(text)
if is_contract_flag:
parties = extract_parties_with_rules(text)
return "β
This is a contract.", ", ".join(parties)
else:
return "β This is NOT a contract.", ""
# Gradio interface
iface = gr.Interface(
fn=process_file,
inputs=gr.File(file_types=[".txt", ".pdf", ".docx"], label="Upload a document"),
outputs=[
gr.Textbox(label="Classification Result"),
gr.Textbox(label="Detected Parties (ORG/PER or Rule-based)")
],
title="Contract Classifier with RoBERTa",
description="Upload a document (.pdf, .txt, .docx) to detect if it's a contract and extract involved parties using RoBERTa + Rule-based matching."
)
iface.launch() |