Spaces:

manish-aggarwal
/

file-classification

Sleeping

File size: 2,513 Bytes

c1ccd2b
 
 
 
3318c67
c1ccd2b
 
 
 
 
 
 
 
 
3318c67
c1ccd2b
 
 
 
 
 
 
 
 
 
 
 
 
 
3318c67
 
 
 
 
 
 
 
 
 
 
 
 
 
c1ccd2b
3318c67
 
 
 
c1ccd2b
 
 
 
 
 
 
 
 
3318c67
07be6f3
c1ccd2b
07be6f3
c1ccd2b
 
 
 
 
 
 
3318c67
c1ccd2b
 
3318c67
c1ccd2b

import gradio as gr
from transformers import pipeline
import PyPDF2
from docx import Document
import re

# Load pipelines
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
ner = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english", grouped_entities=True)

# File reading
def read_file(file_obj):
    name = file_obj.name
    if name.endswith(".txt"):
        return file_obj.read().decode("utf-8", errors="ignore")
    elif name.endswith(".pdf"):
        reader = PyPDF2.PdfReader(file_obj)
        return " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
    elif name.endswith(".docx"):
        doc = Document(file_obj)
        return "\n".join([para.text for para in doc.paragraphs])
    else:
        return "Unsupported file format"

# Contract classification
def is_contract(text):
    result = classifier(text[:1000], ["contract", "not a contract"])
    return result['labels'][0] == 'contract', result

# Rule-based + NER-based party extraction
def extract_parties_with_rules(text):
    results = set()

    # Rule-based: between X and Y
    matches = re.findall(r'between\s+(.*?)\s+and\s+(.*?)[\.,\n]', text, re.IGNORECASE)
    for match in matches:
        results.update(match)

    # Rule-based: "X" (Party A), etc.
    named_matches = re.findall(r'“([^”]+)”\s*\(.*?Party [AB]\)', text)
    results.update(named_matches)

    # NER fallback
    entities = ner(text[:1000])
    ner_parties = [ent['word'] for ent in entities if ent['entity_group'] in ['ORG', 'PER']]
    results.update(ner_parties)

    return list(results)

# Main logic
def process_file(file):
    text = read_file(file)
    if not text.strip():
        return "Empty or unreadable file.", None

    is_contract_flag, classification = is_contract(text)
    if is_contract_flag:
        parties = extract_parties_with_rules(text)
        return "✅ This is a contract.", ", ".join(parties)
    else:
        return "❌ This is NOT a contract.", ""

# Gradio interface
iface = gr.Interface(
    fn=process_file,
    inputs=gr.File(file_types=[".txt", ".pdf", ".docx"], label="Upload a document"),
    outputs=[
        gr.Textbox(label="Classification Result"),
        gr.Textbox(label="Detected Parties (ORG/PER or Rule-based)")
    ],
    title="Contract Classifier with RoBERTa",
    description="Upload a document (.pdf, .txt, .docx) to detect if it's a contract and extract involved parties using RoBERTa + Rule-based matching."
)

iface.launch()