File size: 2,513 Bytes
c1ccd2b
 
 
 
3318c67
c1ccd2b
 
 
 
 
 
 
 
 
3318c67
c1ccd2b
 
 
 
 
 
 
 
 
 
 
 
 
 
3318c67
 
 
 
 
 
 
 
 
 
 
 
 
 
c1ccd2b
3318c67
 
 
 
c1ccd2b
 
 
 
 
 
 
 
 
3318c67
07be6f3
c1ccd2b
07be6f3
c1ccd2b
 
 
 
 
 
 
3318c67
c1ccd2b
 
3318c67
c1ccd2b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import gradio as gr
from transformers import pipeline
import PyPDF2
from docx import Document
import re

# Load pipelines
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
ner = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english", grouped_entities=True)

# File reading
def read_file(file_obj):
    name = file_obj.name
    if name.endswith(".txt"):
        return file_obj.read().decode("utf-8", errors="ignore")
    elif name.endswith(".pdf"):
        reader = PyPDF2.PdfReader(file_obj)
        return " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
    elif name.endswith(".docx"):
        doc = Document(file_obj)
        return "\n".join([para.text for para in doc.paragraphs])
    else:
        return "Unsupported file format"

# Contract classification
def is_contract(text):
    result = classifier(text[:1000], ["contract", "not a contract"])
    return result['labels'][0] == 'contract', result

# Rule-based + NER-based party extraction
def extract_parties_with_rules(text):
    results = set()

    # Rule-based: between X and Y
    matches = re.findall(r'between\s+(.*?)\s+and\s+(.*?)[\.,\n]', text, re.IGNORECASE)
    for match in matches:
        results.update(match)

    # Rule-based: "X" (Party A), etc.
    named_matches = re.findall(r'β€œ([^”]+)”\s*\(.*?Party [AB]\)', text)
    results.update(named_matches)

    # NER fallback
    entities = ner(text[:1000])
    ner_parties = [ent['word'] for ent in entities if ent['entity_group'] in ['ORG', 'PER']]
    results.update(ner_parties)

    return list(results)

# Main logic
def process_file(file):
    text = read_file(file)
    if not text.strip():
        return "Empty or unreadable file.", None

    is_contract_flag, classification = is_contract(text)
    if is_contract_flag:
        parties = extract_parties_with_rules(text)
        return "βœ… This is a contract.", ", ".join(parties)
    else:
        return "❌ This is NOT a contract.", ""

# Gradio interface
iface = gr.Interface(
    fn=process_file,
    inputs=gr.File(file_types=[".txt", ".pdf", ".docx"], label="Upload a document"),
    outputs=[
        gr.Textbox(label="Classification Result"),
        gr.Textbox(label="Detected Parties (ORG/PER or Rule-based)")
    ],
    title="Contract Classifier with RoBERTa",
    description="Upload a document (.pdf, .txt, .docx) to detect if it's a contract and extract involved parties using RoBERTa + Rule-based matching."
)

iface.launch()