Spaces:

manish-aggarwal
/

file-classification

Sleeping

App Files Files Community

manish-aggarwal commited on May 14

Commit

3318c67

verified ·

1 Parent(s): 07be6f3

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -7

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 from transformers import pipeline
 import PyPDF2
 from docx import Document
 # Load pipelines
 classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
@@ -11,7 +12,7 @@ ner = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english", grouped_e
 def read_file(file_obj):
     name = file_obj.name
     if name.endswith(".txt"):
-        return file_obj.read().decode("utf-8")
     elif name.endswith(".pdf"):
         reader = PyPDF2.PdfReader(file_obj)
         return " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
@@ -26,10 +27,25 @@ def is_contract(text):
     result = classifier(text[:1000], ["contract", "not a contract"])
     return result['labels'][0] == 'contract', result
-# Party extraction
-def extract_parties(text):
     entities = ner(text[:1000])
-    return list(set(ent['word'] for ent in entities if ent['entity_group'] in ['ORG', 'PER']))
 # Main logic
 def process_file(file):
@@ -39,7 +55,7 @@ def process_file(file):
     is_contract_flag, classification = is_contract(text)
     if is_contract_flag:
-        parties = extract_parties(text)
         return "✅ This is a contract.", ", ".join(parties)
     else:
         return "❌ This is NOT a contract.", ""
@@ -50,10 +66,10 @@ iface = gr.Interface(
     inputs=gr.File(file_types=[".txt", ".pdf", ".docx"], label="Upload a document"),
     outputs=[
         gr.Textbox(label="Classification Result"),
-        gr.Textbox(label="Detected Parties (ORG/PER)")
     ],
     title="Contract Classifier with RoBERTa",
-    description="Upload a document (.pdf, .txt, .docx) to detect if it's a contract and extract involved parties using RoBERTa."
 )
 iface.launch()

 from transformers import pipeline
 import PyPDF2
 from docx import Document
+import re
 # Load pipelines
 classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
 def read_file(file_obj):
     name = file_obj.name
     if name.endswith(".txt"):
+        return file_obj.read().decode("utf-8", errors="ignore")
     elif name.endswith(".pdf"):
         reader = PyPDF2.PdfReader(file_obj)
         return " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
     result = classifier(text[:1000], ["contract", "not a contract"])
     return result['labels'][0] == 'contract', result
+# Rule-based + NER-based party extraction
+def extract_parties_with_rules(text):
+    results = set()
+    # Rule-based: between X and Y
+    matches = re.findall(r'between\s+(.*?)\s+and\s+(.*?)[\.,\n]', text, re.IGNORECASE)
+    for match in matches:
+        results.update(match)
+    # Rule-based: "X" (Party A), etc.
+    named_matches = re.findall(r'“([^”]+)”\s*\(.*?Party [AB]\)', text)
+    results.update(named_matches)
+    # NER fallback
     entities = ner(text[:1000])
+    ner_parties = [ent['word'] for ent in entities if ent['entity_group'] in ['ORG', 'PER']]
+    results.update(ner_parties)
+    return list(results)
 # Main logic
 def process_file(file):
     is_contract_flag, classification = is_contract(text)
     if is_contract_flag:
+        parties = extract_parties_with_rules(text)
         return "✅ This is a contract.", ", ".join(parties)
     else:
         return "❌ This is NOT a contract.", ""
     inputs=gr.File(file_types=[".txt", ".pdf", ".docx"], label="Upload a document"),
     outputs=[
         gr.Textbox(label="Classification Result"),
+        gr.Textbox(label="Detected Parties (ORG/PER or Rule-based)")
     ],
     title="Contract Classifier with RoBERTa",
+    description="Upload a document (.pdf, .txt, .docx) to detect if it's a contract and extract involved parties using RoBERTa + Rule-based matching."
 )
 iface.launch()