File size: 4,288 Bytes
8c6bdbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import fitz  # PyMuPDF
from paddleocr import PPStructure
from pdf2image import convert_from_path
import numpy as np
import json
import re
import spacy
from spacy.matcher import Matcher
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import gradio as gr
from tqdm.auto import tqdm
import os
# Ensure Poppler is available
os.system("apt-get update -y && apt-get install -y poppler-utils")
# --- Initialization ---
structure_engine = PPStructure(table=True, ocr=True, layout=True)
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

# Regex & matcher setup
date_pattern = r"\d{2}-[A-Za-z]{3}-\d{2}|\d{2}\.\d{2}\.\d{2}"
party_pattern = r"M/s [A-Za-z\s&-]+(?:Consortium)?"
pattern = [{"LOWER": "claimant"}, {"IS_PUNCT": True, "OP": "?"}, {"ENT_TYPE": "ORG"}]
matcher.add("CLAIMANT", [pattern])

# Load Legal-BERT pipelines
ner_model = "nlpaueb/legal-bert-base-uncased"
token_model = AutoModelForTokenClassification.from_pretrained(ner_model)
tokenizer = AutoTokenizer.from_pretrained(ner_model)
ner_pipeline = pipeline("ner", model=token_model, tokenizer=tokenizer, aggregation_strategy="simple")
clf_pipeline = pipeline("text-classification", model=ner_model)

# Helper functions
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    pages = []
    for i in range(len(doc)):
        page = doc[i]
        pages.append({"page": i + 1, "text": page.get_text("text") or ""})
    doc.close()
    return pages


def extract_content_from_images(pdf_path):
    images = convert_from_path(pdf_path)
    results = []
    for i, img in enumerate(images, start=1):
        img_np = np.array(img)
        res = structure_engine(img_np)
        text_lines, tables = [], []
        for block in res:
            if block['type'] == 'text':
                text_lines += [line['text'] for line in block['res'] if 'text' in line]
            elif block['type'] == 'table' and 'html' in block['res']:
                tables.append(block['res']['html'])
        results.append({"page": i, "ocr_text": " ".join(text_lines), "tables_html": tables})
    return results


def extract_metadata(text):
    meta = {"dates": [], "parties": [], "claimants": [], "tribunals": [], "relationships": [], "clauses": []}
    # Regex
    meta['dates'] = re.findall(date_pattern, text)
    meta['parties'] = re.findall(party_pattern, text)
    # SpaCy
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == 'ORG' and ent.text not in meta['parties']:
            meta['parties'].append(ent.text)
        if ent.label_ == 'GPE':
            meta['tribunals'].append(ent.text)
    for match_id, start, end in matcher(doc):
        meta['claimants'].append(doc[start:end].text)
    # Legal-BERT NER
    for ent in ner_pipeline(text):
        grp = ent['entity_group']
        if grp in ('ORG','PARTY') and ent['word'] not in meta['parties']:
            meta['parties'].append(ent['word'])
        if grp == 'GPE' and ent['word'] not in meta['tribunals']:
            meta['tribunals'].append(ent['word'])
    # Clause classification
    for sent in text.split('. '):
        if len(sent) < 10: continue
        try:
            res = clf_pipeline(sent)[0]
            if res['score'] > 0.7:
                meta['clauses'].append({'type': res['label'], 'text': sent})
        except:
            pass
    return meta


def process_pdf(file_obj):
    # Save uploaded file
    pdf_path = file_obj.name
    # 1. Text
    text_pages = extract_text_from_pdf(pdf_path)
    # 2. OCR & tables
    img_content = extract_content_from_images(pdf_path)
    # 3. Metadata
    metadata = []
    for page in text_pages:
        metadata.append({"page": page['page'], "metadata": extract_metadata(page['text'])})
    # Combine
    output = {
        "text_pages": text_pages,
        "image_content": img_content,
        "metadata": metadata
    }
    return output

# Gradio Interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload PDF", file_types=['.pdf']),
    outputs=gr.JSON(label="Extraction Result"),
    title="PDF OCR & Metadata Extractor",
    description="Upload a PDF, wait for processing, and view structured JSON output including text, OCR, tables, and metadata."
)

if __name__ == '__main__':
    iface.launch()