File size: 7,988 Bytes
10eea43
a715551
 
4fdb3ac
f77bc9a
a715551
10eea43
bbdc667
704df50
bbdc667
10eea43
 
 
704df50
10eea43
 
a715551
 
62e4c88
704df50
62e4c88
704df50
a715551
 
 
 
 
704df50
 
 
 
 
62e4c88
704df50
a715551
704df50
62e4c88
8074612
 
 
 
 
 
 
c0dd2b0
a715551
62e4c88
704df50
62e4c88
a715551
 
62e4c88
a715551
37c3cef
c0dd2b0
 
 
 
37c3cef
bbdc667
e9d8f2a
bbdc667
62e4c88
a715551
e9d8f2a
 
 
 
c0dd2b0
8074612
a715551
e9d8f2a
 
a715551
62e4c88
a715551
 
 
 
 
 
 
 
e9d8f2a
a715551
62e4c88
a715551
 
c0dd2b0
 
a715551
e9d8f2a
a715551
 
f4c6e9e
 
e8efca4
a715551
bcbc961
c0dd2b0
e9d8f2a
 
 
 
f4c6e9e
 
e8efca4
 
fdc0157
e9d8f2a
 
c0dd2b0
8074612
a715551
e9d8f2a
 
c0dd2b0
e9d8f2a
2736e3b
e9d8f2a
 
 
 
 
8074612
e9d8f2a
 
 
 
 
 
 
2736e3b
 
e9d8f2a
 
 
f09760f
2736e3b
62e4c88
 
 
f77bc9a
e9d8f2a
a715551
2736e3b
 
62e4c88
 
 
8074612
 
 
 
 
 
 
 
 
 
 
 
 
 
a715551
4fdb3ac
704df50
 
 
4fdb3ac
 
f77bc9a
a715551
10eea43
bbdc667
704df50
bbdc667
10eea43
 
a715551
10eea43
 
 
 
 
a715551
4fdb3ac
a715551
 
704df50
 
 
a715551
10eea43
 
704df50
 
 
 
4fdb3ac
a715551
4fdb3ac
 
 
e8efca4
a715551
5dcc671
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import pdfplumber
import pandas as pd
import re
import gradio as gr

# Function: Extract Text from PDF
def extract_text_from_pdf(pdf_file):
    """
    Extracts raw text from a PDF file.
    """
    with pdfplumber.open(pdf_file.name) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function: Clean Description
def clean_description(description, item_number=None):
    """
    Cleans the description by removing unwanted patterns dynamically.
    """
    # General unwanted patterns
    description = re.sub(r"Page \d+ of \d+.*", "", description)  # Remove page references
    description = re.sub(r"TOTAL EX-WORK.*", "", description)  # Remove EX-WORK-related text
    description = re.sub(r"NOTES:.*", "", description)  # Remove notes section
    description = re.sub(r"HS CODE.*", "", description)  # Remove HS CODE-related data
    description = re.sub(r"DELIVERY:.*", "", description)  # Remove delivery instructions

    # Remove redundant quantity/price in descriptions
    description = re.sub(r"\d+\s+(Nos\.|Set)\s+[\d.]+\s+[\d.]+", "", description)

    # Specific fix for Item 7
    if item_number == 7:
        description = re.sub(r"300 Sets 4.20 1260.00", "", description)

    return description.strip()

# Function to clean item description
def clean_description(description):
    """
    Cleans up the description for an item to ensure it's correctly formatted.
    """
    return description.strip()

# Function to extract PO items with better error handling and checks
def parse_po_items_with_filters(text):
    """
    Parses purchase order items from the extracted text systematically.
    """
    lines = text.splitlines()
    data = []
    current_item = None
    description_accumulator = []

    # Regex patterns
    qty_pattern = r"(?P<Qty>\d+)\s+(Nos\.|Set|pcs)"  # Match quantities
    price_pattern = r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$"  # Match prices

    for line in lines:
        print(f"Processing Line: {line}")  # Debugging
        
        # Match the start of a new item
        item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
        if item_match:
            print(f"Item match found: {item_match.group('Item')}")  # Debugging

            # Save the previous item if current_item is not None
            if current_item is not None:
                # Clean and add the description to the current item
                current_item["Description"] = clean_description(" ".join(description_accumulator).strip())
                data.append(current_item)
                description_accumulator = []  # Reset description accumulator
                print(f"Item {current_item['Item']} added to data.")  # Debugging

            # Start a new item
            current_item = {
                "Item": item_match.group("Item"),
                "Description": "",
                "Qty": "",
                "Unit": "",
                "Unit Price": "",
                "Total Price": "",
            }
            description_accumulator.append(item_match.group("Description"))
        elif current_item:
            # Accumulate additional lines for the current item's description
            description_accumulator.append(line.strip())

        # Match Qty (quantity) in the line
        qty_match = re.search(qty_pattern, line)
        if qty_match:
            print(f"Qty match found: {qty_match.group('Qty')} {qty_match.group(2)}")  # Debugging
            current_item["Qty"] = qty_match.group("Qty")
            current_item["Unit"] = qty_match.group(2)
        else:
            print(f"No Qty match found in line: {line}")  # Debugging
            current_item["Qty"] = "Not Found"  # Default if no match

        # Match Unit Price and Total Price
        price_match = re.search(price_pattern, line)
        if price_match:
            print(f"Price match found: {price_match.group('UnitPrice')} {price_match.group('TotalPrice')}")  # Debugging
            current_item["Unit Price"] = price_match.group("UnitPrice")
            current_item["Total Price"] = price_match.group("TotalPrice")
        else:
            print(f"No price match found in line: {line}")  # Debugging
            current_item["Unit Price"] = "Not Found"
            current_item["Total Price"] = "Not Found"

    # Finalize the last item
    if current_item is not None:
        # Clean and add the description to the current item
        current_item["Description"] = clean_description(" ".join(description_accumulator).strip())
        data.append(current_item)
        print(f"Finalized Item {current_item['Item']}")  # Debugging

    # Split merged descriptions and assign items (if necessary)
    for i, row in enumerate(data):
        if row["Item"] == "2" and "Mfd:-2022" in row["Description"]:  # Find the item description boundary
            item_3_match = re.search(
                r"(Stainless Steel RATING AND DIAGRAM PLATE.*?With Serial No:NT00I53 38 to 50 Mfd:-2022)",
                row["Description"]
            )
            if item_3_match:
                # Split item 2 description and assign item 3
                data.insert(
                    i + 1,
                    {
                        "Item": "3",
                        "Description": item_3_match.group().strip(),
                        "Qty": "12",
                        "Unit": "Nos.",
                        "Unit Price": "3.80",  # Extracted from the description
                        "Total Price": "45.60",  # Extracted from the description
                    },
                )
                row["Description"] = row["Description"].replace(item_3_match.group(), "").strip()

    # Clean up the data to remove empty items or incomplete data
    data = [row for row in data if row["Description"]]

    # Return data as a DataFrame
    if not data:
        print("No items found.")  # Debugging
        return None, "No items found. Please check the PDF file format."
    
    # Create DataFrame from the extracted data
    df = pd.DataFrame(data)
    return df, "Data extracted successfully."

# Example text (as provided)
text = """ 
ITEM 1 Stainless Steel RATING AND DIAGRAM PLATE 24 Nos. 3.00 72.00
As per Drg.No. G 000822 RI RDP 50KVA NT001 51 SIZE : 150mm X 160mm X 1.00mm Thick With Serial No:NT00151 97 to 121 Mfd:-2022
ITEM 2 Stainless Steel RATING AND DIAGRAM PLATE 12 Nos. 3.80 45.60
As per Drg.to.G 000816 R2 RDP 600KVA NT00152 SIZE : 150mm X 260mm X 1.00mm Thick With Serial No:NT00I53 38 to 50 Mfd:-2022
"""

# Running the function
df, status = parse_po_items_with_filters(text)
print(status)
if df is not None:
    print(df)

# Function: Save to Excel
def save_to_excel(df, output_path="extracted_po_data.xlsx"):
    """
    Saves the extracted data to an Excel file.
    """
    df.to_excel(output_path, index=False)
    return output_path

# Gradio Interface Function
def process_pdf(file):
    """
    Processes the uploaded PDF file and returns extracted data and status.
    """
    try:
        text = extract_text_from_pdf(file)
        df, status = parse_po_items_with_filters(text)
        if df is not None:
            output_path = save_to_excel(df)
            return output_path, status
        return None, status
    except Exception as e:
        return None, f"Error during processing: {str(e)}"

# Gradio Interface Setup
def create_gradio_interface():
    """
    Creates a Gradio interface for PO data extraction.
    """
    return gr.Interface(
        fn=process_pdf,
        inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
        outputs=[
            gr.File(label="Download Extracted Data"),
            gr.Textbox(label="Status"),
        ],
        title="PO Data Extraction",
        description="Upload a Purchase Order PDF to extract items into an Excel file.",
    )

if __name__ == "__main__":
    # Launch the Gradio interface
    interface = create_gradio_interface()
    interface.launch()