File size: 5,692 Bytes
10eea43
a715551
 
4fdb3ac
f77bc9a
a715551
10eea43
bbdc667
704df50
bbdc667
10eea43
 
 
704df50
10eea43
 
a715551
 
62e4c88
704df50
62e4c88
704df50
a715551
 
 
 
 
704df50
 
 
 
 
62e4c88
704df50
a715551
704df50
62e4c88
704df50
a715551
62e4c88
704df50
62e4c88
a715551
 
62e4c88
a715551
37c3cef
 
bbdc667
 
 
62e4c88
a715551
62e4c88
a715551
 
62e4c88
 
223273b
a715551
 
 
62e4c88
a715551
 
 
 
 
 
 
 
 
 
62e4c88
a715551
 
62e4c88
a715551
 
 
 
 
fdc0157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a715551
fdc0157
a715551
 
62e4c88
 
a715551
 
f09760f
704df50
62e4c88
 
 
f77bc9a
a715551
62e4c88
 
 
a715551
4fdb3ac
704df50
 
 
4fdb3ac
 
f77bc9a
a715551
10eea43
bbdc667
704df50
bbdc667
10eea43
 
a715551
10eea43
 
 
 
 
a715551
4fdb3ac
a715551
 
704df50
 
 
a715551
10eea43
 
704df50
 
 
 
4fdb3ac
a715551
4fdb3ac
 
 
a715551
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import pdfplumber
import pandas as pd
import re
import gradio as gr

# Function: Extract Text from PDF
def extract_text_from_pdf(pdf_file):
    """
    Extracts raw text from a PDF file.
    """
    with pdfplumber.open(pdf_file.name) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function: Clean Description
def clean_description(description, item_number=None):
    """
    Cleans the description by removing unwanted patterns dynamically.
    """
    # General unwanted patterns
    description = re.sub(r"Page \d+ of \d+.*", "", description)  # Remove page references
    description = re.sub(r"TOTAL EX-WORK.*", "", description)  # Remove EX-WORK-related text
    description = re.sub(r"NOTES:.*", "", description)  # Remove notes section
    description = re.sub(r"HS CODE.*", "", description)  # Remove HS CODE-related data
    description = re.sub(r"DELIVERY:.*", "", description)  # Remove delivery instructions

    # Remove redundant quantity/price in descriptions
    description = re.sub(r"\d+\s+(Nos\.|Set)\s+[\d.]+\s+[\d.]+", "", description)

    # Specific fix for Item 7
    if item_number == 7:
        description = re.sub(r"300 Sets 4.20 1260.00", "", description)

    return description.strip()

# Function: Parse PO Items with Filters
def parse_po_items_with_filters(text):
    """
    Parses purchase order items from the extracted text systematically.
    """
    lines = text.splitlines()
    data = []
    current_item = None
    description_accumulator = []

    for line in lines:
        print(f"Processing Line: {line}")  # Debugging

        # Match the start of a new item
        item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
        if item_match:
            # Save the previous item
            if current_item:
                current_item["Description"] = clean_description(
                    " ".join(description_accumulator).strip(),
                    item_number=int(current_item["Item"]),
                )
                data.append(current_item)
                description_accumulator = []

            # Start a new item
            current_item = {
                "Item": item_match.group("Item"),
                "Description": "",
                "Qty": "",
                "Unit": "",
                "Unit Price": "",
                "Total Price": "",
            }
            description_accumulator.append(item_match.group("Description"))
        elif current_item:
            # Accumulate additional lines for the current item's description
            description_accumulator.append(line.strip())

        # Match Qty, Unit, Unit Price, and Total Price
        qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
        if qty_match:
            current_item["Qty"] = qty_match.group("Qty")
            current_item["Unit"] = qty_match.group(2)

        # Skip extracting unit price and total price for specific items
        if not re.search(r"(Mfd:-2022|\(NT00192\)|SIZE)", line):
            price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$", line)
            if price_match:
                current_item["Unit Price"] = price_match.group("UnitPrice")
                current_item["Total Price"] = price_match.group("TotalPrice")

        # End of Description: Start new item when description ends with specific pattern
        if re.search(r"(Mfd:-2022|\(NT00192\)|SIZE)", line):
            if current_item:
                current_item["Description"] = clean_description(
                    " ".join(description_accumulator).strip(),
                    item_number=int(current_item["Item"]),
                )
                data.append(current_item)
                description_accumulator = []
                current_item = None  # Reset for the next item

    # Save the last item if not already added
    if current_item:
        current_item["Description"] = clean_description(
            " ".join(description_accumulator).strip(),
            item_number=int(current_item["Item"]),
        )
        data.append(current_item)

    # Remove invalid rows
    data = [row for row in data if row["Description"]]

    # Return data as a DataFrame
    if not data:
        return None, "No items found. Please check the PDF file format."
    df = pd.DataFrame(data)
    return df, "Data extracted successfully."

# Function: Save to Excel
def save_to_excel(df, output_path="extracted_po_data.xlsx"):
    """
    Saves the extracted data to an Excel file.
    """
    df.to_excel(output_path, index=False)
    return output_path

# Gradio Interface Function
def process_pdf(file):
    """
    Processes the uploaded PDF file and returns extracted data and status.
    """
    try:
        text = extract_text_from_pdf(file)
        df, status = parse_po_items_with_filters(text)
        if df is not None:
            output_path = save_to_excel(df)
            return output_path, status
        return None, status
    except Exception as e:
        return None, f"Error during processing: {str(e)}"

# Gradio Interface Setup
def create_gradio_interface():
    """
    Creates a Gradio interface for PO data extraction.
    """
    return gr.Interface(
        fn=process_pdf,
        inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
        outputs=[
            gr.File(label="Download Extracted Data"),
            gr.Textbox(label="Status"),
        ],
        title="PO Data Extraction",
        description="Upload a Purchase Order PDF to extract items into an Excel file.",
    )

if __name__ == "__main__":
    interface = create_gradio_interface()
    interface.launch()