dschandra commited on
Commit
f705371
·
verified ·
1 Parent(s): b61deb6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -123
app.py CHANGED
@@ -3,150 +3,104 @@ import pdfplumber
3
  import pandas as pd
4
  import re
5
 
6
-
7
- def extract_item_code(lines, start_index):
8
- """
9
- Extract the numeric part of the Item Code with better handling of multi-line rows.
10
- """
11
- item_code = ""
12
-
13
- for line in lines[start_index:]:
14
- # Stop processing if a new row starts
15
- if line.strip().isdigit(): # Check for new row start
16
- break
17
-
18
- # Skip lines with unwanted keywords
19
- if any(keyword in line for keyword in ["Calculation Method", "Landed Cost", "SUB TOTAL", "Central GST", "State GST"]):
20
- continue
21
-
22
- # Concatenate valid lines
23
- item_code += " " + line.strip()
24
-
25
- print(f"Concatenated Item Code Line: {item_code}") # Debugging
26
-
27
- # Regex to extract numeric Item Code
28
- pattern = r"(\d{6,12})"
29
- match = re.search(pattern, item_code)
30
- if match:
31
- return match.group(1) # Return the numeric Item Code
32
- else:
33
- print(f"Failed to extract numeric Item Code from: {item_code}")
34
- return "MISSING" # Indicate missing Item Code
35
-
36
-
37
- def extract_row_fields(line):
38
- """
39
- Extract fields like Unit, Delivery Date, Quantity, Basic Price, etc.
40
- """
41
- parts = line.split()
42
- try:
43
- pos = parts[0] if len(parts) > 0 else ""
44
- unit = parts[-7] if len(parts) > 6 else ""
45
- delivery_date = parts[-6] if len(parts) > 5 else ""
46
- quantity = float(parts[-5]) if len(parts) > 4 else 0.0
47
- basic_price = float(parts[-4]) if len(parts) > 3 else 0.0
48
- discount = float(parts[-3]) if len(parts) > 2 else 0.0
49
- cur = parts[-2] if len(parts) > 1 else ""
50
- amount = float(parts[-1]) if len(parts) > 0 else 0.0
51
-
52
- return pos, unit, delivery_date, quantity, basic_price, discount, cur, amount
53
- except (ValueError, IndexError) as e:
54
- print(f"Error extracting row fields: {e}")
55
- return "", "", "", 0.0, 0.0, 0.0, "", 0.0
56
-
57
-
58
- def calculate_totals(amount):
59
  """
60
- Calculate CGST, SGST, and Sub Total.
61
  """
62
- cgst = amount * 0.09 # 9% of Amount
63
- sgst = amount * 0.09 # 9% of Amount
64
- sub_total = amount + cgst + sgst
65
- return cgst, sgst, sub_total
66
-
67
-
68
- def extract_data(pdf_file):
69
- """
70
- Extract data from the uploaded PDF.
71
- """
72
- data = []
73
- skipped_rows = [] # Track rows with missing Item Codes
74
 
75
  with pdfplumber.open(pdf_file) as pdf:
76
  for page in pdf.pages:
77
- text = page.extract_text().splitlines()
78
- print(f"Page {page.page_number} Text: {text}") # Debug raw text
79
-
80
- current_row = {}
81
- for i, line in enumerate(text):
82
- parts = line.split()
83
- try:
84
- pos = int(parts[0]) if parts[0].isdigit() else None
85
- if pos and 10 <= pos <= 450:
86
- # Extract numeric Item Code
87
- item_code = extract_item_code(text, i + 1)
88
-
89
- # Clean the description and append it to the current row
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  if current_row and "Item Code" in current_row:
91
  clean_line = re.sub(
92
- r"(Calculation Method.*|Landed Cost.*|Central GST.*|State GST.*|Perc:.*|"
93
- r"\d+\/\d+|\d+-\d+-\d+|Cal.*Method:.*|\/\d+|"
94
- r"\s{2,}|[A-Za-z]+:[0-9\.]+)",
95
  "",
96
  line
97
  ).strip()
98
-
99
  if clean_line:
100
  current_row["Description"] += f" {clean_line}".strip()
101
 
102
- # Extract other row-specific fields
103
- pos, unit, delivery_date, quantity, basic_price, discount, cur, amount = extract_row_fields(line)
 
104
 
105
- # Calculate totals
106
- cgst, sgst, sub_total = calculate_totals(amount)
 
 
 
107
 
108
- # Append the row to the data list
109
- data.append([pos, item_code, unit, delivery_date, quantity, basic_price, discount, cur, amount, cgst, sgst, sub_total])
 
 
 
 
110
 
111
- # Combine Item Code and Description once processing is complete
112
- if "Description" in current_row:
113
- current_row["Item Code"] = f"{item_code}\n{current_row['Description']}".strip()
114
- del current_row["Description"]
115
 
116
- except Exception as e:
117
- print(f"Error processing line: {line} | Error: {e}")
118
- skipped_rows.append(line) # Track skipped rows
119
- continue
120
 
121
- # Create DataFrame
122
- df = pd.DataFrame(data, columns=["Pos", "Item Code", "Unit", "Delivery Date",
123
- "Quantity", "Basic Price", "Discount", "Cur", "Amount",
124
- "Central GST", "State GST", "Sub Total"])
125
-
126
- # Log skipped rows for debugging
127
- if skipped_rows:
128
- print(f"Skipped Rows: {skipped_rows}")
129
-
130
- # Save to Excel
131
- excel_path = "/tmp/Extracted_PO_Data.xlsx"
132
- df.to_excel(excel_path, index=False)
133
- return excel_path
134
 
 
 
 
 
135
 
136
  # Gradio interface
137
- def run_gradio_interface():
138
- """
139
- Gradio interface for PDF upload and data extraction.
140
- """
141
- iface = gr.Interface(
142
- fn=extract_data,
143
- inputs=gr.File(label="Upload PDF"),
144
- outputs=gr.File(label="Download Excel"),
145
- title="PO Data Extractor",
146
- description="Upload a PDF file to extract Purchase Order data."
147
- )
148
- iface.launch()
149
 
 
 
 
 
 
 
 
150
 
151
  if __name__ == "__main__":
152
- run_gradio_interface()
 
3
  import pandas as pd
4
  import re
5
 
6
+ def extract_cleaned_po_data(pdf_file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  """
8
+ Extract and clean data from a Toshiba PO PDF file.
9
  """
10
+ extracted_data = []
11
+ current_row = {}
 
 
 
 
 
 
 
 
 
 
12
 
13
  with pdfplumber.open(pdf_file) as pdf:
14
  for page in pdf.pages:
15
+ text = page.extract_text()
16
+ if text:
17
+ lines = text.split("\n")
18
+ for line in lines:
19
+ line = line.strip()
20
+
21
+ # Match rows starting with POS and numeric Item Code
22
+ if re.match(r"^\d+\s+\d{12}\s+", line):
23
+ parts = re.split(r'\s+', line, maxsplit=9) # Split only the first 9 elements to handle descriptions correctly
24
+ if len(parts) >= 9:
25
+ # Save the previous row if exists
26
+ if current_row:
27
+ extracted_data.append(current_row)
28
+ current_row = {
29
+ "Pos": parts[0],
30
+ "Item Code": parts[1],
31
+ "Description": "",
32
+ "Unit": parts[2],
33
+ "Delivery Date": parts[3],
34
+ "Quantity": parts[4],
35
+ "Basic Price": parts[5],
36
+ "Discount": parts[6],
37
+ "Cur": parts[7],
38
+ "Amount": parts[8],
39
+ "Sub Total": ""
40
+ }
41
+ elif "SUB TOTAL" in line and current_row:
42
+ # Capture the Sub Total
43
+ sub_total_match = re.search(r"SUB TOTAL\s*:\s*(\d+\.\d+)", line)
44
+ if sub_total_match:
45
+ current_row["Sub Total"] = sub_total_match.group(1)
46
+ extracted_data.append(current_row)
47
+ current_row = {}
48
+ else:
49
+ # Clean and append descriptions only
50
  if current_row and "Item Code" in current_row:
51
  clean_line = re.sub(
52
+ r"(Calculation Method.*|Landed Cost.*|Central GST.*|State GST.*|Perc:.*|\d+\/\d+|\d+-\d+-\d+|Cal.*Method:.*|\/\d+|\s{2,}|[A-Za-z]+:[0-9\.]+)",
 
 
53
  "",
54
  line
55
  ).strip()
 
56
  if clean_line:
57
  current_row["Description"] += f" {clean_line}".strip()
58
 
59
+ # Add the last row if exists
60
+ if current_row:
61
+ extracted_data.append(current_row)
62
 
63
+ # Combine Item Code and Description
64
+ for row in extracted_data:
65
+ if "Description" in row:
66
+ row["Item Code"] = f"{row['Item Code']}\n{row['Description']}".strip()
67
+ del row["Description"]
68
 
69
+ # Convert to DataFrame
70
+ columns = [
71
+ "Pos", "Item Code", "Unit", "Delivery Date", "Quantity",
72
+ "Basic Price", "Discount", "Cur", "Amount", "Sub Total"
73
+ ]
74
+ df = pd.DataFrame(extracted_data, columns=columns)
75
 
76
+ # Ensure Pos is numeric and filter rows for POS 10 to POS 450
77
+ df['Pos'] = pd.to_numeric(df['Pos'], errors='coerce')
78
+ df = df[(df['Pos'] >= 10) & (df['Pos'] <= 450)]
 
79
 
80
+ # Identify missing POS numbers
81
+ expected_pos = set(range(10, 451))
82
+ extracted_pos = set(df['Pos'].dropna().astype(int))
83
+ missing_pos = sorted(expected_pos - extracted_pos)
84
 
85
+ print("Missing POS numbers:", missing_pos) # Debug output to identify skipped POS numbers
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ # Save as Excel for download
88
+ output_path = "cleaned_extracted_po_data.xlsx"
89
+ df.to_excel(output_path, index=False)
90
+ return output_path
91
 
92
  # Gradio interface
93
+ def process_pdf(file):
94
+ excel_path = extract_cleaned_po_data(file.name)
95
+ return excel_path
 
 
 
 
 
 
 
 
 
96
 
97
+ iface = gr.Interface(
98
+ fn=process_pdf,
99
+ inputs=gr.File(label="Upload Toshiba PO PDF"),
100
+ outputs=gr.File(label="Download Cleaned Extracted Excel"),
101
+ title="Toshiba PO Data Extraction",
102
+ description="Upload a Toshiba PO PDF file to extract cleaned data in the specified format and download as an Excel file.",
103
+ )
104
 
105
  if __name__ == "__main__":
106
+ iface.launch()