Spaces:

amamrnaf
/

data_extraction_demo

Sleeping

App Files Files Community

Amamrnaf commited on Mar 3

Commit

2d39f2f

1 Parent(s): daaaba0

update

Browse files

Files changed (3) hide show

app.py +25 -1
dataSchema.py +1 -1
excel_to_pdf.py +63 -0

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import pymupdf  #type: ignore
 from PIL import Image
 import os
 from functions import get_image_informations
 from dataSchema import *
 # import shutil
@@ -183,7 +184,7 @@ def process_file(file, option):
         if file_extension in ['.pdf']:
             # Process PDF files
             if option == "Noc_timesheet_residential_old":
-                print(file_path)
                 Noc_timeSheet_pdf_to_img(file_path)
                 print("here 2")
                 result = get_image_informations("output.jpg", Noc_Res_timesheet_prompt, Noc_Res_timeSheet_parser)
@@ -221,6 +222,29 @@ def process_file(file, option):
             elif option == "Noc_invoice":
                 result = get_image_informations(file_path, Noc_invoice_prompt, Noc_invoice_parser_v1)
                 return result
         else:
             return "Unsupported file type. Please upload a PDF or image file."
     except Exception as e:

 from PIL import Image
 import os
 from functions import get_image_informations
+from excel_to_pdf import excel_to_pdf
 from dataSchema import *
 # import shutil
         if file_extension in ['.pdf']:
             # Process PDF files
             if option == "Noc_timesheet_residential_old":
                 Noc_timeSheet_pdf_to_img(file_path)
                 print("here 2")
                 result = get_image_informations("output.jpg", Noc_Res_timesheet_prompt, Noc_Res_timeSheet_parser)
             elif option == "Noc_invoice":
                 result = get_image_informations(file_path, Noc_invoice_prompt, Noc_invoice_parser_v1)
                 return result
+        elif file_extension in ['.xls','.xlsx']:
+            if option == "Noc_timesheet_residential_old":
+                Noc_timeSheet_pdf_to_img(file_path)
+                print("here 2")
+                result = get_image_informations("output.jpg", Noc_Res_timesheet_prompt, Noc_Res_timeSheet_parser)
+                return result
+            elif option == "Noc_timesheet_rotational_old":
+                Noc_timeSheet_pdf_to_img(file_path)
+                result = get_image_informations("output.jpg", Noc_Rot_timesheet_prompt, Noc_Rot_timeSheet_parser)
+                return result
+            elif option == "Noc_PO":
+                result = noc_invoice_extraction(file_path, save_dir)
+                return result
+            elif option =="Noc_timesheet_new":
+                pdf_to_img(file_path)
+                result = get_image_informations("output.jpg", Noc_timesheet_prompt, Noc_timesheet_parser_v1)
+                return result
+            elif option == "Noc_invoice":
+                pdf_to_img(file_path)
+                result = get_image_informations("output.jpg", Noc_invoice_prompt, Noc_invoice_parser_v1)
+                return result
         else:
             return "Unsupported file type. Please upload a PDF or image file."
     except Exception as e:

dataSchema.py CHANGED Viewed

@@ -184,7 +184,7 @@ Based on the provided timesheet details, extract the following information:
 - Number of extended hitch days onshore for rotational personnel
 - Number of over time hours onshore (Over 8 hours)
 - Number of over time hours offshore (Over 12 hours)
-- Number of Per Diem days for onshore/offshore rotational personnel
 - Number of training days
 - Number of travel days

 - Number of extended hitch days onshore for rotational personnel
 - Number of over time hours onshore (Over 8 hours)
 - Number of over time hours offshore (Over 12 hours)
+- Number of Per Diem
 - Number of training days
 - Number of travel days

excel_to_pdf.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import pandas as pd
+import os
+import xlrd  # Reads old .xls files
+from openpyxl import Workbook
+from openpyxl import load_workbook
+from reportlab.pdfgen import canvas
+from reportlab.lib.pagesizes import landscape, portrait, A0, A1, A2, A3, A4
+from reportlab.lib.colors import Color, black
+def convert_xls_to_xlsx(xls_path):
+    """Convert .xls to .xlsx while keeping all sheets and data intact."""
+    if not xls_path.endswith(".xls"):
+        return xls_path
+    xlsx_path = xls_path.replace(".xls", ".xlsx")
+    # Open .xls file using xlrd
+    book = xlrd.open_workbook(xls_path)
+    new_book = Workbook()
+    new_book.remove(new_book.active)
+    for sheet_index in range(book.nsheets):
+        sheet = book.sheet_by_index(sheet_index)
+        new_sheet = new_book.create_sheet(title=sheet.name)
+        for row_idx in range(sheet.nrows):
+            for col_idx in range(sheet.ncols):
+                cell_value = sheet.cell(row_idx, col_idx).value
+                new_sheet.cell(row=row_idx + 1, column=col_idx + 1, value=cell_value)
+    new_book.save(xlsx_path)
+    print(f"Converted {xls_path} to {xlsx_path}")
+    return xlsx_path
+def excel_to_pdf(excel_file, pdf_file = "output.pdf"):
+    excel_file = convert_xls_to_xlsx(excel_file)  # Convert if .xls
+    workbook = load_workbook(excel_file, data_only=True)
+    c = canvas.Canvas(pdf_file)
+    for sheet_index, sheet in enumerate(workbook.worksheets):
+        print("hello")
+        num_columns = sheet.max_column
+        num_rows = sheet.max_row
+        page_size = A2  # page size, customize as needed
+        c.setPageSize(page_size)
+        if sheet_index > 0:
+            c.showPage()
+        page_width, page_height = page_size
+        y = page_height - 20  # Start from top
+        for row in sheet.iter_rows():
+            x = 10  # Start from left
+            for cell in row:
+                c.drawString(x, y, str(cell.value or ""))
+                x += 150  # Adjust column width
+            y -= 20  # Adjust row height
+    c.save()