Spaces:

amamrnaf
/

data_extraction_demo

Sleeping

App Files Files Community

Amamrnaf commited on Jan 14

Commit

6e805b9

1 Parent(s): 2cf3347

app done ?

Browse files

Files changed (3) hide show

app.py +158 -13
dataSchema.py +12 -0
functions.py +48 -0

app.py CHANGED Viewed

@@ -1,25 +1,170 @@
 import gradio as gr
-import fitz  # PyMuPDF for handling PDF files
 def process_pdf(file, option):
     if file is None:
         return "Please upload a PDF file."
     try:
-        # Open the PDF file
-        doc = fitz.open(file.name)
-        text = ""
-        for page in doc:
-            text += page.get_text()
-        doc.close()
         # Process based on the selected option
-        if option == "Option 1":
-            return f"Option 1 selected. Extracted text:\n{text[:500]}..."  # Truncated for brevity
-        elif option == "Option 2":
-            return f"Option 2 selected. Extracted text:\n{text[:500]}..."  # Truncated for brevity
-        else:
-            return "Invalid option selected."
     except Exception as e:
         return f"An error occurred: {e}"

 import gradio as gr
+import pymupdf  # PyMuPDF for handling PDF files
+from PIL import Image
+import os
+from functions import get_image_informations
+from dataSchema import *
+def Noc_timeSheet_pdf_to_img(pdf_path,output_path,dpi: int = 300, quality: int = 95):
+    pdf_document = pymupdf.open(pdf_path)
+    # Get the first page of the PDF
+    page = pdf_document.load_page(0)  # 0 is the first page
+    # Convert the page to a pixmap (image)
+    pix = page.get_pixmap(dpi=dpi)
+    # Convert the pixmap to a PIL Image and save as JPG
+    image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+    width, height = image.size
+    start_y_total_table = int(height* 0.42)
+    end_y_first_table =  int(height*0.30)
+    croped1 = image.crop((0, 0, width//2, end_y_first_table))
+    croped2 = image.crop((0, start_y_total_table, width//2, height))
+    upper_width, upper_height = croped1.size
+    lower_width, lower_height = croped2.size
+    combined_image = Image.new('RGB', (upper_width, upper_height + lower_height))
+    # Paste the upper image (croped1) on top
+    combined_image.paste(croped1, (0, 0))
+    # Paste the lower image (croped2) below the upper image
+    combined_image.paste(croped2, (0, upper_height))
+    # Save the combined image
+    combined_image.save(output_path, "JPEG",quality=quality)
+    #-----------S3------------ need S3_BUCKET,S3_REGION,S3_URL
+    # import boto3
+    # s3_client = boto3.client('s3', region_name=S3_REGION)
+    # s3_client.upload_file(output_path, S3_BUCKET, key)
+    # file_url = f"{S3_URL}/{key}"
+    # return file_url
+    # return output_path
+def Clauses_in_invoice(pdf_path: str) -> bool:
+    """
+    Extract text from the last page of a PDF.
+    """
+    pdf_document = pymupdf.open(pdf_path)
+    total_pages = pdf_document.page_count
+    last_page = pdf_document.load_page(total_pages - 1)
+    text = last_page.get_text()
+    pdf_document.close()
+    if "clauses" in text.lower():
+        return True
+    else:
+        return False
+def Noc_invoice_pdf_to_img(pdf_path: str, folder_path: str, dpi: int = 300, quality: int = 95):
+    pdf_document = pymupdf.open(pdf_path)
+    folder_path = folder_path.rstrip(os.sep)
+    os.makedirs(folder_path, exist_ok=True)
+    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
+    total_pages = pdf_document.page_count
+    image_paths=[]
+    for page_num in range(total_pages):
+        page = pdf_document.load_page(page_num)
+        pix = page.get_pixmap(dpi=dpi)
+        image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        output_path = os.path.join(folder_path, f"{pdf_name}_page_{page_num + 1}.jpg")
+        image.save(output_path, "JPEG", quality=quality)
+        #-----------S3------------ need S3_BUCKET,S3_REGION,S3_URL
+        # import boto3
+        # s3_client = boto3.client('s3', region_name=S3_REGION)
+        # s3_client.upload_file(output_path, S3_BUCKET, key)
+        # file_url = f"{S3_URL}/{key}"
+        # append the s3 links
+        # image_paths.append(file_url)
+        image_paths.append(output_path)
+    pdf_document.close()
+    return image_paths
+def delete_images(image_paths):
+    # Iterate through the list of image paths
+    for image_path in image_paths:
+        try:
+            # Check if the file exists before attempting to delete
+            if os.path.exists(image_path):
+                os.remove(image_path)
+                print(f"Deleted: {image_path}")
+            else:
+                print(f"File not found: {image_path}")
+        except Exception as e:
+            print(f"Error deleting {image_path}: {e}")
+def noc_invoice_extraction(pdf_path: str,folder_path):
+    image_paths=Noc_invoice_pdf_to_img(pdf_path,folder_path)
+    data = {}
+    result = get_image_informations(image_paths[0],invoice_first_page_prompt,Noc_PurchaseOrder_information_parser)
+    data.update(result)
+    result = get_image_informations(image_paths[1],invoice_item_page1_prompt,Noc_PurchaseOrder_item1_parser)
+    data.update(result)
+    if Clauses_in_invoice(pdf_path):
+        for pic in range(len(image_paths)-4):
+            new_item = get_image_informations(image_paths[pic+2],invoice_item_pages_prompt,Noc_PurchaseOrder_items_parser)
+            for item in new_item["items"]:
+                data["items"].append(item)
+        result = get_image_informations(image_paths[-2],invoice_total_page_prompt,Noc_PurchaseOrder_total_parser)
+        data.update(result)
+        result = get_image_informations(image_paths[-1],invoice_clauses_page_prompt,Noc_PurchaseOrder_clauses_parser)
+        data.update(result)
+        delete_images(image_paths)
+        return data
+    else:
+        for pic in range(len(image_paths)-3):
+            new_item = get_image_informations(image_paths[pic+2],invoice_item_pages_prompt,Noc_PurchaseOrder_items_parser)
+            for item in new_item["items"]:
+                data["items"].append(item)
+        result = get_image_informations(image_paths[-2],invoice_total_page_prompt,Noc_PurchaseOrder_total_parser)
+        data.update(result)
+        delete_images(image_paths)
+        return data
 def process_pdf(file, option):
     if file is None:
         return "Please upload a PDF file."
     try:
+        save_dir = "uploaded_files"
+        os.makedirs(save_dir, exist_ok=True)  # Create the directory if it doesn't exist
+        # Save the uploaded file
+        file_path = os.path.join(save_dir, file.name)
+        with open(file_path, "wb") as f:
+            f.write(file.read())
         # Process based on the selected option
+        if option == "Noc_timesheet_resdiential":
+            Noc_timeSheet_pdf_to_img(file_path,"output.jpg")
+            result = get_image_informations("output.jpg",Noc_Res_timesheet_prompt,Noc_Res_timeSheet_parser)
+            return result
+    #     elif option == "Option 2":
+    #         return f"Option 2 selected. Extracted text:\n{text[:500]}..."  # Truncated for brevity
+    #     else:
+    #         return "Invalid option selected."
     except Exception as e:
         return f"An error occurred: {e}"

dataSchema.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from pydantic import BaseModel, Field
 from typing import Optional,List
 class Noc_Residential_TimeSheetInformation(BaseModel):
     """Details of a timesheet entry."""
@@ -88,6 +89,7 @@ class Noc_PurchaseOrderInformation(BaseModel):
     your_reference: Optional[str] = Field(None, description="under Your reference title.")
     incoterms: Optional[str] = Field(None, description="Incoterms applicable to the order.")
     total_value_of_order: str = Field(..., description="Total value of the purchase order.")
     signature_released_by: str = Field(None, description="Name of the person who released the purchase order.")
     signature_date: Optional[str] = Field(None, description="Date the order was signed.")
@@ -155,6 +157,7 @@ Extract the following details from the provided purchase order document:
 - Your Reference: Reference specified under the "Your Reference" section (if present).
 - Incoterms: Any applicable incoterms mentioned in the document (e.g., FOB, CIF).
 - Total Value of the Order: The total monetary value of the purchase order (include currency).
 - Signature Released By: The name of the person who authorized or released the purchase order.
 - Signature Date: The date when the order was signed (format: DD/MM/YYYY).
 """
@@ -199,3 +202,12 @@ extract from the document:
 invoice_clauses_page_prompt = """
 extract from the document the clauses  """

 from pydantic import BaseModel, Field
 from typing import Optional,List
+from langchain_core.output_parsers import JsonOutputParser
 class Noc_Residential_TimeSheetInformation(BaseModel):
     """Details of a timesheet entry."""
     your_reference: Optional[str] = Field(None, description="under Your reference title.")
     incoterms: Optional[str] = Field(None, description="Incoterms applicable to the order.")
     total_value_of_order: str = Field(..., description="Total value of the purchase order.")
+    signed: bool = Field(..., description="Whether the document has been signed or not.")
     signature_released_by: str = Field(None, description="Name of the person who released the purchase order.")
     signature_date: Optional[str] = Field(None, description="Date the order was signed.")
 - Your Reference: Reference specified under the "Your Reference" section (if present).
 - Incoterms: Any applicable incoterms mentioned in the document (e.g., FOB, CIF).
 - Total Value of the Order: The total monetary value of the purchase order (include currency).
+- signed: Whether the document has been signed or not.
 - Signature Released By: The name of the person who authorized or released the purchase order.
 - Signature Date: The date when the order was signed (format: DD/MM/YYYY).
 """
 invoice_clauses_page_prompt = """
 extract from the document the clauses  """
+# CHOOSING PARSER DEPENDING ON THE TYPE OF DOCUMENT
+Noc_Res_timeSheet_parser = JsonOutputParser(pydantic_object=Noc_Residential_TimeSheetInformation)
+Noc_Rot_timeSheet_parser = JsonOutputParser(pydantic_object=Noc_Rotational_TimeSheetInformation)
+Noc_PurchaseOrder_information_parser = JsonOutputParser(pydantic_object=Noc_PurchaseOrderInformation)
+Noc_PurchaseOrder_item1_parser = JsonOutputParser(pydantic_object=Noc_Document_Information)
+Noc_PurchaseOrder_items_parser = JsonOutputParser(pydantic_object=Noc_items)
+Noc_PurchaseOrder_total_parser = JsonOutputParser(pydantic_object=Noc_total)
+Noc_PurchaseOrder_clauses_parser = JsonOutputParser(pydantic_object=Noc_Clauses)

functions.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from langchain.chains import TransformChain
+from langchain_core.messages import HumanMessage
+from langchain_openai import ChatOpenAI
+from langchain import globals
+from langchain_core.runnables import chain
+import base64
+from typing import Dict,List,Union
+def load_image(inputs: dict) -> dict:
+  """Load image from file and encode it as base64."""
+  image_path = inputs["image_path"]
+  def encode_image(image_path):
+      with open(image_path, "rb") as image_file:
+          return base64.b64encode(image_file.read()).decode('utf-8')
+  image_base64 = encode_image(image_path)
+  return {"image": image_base64}
+load_image_chain = TransformChain(
+    input_variables=["image_path"],
+    output_variables=["image"],
+    transform=load_image
+)
+@chain
+def image_model(inputs: dict) -> Union[str, List[str], dict]:
+ """Invoke model with image and prompt."""
+ model = ChatOpenAI(temperature=0.1, model="gpt-4o", max_tokens=1024)
+ parser = inputs["parser"]
+ msg = model.invoke(
+             [HumanMessage(
+             content=[
+             {"type": "text", "text": inputs["prompt"]},
+             {"type": "text", "text": parser.get_format_instructions()},
+             {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{inputs['image']}"}},
+             ])]
+             )
+ return msg.content
+def get_image_informations(image_path: str,prompt,parser) -> dict:
+   vision_chain = load_image_chain | image_model | parser
+   return vision_chain.invoke({'image_path': f'{image_path}',
+                               'prompt': prompt,
+                               'parser': parser
+                               })