Spaces:

masadonline
/

RAG-PDF

Sleeping

App Files Files Community

masadonline commited on May 18

Commit

6c38165

verified ·

1 Parent(s): 6ca1b30

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -10

app.py CHANGED Viewed

@@ -19,30 +19,32 @@ def extract_order_data(pdf_file):
     for page in reader.pages:
         text = page.extract_text()
         if text:
-            # Splitting the text page by page and then by line
             lines = text.strip().split('\n')
-            # Find the start of the table, assuming it begins with a "Order ID" heading
             start_index = next((i for i, line in enumerate(lines) if "Order ID" in line), None)
             if start_index is not None:
-                # Table headers are on the same line
                 headers = [header.strip() for header in lines[start_index].split(",")]
-                # Data starts from the next line
                 for line in lines[start_index + 1:]:
-                    values = [value.strip() for value in line.split(",")]
-                    # Ensure that the number of values matches the number of headers
                     if len(headers) == len(values):
                         order_data.append(dict(zip(headers, values)))
     return order_data
 # Function to fetch order status using GROQ API
 def fetch_order_status_from_groq(order_id, groq_api_key):
     """Fetches order status and customer details from GROQ API."""
     transport = RequestsHTTPTransport(
-        url="https://api.groq.cloud/v1/graphql",  # Replace with your GROQ endpoint
         headers={"Authorization": f"Bearer {groq_api_key}"},
         verify=True,
         retries=3,
@@ -92,7 +94,7 @@ def main():
                 else:
                     st.error("Could not retrieve order status.")
         else:
-            st.error("Failed to extract order data from PDF.")
 if __name__ == "__main__":
     main()

     for page in reader.pages:
         text = page.extract_text()
         if text:
             lines = text.strip().split('\n')
             start_index = next((i for i, line in enumerate(lines) if "Order ID" in line), None)
             if start_index is not None:
                 headers = [header.strip() for header in lines[start_index].split(",")]
+                # Clean headers from extra spaces
+                headers = [h.replace(" ", "") for h in headers] # Remove spaces in header names
                 for line in lines[start_index + 1:]:
+                    values = [v.strip() for v in line.split(",")]
                     if len(headers) == len(values):
                         order_data.append(dict(zip(headers, values)))
+                    elif len(values) > len(headers):
+                         # Handle cases where there are more values than headers (e.g., extra commas)
+                         order_data.append(dict(zip(headers, values[:len(headers)])))
+                    else:
+                        print(f"Skipping line due to header/value mismatch: {line}") # print the problematic line.
     return order_data
 # Function to fetch order status using GROQ API
 def fetch_order_status_from_groq(order_id, groq_api_key):
     """Fetches order status and customer details from GROQ API."""
     transport = RequestsHTTPTransport(
+        url="[https://api.groq.cloud/v1/graphql](https://api.groq.cloud/v1/graphql)",  # Replace with your GROQ endpoint
         headers={"Authorization": f"Bearer {groq_api_key}"},
         verify=True,
         retries=3,
                 else:
                     st.error("Could not retrieve order status.")
         else:
+            st.error("Failed to extract order data from PDF. Please check the PDF format and try again.")
 if __name__ == "__main__":
     main()