Spaces:

chandinisaisri
/

formiq

Running

App Files Files Community

chandini2595 commited on May 10

Commit

195deb3

1 Parent(s): 39c8302

added

Browse files

Files changed (1) hide show

app.py +44 -30

app.py CHANGED Viewed

@@ -97,8 +97,14 @@ def extract_products(text):
 def extract_with_perplexity_llm(ocr_text):
     prompt = f"""
-Extract all products as a list of {{name, price}} from this receipt text.
-Return the result as a JSON object with a 'products' field (list of objects).
 Text:
 \"\"\"{ocr_text}\"\"\"
@@ -125,6 +131,13 @@ def save_to_dynamodb(data, table_name="Receipts"):
     # data["products"] is a list of dicts
     table.put_item(Item=data)
 def main():
     st.set_page_config(
         page_title="FormIQ - Intelligent Document Parser",
@@ -180,7 +193,6 @@ def main():
     if uploaded_file is not None:
         # Display uploaded image
         if uploaded_file.type == "application/pdf":
-    # Convert first page of PDF to image
             images = convert_from_bytes(uploaded_file.read())
             image = images[0]  # Use the first page
         else:
@@ -200,35 +212,37 @@ def main():
                     # Extract with Perplexity LLM
                     with st.spinner("Extracting structured data with Perplexity LLM..."):
-                        try:
-                            llm_result = extract_with_perplexity_llm(pytesseract.image_to_string(Image.open(temp_path)))
-                            st.subheader("Structured Data (Perplexity LLM)")
-                            st.code(llm_result, language="json")
-                            # Try to parse the JSON from the LLM output
-                            try:
-                                llm_json = extract_json_from_llm_output(llm_result)
-                                if llm_json:
-                                    llm_data = json.loads(llm_json)
-                                    if "products" in llm_data and llm_data["products"]:
-                                        st.subheader("Products (LLM Extracted)")
-                                        st.dataframe(pd.DataFrame(llm_data["products"]))
-                                    else:
-                                        st.warning("Could not extract JSON from LLM output.")
-                            except Exception as e:
-                                st.error(f"Failed to parse LLM output as JSON: {e}")
                         except Exception as e:
-                            st.error(f"LLM extraction failed: {e}")
-                    # Display extracted fields (regex)
-                    st.subheader("Extracted Fields (Regex)")
-                    fields_df = pd.DataFrame([fields])
-                    st.dataframe(fields_df)
-                    if "products" in fields and fields["products"]:
-                        st.subheader("Products (Regex Extracted)")
-                        st.dataframe(pd.DataFrame(fields["products"]))
                 except Exception as e:
                     logger.error(f"Error processing document: {str(e)}")

 def extract_with_perplexity_llm(ocr_text):
     prompt = f"""
+Extract the following fields from this receipt text:
+- name (customer name)
+- date
+- products (list of {{name, price}})
+- amount_paid (total)
+- receipt_no
+Return the result as a JSON object with these fields.
 Text:
 \"\"\"{ocr_text}\"\"\"
     # data["products"] is a list of dicts
     table.put_item(Item=data)
+def merge_extractions(regex_fields, llm_fields):
+    merged = {}
+    for key in ["name", "date", "amount_paid", "receipt_no"]:
+        merged[key] = llm_fields.get(key) or regex_fields.get(key)
+    merged["products"] = llm_fields.get("products") or regex_fields.get("products")
+    return merged
 def main():
     st.set_page_config(
         page_title="FormIQ - Intelligent Document Parser",
     if uploaded_file is not None:
         # Display uploaded image
         if uploaded_file.type == "application/pdf":
             images = convert_from_bytes(uploaded_file.read())
             image = images[0]  # Use the first page
         else:
                     # Extract with Perplexity LLM
                     with st.spinner("Extracting structured data with Perplexity LLM..."):
+                        llm_result = extract_with_perplexity_llm(pytesseract.image_to_string(Image.open(temp_path)))
+                        st.subheader("Structured Data (Perplexity LLM)")
+                        st.code(llm_result, language="json")
+                        # Try to parse the JSON from the LLM output
+                        llm_data = {}
+                        try:
+                            llm_json = extract_json_from_llm_output(llm_result)
+                            if llm_json:
+                                llm_data = json.loads(llm_json)
                         except Exception as e:
+                            st.error(f"Failed to parse LLM output as JSON: {e}")
+                    # Merge results
+                    final_data = merge_extractions(fields, llm_data)
+                    # Display merged fields
+                    st.subheader("Final Extracted Fields (Merged)")
+                    st.json(final_data)
+                    # Save to DynamoDB
+                    try:
+                        save_to_dynamodb(final_data)
+                        st.success("Saved to DynamoDB!")
+                    except Exception as e:
+                        st.error(f"Failed to save to DynamoDB: {e}")
+                    # Display extracted products
+                    if "products" in final_data and final_data["products"]:
+                        st.subheader("Products (Final Extracted)")
+                        st.dataframe(pd.DataFrame(final_data["products"]))
                 except Exception as e:
                     logger.error(f"Error processing document: {str(e)}")