chandini2595 commited on
Commit
195deb3
·
1 Parent(s): 39c8302
Files changed (1) hide show
  1. app.py +44 -30
app.py CHANGED
@@ -97,8 +97,14 @@ def extract_products(text):
97
 
98
  def extract_with_perplexity_llm(ocr_text):
99
  prompt = f"""
100
- Extract all products as a list of {{name, price}} from this receipt text.
101
- Return the result as a JSON object with a 'products' field (list of objects).
 
 
 
 
 
 
102
 
103
  Text:
104
  \"\"\"{ocr_text}\"\"\"
@@ -125,6 +131,13 @@ def save_to_dynamodb(data, table_name="Receipts"):
125
  # data["products"] is a list of dicts
126
  table.put_item(Item=data)
127
 
 
 
 
 
 
 
 
128
  def main():
129
  st.set_page_config(
130
  page_title="FormIQ - Intelligent Document Parser",
@@ -180,7 +193,6 @@ def main():
180
  if uploaded_file is not None:
181
  # Display uploaded image
182
  if uploaded_file.type == "application/pdf":
183
- # Convert first page of PDF to image
184
  images = convert_from_bytes(uploaded_file.read())
185
  image = images[0] # Use the first page
186
  else:
@@ -200,35 +212,37 @@ def main():
200
 
201
  # Extract with Perplexity LLM
202
  with st.spinner("Extracting structured data with Perplexity LLM..."):
203
- try:
204
- llm_result = extract_with_perplexity_llm(pytesseract.image_to_string(Image.open(temp_path)))
205
- st.subheader("Structured Data (Perplexity LLM)")
206
- st.code(llm_result, language="json")
207
-
208
- # Try to parse the JSON from the LLM output
209
- try:
210
- llm_json = extract_json_from_llm_output(llm_result)
211
- if llm_json:
212
- llm_data = json.loads(llm_json)
213
- if "products" in llm_data and llm_data["products"]:
214
- st.subheader("Products (LLM Extracted)")
215
- st.dataframe(pd.DataFrame(llm_data["products"]))
216
- else:
217
- st.warning("Could not extract JSON from LLM output.")
218
- except Exception as e:
219
- st.error(f"Failed to parse LLM output as JSON: {e}")
220
 
 
 
 
 
 
 
221
  except Exception as e:
222
- st.error(f"LLM extraction failed: {e}")
223
-
224
- # Display extracted fields (regex)
225
- st.subheader("Extracted Fields (Regex)")
226
- fields_df = pd.DataFrame([fields])
227
- st.dataframe(fields_df)
228
-
229
- if "products" in fields and fields["products"]:
230
- st.subheader("Products (Regex Extracted)")
231
- st.dataframe(pd.DataFrame(fields["products"]))
 
 
 
 
 
 
 
 
 
 
232
 
233
  except Exception as e:
234
  logger.error(f"Error processing document: {str(e)}")
 
97
 
98
  def extract_with_perplexity_llm(ocr_text):
99
  prompt = f"""
100
+ Extract the following fields from this receipt text:
101
+ - name (customer name)
102
+ - date
103
+ - products (list of {{name, price}})
104
+ - amount_paid (total)
105
+ - receipt_no
106
+
107
+ Return the result as a JSON object with these fields.
108
 
109
  Text:
110
  \"\"\"{ocr_text}\"\"\"
 
131
  # data["products"] is a list of dicts
132
  table.put_item(Item=data)
133
 
134
+ def merge_extractions(regex_fields, llm_fields):
135
+ merged = {}
136
+ for key in ["name", "date", "amount_paid", "receipt_no"]:
137
+ merged[key] = llm_fields.get(key) or regex_fields.get(key)
138
+ merged["products"] = llm_fields.get("products") or regex_fields.get("products")
139
+ return merged
140
+
141
  def main():
142
  st.set_page_config(
143
  page_title="FormIQ - Intelligent Document Parser",
 
193
  if uploaded_file is not None:
194
  # Display uploaded image
195
  if uploaded_file.type == "application/pdf":
 
196
  images = convert_from_bytes(uploaded_file.read())
197
  image = images[0] # Use the first page
198
  else:
 
212
 
213
  # Extract with Perplexity LLM
214
  with st.spinner("Extracting structured data with Perplexity LLM..."):
215
+ llm_result = extract_with_perplexity_llm(pytesseract.image_to_string(Image.open(temp_path)))
216
+ st.subheader("Structured Data (Perplexity LLM)")
217
+ st.code(llm_result, language="json")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
+ # Try to parse the JSON from the LLM output
220
+ llm_data = {}
221
+ try:
222
+ llm_json = extract_json_from_llm_output(llm_result)
223
+ if llm_json:
224
+ llm_data = json.loads(llm_json)
225
  except Exception as e:
226
+ st.error(f"Failed to parse LLM output as JSON: {e}")
227
+
228
+ # Merge results
229
+ final_data = merge_extractions(fields, llm_data)
230
+
231
+ # Display merged fields
232
+ st.subheader("Final Extracted Fields (Merged)")
233
+ st.json(final_data)
234
+
235
+ # Save to DynamoDB
236
+ try:
237
+ save_to_dynamodb(final_data)
238
+ st.success("Saved to DynamoDB!")
239
+ except Exception as e:
240
+ st.error(f"Failed to save to DynamoDB: {e}")
241
+
242
+ # Display extracted products
243
+ if "products" in final_data and final_data["products"]:
244
+ st.subheader("Products (Final Extracted)")
245
+ st.dataframe(pd.DataFrame(final_data["products"]))
246
 
247
  except Exception as e:
248
  logger.error(f"Error processing document: {str(e)}")