Spaces:
Running
Running
Commit
·
195deb3
1
Parent(s):
39c8302
added
Browse files
app.py
CHANGED
@@ -97,8 +97,14 @@ def extract_products(text):
|
|
97 |
|
98 |
def extract_with_perplexity_llm(ocr_text):
|
99 |
prompt = f"""
|
100 |
-
Extract
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
Text:
|
104 |
\"\"\"{ocr_text}\"\"\"
|
@@ -125,6 +131,13 @@ def save_to_dynamodb(data, table_name="Receipts"):
|
|
125 |
# data["products"] is a list of dicts
|
126 |
table.put_item(Item=data)
|
127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
def main():
|
129 |
st.set_page_config(
|
130 |
page_title="FormIQ - Intelligent Document Parser",
|
@@ -180,7 +193,6 @@ def main():
|
|
180 |
if uploaded_file is not None:
|
181 |
# Display uploaded image
|
182 |
if uploaded_file.type == "application/pdf":
|
183 |
-
# Convert first page of PDF to image
|
184 |
images = convert_from_bytes(uploaded_file.read())
|
185 |
image = images[0] # Use the first page
|
186 |
else:
|
@@ -200,35 +212,37 @@ def main():
|
|
200 |
|
201 |
# Extract with Perplexity LLM
|
202 |
with st.spinner("Extracting structured data with Perplexity LLM..."):
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
st.code(llm_result, language="json")
|
207 |
-
|
208 |
-
# Try to parse the JSON from the LLM output
|
209 |
-
try:
|
210 |
-
llm_json = extract_json_from_llm_output(llm_result)
|
211 |
-
if llm_json:
|
212 |
-
llm_data = json.loads(llm_json)
|
213 |
-
if "products" in llm_data and llm_data["products"]:
|
214 |
-
st.subheader("Products (LLM Extracted)")
|
215 |
-
st.dataframe(pd.DataFrame(llm_data["products"]))
|
216 |
-
else:
|
217 |
-
st.warning("Could not extract JSON from LLM output.")
|
218 |
-
except Exception as e:
|
219 |
-
st.error(f"Failed to parse LLM output as JSON: {e}")
|
220 |
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
except Exception as e:
|
222 |
-
st.error(f"LLM
|
223 |
-
|
224 |
-
#
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
|
233 |
except Exception as e:
|
234 |
logger.error(f"Error processing document: {str(e)}")
|
|
|
97 |
|
98 |
def extract_with_perplexity_llm(ocr_text):
|
99 |
prompt = f"""
|
100 |
+
Extract the following fields from this receipt text:
|
101 |
+
- name (customer name)
|
102 |
+
- date
|
103 |
+
- products (list of {{name, price}})
|
104 |
+
- amount_paid (total)
|
105 |
+
- receipt_no
|
106 |
+
|
107 |
+
Return the result as a JSON object with these fields.
|
108 |
|
109 |
Text:
|
110 |
\"\"\"{ocr_text}\"\"\"
|
|
|
131 |
# data["products"] is a list of dicts
|
132 |
table.put_item(Item=data)
|
133 |
|
134 |
+
def merge_extractions(regex_fields, llm_fields):
|
135 |
+
merged = {}
|
136 |
+
for key in ["name", "date", "amount_paid", "receipt_no"]:
|
137 |
+
merged[key] = llm_fields.get(key) or regex_fields.get(key)
|
138 |
+
merged["products"] = llm_fields.get("products") or regex_fields.get("products")
|
139 |
+
return merged
|
140 |
+
|
141 |
def main():
|
142 |
st.set_page_config(
|
143 |
page_title="FormIQ - Intelligent Document Parser",
|
|
|
193 |
if uploaded_file is not None:
|
194 |
# Display uploaded image
|
195 |
if uploaded_file.type == "application/pdf":
|
|
|
196 |
images = convert_from_bytes(uploaded_file.read())
|
197 |
image = images[0] # Use the first page
|
198 |
else:
|
|
|
212 |
|
213 |
# Extract with Perplexity LLM
|
214 |
with st.spinner("Extracting structured data with Perplexity LLM..."):
|
215 |
+
llm_result = extract_with_perplexity_llm(pytesseract.image_to_string(Image.open(temp_path)))
|
216 |
+
st.subheader("Structured Data (Perplexity LLM)")
|
217 |
+
st.code(llm_result, language="json")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
+
# Try to parse the JSON from the LLM output
|
220 |
+
llm_data = {}
|
221 |
+
try:
|
222 |
+
llm_json = extract_json_from_llm_output(llm_result)
|
223 |
+
if llm_json:
|
224 |
+
llm_data = json.loads(llm_json)
|
225 |
except Exception as e:
|
226 |
+
st.error(f"Failed to parse LLM output as JSON: {e}")
|
227 |
+
|
228 |
+
# Merge results
|
229 |
+
final_data = merge_extractions(fields, llm_data)
|
230 |
+
|
231 |
+
# Display merged fields
|
232 |
+
st.subheader("Final Extracted Fields (Merged)")
|
233 |
+
st.json(final_data)
|
234 |
+
|
235 |
+
# Save to DynamoDB
|
236 |
+
try:
|
237 |
+
save_to_dynamodb(final_data)
|
238 |
+
st.success("Saved to DynamoDB!")
|
239 |
+
except Exception as e:
|
240 |
+
st.error(f"Failed to save to DynamoDB: {e}")
|
241 |
+
|
242 |
+
# Display extracted products
|
243 |
+
if "products" in final_data and final_data["products"]:
|
244 |
+
st.subheader("Products (Final Extracted)")
|
245 |
+
st.dataframe(pd.DataFrame(final_data["products"]))
|
246 |
|
247 |
except Exception as e:
|
248 |
logger.error(f"Error processing document: {str(e)}")
|