Spaces:
Sleeping
Sleeping
Commit
·
8c81f89
1
Parent(s):
bb9f60a
Remove post-training TensorBoard log charts
Browse files
app.py
CHANGED
@@ -62,10 +62,7 @@ def extract_json_from_llm_output(llm_result):
|
|
62 |
return None
|
63 |
|
64 |
def extract_fields(image_path):
|
65 |
-
# OCR
|
66 |
text = pytesseract.image_to_string(Image.open(image_path))
|
67 |
-
|
68 |
-
# Display OCR output for debugging
|
69 |
st.subheader("Raw OCR Output")
|
70 |
st.code(text)
|
71 |
|
@@ -86,21 +83,24 @@ def extract_fields(image_path):
|
|
86 |
else:
|
87 |
results[field] = None
|
88 |
|
|
|
|
|
89 |
return results
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
def extract_with_perplexity_llm(ocr_text):
|
92 |
prompt = f"""
|
93 |
-
Extract
|
94 |
-
|
95 |
-
- date
|
96 |
-
- product
|
97 |
-
- amount_paid
|
98 |
-
- receipt_no
|
99 |
|
100 |
Text:
|
101 |
\"\"\"{ocr_text}\"\"\"
|
102 |
-
|
103 |
-
Return the result as a JSON object with those fields.
|
104 |
"""
|
105 |
messages = [
|
106 |
{
|
@@ -119,6 +119,11 @@ Return the result as a JSON object with those fields.
|
|
119 |
)
|
120 |
return response.choices[0].message.content
|
121 |
|
|
|
|
|
|
|
|
|
|
|
122 |
def main():
|
123 |
st.set_page_config(
|
124 |
page_title="FormIQ - Intelligent Document Parser",
|
@@ -199,6 +204,10 @@ def main():
|
|
199 |
fields_df = pd.DataFrame([fields])
|
200 |
st.dataframe(fields_df)
|
201 |
|
|
|
|
|
|
|
|
|
202 |
except Exception as e:
|
203 |
st.error(f"LLM extraction failed: {e}")
|
204 |
|
|
|
62 |
return None
|
63 |
|
64 |
def extract_fields(image_path):
|
|
|
65 |
text = pytesseract.image_to_string(Image.open(image_path))
|
|
|
|
|
66 |
st.subheader("Raw OCR Output")
|
67 |
st.code(text)
|
68 |
|
|
|
83 |
else:
|
84 |
results[field] = None
|
85 |
|
86 |
+
# Extract all products
|
87 |
+
results["products"] = extract_products(text)
|
88 |
return results
|
89 |
|
90 |
+
def extract_products(text):
|
91 |
+
# Example regex: product name (letters/numbers/spaces), then price (float)
|
92 |
+
product_pattern = r"([A-Z0-9 ]+)\s+([0-9]+\.[0-9]{2})"
|
93 |
+
matches = re.findall(product_pattern, text)
|
94 |
+
products = [{"name": name.strip(), "price": float(price)} for name, price in matches]
|
95 |
+
return products
|
96 |
+
|
97 |
def extract_with_perplexity_llm(ocr_text):
|
98 |
prompt = f"""
|
99 |
+
Extract all products as a list of {name, price} from this receipt text.
|
100 |
+
Return the result as a JSON object with a 'products' field (list of objects).
|
|
|
|
|
|
|
|
|
101 |
|
102 |
Text:
|
103 |
\"\"\"{ocr_text}\"\"\"
|
|
|
|
|
104 |
"""
|
105 |
messages = [
|
106 |
{
|
|
|
119 |
)
|
120 |
return response.choices[0].message.content
|
121 |
|
122 |
+
def save_to_dynamodb(data, table_name="Receipts"):
|
123 |
+
# ... existing code ...
|
124 |
+
# data["products"] is a list of dicts
|
125 |
+
table.put_item(Item=data)
|
126 |
+
|
127 |
def main():
|
128 |
st.set_page_config(
|
129 |
page_title="FormIQ - Intelligent Document Parser",
|
|
|
204 |
fields_df = pd.DataFrame([fields])
|
205 |
st.dataframe(fields_df)
|
206 |
|
207 |
+
if "products" in fields and fields["products"]:
|
208 |
+
st.subheader("Products")
|
209 |
+
st.dataframe(pd.DataFrame(fields["products"]))
|
210 |
+
|
211 |
except Exception as e:
|
212 |
st.error(f"LLM extraction failed: {e}")
|
213 |
|