Spaces:
Running
Running
Commit
·
54478a0
1
Parent(s):
966ffcd
Refactor AzureDIService to enhance document analysis logging and update table extraction logic. Temporarily disable table extraction and improve content type logging. Update TableAgent to reflect changes in context handling.
Browse files
src/agents/__pycache__/field_mapper_agent.cpython-312.pyc
CHANGED
Binary files a/src/agents/__pycache__/field_mapper_agent.cpython-312.pyc and b/src/agents/__pycache__/field_mapper_agent.cpython-312.pyc differ
|
|
src/agents/__pycache__/table_agent.cpython-312.pyc
CHANGED
Binary files a/src/agents/__pycache__/table_agent.cpython-312.pyc and b/src/agents/__pycache__/table_agent.cpython-312.pyc differ
|
|
src/agents/table_agent.py
CHANGED
@@ -35,9 +35,9 @@ class TableAgent(BaseAgent):
|
|
35 |
|
36 |
# Store both text and tables in context
|
37 |
ctx["text"] = result["text"]
|
38 |
-
ctx["tables"] = result["tables"]
|
39 |
|
40 |
-
self.logger.info(f"Extracted {len(result['text'])} characters of text
|
41 |
return result
|
42 |
|
43 |
except Exception as e:
|
|
|
35 |
|
36 |
# Store both text and tables in context
|
37 |
ctx["text"] = result["text"]
|
38 |
+
#ctx["tables"] = result["tables"]
|
39 |
|
40 |
+
self.logger.info(f"Extracted {len(result['text'])} characters of text including tables")
|
41 |
return result
|
42 |
|
43 |
except Exception as e:
|
src/services/__pycache__/azure_di_service.cpython-312.pyc
CHANGED
Binary files a/src/services/__pycache__/azure_di_service.cpython-312.pyc and b/src/services/__pycache__/azure_di_service.cpython-312.pyc differ
|
|
src/services/azure_di_service.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
"""Real wrapper goes here – currently not used by stub agents."""
|
2 |
import logging
|
|
|
3 |
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
|
|
4 |
from azure.core.credentials import AzureKeyCredential
|
5 |
from azure.core.exceptions import HttpResponseError
|
6 |
|
@@ -14,38 +16,35 @@ class AzureDIService:
|
|
14 |
self.logger.info("Starting document analysis with Azure Document Intelligence")
|
15 |
|
16 |
# Analyze the entire document at once
|
17 |
-
poller = self.client.begin_analyze_document("prebuilt-layout", body=pdf_bytes)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
result = poller.result()
|
19 |
|
20 |
-
#
|
21 |
-
|
|
|
|
|
|
|
|
|
22 |
if hasattr(result, "content"):
|
23 |
-
|
24 |
-
self.logger.info(f"
|
25 |
|
26 |
-
#
|
27 |
-
tables = []
|
28 |
if hasattr(result, "tables"):
|
29 |
-
self.logger.info(f"
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
for row_idx in range(table.row_count):
|
34 |
-
html += "<tr>"
|
35 |
-
for col_idx in range(table.column_count):
|
36 |
-
cell = next((c for c in table.cells if c.row_index == row_idx and c.column_index == col_idx), None)
|
37 |
-
html += f"<td>{cell.content if cell else ''}</td>"
|
38 |
-
html += "</tr>"
|
39 |
-
html += "</table>"
|
40 |
-
tables.append(html)
|
41 |
-
else:
|
42 |
-
self.logger.warning("No tables found in the document")
|
43 |
|
44 |
-
#
|
45 |
-
return {
|
46 |
-
"text": text_content,
|
47 |
-
"tables": tables
|
48 |
-
}
|
49 |
|
50 |
except HttpResponseError as e:
|
51 |
self.logger.error(f"Azure Document Intelligence API error: {str(e)}")
|
|
|
1 |
"""Real wrapper goes here – currently not used by stub agents."""
|
2 |
import logging
|
3 |
+
import json
|
4 |
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
5 |
+
from azure.ai.documentintelligence.models import DocumentContentFormat
|
6 |
from azure.core.credentials import AzureKeyCredential
|
7 |
from azure.core.exceptions import HttpResponseError
|
8 |
|
|
|
16 |
self.logger.info("Starting document analysis with Azure Document Intelligence")
|
17 |
|
18 |
# Analyze the entire document at once
|
19 |
+
#poller = self.client.begin_analyze_document("prebuilt-layout", body=pdf_bytes)
|
20 |
+
|
21 |
+
poller = self.client.begin_analyze_document(
|
22 |
+
"prebuilt-layout",
|
23 |
+
body=pdf_bytes,
|
24 |
+
content_type="application/octet-stream",
|
25 |
+
output_content_format=DocumentContentFormat.MARKDOWN
|
26 |
+
)
|
27 |
result = poller.result()
|
28 |
|
29 |
+
# Log the raw result structure
|
30 |
+
self.logger.info("Inspecting Azure DI result structure:")
|
31 |
+
self.logger.info(f"Result type: {type(result)}")
|
32 |
+
self.logger.info(f"Result attributes: {dir(result)}")
|
33 |
+
|
34 |
+
# Check if content exists and log its type
|
35 |
if hasattr(result, "content"):
|
36 |
+
self.logger.info(f"Content type: {type(result.content)}")
|
37 |
+
self.logger.info(f"Content preview: {result.content[:500]}")
|
38 |
|
39 |
+
# Check if tables exist and log their structure
|
|
|
40 |
if hasattr(result, "tables"):
|
41 |
+
self.logger.info(f"Number of tables: {len(result.tables)}")
|
42 |
+
if result.tables:
|
43 |
+
self.logger.info(f"First table structure: {dir(result.tables[0])}")
|
44 |
+
self.logger.info(f"First table cells: {[cell.content for cell in result.tables[0].cells]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
+
# For now, return empty result until we understand the structure
|
47 |
+
return {"text": result.content}
|
|
|
|
|
|
|
48 |
|
49 |
except HttpResponseError as e:
|
50 |
self.logger.error(f"Azure Document Intelligence API error: {str(e)}")
|