levalencia commited on
Commit
54478a0
·
1 Parent(s): 966ffcd

Refactor AzureDIService to enhance document analysis logging and update table extraction logic. Temporarily disable table extraction and improve content type logging. Update TableAgent to reflect changes in context handling.

Browse files
src/agents/__pycache__/field_mapper_agent.cpython-312.pyc CHANGED
Binary files a/src/agents/__pycache__/field_mapper_agent.cpython-312.pyc and b/src/agents/__pycache__/field_mapper_agent.cpython-312.pyc differ
 
src/agents/__pycache__/table_agent.cpython-312.pyc CHANGED
Binary files a/src/agents/__pycache__/table_agent.cpython-312.pyc and b/src/agents/__pycache__/table_agent.cpython-312.pyc differ
 
src/agents/table_agent.py CHANGED
@@ -35,9 +35,9 @@ class TableAgent(BaseAgent):
35
 
36
  # Store both text and tables in context
37
  ctx["text"] = result["text"]
38
- ctx["tables"] = result["tables"]
39
 
40
- self.logger.info(f"Extracted {len(result['text'])} characters of text and {len(result['tables'])} tables")
41
  return result
42
 
43
  except Exception as e:
 
35
 
36
  # Store both text and tables in context
37
  ctx["text"] = result["text"]
38
+ #ctx["tables"] = result["tables"]
39
 
40
+ self.logger.info(f"Extracted {len(result['text'])} characters of text including tables")
41
  return result
42
 
43
  except Exception as e:
src/services/__pycache__/azure_di_service.cpython-312.pyc CHANGED
Binary files a/src/services/__pycache__/azure_di_service.cpython-312.pyc and b/src/services/__pycache__/azure_di_service.cpython-312.pyc differ
 
src/services/azure_di_service.py CHANGED
@@ -1,6 +1,8 @@
1
  """Real wrapper goes here – currently not used by stub agents."""
2
  import logging
 
3
  from azure.ai.documentintelligence import DocumentIntelligenceClient
 
4
  from azure.core.credentials import AzureKeyCredential
5
  from azure.core.exceptions import HttpResponseError
6
 
@@ -14,38 +16,35 @@ class AzureDIService:
14
  self.logger.info("Starting document analysis with Azure Document Intelligence")
15
 
16
  # Analyze the entire document at once
17
- poller = self.client.begin_analyze_document("prebuilt-layout", body=pdf_bytes)
 
 
 
 
 
 
 
18
  result = poller.result()
19
 
20
- # Extract text content
21
- text_content = ""
 
 
 
 
22
  if hasattr(result, "content"):
23
- text_content = result.content
24
- self.logger.info(f"Extracted {len(text_content)} characters of text")
25
 
26
- # Extract tables as HTML strings
27
- tables = []
28
  if hasattr(result, "tables"):
29
- self.logger.info(f"Found {len(result.tables)} tables in the document")
30
- for table in result.tables:
31
- # Simple HTML rendering for demo; you can improve this
32
- html = "<table>"
33
- for row_idx in range(table.row_count):
34
- html += "<tr>"
35
- for col_idx in range(table.column_count):
36
- cell = next((c for c in table.cells if c.row_index == row_idx and c.column_index == col_idx), None)
37
- html += f"<td>{cell.content if cell else ''}</td>"
38
- html += "</tr>"
39
- html += "</table>"
40
- tables.append(html)
41
- else:
42
- self.logger.warning("No tables found in the document")
43
 
44
- # Return both text and tables
45
- return {
46
- "text": text_content,
47
- "tables": tables
48
- }
49
 
50
  except HttpResponseError as e:
51
  self.logger.error(f"Azure Document Intelligence API error: {str(e)}")
 
1
  """Real wrapper goes here – currently not used by stub agents."""
2
  import logging
3
+ import json
4
  from azure.ai.documentintelligence import DocumentIntelligenceClient
5
+ from azure.ai.documentintelligence.models import DocumentContentFormat
6
  from azure.core.credentials import AzureKeyCredential
7
  from azure.core.exceptions import HttpResponseError
8
 
 
16
  self.logger.info("Starting document analysis with Azure Document Intelligence")
17
 
18
  # Analyze the entire document at once
19
+ #poller = self.client.begin_analyze_document("prebuilt-layout", body=pdf_bytes)
20
+
21
+ poller = self.client.begin_analyze_document(
22
+ "prebuilt-layout",
23
+ body=pdf_bytes,
24
+ content_type="application/octet-stream",
25
+ output_content_format=DocumentContentFormat.MARKDOWN
26
+ )
27
  result = poller.result()
28
 
29
+ # Log the raw result structure
30
+ self.logger.info("Inspecting Azure DI result structure:")
31
+ self.logger.info(f"Result type: {type(result)}")
32
+ self.logger.info(f"Result attributes: {dir(result)}")
33
+
34
+ # Check if content exists and log its type
35
  if hasattr(result, "content"):
36
+ self.logger.info(f"Content type: {type(result.content)}")
37
+ self.logger.info(f"Content preview: {result.content[:500]}")
38
 
39
+ # Check if tables exist and log their structure
 
40
  if hasattr(result, "tables"):
41
+ self.logger.info(f"Number of tables: {len(result.tables)}")
42
+ if result.tables:
43
+ self.logger.info(f"First table structure: {dir(result.tables[0])}")
44
+ self.logger.info(f"First table cells: {[cell.content for cell in result.tables[0].cells]}")
 
 
 
 
 
 
 
 
 
 
45
 
46
+ # For now, return empty result until we understand the structure
47
+ return {"text": result.content}
 
 
 
48
 
49
  except HttpResponseError as e:
50
  self.logger.error(f"Azure Document Intelligence API error: {str(e)}")