naman1102 commited on
Commit
133d76b
·
1 Parent(s): cfc12ce
Files changed (2) hide show
  1. apt.txt +0 -2
  2. tools.py +84 -22
apt.txt DELETED
@@ -1,2 +0,0 @@
1
- tesseract-ocr
2
- libtesseract-dev
 
 
 
tools.py CHANGED
@@ -88,47 +88,109 @@ def web_search_tool(state: AgentState) -> AgentState:
88
  }
89
 
90
 
 
91
  def ocr_image_tool(state: AgentState) -> AgentState:
92
  """
93
- Expects state["ocr_path"] to be either:
94
- A real local image path (e.g. "./hf_files/abc.png"), or
95
- A Task ID string like "abc123", in which case we GET /files/abc123.
 
 
96
  Returns:
97
- { "ocr_path": None, "ocr_result": "<OCRed text or error string>" }
98
- Always attempts to download the file for the given path or task ID.
 
 
99
  """
100
  print("reached ocr_image_tool")
101
- # path_or_id = state.get("ocr_path", "")
102
- # if not path_or_id:
103
- # return {}
104
 
105
- # Always attempt to download the file, regardless of local existence
106
  local_img = ""
107
- for ext in ("png", "jpg", "jpeg"):
108
- candidate = _download_file_for_task(state.get("task_id"), ext)
109
- if candidate:
110
- local_img = candidate
111
- break
 
 
 
112
 
113
  if not local_img or not os.path.exists(local_img):
114
  return {
115
  "ocr_path": None,
116
- "ocr_result": "Error: No image file found (download failed)."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  }
118
 
119
- # Run OCR
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  try:
121
- img = Image.open(local_img)
122
- text = pytesseract.image_to_string(img).strip() or "(no visible text)"
 
 
 
 
 
 
 
 
 
 
123
  except Exception as e:
124
- text = f"Error during OCR: {e}"
125
- print(f"OCRed as ocr_result: {text}")
 
 
 
126
  return {
127
  "ocr_path": None,
128
- "ocr_result": text
129
  }
130
 
131
-
132
  def parse_excel_tool(state: AgentState) -> AgentState:
133
  """
134
  Expects state["excel_path"] to be either:
 
88
  }
89
 
90
 
91
+
92
  def ocr_image_tool(state: AgentState) -> AgentState:
93
  """
94
+ Expects: state["ocr_path"] is either:
95
+ a local image path (e.g. "./hf_files/abc.png"), OR
96
+ a Task ID (e.g. "abc123"), in which case we try downloading
97
+ GET {DEFAULT_API_URL}/files/{task_id} with .png/.jpg/.jpeg extensions.
98
+
99
  Returns:
100
+ {
101
+ "ocr_path": None,
102
+ "ocr_result": "<OCR text + brief caption or an error message>"
103
+ }
104
  """
105
  print("reached ocr_image_tool")
106
+ path_or_id = state.get("ocr_path", "")
107
+ if not path_or_id:
108
+ return {}
109
 
110
+ # 1) Determine local_img: either existing path_or_id or download by Task ID
111
  local_img = ""
112
+ if os.path.exists(path_or_id):
113
+ local_img = path_or_id
114
+ else:
115
+ for ext in ("png", "jpg", "jpeg"):
116
+ candidate = _download_file_for_task(path_or_id, ext)
117
+ if candidate:
118
+ local_img = candidate
119
+ break
120
 
121
  if not local_img or not os.path.exists(local_img):
122
  return {
123
  "ocr_path": None,
124
+ "ocr_result": "Error: No image file found (local nonexistent or download failed)."
125
+ }
126
+
127
+ # 2) Read raw bytes
128
+ try:
129
+ with open(local_img, "rb") as f:
130
+ image_bytes = f.read()
131
+ except Exception as e:
132
+ return {
133
+ "ocr_path": None,
134
+ "ocr_result": f"Error reading image file: {e}"
135
+ }
136
+
137
+ # 3) Prepare HF Inference headers
138
+ hf_token = os.getenv("HF_TOKEN")
139
+ if not hf_token:
140
+ return {
141
+ "ocr_path": None,
142
+ "ocr_result": "Error: HUGGINGFACE_API_KEY not set in environment."
143
  }
144
 
145
+ headers = {"Authorization": f"Bearer {hf_token}"}
146
+
147
+ # 4) Call HF’s vision-ocr to extract text
148
+ ocr_text = ""
149
+ try:
150
+ ocr_resp = requests.post(
151
+ "https://api-inference.huggingface.co/models/google/vit-ocr",
152
+ headers=headers,
153
+ files={"file": image_bytes},
154
+ timeout=30
155
+ )
156
+ ocr_resp.raise_for_status()
157
+ ocr_json = ocr_resp.json()
158
+
159
+ # The JSON has “pages” → list of blocks → “lines” → each line has “text”
160
+ lines = []
161
+ for page in ocr_json.get("pages", []):
162
+ for line in page.get("lines", []):
163
+ lines.append(line.get("text", "").strip())
164
+ ocr_text = "\n".join(lines).strip() or "(no visible text)"
165
+ except Exception as e:
166
+ ocr_text = f"Error during HF OCR: {e}"
167
+
168
+ # 5) Call HF’s image-captioning to get a brief description
169
+ caption = ""
170
  try:
171
+ cap_resp = requests.post(
172
+ "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base",
173
+ headers=headers,
174
+ files={"file": image_bytes},
175
+ timeout=30
176
+ )
177
+ cap_resp.raise_for_status()
178
+ cap_json = cap_resp.json()
179
+ # The response looks like: {"generated_text": "...caption..."}
180
+ caption = cap_json.get("generated_text", "").strip()
181
+ if not caption:
182
+ caption = "(no caption returned)"
183
  except Exception as e:
184
+ caption = f"Error during HF captioning: {e}"
185
+
186
+ # 6) Combine OCR + caption
187
+ combined = f"OCR text:\n{ocr_text}\n\nImage caption:\n{caption}"
188
+
189
  return {
190
  "ocr_path": None,
191
+ "ocr_result": combined
192
  }
193
 
 
194
  def parse_excel_tool(state: AgentState) -> AgentState:
195
  """
196
  Expects state["excel_path"] to be either: