Bhanu-Chander-ABB commited on
Commit
8e63348
·
1 Parent(s): 85a86de

process_attachment tool

Browse files
Files changed (2) hide show
  1. app.py +97 -5
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,6 +1,9 @@
1
  import os
2
  import gradio as gr
3
  import requests
 
 
 
4
  import pandas as pd
5
  import datetime
6
  from langchain.tools import tool
@@ -264,12 +267,89 @@ def python_executor(code: str) -> str:
264
  return str(result)
265
  except Exception as e:
266
  return f"error: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
 
 
268
  ##-- Tool Discovery ---
269
  # Use @tool for each function.
270
  # Use get_all_tools() to auto-discover all decorated tools.
271
  # tools_list = get_all_tools()
272
  tools_list = [
 
273
  search_tool,
274
  get_weather,
275
  calculator,
@@ -300,11 +380,22 @@ You have access to a set of tools that you can use to answer the question:
300
 
301
  {tool_descriptions}
302
 
 
 
 
 
303
  You must use the tools only if necessary, and you must not use multiple tools in a single call. You should not use a tool if you know the exact answer and can answer by yourself. Don't hallucinate.
304
  YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you don't have a valid answer, just return "no_answer".
305
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
306
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
307
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
 
 
 
 
 
 
 
308
  """
309
 
310
  # system_prompt = f"""
@@ -373,8 +464,9 @@ agent = initialize_agent(
373
  agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
374
  agent_kwargs={"system_message": system_prompt},
375
  verbose=True,
376
- max_iterations=10, # Increase as needed
377
- max_execution_time=3000, # Increase as needed
 
378
  handle_parsing_errors=True
379
  )
380
 
 
1
  import os
2
  import gradio as gr
3
  import requests
4
+ import tempfile
5
+ import mimetypes
6
+ import base64
7
  import pandas as pd
8
  import datetime
9
  from langchain.tools import tool
 
267
  return str(result)
268
  except Exception as e:
269
  return f"error: {e}"
270
+
271
+ # --- TOOL 15: Attachment Processing Tool ---
272
+ @tool
273
+ def process_attachment(file_bytes: bytes, filename: str) -> str:
274
+ """
275
+ Processes an input attachment (audio, image, or video) and returns extracted text or a summary suitable for LLM input.
276
+ - For audio: transcribes to text using Whisper.
277
+ - For image: encodes as base64 and returns a prompt for LLMs that support image input.
278
+ - For video: extracts audio, transcribes, and returns the transcript.
279
+ - For unsupported types: returns an error message.
280
+ """
281
+ # Detect file type
282
+ mime_type, _ = mimetypes.guess_type(filename)
283
+ if not mime_type:
284
+ return "error: Could not determine file type. Skip the file"
285
+
286
+ # Handle audio files
287
+ if mime_type.startswith("audio"):
288
+ api_url = "https://api-inference.huggingface.co/models/openai/whisper-large-v3"
289
+ headers = {"Authorization": f"Bearer {HF_ACCESS_KEY}"}
290
+ files = {"file": (filename, file_bytes)}
291
+ try:
292
+ resp = requests.post(api_url, headers=headers, files=files, timeout=60)
293
+ resp.raise_for_status()
294
+ data = resp.json()
295
+ transcript = data.get("text", "")
296
+ if transcript:
297
+ return f"Transcript of the audio: {transcript}"
298
+ else:
299
+ return "error: No transcript returned."
300
+ except Exception as e:
301
+ return f"error: {e}"
302
+
303
+ # Handle image files
304
+ elif mime_type.startswith("image"):
305
+ image_b64 = base64.b64encode(file_bytes).decode()
306
+ return f"Attached image (base64): {image_b64}"
307
+
308
+ # Handle video files (extract audio, then transcribe)
309
+ elif mime_type.startswith("video"):
310
+ try:
311
+ # Save video to temp file
312
+ with tempfile.NamedTemporaryFile(delete=False, suffix=filename.split('.')[-1]) as tmp_video:
313
+ tmp_video.write(file_bytes)
314
+ tmp_video.flush()
315
+ video_path = tmp_video.name
316
+
317
+ # Extract audio using ffmpeg (requires ffmpeg installed)
318
+ audio_path = video_path + ".wav"
319
+ import subprocess
320
+ subprocess.run([
321
+ "ffmpeg", "-i", video_path, "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", audio_path
322
+ ], check=True)
323
+
324
+ # Read audio bytes
325
+ with open(audio_path, "rb") as f:
326
+ audio_bytes = f.read()
327
+
328
+ # Transcribe audio
329
+ api_url = "https://api-inference.huggingface.co/models/openai/whisper-large-v3"
330
+ headers = {"Authorization": f"Bearer {HF_ACCESS_KEY}"}
331
+ files = {"file": ("audio.wav", audio_bytes)}
332
+ resp = requests.post(api_url, headers=headers, files=files, timeout=120)
333
+ resp.raise_for_status()
334
+ data = resp.json()
335
+ transcript = data.get("text", "")
336
+ if transcript:
337
+ return f"Transcript of the video audio: {transcript}"
338
+ else:
339
+ return "error: No transcript returned from video audio."
340
+ except Exception as e:
341
+ return f"error: {e}"
342
+
343
+ else:
344
+ return "error: Unsupported file type. Please skip the file usage."
345
 
346
+
347
  ##-- Tool Discovery ---
348
  # Use @tool for each function.
349
  # Use get_all_tools() to auto-discover all decorated tools.
350
  # tools_list = get_all_tools()
351
  tools_list = [
352
+ process_attachment,
353
  search_tool,
354
  get_weather,
355
  calculator,
 
380
 
381
  {tool_descriptions}
382
 
383
+ If there is a file (image, audio, or video) attached to the question, you should use the process_attachment tool to process it.
384
+ For audio or video attachments, the process_attachment tool will transcribe the audio and return the transcript, which you can use to answer the question.
385
+ For image attachments, the process_attachment tool will return a base64 encoded string of the image. You can use this encoded information to provide answer.
386
+
387
  You must use the tools only if necessary, and you must not use multiple tools in a single call. You should not use a tool if you know the exact answer and can answer by yourself. Don't hallucinate.
388
  YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you don't have a valid answer, just return "no_answer".
389
+
390
+ Example of a valid answer:
391
+ If your response to a question is "The capital of France is Paris", you should return "Paris" as your final answer.
392
+ If your response to a question is "The population of France is 67 million", you should return "67" as your final answer.
393
+ If your response to a question is "4 studio albums were published by Mercedes Sosa between 2000 and 2009", you should return "4" as your final answer.
394
+
395
+ Further instructions:
396
+ - If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
397
+ - If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
398
+ - If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
399
  """
400
 
401
  # system_prompt = f"""
 
464
  agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
465
  agent_kwargs={"system_message": system_prompt},
466
  verbose=True,
467
+ max_iterations=20, # Increase as needed
468
+ max_execution_time=4000, # Increase as needed
469
+ early_stopping_method="generate",
470
  handle_parsing_errors=True
471
  )
472
 
requirements.txt CHANGED
@@ -8,4 +8,5 @@ langchain-huggingface
8
  langchain-community
9
  transformers
10
  langchain-openai
11
- beautifulsoup4
 
 
8
  langchain-community
9
  transformers
10
  langchain-openai
11
+ beautifulsoup4
12
+ mimetype