Spaces:
Sleeping
Sleeping
Commit
·
8e63348
1
Parent(s):
85a86de
process_attachment tool
Browse files- app.py +97 -5
- requirements.txt +2 -1
app.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1 |
import os
|
2 |
import gradio as gr
|
3 |
import requests
|
|
|
|
|
|
|
4 |
import pandas as pd
|
5 |
import datetime
|
6 |
from langchain.tools import tool
|
@@ -264,12 +267,89 @@ def python_executor(code: str) -> str:
|
|
264 |
return str(result)
|
265 |
except Exception as e:
|
266 |
return f"error: {e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
|
|
|
268 |
##-- Tool Discovery ---
|
269 |
# Use @tool for each function.
|
270 |
# Use get_all_tools() to auto-discover all decorated tools.
|
271 |
# tools_list = get_all_tools()
|
272 |
tools_list = [
|
|
|
273 |
search_tool,
|
274 |
get_weather,
|
275 |
calculator,
|
@@ -300,11 +380,22 @@ You have access to a set of tools that you can use to answer the question:
|
|
300 |
|
301 |
{tool_descriptions}
|
302 |
|
|
|
|
|
|
|
|
|
303 |
You must use the tools only if necessary, and you must not use multiple tools in a single call. You should not use a tool if you know the exact answer and can answer by yourself. Don't hallucinate.
|
304 |
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you don't have a valid answer, just return "no_answer".
|
305 |
-
|
306 |
-
|
307 |
-
If
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
"""
|
309 |
|
310 |
# system_prompt = f"""
|
@@ -373,8 +464,9 @@ agent = initialize_agent(
|
|
373 |
agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
|
374 |
agent_kwargs={"system_message": system_prompt},
|
375 |
verbose=True,
|
376 |
-
max_iterations=
|
377 |
-
max_execution_time=
|
|
|
378 |
handle_parsing_errors=True
|
379 |
)
|
380 |
|
|
|
1 |
import os
|
2 |
import gradio as gr
|
3 |
import requests
|
4 |
+
import tempfile
|
5 |
+
import mimetypes
|
6 |
+
import base64
|
7 |
import pandas as pd
|
8 |
import datetime
|
9 |
from langchain.tools import tool
|
|
|
267 |
return str(result)
|
268 |
except Exception as e:
|
269 |
return f"error: {e}"
|
270 |
+
|
271 |
+
# --- TOOL 15: Attachment Processing Tool ---
|
272 |
+
@tool
|
273 |
+
def process_attachment(file_bytes: bytes, filename: str) -> str:
|
274 |
+
"""
|
275 |
+
Processes an input attachment (audio, image, or video) and returns extracted text or a summary suitable for LLM input.
|
276 |
+
- For audio: transcribes to text using Whisper.
|
277 |
+
- For image: encodes as base64 and returns a prompt for LLMs that support image input.
|
278 |
+
- For video: extracts audio, transcribes, and returns the transcript.
|
279 |
+
- For unsupported types: returns an error message.
|
280 |
+
"""
|
281 |
+
# Detect file type
|
282 |
+
mime_type, _ = mimetypes.guess_type(filename)
|
283 |
+
if not mime_type:
|
284 |
+
return "error: Could not determine file type. Skip the file"
|
285 |
+
|
286 |
+
# Handle audio files
|
287 |
+
if mime_type.startswith("audio"):
|
288 |
+
api_url = "https://api-inference.huggingface.co/models/openai/whisper-large-v3"
|
289 |
+
headers = {"Authorization": f"Bearer {HF_ACCESS_KEY}"}
|
290 |
+
files = {"file": (filename, file_bytes)}
|
291 |
+
try:
|
292 |
+
resp = requests.post(api_url, headers=headers, files=files, timeout=60)
|
293 |
+
resp.raise_for_status()
|
294 |
+
data = resp.json()
|
295 |
+
transcript = data.get("text", "")
|
296 |
+
if transcript:
|
297 |
+
return f"Transcript of the audio: {transcript}"
|
298 |
+
else:
|
299 |
+
return "error: No transcript returned."
|
300 |
+
except Exception as e:
|
301 |
+
return f"error: {e}"
|
302 |
+
|
303 |
+
# Handle image files
|
304 |
+
elif mime_type.startswith("image"):
|
305 |
+
image_b64 = base64.b64encode(file_bytes).decode()
|
306 |
+
return f"Attached image (base64): {image_b64}"
|
307 |
+
|
308 |
+
# Handle video files (extract audio, then transcribe)
|
309 |
+
elif mime_type.startswith("video"):
|
310 |
+
try:
|
311 |
+
# Save video to temp file
|
312 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=filename.split('.')[-1]) as tmp_video:
|
313 |
+
tmp_video.write(file_bytes)
|
314 |
+
tmp_video.flush()
|
315 |
+
video_path = tmp_video.name
|
316 |
+
|
317 |
+
# Extract audio using ffmpeg (requires ffmpeg installed)
|
318 |
+
audio_path = video_path + ".wav"
|
319 |
+
import subprocess
|
320 |
+
subprocess.run([
|
321 |
+
"ffmpeg", "-i", video_path, "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", audio_path
|
322 |
+
], check=True)
|
323 |
+
|
324 |
+
# Read audio bytes
|
325 |
+
with open(audio_path, "rb") as f:
|
326 |
+
audio_bytes = f.read()
|
327 |
+
|
328 |
+
# Transcribe audio
|
329 |
+
api_url = "https://api-inference.huggingface.co/models/openai/whisper-large-v3"
|
330 |
+
headers = {"Authorization": f"Bearer {HF_ACCESS_KEY}"}
|
331 |
+
files = {"file": ("audio.wav", audio_bytes)}
|
332 |
+
resp = requests.post(api_url, headers=headers, files=files, timeout=120)
|
333 |
+
resp.raise_for_status()
|
334 |
+
data = resp.json()
|
335 |
+
transcript = data.get("text", "")
|
336 |
+
if transcript:
|
337 |
+
return f"Transcript of the video audio: {transcript}"
|
338 |
+
else:
|
339 |
+
return "error: No transcript returned from video audio."
|
340 |
+
except Exception as e:
|
341 |
+
return f"error: {e}"
|
342 |
+
|
343 |
+
else:
|
344 |
+
return "error: Unsupported file type. Please skip the file usage."
|
345 |
|
346 |
+
|
347 |
##-- Tool Discovery ---
|
348 |
# Use @tool for each function.
|
349 |
# Use get_all_tools() to auto-discover all decorated tools.
|
350 |
# tools_list = get_all_tools()
|
351 |
tools_list = [
|
352 |
+
process_attachment,
|
353 |
search_tool,
|
354 |
get_weather,
|
355 |
calculator,
|
|
|
380 |
|
381 |
{tool_descriptions}
|
382 |
|
383 |
+
If there is a file (image, audio, or video) attached to the question, you should use the process_attachment tool to process it.
|
384 |
+
For audio or video attachments, the process_attachment tool will transcribe the audio and return the transcript, which you can use to answer the question.
|
385 |
+
For image attachments, the process_attachment tool will return a base64 encoded string of the image. You can use this encoded information to provide answer.
|
386 |
+
|
387 |
You must use the tools only if necessary, and you must not use multiple tools in a single call. You should not use a tool if you know the exact answer and can answer by yourself. Don't hallucinate.
|
388 |
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you don't have a valid answer, just return "no_answer".
|
389 |
+
|
390 |
+
Example of a valid answer:
|
391 |
+
If your response to a question is "The capital of France is Paris", you should return "Paris" as your final answer.
|
392 |
+
If your response to a question is "The population of France is 67 million", you should return "67" as your final answer.
|
393 |
+
If your response to a question is "4 studio albums were published by Mercedes Sosa between 2000 and 2009", you should return "4" as your final answer.
|
394 |
+
|
395 |
+
Further instructions:
|
396 |
+
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
|
397 |
+
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
|
398 |
+
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
399 |
"""
|
400 |
|
401 |
# system_prompt = f"""
|
|
|
464 |
agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
|
465 |
agent_kwargs={"system_message": system_prompt},
|
466 |
verbose=True,
|
467 |
+
max_iterations=20, # Increase as needed
|
468 |
+
max_execution_time=4000, # Increase as needed
|
469 |
+
early_stopping_method="generate",
|
470 |
handle_parsing_errors=True
|
471 |
)
|
472 |
|
requirements.txt
CHANGED
@@ -8,4 +8,5 @@ langchain-huggingface
|
|
8 |
langchain-community
|
9 |
transformers
|
10 |
langchain-openai
|
11 |
-
beautifulsoup4
|
|
|
|
8 |
langchain-community
|
9 |
transformers
|
10 |
langchain-openai
|
11 |
+
beautifulsoup4
|
12 |
+
mimetype
|