Spaces:
Sleeping
Sleeping
audio
Browse files
app.py
CHANGED
@@ -20,8 +20,8 @@ from state import AgentState
|
|
20 |
# --- Constants ---
|
21 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
22 |
|
23 |
-
from tools import ocr_image_tool, parse_excel_tool, web_search_tool, run_tools
|
24 |
-
tool_node = ToolNode([ocr_image_tool, parse_excel_tool, web_search_tool])
|
25 |
|
26 |
llm = ChatOpenAI(model_name="gpt-4.1-mini", temperature=0.0)
|
27 |
|
@@ -45,14 +45,14 @@ def plan_node(state: AgentState) -> AgentState:
|
|
45 |
# 2) Build a fresh SystemMessage explaining exactly one dict key
|
46 |
system_msg = SystemMessage(
|
47 |
content=(
|
48 |
-
"You
|
|
|
|
|
49 |
" • web_search_query: <search terms>\n"
|
50 |
" • ocr_path: <path to an image file>\n"
|
51 |
-
" • excel_path: <path to a .xlsx file
|
52 |
-
" •
|
53 |
-
"
|
54 |
-
"Example: {'web_search_query':'Mercedes Sosa discography'}\n"
|
55 |
-
"Respond with only that Python dict literal—no extra text or explanation."
|
56 |
)
|
57 |
)
|
58 |
human_msg = HumanMessage(content=user_input)
|
@@ -73,6 +73,7 @@ def plan_node(state: AgentState) -> AgentState:
|
|
73 |
"ocr_path",
|
74 |
"excel_path",
|
75 |
"excel_sheet_name",
|
|
|
76 |
"final_answer"
|
77 |
}
|
78 |
for k, v in parsed.items():
|
@@ -110,7 +111,11 @@ def finalize_node(state: AgentState) -> AgentState:
|
|
110 |
combined += f"OCR_RESULT: {orc}\n"
|
111 |
if exr := state.get("excel_result"):
|
112 |
combined += f"EXCEL_RESULT: {exr}\n"
|
113 |
-
|
|
|
|
|
|
|
|
|
114 |
|
115 |
llm_response = llm([SystemMessage(content=combined)])
|
116 |
return {"final_answer": llm_response.content.strip()}
|
@@ -178,11 +183,12 @@ def respond_to_input(user_input: str) -> str:
|
|
178 |
system_msg = SystemMessage(
|
179 |
content=(
|
180 |
"You are an agent that decides whether to call a tool or answer the user directly. "
|
181 |
-
"The user
|
182 |
"If you need to call a tool, set exactly one key from the following in a Python dict: "
|
183 |
" • web_search_query: <search terms>\n"
|
184 |
" • ocr_path: <path to an image file>\n"
|
185 |
" • excel_path: <path to a .xlsx file>, excel_sheet_name: <sheet name>.\n"
|
|
|
186 |
"Do not include any extra text or markdown—only return a valid Python dict literal."
|
187 |
)
|
188 |
)
|
|
|
20 |
# --- Constants ---
|
21 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
22 |
|
23 |
+
from tools import ocr_image_tool, parse_excel_tool, web_search_tool, run_tools, audio_transcriber_tool
|
24 |
+
tool_node = ToolNode([ocr_image_tool, parse_excel_tool, web_search_tool, audio_transcriber_tool])
|
25 |
|
26 |
llm = ChatOpenAI(model_name="gpt-4.1-mini", temperature=0.0)
|
27 |
|
|
|
45 |
# 2) Build a fresh SystemMessage explaining exactly one dict key
|
46 |
system_msg = SystemMessage(
|
47 |
content=(
|
48 |
+
"You are an agent that decides whether to call a tool or answer the user directly. "
|
49 |
+
"The user's question is below. If the answer can be given directly, return {'final_answer': <your answer>}."
|
50 |
+
"If you need to call a tool, set exactly one key from the following in a Python dict: "
|
51 |
" • web_search_query: <search terms>\n"
|
52 |
" • ocr_path: <path to an image file>\n"
|
53 |
+
" • excel_path: <path to a .xlsx file>, excel_sheet_name: <sheet name>.\n"
|
54 |
+
" • audio_path: <path to an audio file>\n"
|
55 |
+
"Do not include any extra text or markdown—only return a valid Python dict literal."
|
|
|
|
|
56 |
)
|
57 |
)
|
58 |
human_msg = HumanMessage(content=user_input)
|
|
|
73 |
"ocr_path",
|
74 |
"excel_path",
|
75 |
"excel_sheet_name",
|
76 |
+
"audio_path",
|
77 |
"final_answer"
|
78 |
}
|
79 |
for k, v in parsed.items():
|
|
|
111 |
combined += f"OCR_RESULT: {orc}\n"
|
112 |
if exr := state.get("excel_result"):
|
113 |
combined += f"EXCEL_RESULT: {exr}\n"
|
114 |
+
# Check for both possible transcript keys
|
115 |
+
audio_transcript = state.get("audio_transcript") or state.get("transcript")
|
116 |
+
if audio_transcript:
|
117 |
+
combined += f"AUDIO_TRANSCRIPT: {audio_transcript}\n"
|
118 |
+
combined += "Based on the above, provide ONLY the final answer. Do not include any explanation or extra text."
|
119 |
|
120 |
llm_response = llm([SystemMessage(content=combined)])
|
121 |
return {"final_answer": llm_response.content.strip()}
|
|
|
183 |
system_msg = SystemMessage(
|
184 |
content=(
|
185 |
"You are an agent that decides whether to call a tool or answer the user directly. "
|
186 |
+
"The user's question is below. If the answer can be given directly, return {'final_answer': <your answer>}."
|
187 |
"If you need to call a tool, set exactly one key from the following in a Python dict: "
|
188 |
" • web_search_query: <search terms>\n"
|
189 |
" • ocr_path: <path to an image file>\n"
|
190 |
" • excel_path: <path to a .xlsx file>, excel_sheet_name: <sheet name>.\n"
|
191 |
+
" • audio_path: <path to an audio file>\n"
|
192 |
"Do not include any extra text or markdown—only return a valid Python dict literal."
|
193 |
)
|
194 |
)
|
requirements.txt
CHANGED
@@ -8,3 +8,5 @@ openai
|
|
8 |
pandas
|
9 |
langchain_openai
|
10 |
langchain_community
|
|
|
|
|
|
8 |
pandas
|
9 |
langchain_openai
|
10 |
langchain_community
|
11 |
+
pydub
|
12 |
+
whisper
|
state.py
CHANGED
@@ -12,4 +12,7 @@ class AgentState(TypedDict, total=False):
|
|
12 |
ocr_result: str
|
13 |
excel_result: str
|
14 |
final_answer: str
|
15 |
-
user_input: str
|
|
|
|
|
|
|
|
12 |
ocr_result: str
|
13 |
excel_result: str
|
14 |
final_answer: str
|
15 |
+
user_input: str
|
16 |
+
audio_path: str
|
17 |
+
transcript: str
|
18 |
+
audio_transcript: str
|
tools.py
CHANGED
@@ -79,4 +79,62 @@ def run_tools(state: AgentState, tool_out: AgentState) -> AgentState:
|
|
79 |
This node should be wired as its own graph node, not as a transition function.
|
80 |
"""
|
81 |
new_state = {**state, **tool_out}
|
82 |
-
return new_state
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
This node should be wired as its own graph node, not as a transition function.
|
80 |
"""
|
81 |
new_state = {**state, **tool_out}
|
82 |
+
return new_state
|
83 |
+
|
84 |
+
import whisper
|
85 |
+
import os
|
86 |
+
from pydub import AudioSegment
|
87 |
+
from pydub.utils import make_chunks
|
88 |
+
|
89 |
+
_whisper_model = whisper.load_model("base")
|
90 |
+
|
91 |
+
|
92 |
+
def audio_transcriber_tool(state: AgentState) -> AgentState:
|
93 |
+
"""
|
94 |
+
LangGraph tool for transcribing audio via Whisper.
|
95 |
+
Expects: state["audio_path"] to be a path to a .wav/.mp3/.m4a file.
|
96 |
+
Returns:
|
97 |
+
{
|
98 |
+
"audio_path": None,
|
99 |
+
"transcript": "<full transcribed text>"
|
100 |
+
}
|
101 |
+
If no valid audio_path is found, returns {} to signal "no-op."
|
102 |
+
"""
|
103 |
+
path = state.get("audio_path", "")
|
104 |
+
if not path or not os.path.exists(path):
|
105 |
+
return {}
|
106 |
+
|
107 |
+
try:
|
108 |
+
# Whisper API has a ~25 MB limit per request. If file is small, transcribe directly.
|
109 |
+
max_bytes = 25 * 1024 * 1024
|
110 |
+
if os.path.getsize(path) <= max_bytes:
|
111 |
+
result = _whisper_model.transcribe(path)
|
112 |
+
text = result["text"].strip()
|
113 |
+
else:
|
114 |
+
# For large files, split into 2-minute (120 s) chunks
|
115 |
+
audio = AudioSegment.from_file(path)
|
116 |
+
chunk_length_ms = 120 * 1000
|
117 |
+
chunks = make_chunks(audio, chunk_length_ms)
|
118 |
+
|
119 |
+
transcripts = []
|
120 |
+
for i, chunk in enumerate(chunks):
|
121 |
+
chunk_name = f"temp_chunk_{i}.wav"
|
122 |
+
chunk.export(chunk_name, format="wav")
|
123 |
+
res = _whisper_model.transcribe(chunk_name)
|
124 |
+
transcripts.append(res["text"].strip())
|
125 |
+
os.remove(chunk_name)
|
126 |
+
text = "\n".join(transcripts)
|
127 |
+
|
128 |
+
except Exception as e:
|
129 |
+
text = f"Error during transcription: {e}"
|
130 |
+
|
131 |
+
return {
|
132 |
+
"audio_path": None,
|
133 |
+
"transcript": text
|
134 |
+
}
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
|
139 |
+
|
140 |
+
|