naman1102 commited on
Commit
7fb0070
·
1 Parent(s): 8e1dd81
Files changed (4) hide show
  1. app.py +16 -10
  2. requirements.txt +2 -0
  3. state.py +4 -1
  4. tools.py +59 -1
app.py CHANGED
@@ -20,8 +20,8 @@ from state import AgentState
20
  # --- Constants ---
21
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
22
 
23
- from tools import ocr_image_tool, parse_excel_tool, web_search_tool, run_tools
24
- tool_node = ToolNode([ocr_image_tool, parse_excel_tool, web_search_tool])
25
 
26
  llm = ChatOpenAI(model_name="gpt-4.1-mini", temperature=0.0)
27
 
@@ -45,14 +45,14 @@ def plan_node(state: AgentState) -> AgentState:
45
  # 2) Build a fresh SystemMessage explaining exactly one dict key
46
  system_msg = SystemMessage(
47
  content=(
48
- "You can set exactly one of these keys in a Python dict and nothing else:\n"
 
 
49
  " • web_search_query: <search terms>\n"
50
  " • ocr_path: <path to an image file>\n"
51
- " • excel_path: <path to a .xlsx file>\n"
52
- " • excel_sheet_name: <sheet name>\n"
53
- "Or, if no tool is needed, set final_answer: <your answer>.\n"
54
- "Example: {'web_search_query':'Mercedes Sosa discography'}\n"
55
- "Respond with only that Python dict literal—no extra text or explanation."
56
  )
57
  )
58
  human_msg = HumanMessage(content=user_input)
@@ -73,6 +73,7 @@ def plan_node(state: AgentState) -> AgentState:
73
  "ocr_path",
74
  "excel_path",
75
  "excel_sheet_name",
 
76
  "final_answer"
77
  }
78
  for k, v in parsed.items():
@@ -110,7 +111,11 @@ def finalize_node(state: AgentState) -> AgentState:
110
  combined += f"OCR_RESULT: {orc}\n"
111
  if exr := state.get("excel_result"):
112
  combined += f"EXCEL_RESULT: {exr}\n"
113
- combined += "Based on the above, provide the final answer."
 
 
 
 
114
 
115
  llm_response = llm([SystemMessage(content=combined)])
116
  return {"final_answer": llm_response.content.strip()}
@@ -178,11 +183,12 @@ def respond_to_input(user_input: str) -> str:
178
  system_msg = SystemMessage(
179
  content=(
180
  "You are an agent that decides whether to call a tool or answer the user directly. "
181
- "The users question is below. If the answer can be given directly, return {'final_answer': <your answer>}."
182
  "If you need to call a tool, set exactly one key from the following in a Python dict: "
183
  " • web_search_query: <search terms>\n"
184
  " • ocr_path: <path to an image file>\n"
185
  " • excel_path: <path to a .xlsx file>, excel_sheet_name: <sheet name>.\n"
 
186
  "Do not include any extra text or markdown—only return a valid Python dict literal."
187
  )
188
  )
 
20
  # --- Constants ---
21
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
22
 
23
+ from tools import ocr_image_tool, parse_excel_tool, web_search_tool, run_tools, audio_transcriber_tool
24
+ tool_node = ToolNode([ocr_image_tool, parse_excel_tool, web_search_tool, audio_transcriber_tool])
25
 
26
  llm = ChatOpenAI(model_name="gpt-4.1-mini", temperature=0.0)
27
 
 
45
  # 2) Build a fresh SystemMessage explaining exactly one dict key
46
  system_msg = SystemMessage(
47
  content=(
48
+ "You are an agent that decides whether to call a tool or answer the user directly. "
49
+ "The user's question is below. If the answer can be given directly, return {'final_answer': <your answer>}."
50
+ "If you need to call a tool, set exactly one key from the following in a Python dict: "
51
  " • web_search_query: <search terms>\n"
52
  " • ocr_path: <path to an image file>\n"
53
+ " • excel_path: <path to a .xlsx file>, excel_sheet_name: <sheet name>.\n"
54
+ " • audio_path: <path to an audio file>\n"
55
+ "Do not include any extra text or markdown—only return a valid Python dict literal."
 
 
56
  )
57
  )
58
  human_msg = HumanMessage(content=user_input)
 
73
  "ocr_path",
74
  "excel_path",
75
  "excel_sheet_name",
76
+ "audio_path",
77
  "final_answer"
78
  }
79
  for k, v in parsed.items():
 
111
  combined += f"OCR_RESULT: {orc}\n"
112
  if exr := state.get("excel_result"):
113
  combined += f"EXCEL_RESULT: {exr}\n"
114
+ # Check for both possible transcript keys
115
+ audio_transcript = state.get("audio_transcript") or state.get("transcript")
116
+ if audio_transcript:
117
+ combined += f"AUDIO_TRANSCRIPT: {audio_transcript}\n"
118
+ combined += "Based on the above, provide ONLY the final answer. Do not include any explanation or extra text."
119
 
120
  llm_response = llm([SystemMessage(content=combined)])
121
  return {"final_answer": llm_response.content.strip()}
 
183
  system_msg = SystemMessage(
184
  content=(
185
  "You are an agent that decides whether to call a tool or answer the user directly. "
186
+ "The user's question is below. If the answer can be given directly, return {'final_answer': <your answer>}."
187
  "If you need to call a tool, set exactly one key from the following in a Python dict: "
188
  " • web_search_query: <search terms>\n"
189
  " • ocr_path: <path to an image file>\n"
190
  " • excel_path: <path to a .xlsx file>, excel_sheet_name: <sheet name>.\n"
191
+ " • audio_path: <path to an audio file>\n"
192
  "Do not include any extra text or markdown—only return a valid Python dict literal."
193
  )
194
  )
requirements.txt CHANGED
@@ -8,3 +8,5 @@ openai
8
  pandas
9
  langchain_openai
10
  langchain_community
 
 
 
8
  pandas
9
  langchain_openai
10
  langchain_community
11
+ pydub
12
+ whisper
state.py CHANGED
@@ -12,4 +12,7 @@ class AgentState(TypedDict, total=False):
12
  ocr_result: str
13
  excel_result: str
14
  final_answer: str
15
- user_input: str
 
 
 
 
12
  ocr_result: str
13
  excel_result: str
14
  final_answer: str
15
+ user_input: str
16
+ audio_path: str
17
+ transcript: str
18
+ audio_transcript: str
tools.py CHANGED
@@ -79,4 +79,62 @@ def run_tools(state: AgentState, tool_out: AgentState) -> AgentState:
79
  This node should be wired as its own graph node, not as a transition function.
80
  """
81
  new_state = {**state, **tool_out}
82
- return new_state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  This node should be wired as its own graph node, not as a transition function.
80
  """
81
  new_state = {**state, **tool_out}
82
+ return new_state
83
+
84
+ import whisper
85
+ import os
86
+ from pydub import AudioSegment
87
+ from pydub.utils import make_chunks
88
+
89
+ _whisper_model = whisper.load_model("base")
90
+
91
+
92
+ def audio_transcriber_tool(state: AgentState) -> AgentState:
93
+ """
94
+ LangGraph tool for transcribing audio via Whisper.
95
+ Expects: state["audio_path"] to be a path to a .wav/.mp3/.m4a file.
96
+ Returns:
97
+ {
98
+ "audio_path": None,
99
+ "transcript": "<full transcribed text>"
100
+ }
101
+ If no valid audio_path is found, returns {} to signal "no-op."
102
+ """
103
+ path = state.get("audio_path", "")
104
+ if not path or not os.path.exists(path):
105
+ return {}
106
+
107
+ try:
108
+ # Whisper API has a ~25 MB limit per request. If file is small, transcribe directly.
109
+ max_bytes = 25 * 1024 * 1024
110
+ if os.path.getsize(path) <= max_bytes:
111
+ result = _whisper_model.transcribe(path)
112
+ text = result["text"].strip()
113
+ else:
114
+ # For large files, split into 2-minute (120 s) chunks
115
+ audio = AudioSegment.from_file(path)
116
+ chunk_length_ms = 120 * 1000
117
+ chunks = make_chunks(audio, chunk_length_ms)
118
+
119
+ transcripts = []
120
+ for i, chunk in enumerate(chunks):
121
+ chunk_name = f"temp_chunk_{i}.wav"
122
+ chunk.export(chunk_name, format="wav")
123
+ res = _whisper_model.transcribe(chunk_name)
124
+ transcripts.append(res["text"].strip())
125
+ os.remove(chunk_name)
126
+ text = "\n".join(transcripts)
127
+
128
+ except Exception as e:
129
+ text = f"Error during transcription: {e}"
130
+
131
+ return {
132
+ "audio_path": None,
133
+ "transcript": text
134
+ }
135
+
136
+
137
+
138
+
139
+
140
+