Spaces:
Sleeping
Sleeping
Upload 6 files
Browse files- ImageForDoctor.py +54 -0
- README.md +2 -14
- VoiceOfDoctor.py +49 -0
- VoiceOfPatient.py +75 -0
- app.py +48 -0
- requirements.txt +7 -0
ImageForDoctor.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
from groq import Groq
|
4 |
+
import base64 #bit to string
|
5 |
+
|
6 |
+
|
7 |
+
#setup1 groq api setup
|
8 |
+
load_dotenv()
|
9 |
+
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
|
10 |
+
client = Groq(api_key=GROQ_API_KEY)
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
#setup2 the image into encoded formate
|
15 |
+
# image_path = "acne.jpg"
|
16 |
+
model = "llama-3.2-90b-vision-preview"
|
17 |
+
def encodeimage(image_path):
|
18 |
+
if not os.path.exists(image_path):
|
19 |
+
raise FileNotFoundError(f"Image file not found: {image_path}")
|
20 |
+
with open(image_path, "rb") as image_file:
|
21 |
+
return base64.b64encode(image_file.read()).decode("utf-8")
|
22 |
+
|
23 |
+
|
24 |
+
#step3 Setup the grof for vision
|
25 |
+
def AnalyzeImagewithQuery(query,encode_imgae):
|
26 |
+
messages = [
|
27 |
+
{
|
28 |
+
"role" : "user",
|
29 |
+
"content" : [
|
30 |
+
{
|
31 |
+
"type" : "text",
|
32 |
+
"text" : query
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"type": "image_url",
|
36 |
+
"image_url": {
|
37 |
+
"url": f"data:image/jpeg;base64,{encode_imgae}"
|
38 |
+
}
|
39 |
+
}
|
40 |
+
|
41 |
+
]
|
42 |
+
}
|
43 |
+
]
|
44 |
+
chat_completion = client.chat.completions.create(
|
45 |
+
messages = messages,
|
46 |
+
model = "meta-llama/llama-4-scout-17b-16e-instruct",
|
47 |
+
temperature = 0.7
|
48 |
+
)
|
49 |
+
return chat_completion.choices[0].message.content
|
50 |
+
|
51 |
+
if __name__ == "__main__":
|
52 |
+
query = "What happen with my face can you analyze that?"
|
53 |
+
e_image=encodeimage()
|
54 |
+
AnalyzeImagewithQuery(encode_imgae=e_image,query=query)
|
README.md
CHANGED
@@ -1,14 +1,2 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
emoji: 🐢
|
4 |
-
colorFrom: blue
|
5 |
-
colorTo: purple
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 5.25.1
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: mit
|
11 |
-
short_description: ChatWithDoctorAny is a smart AI-powered assistant designed t
|
12 |
-
---
|
13 |
-
|
14 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
# ChatWithDoctorAny
|
2 |
+
ChatWithDoctorAny is a smart AI-powered assistant designed to simulate conversations with a virtual doctor. Whether you have a general health concern or just want instant advice, this tool gives you a safe space to ask health-related questions, anytime and anywhere.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
VoiceOfDoctor.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from gtts import gTTS
|
2 |
+
from pydub import AudioSegment
|
3 |
+
import os
|
4 |
+
from dotenv import load_dotenv # Optional, only if you need environment variables
|
5 |
+
import pygame
|
6 |
+
import warnings
|
7 |
+
warnings.filterwarnings("ignore")
|
8 |
+
|
9 |
+
# Optional, only if you have any environment variables to load
|
10 |
+
load_dotenv()
|
11 |
+
|
12 |
+
def text_to_speech_with_gtts(text, mp3_output_path):
|
13 |
+
"""
|
14 |
+
Converts text to speech using gTTS, saves it as MP3, and optionally converts it to WAV.
|
15 |
+
|
16 |
+
Args:
|
17 |
+
- text (str): The text that will be converted to speech.
|
18 |
+
- mp3_output_path (str): The path where the MP3 file will be saved.
|
19 |
+
- wav_output_path (str, optional): If provided, will save the converted WAV file here.
|
20 |
+
"""
|
21 |
+
try:
|
22 |
+
# Convert text to speech
|
23 |
+
print("Converting text to speech...")
|
24 |
+
tts = gTTS(text=text, lang='en')
|
25 |
+
tts.save(mp3_output_path)
|
26 |
+
print(f"MP3 file saved to {mp3_output_path}")
|
27 |
+
|
28 |
+
# Initialize pygame mixer
|
29 |
+
pygame.mixer.init()
|
30 |
+
|
31 |
+
# Play the MP3 file
|
32 |
+
print("Playing the MP3 file...")
|
33 |
+
pygame.mixer.music.load(mp3_output_path)
|
34 |
+
pygame.mixer.music.play()
|
35 |
+
|
36 |
+
while pygame.mixer.music.get_busy():
|
37 |
+
pygame.time.Clock().tick(14)
|
38 |
+
|
39 |
+
print("Audio playback finished.")
|
40 |
+
|
41 |
+
except Exception as e:
|
42 |
+
print(f"Error during text-to-speech conversion: {e}")
|
43 |
+
|
44 |
+
|
45 |
+
# Example usage
|
46 |
+
# text = "Hello my name is waris. i am from islamabad, Right now i am struggling to get the job in the field of Art"
|
47 |
+
# mp3_file = r"C:\Users\HP\Desktop\ChatWithDoctorAny\ChatWithDoctorAny\output.mp3"
|
48 |
+
|
49 |
+
# text_to_speech_with_gtts(text, mp3_file)
|
VoiceOfPatient.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# VoiceOfPatient.py
|
2 |
+
import logging
|
3 |
+
import speech_recognition as sr
|
4 |
+
from pydub import AudioSegment
|
5 |
+
from io import BytesIO
|
6 |
+
import os
|
7 |
+
from groq import Groq
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
import warnings
|
10 |
+
from pydub import AudioSegment
|
11 |
+
from pydub.utils import which
|
12 |
+
|
13 |
+
warnings.filterwarnings("ignore")
|
14 |
+
load_dotenv()
|
15 |
+
|
16 |
+
# Get the ffmpeg path from environment and register it with pydub
|
17 |
+
ffmpeg_path = os.getenv("FFMPEG_PATH")
|
18 |
+
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
|
19 |
+
if ffmpeg_path:
|
20 |
+
AudioSegment.converter = ffmpeg_path
|
21 |
+
else:
|
22 |
+
raise EnvironmentError("FFMPEG_PATH is not set. Please define it in the .env file.")
|
23 |
+
|
24 |
+
# Configure logging
|
25 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
26 |
+
|
27 |
+
def record_audio(file_path, timeout=20, phrase_time_limit=None):
|
28 |
+
"""
|
29 |
+
Record audio from the microphone and save it as an MP3 file.
|
30 |
+
|
31 |
+
Args:
|
32 |
+
file_path (str): Path to save the recorded audio file.
|
33 |
+
timeout (int): Max time to wait for speech to start (in seconds).
|
34 |
+
phrase_time_limit (int or None): Max length of the speech (in seconds).
|
35 |
+
"""
|
36 |
+
recognizer = sr.Recognizer()
|
37 |
+
|
38 |
+
try:
|
39 |
+
with sr.Microphone() as source:
|
40 |
+
logging.info("Adjusting for ambient noise...")
|
41 |
+
recognizer.adjust_for_ambient_noise(source, duration=1)
|
42 |
+
logging.info("Start speaking now...")
|
43 |
+
|
44 |
+
audio_data = recognizer.listen(source, timeout=timeout, phrase_time_limit=phrase_time_limit)
|
45 |
+
logging.info("Recording complete.")
|
46 |
+
|
47 |
+
wav_data = audio_data.get_wav_data()
|
48 |
+
audio_segment = AudioSegment.from_wav(BytesIO(wav_data))
|
49 |
+
audio_segment.export(file_path, format="mp3", bitrate="128k")
|
50 |
+
|
51 |
+
logging.info(f"Audio saved to: {file_path}")
|
52 |
+
|
53 |
+
except Exception as e:
|
54 |
+
logging.error(f"An error occurred: {e}")
|
55 |
+
|
56 |
+
audio_file_path = "patientvoicetest.mp3"
|
57 |
+
record_audio(file_path=audio_file_path)
|
58 |
+
|
59 |
+
# Now setup speech to text model setup for transcribe the text from the voice
|
60 |
+
client = Groq(api_key=GROQ_API_KEY)
|
61 |
+
def transcribe_with_whisper(audio_file_path,model_name="meta-llama/llama-4-scout-17b-16e-instruct"):
|
62 |
+
|
63 |
+
|
64 |
+
with open(audio_file_path, "rb") as audio_file:
|
65 |
+
transcription = client.audio.transcriptions.create(
|
66 |
+
model=model_name,
|
67 |
+
file=audio_file,
|
68 |
+
language="en",
|
69 |
+
)
|
70 |
+
|
71 |
+
return transcription.text
|
72 |
+
|
73 |
+
|
74 |
+
if __name__ == "__main__":
|
75 |
+
pass
|
app.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from VoiceOfPatient import record_audio,transcribe_with_whisper
|
2 |
+
from VoiceOfDoctor import text_to_speech_with_gtts
|
3 |
+
from ImageForDoctor import encodeimage,AnalyzeImagewithQuery
|
4 |
+
import gradio as gr
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
import os
|
7 |
+
|
8 |
+
load_dotenv()
|
9 |
+
|
10 |
+
prompt = """
|
11 |
+
You are Doctor Any, a virtual medical expert designed to assist with health-related issues. Always respond as a confident, knowledgeable doctor.
|
12 |
+
|
13 |
+
When given a patient’s image and question, analyze both thoroughly and provide a clear medical explanation of what might be wrong. Based on your analysis, offer direct, practical advice or treatment suggestions. Act like a real doctor — do not suggest consulting another specialist unless the condition is life-threatening or beyond your scope.
|
14 |
+
|
15 |
+
Do not answer questions unrelated to medicine or mental health. Instead, reply: "I am Doctor Any, a virtual medical expert. I only assist with health-related matters."
|
16 |
+
|
17 |
+
Keep responses brief—no more than 2 to 3 lines. Be precise, informative, and avoid using extra spaces, special characters, or numbers. Speak clearly and with authority, like a professional doctor guiding a patient.
|
18 |
+
"""
|
19 |
+
|
20 |
+
#now setup the frontend than integrate the function with i.
|
21 |
+
|
22 |
+
def process_input(audio_filepath,image_filepath):
|
23 |
+
speech_to_text = transcribe_with_whisper(audio_file_path=audio_filepath,
|
24 |
+
model_name="whisper-large-v3")
|
25 |
+
if image_filepath:
|
26 |
+
doctor_response = AnalyzeImagewithQuery(encode_imgae=encodeimage(image_filepath),query=prompt+speech_to_text)
|
27 |
+
else:
|
28 |
+
doctor_response = "No image for me to analayze.Kindly upload the photo."
|
29 |
+
|
30 |
+
voice_of_doctor = text_to_speech_with_gtts(text=doctor_response,mp3_output_path="final.mp3")
|
31 |
+
|
32 |
+
return speech_to_text, doctor_response, voice_of_doctor
|
33 |
+
|
34 |
+
iface = gr.Interface(
|
35 |
+
fn=process_input,
|
36 |
+
inputs=[
|
37 |
+
gr.Audio(sources=["microphone"], type="filepath"),
|
38 |
+
gr.Image(type="filepath")
|
39 |
+
],
|
40 |
+
outputs=[
|
41 |
+
gr.Textbox(label="Speech to Text"),
|
42 |
+
gr.Textbox(label="DocorAny-Response"),
|
43 |
+
gr.Audio("Temp.mp3")
|
44 |
+
],
|
45 |
+
title="Chat With DoctorAny."
|
46 |
+
)
|
47 |
+
|
48 |
+
iface.launch(debug=True)
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python
|
2 |
+
gradio
|
3 |
+
load_dotenv
|
4 |
+
os
|
5 |
+
langchain
|
6 |
+
langchain_groq
|
7 |
+
gTTs
|