PuristanLabs1 commited on
Commit
7eb5f41
·
verified ·
1 Parent(s): ff4eee6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -13
app.py CHANGED
@@ -1,6 +1,8 @@
1
  import gradio as gr
2
  import trafilatura
 
3
  import docling
 
4
  import torch
5
  import soundfile as sf
6
  import numpy as np
@@ -28,23 +30,66 @@ SUPPORTED_TTS_LANGUAGES = {
28
  def fetch_content(url):
29
  """Fetch and extract text from a given URL (HTML or PDF)."""
30
  if url.endswith(".pdf") or "pdf" in url:
31
- text = docling.extract_text(url)
 
32
  else:
33
  downloaded = trafilatura.fetch_url(url)
34
- text = trafilatura.extract(downloaded, output_format="markdown", with_metadata=False)
35
  return text
36
 
37
  ### 2️⃣ Cleaning Function
38
  def extract_and_clean_text(data):
39
- """Removes citations, links, markdown elements, and unnecessary sections."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  def clean_text(text):
41
- text = re.sub(r'\[\d+\]', '', text) # Remove citations like [2][4]
42
- text = re.sub(r'http[s]?://\S+', '', text) # Remove URLs
43
- text = re.sub(r'[*_`]', '', text) # Remove markdown formatting
44
- text = re.sub(r'\n\s*\n+', '\n\n', text).strip() # Remove excessive whitespace
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  return text
46
 
47
- return clean_text(data)
 
 
48
 
49
  ### 3️⃣ Language Detection
50
  def detect_language(text):
@@ -60,22 +105,33 @@ def generate_audio_kokoro(text, lang):
60
  """Generate speech using KokoroTTS for supported languages."""
61
  lang_code = SUPPORTED_TTS_LANGUAGES.get(lang, "a") # Default to English
62
  generator = kokoro_tts(text, voice="af_bella", speed=1, split_pattern=r'\n+')
 
 
63
 
64
- # Combine audio segments into a single file
65
- audio_data = np.concatenate([audio for gs, ps, audio in generator])
 
 
 
 
 
 
 
 
 
66
  output_file = f"audio_{lang}.wav"
67
- sf.write(output_file, audio_data, 24000) # Save as WAV file
68
  return output_file
69
 
70
  ### 5️⃣ Main Processing Function
71
  def process_url(url):
72
  """Processes the URL, extracts text, detects language, and converts to audio."""
73
  content = fetch_content(url)
74
- cleaned_text = extract_and_clean_text(content)
75
  detected_lang = detect_language(cleaned_text)
76
  audio_file = generate_audio_kokoro(cleaned_text, detected_lang)
77
 
78
- return cleaned_text, detected_lang, audio_file
79
 
80
  ### 6️⃣ Gradio Interface
81
  with gr.Blocks() as demo:
 
1
  import gradio as gr
2
  import trafilatura
3
+ from trafilatura import fetch_url, extract
4
  import docling
5
+ from docling.document_converter import DocumentConverter
6
  import torch
7
  import soundfile as sf
8
  import numpy as np
 
30
  def fetch_content(url):
31
  """Fetch and extract text from a given URL (HTML or PDF)."""
32
  if url.endswith(".pdf") or "pdf" in url:
33
+ converter = DocumentConverter()
34
+ text = converter.convert(url)
35
  else:
36
  downloaded = trafilatura.fetch_url(url)
37
+ text = extract(downloaded, output_format="markdown", with_metadata=True, include_tables=False, include_links=False, include_formatting=True, include_comments=False) #without metadata extraction
38
  return text
39
 
40
  ### 2️⃣ Cleaning Function
41
  def extract_and_clean_text(data):
42
+
43
+ metadata_dict = {}
44
+
45
+ # Step 1: Extract metadata enclosed between "---" at the beginning
46
+ metadata_pattern = re.match(r"^---(.*?)---", data, re.DOTALL)
47
+
48
+ if metadata_pattern:
49
+ metadata_raw = metadata_pattern.group(1).strip()
50
+ data = data[metadata_pattern.end():].strip() # Remove metadata from text
51
+
52
+ # Convert metadata into dictionary format manually (since YAML isn't reliable)
53
+ metadata_lines = metadata_raw.split("\n")
54
+ for line in metadata_lines:
55
+ if ": " in line: # Only process lines with key-value pairs
56
+ key, value = line.split(": ", 1) # Split at first ": "
57
+
58
+ # Convert lists (wrapped in square brackets) into Python lists
59
+ if value.startswith("[") and value.endswith("]"):
60
+ try:
61
+ value = json.loads(value) # Convert to list
62
+ except json.JSONDecodeError:
63
+ pass # If JSON parsing fails, keep it as a string
64
+
65
+ metadata_dict[key.strip()] = value.strip() # Store cleaned key-value pair
66
+
67
+ # Step 2: Clean the extracted text
68
  def clean_text(text):
69
+ # Remove inline citations like [2][4]
70
+ text = re.sub(r'\[\d+\]', '', text)
71
+
72
+ # Remove URLs (both direct links and markdown-style links)
73
+ text = re.sub(r'http[s]?://\S+', '', text) # Direct links
74
+ text = re.sub(r'\[.*?\]\(http[s]?://\S+\)', '', text) # Markdown links
75
+
76
+ # Remove markdown-style headings and special characters (#, ##, *, etc.)
77
+ text = re.sub(r'^\s*#+\s*', '', text, flags=re.MULTILINE) # Remove headings
78
+ text = re.sub(r'[*_`]', '', text) # Remove bold/italic/monospace markers
79
+
80
+ # Remove References, Bibliography, External Links, and Comments sections
81
+ patterns = [r'References\b.*', r'Bibliography\b.*', r'External Links\b.*', r'COMMENTS\b.*']
82
+ for pattern in patterns:
83
+ text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
84
+
85
+ # Remove extra whitespace and newlines
86
+ text = re.sub(r'\n\s*\n+', '\n\n', text).strip()
87
+
88
  return text
89
 
90
+ cleaned_text = clean_text(data)
91
+
92
+ return metadata_dict, cleaned_text
93
 
94
  ### 3️⃣ Language Detection
95
  def detect_language(text):
 
105
  """Generate speech using KokoroTTS for supported languages."""
106
  lang_code = SUPPORTED_TTS_LANGUAGES.get(lang, "a") # Default to English
107
  generator = kokoro_tts(text, voice="af_bella", speed=1, split_pattern=r'\n+')
108
+ # 3. Specify Device
109
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
110
 
111
+ # Initialize an empty list to store audio data
112
+ audio_data_list = []
113
+ # Generate and collect audio data
114
+ for i, (gs, ps, audio) in enumerate(generator):
115
+ print(f"Processing segment {i + 1}")
116
+ print(gs) # Print the text segment
117
+ audio_data_list.append(audio) # Append audio data to the list
118
+
119
+ # Concatenate all audio data into a single array
120
+ full_audio = np.concatenate(audio_data_list)
121
+
122
  output_file = f"audio_{lang}.wav"
123
+ sf.write(output_file, full, 24000) # Save as WAV file
124
  return output_file
125
 
126
  ### 5️⃣ Main Processing Function
127
  def process_url(url):
128
  """Processes the URL, extracts text, detects language, and converts to audio."""
129
  content = fetch_content(url)
130
+ metadata,cleaned_text = extract_and_clean_text(content)
131
  detected_lang = detect_language(cleaned_text)
132
  audio_file = generate_audio_kokoro(cleaned_text, detected_lang)
133
 
134
+ return metadata, cleaned_text, detected_lang, audio_file
135
 
136
  ### 6️⃣ Gradio Interface
137
  with gr.Blocks() as demo: