Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -3,8 +3,7 @@ import os
|
|
3 |
import gradio as gr
|
4 |
import trafilatura
|
5 |
from trafilatura import fetch_url, extract
|
6 |
-
import
|
7 |
-
from docling.document_converter import DocumentConverter
|
8 |
import torch
|
9 |
import soundfile as sf
|
10 |
import numpy as np
|
@@ -39,9 +38,9 @@ AVAILABLE_VOICES = [
|
|
39 |
def fetch_and_display_content(url):
|
40 |
"""Fetch and extract text from a given URL (HTML or PDF)."""
|
41 |
if url.endswith(".pdf") or "pdf" in url:
|
42 |
-
converter =
|
43 |
#result = converter.convert(source)
|
44 |
-
text = converter.convert(url).
|
45 |
else:
|
46 |
downloaded = trafilatura.fetch_url(url)
|
47 |
text = extract(downloaded, output_format="markdown", with_metadata=True, include_tables=False, include_links=False, include_formatting=True, include_comments=False) #without metadata extraction
|
|
|
3 |
import gradio as gr
|
4 |
import trafilatura
|
5 |
from trafilatura import fetch_url, extract
|
6 |
+
from markitdown import MarkItDown
|
|
|
7 |
import torch
|
8 |
import soundfile as sf
|
9 |
import numpy as np
|
|
|
38 |
def fetch_and_display_content(url):
|
39 |
"""Fetch and extract text from a given URL (HTML or PDF)."""
|
40 |
if url.endswith(".pdf") or "pdf" in url:
|
41 |
+
converter = MarkItDown()
|
42 |
#result = converter.convert(source)
|
43 |
+
text = converter.convert(url).text_content
|
44 |
else:
|
45 |
downloaded = trafilatura.fetch_url(url)
|
46 |
text = extract(downloaded, output_format="markdown", with_metadata=True, include_tables=False, include_links=False, include_formatting=True, include_comments=False) #without metadata extraction
|