Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,8 +3,7 @@ import os
|
|
| 3 |
import gradio as gr
|
| 4 |
import trafilatura
|
| 5 |
from trafilatura import fetch_url, extract
|
| 6 |
-
import
|
| 7 |
-
from docling.document_converter import DocumentConverter
|
| 8 |
import torch
|
| 9 |
import soundfile as sf
|
| 10 |
import numpy as np
|
|
@@ -39,9 +38,9 @@ AVAILABLE_VOICES = [
|
|
| 39 |
def fetch_and_display_content(url):
|
| 40 |
"""Fetch and extract text from a given URL (HTML or PDF)."""
|
| 41 |
if url.endswith(".pdf") or "pdf" in url:
|
| 42 |
-
converter =
|
| 43 |
#result = converter.convert(source)
|
| 44 |
-
text = converter.convert(url).
|
| 45 |
else:
|
| 46 |
downloaded = trafilatura.fetch_url(url)
|
| 47 |
text = extract(downloaded, output_format="markdown", with_metadata=True, include_tables=False, include_links=False, include_formatting=True, include_comments=False) #without metadata extraction
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
import trafilatura
|
| 5 |
from trafilatura import fetch_url, extract
|
| 6 |
+
from markitdown import MarkItDown
|
|
|
|
| 7 |
import torch
|
| 8 |
import soundfile as sf
|
| 9 |
import numpy as np
|
|
|
|
| 38 |
def fetch_and_display_content(url):
|
| 39 |
"""Fetch and extract text from a given URL (HTML or PDF)."""
|
| 40 |
if url.endswith(".pdf") or "pdf" in url:
|
| 41 |
+
converter = MarkItDown()
|
| 42 |
#result = converter.convert(source)
|
| 43 |
+
text = converter.convert(url).text_content
|
| 44 |
else:
|
| 45 |
downloaded = trafilatura.fetch_url(url)
|
| 46 |
text = extract(downloaded, output_format="markdown", with_metadata=True, include_tables=False, include_links=False, include_formatting=True, include_comments=False) #without metadata extraction
|