PuristanLabs1 commited on
Commit
23b607a
·
verified ·
1 Parent(s): 7e5ccd3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -4
app.py CHANGED
@@ -3,8 +3,7 @@ import os
3
  import gradio as gr
4
  import trafilatura
5
  from trafilatura import fetch_url, extract
6
- import docling
7
- from docling.document_converter import DocumentConverter
8
  import torch
9
  import soundfile as sf
10
  import numpy as np
@@ -39,9 +38,9 @@ AVAILABLE_VOICES = [
39
  def fetch_and_display_content(url):
40
  """Fetch and extract text from a given URL (HTML or PDF)."""
41
  if url.endswith(".pdf") or "pdf" in url:
42
- converter = DocumentConverter()
43
  #result = converter.convert(source)
44
- text = converter.convert(url).document.export_to_markdown()
45
  else:
46
  downloaded = trafilatura.fetch_url(url)
47
  text = extract(downloaded, output_format="markdown", with_metadata=True, include_tables=False, include_links=False, include_formatting=True, include_comments=False) #without metadata extraction
 
3
  import gradio as gr
4
  import trafilatura
5
  from trafilatura import fetch_url, extract
6
+ from markitdown import MarkItDown
 
7
  import torch
8
  import soundfile as sf
9
  import numpy as np
 
38
  def fetch_and_display_content(url):
39
  """Fetch and extract text from a given URL (HTML or PDF)."""
40
  if url.endswith(".pdf") or "pdf" in url:
41
+ converter = MarkItDown()
42
  #result = converter.convert(source)
43
+ text = converter.convert(url).text_content
44
  else:
45
  downloaded = trafilatura.fetch_url(url)
46
  text = extract(downloaded, output_format="markdown", with_metadata=True, include_tables=False, include_links=False, include_formatting=True, include_comments=False) #without metadata extraction