Spaces:

ResearchMAGIC
/

the-big-scraper

Sleeping

App Files Files Community

the-big-scraper / alternative-2.py

rodrigomasini

Update alternative-2.py

06443d3 verified over 1 year ago

raw

history blame

7.73 kB

	###############################################################################################################################################################
	# _____ _ ___ _ ___
	# \|_ _\|\| \|_ ___ \| _ )(_) __ _ / __\| __ _ _ __ _ _ __ ___ _ _
	# \| \| \| ' \ / -_) \| _ \\| \|/ _` \| \__ \/ _\|\| '_\|/ _` \|\| '_ \/ -_)\| '_\|
	# \|_\| \|_\|\|_\|\___\| \|___/\|_\|\__, \| \|___/\__\|\|_\| \__,_\|\| .__/\___\|\|_\|
	# \|___/ \|_\|
	#
	##############################################################################################################################################################
	# _ ______ _ _ _______ _ _
	# _ \| \| (_____ \ \| \| (_) (_______) (_) (_)
	# _____ _ _ _\| \|_ \| \|__ ___ ____ _ _____) ) ___ __\| \| ____ _ ____ ___ _ _ _ _____ ___ _ ____ _
	# (____ \|\| \| \| \|(_ _)\| _ \ / _ \ / ___)(_) \| __ / / _ \ / _ \| / ___)\| \| / _ \| / _ \ \| \|\|_\|\| \|(____ \| /___)\| \|\| _ \ \| \|
	# / ___ \|\| \|_\| \| \| \|_ \| \| \| \|\| \|_\| \|\| \| _ \| \| \ \ \| \|_\| \|( (_\| \|\| \| \| \|( (_\| \|\| \|_\| \| \| \| \| \|/ ___ \|\|___ \|\| \|\| \| \| \|\| \|
	# \_____\|\|____/ \__)\|_\| \|_\| \___/ \|_\| (_) \|_\| \|_\| \___/ \____\|\|_\| \|_\| \___ \| \___/ \|_\| \|_\|\_____\|(___/ \|_\|\|_\| \|_\|\|_\|
	# (_____\|
	###############################################################################################################################################################
	#
	# Last updated in: 8/20/2024
	#
	###############################################################################################################################################################

	# ------------------------------------------------------------------------------
	# IMPORTS
	# ------------------------------------------------------------------------------
	import gradio as gr
	from bs4 import BeautifulSoup as Soup
	from langchain_community.document_loaders import (AsyncHtmlLoader,
	NewsURLLoader, PubMedLoader,
	PlaywrightURLLoader,
	RecursiveUrlLoader,
	SeleniumURLLoader,
	UnstructuredURLLoader,
	WebBaseLoader)
	from selenium import webdriver
	from selenium.common.exceptions import WebDriverException
	from PIL import Image
	from io import BytesIO

	# ------------------------------------------------------------------------------
	# THE BIG SCRAPER METHOD
	# ------------------------------------------------------------------------------

	def extractDataFromUrls(urls: str, loader_type: str):
	"""Extracts data from provided URLs using specified loader type.

	Args:
	urls (str): Comma-separated URLs to extract data from.
	loader_type (str): Type of loader to use for data extraction.

	Returns:
	tuple: A tuple containing the extracted data in JSON format and as a list of Document objects.
	Returns error messages if an exception occurs.
	"""
	try:
	urls = urls.split(',')
	data = []

	# Instantiate the selected loader based on loader_type
	if loader_type == 'AsyncHtmlLoader':
	loader = AsyncHtmlLoader(urls)

	elif loader_type == 'UnstructuredURL':
	loader = UnstructuredURLLoader(urls=urls)

	elif loader_type == 'RecursiveURL':
	loader = RecursiveUrlLoader(
	url=urls[0], max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
	)

	elif loader_type == 'SeleniumURL':
	loader = SeleniumURLLoader(urls=urls)

	elif loader_type == 'SeleniumURLH':
	loader = SeleniumURLLoader(urls=urls, headless=False)

	elif loader_type == 'PlaywrightURL':
	loader = PlaywrightURLLoader(urls=urls)

	elif loader_type == 'PubMed':
	loader = PubMedLoader(urls[0])

	elif loader_type == 'NewsURL':
	loader = NewsURLLoader(urls)

	elif loader_type == 'WebBaseLoader':
	loader = WebBaseLoader(urls)

	else:
	return "Not Implemented. Development in Progress", "Work In Progress"

	# Load data using the selected loader
	data = loader.load()

	# Convert data to JSON format
	jsonData = []

	for item in data:
	jsonData.append(item.to_json())

	return jsonData, data, urls[0]

	except Exception as err:
	return "An Error Occurred. Contact Developer" + str(err), "Error Occured. Boom"

	# ------------------------------------------------------------------------------
	# WEB DATA AND SCREENSHOT
	# ------------------------------------------------------------------------------
	def take_webdata(url):
	options = webdriver.ChromeOptions()
	options.add_argument('--headless')
	options.add_argument('--no-sandbox')
	options.add_argument('--disable-dev-shm-usage')

	try:
	wd = webdriver.Chrome(options=options)
	wd.set_window_size(1080, 720)
	wd.get(url)
	wd.implicitly_wait(5)
	page_title = wd.title
	screenshot = wd.get_screenshot_as_png()

	except WebDriverException as e:
	return Image.new('RGB', (1, 1)), page_title
	finally:
	if wd:
	wd.quit()

	return Image.open(BytesIO(screenshot)) , page_title

	# ------------------------------------------------------------------------------
	# GRADIO
	# ------------------------------------------------------------------------------

	# Define choices for the dropdown menu
	choices = [
	'AsyncHtmlLoader', 'UnstructuredURL', 'RecursiveURL', 'PubMed',
	'WebBaseLoader', 'Scrapy', 'PySpider', 'Beautiful Soup',
	'SeleniumURL', 'SeleniumURLH', 'PlaywrightURL', 'NewsURL',
	]

	# Create the Gradio interface
	with gr.Blocks(theme="sudeepshouche/minimalist") as demo:
	extracted_url = gr.State() # Use gr.State() to store the URL
	screenshot_output = gr.State()
	title_output = gr.State()
	gr.Markdown("# The Big Scraper")
	with gr.Tab("Scraped"):
	with gr.Row():
	with gr.Column():
	url_input = gr.Textbox(label="Enter your comma separated URLs here")
	loader_dropdown = gr.Dropdown(choices=choices, label="Pick your Loader from here")
	with gr.Column():
	json_output = gr.JSON(label="Extracted Data (JSON)")
	text_output = gr.Textbox(label="Extracted Data (Text)")

	btn = gr.Button("Extract Data")
	btn.click(extractDataFromUrls, inputs=[url_input, loader_dropdown], outputs=[json_output, text_output, extracted_url]) \
	.then(take_webdata, inputs=extracted_url, outputs=[screenshot_output, title_output], queue=True)

	with gr.Tab("Images"):
	with gr.Row():
	screenshot_output = gr.Image(label="Screenshot")
	title_output = gr.Textbox(label="Page Title")

	# Launch the Gradio interface
	demo.launch()