Spaces:

ashok2216
/

youtube-data_scraper

Build error

App Files Files Community

youtube-data_scraper / code /youtube1.py

ashok2216

Upload 13 files

4292ffa verified over 1 year ago

raw

history blame

4.51 kB

	import time
	import pprint
	import csv
	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service
	from webdriver_manager.chrome import ChromeDriverManager
	from selenium.webdriver.common.by import By
	import csv
	from youtube_comment_scraper_python import *
	import pandas as pd
	import plotly.express as px
	import re
	import streamlit as st

	st.title('Youtube Channel Analysis')
	st.write('Youtube WebScrap')


	# # ------------------------------------------------------------------------------CHANNEL DATA------------------------------------------------------------------------

	driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))


	url = st.text_input('Paste the Youtube Channel Link',"")
	if not url:
	st.warning('Please input a Link.')
	st.stop()
	st.success('Thank you for inputting a link.')
	# url ='https://www.youtube.com/@YasoobKhalid/videos'
	name = re.compile(r"[A-Z]\w+")
	inp = name.findall(url)
	out = inp[0]
	st.write('Getting Data from', out, 'channel')
	driver.get(url)

	# url = input('Enter Youtube Video Url- ')
	# driver.get(url)
	# # "https://www.youtube.com/@YasoobKhalid/videos"
	# channel_title = driver.find_element(By.XPATH, '//yt-formatted-string[contains(@class, "ytd-channel-name")]').text
	handle = driver.find_element(By.XPATH, '//yt-formatted-string[@id="channel-handle"]').text
	subscriber_count = driver.find_element(By.XPATH, '//yt-formatted-string[@id="subscriber-count"]').text

	WAIT_IN_SECONDS = 5
	last_height = driver.execute_script("return document.documentElement.scrollHeight")

	while True:
	# Scroll to the bottom of page
	driver.execute_script("window.scrollTo(0, arguments[0]);", last_height)
	# Wait for new videos to show up
	time.sleep(WAIT_IN_SECONDS)

	# Calculate new document height and compare it with last height
	new_height = driver.execute_script("return document.documentElement.scrollHeight")
	if new_height == last_height:
	break
	last_height = new_height


	thumbnails = driver.find_elements(By.XPATH, '//a[@id="thumbnail"]/yt-image/img')
	views = driver.find_elements(By.XPATH,'//div[@id="metadata-line"]/span[1]')
	titles = driver.find_elements(By.ID, "video-title")
	links = driver.find_elements(By.ID, "video-title-link")
	# likes = driver.find_elements(By.ID, "video-title-link-likes")

	videos = []
	for title, view, thumb, link in zip(titles, views, thumbnails, links):
	video_dict = {
	'title': title.text,
	'views': view.text,
	# 'likes': likes.text,
	'thumbnail': thumb.get_attribute('src'),
	'link': link.get_attribute('href')
	}
	videos.append(video_dict)

	print(videos)

	to_csv = videos
	keys = to_csv[0].keys()

	with open('output/people.csv', 'w', newline='', encoding='utf-8') as output_file:
	dict_writer = csv.DictWriter(output_file, keys)
	dict_writer.writeheader()
	dict_writer.writerows(to_csv)
	df = pd.read_csv('output/people.csv')
	st.dataframe(df)

	count = st.slider('Select Lower Video Count', 0, 607, 100)
	st.write("You selected", count, 'Videos')

	fig = px.bar(df,
	x="title",
	y="views", height=600
	)
	fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
	# fig.update_yaxes(tickvals=['10k', '22k', '29k', '56k'])
	tab1, tab2 = st.tabs(["Streamlit theme (default)", "Plotly native theme"])
	with tab1:
	# Use the Streamlit theme.
	# This is the default. So you can also omit the theme argument.
	st.plotly_chart(fig, theme="streamlit", use_container_width=True)
	with tab2:
	# Use the native Plotly theme.
	st.plotly_chart(fig, theme=None, use_container_width=True)

	# ----------------------------------------------------------------------------COMMENTS------------------------------------------------------------------------------


	# url = input('Enter Youtube Video Url- ')
	# youtube.open(url)
	# youtube.keypress("pagedown")

	# data = []
	# currentpagesource=youtube.get_page_source()
	# lastpagesource=''

	# while(True):
	# if(lastpagesource==currentpagesource):
	# break

	# lastpagesource=currentpagesource
	# response=youtube.video_comments()

	# for c in response['body']:
	# data.append(c)

	# youtube.scroll()
	# currentpagesource=youtube.get_page_source()


	# df = pd.DataFrame(data)

	# df = df.replace('\n',' ', regex=True)

	# df = df[['Comment', 'Likes']].drop_duplicates(keep="first")
	# # df = df[['Likes']].drop_duplicates(keep="first")

	# df.to_csv('output/data.csv',index=False)

	# df.head()