QuotesBot / app.py
tracinginsights's picture
Create app.py
eb67193
raw
history blame
2 kB
import pandas as pd
import requests
import isort
import black
import flair
import time
from bs4 import BeautifulSoup
URL = "https://www.formula1.com/content/fom-website/en/latest/all.xml"
def get_xml(url):
# xpath is only for formula1
# use urllib.parse to check for formula1.com website or other news
xml = pd.read_xml(url,xpath='channel/item')
def check_updates(every=60):
while True:
time.sleep(every)
latest_xml = get_xml()
if ~previous_xml.equals(latest_xml):
print('New articles found')
new_articles_df = latest_xml[~latest_xml["guid"].isin(previous_xml["guid"])]
for article in new_articles_df.iterrows():
link = row[1]["guid"]
request = requests.get(link)
soup = BeautifulSoup(request.content, "html.parser")
# class_ below will be different for different websites
s = soup.find("div", class_="col-lg-8 col-xl-7 offset-xl-1 f1-article--content")
lines = s.find_all("p")
text_content = pd.DataFrame(data={"text": []})
for i, line in enumerate(lines):
df = pd.DataFrame(data={"text": [line.text]})
text_content = pd.concat([text_content, df], ignore_index=True)
strongs = s.find_all("strong")
strong_content = pd.DataFrame(data={"text": []})
for i, strong in enumerate(strongs):
if i > 0:
df = pd.DataFrame(data={"text": [strong.text]})
strong_content = pd.concat([strong_content, df], ignore_index=True)
# df has content
df = text_content[~text_content["text"].isin(strong_content["text"])].reset_index(
drop=True
)
return df
else:
print('No New article is found')