Spaces:
Runtime error
Runtime error
import pandas as pd | |
import requests | |
import isort | |
import black | |
import flair | |
import time | |
from bs4 import BeautifulSoup | |
URL = "https://www.formula1.com/content/fom-website/en/latest/all.xml" | |
def get_xml(url): | |
# xpath is only for formula1 | |
# use urllib.parse to check for formula1.com website or other news | |
xml = pd.read_xml(url,xpath='channel/item') | |
def check_updates(every=60): | |
while True: | |
time.sleep(every) | |
latest_xml = get_xml() | |
if ~previous_xml.equals(latest_xml): | |
print('New articles found') | |
new_articles_df = latest_xml[~latest_xml["guid"].isin(previous_xml["guid"])] | |
for article in new_articles_df.iterrows(): | |
link = row[1]["guid"] | |
request = requests.get(link) | |
soup = BeautifulSoup(request.content, "html.parser") | |
# class_ below will be different for different websites | |
s = soup.find("div", class_="col-lg-8 col-xl-7 offset-xl-1 f1-article--content") | |
lines = s.find_all("p") | |
text_content = pd.DataFrame(data={"text": []}) | |
for i, line in enumerate(lines): | |
df = pd.DataFrame(data={"text": [line.text]}) | |
text_content = pd.concat([text_content, df], ignore_index=True) | |
strongs = s.find_all("strong") | |
strong_content = pd.DataFrame(data={"text": []}) | |
for i, strong in enumerate(strongs): | |
if i > 0: | |
df = pd.DataFrame(data={"text": [strong.text]}) | |
strong_content = pd.concat([strong_content, df], ignore_index=True) | |
# df has content | |
df = text_content[~text_content["text"].isin(strong_content["text"])].reset_index( | |
drop=True | |
) | |
return df | |
else: | |
print('No New article is found') | |