File size: 2,000 Bytes
eb67193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
import requests
import isort
import black
import flair
import time
from bs4 import BeautifulSoup



URL = "https://www.formula1.com/content/fom-website/en/latest/all.xml"

def get_xml(url):
    # xpath is only for formula1
    # use urllib.parse to check for formula1.com website or other news
    xml = pd.read_xml(url,xpath='channel/item')

def check_updates(every=60):
    while True:
        time.sleep(every) 
        latest_xml = get_xml()
        if ~previous_xml.equals(latest_xml):
            print('New articles found')
            new_articles_df = latest_xml[~latest_xml["guid"].isin(previous_xml["guid"])]
            for article in new_articles_df.iterrows():
                link = row[1]["guid"]
                request = requests.get(link)
                soup = BeautifulSoup(request.content, "html.parser")
                # class_ below will be different for different websites
                s = soup.find("div", class_="col-lg-8 col-xl-7 offset-xl-1 f1-article--content")
                lines = s.find_all("p")
                text_content = pd.DataFrame(data={"text": []})
                for i, line in enumerate(lines):
                    df = pd.DataFrame(data={"text": [line.text]})
                    text_content = pd.concat([text_content, df], ignore_index=True)

                strongs = s.find_all("strong")
                strong_content = pd.DataFrame(data={"text": []})
                for i, strong in enumerate(strongs):
                    if i > 0:
                        df = pd.DataFrame(data={"text": [strong.text]})
                        strong_content = pd.concat([strong_content, df], ignore_index=True)
                # df has content
                df = text_content[~text_content["text"].isin(strong_content["text"])].reset_index(
                            drop=True
                        )

                return df
                
                
        else:
            print('No New article is found')