File size: 4,857 Bytes
425bb64
 
 
cc732e9
425bb64
 
 
 
cc732e9
 
 
d4cfb6b
cc732e9
e8f4b3e
cc732e9
425bb64
cc732e9
 
 
 
 
 
 
425bb64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc732e9
425bb64
cc732e9
425bb64
 
cc732e9
 
425bb64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc732e9
 
425bb64
 
 
 
 
 
cc732e9
425bb64
 
 
cc732e9
425bb64
 
 
 
 
cc732e9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import streamlit as st                      #Web App
from gnewsclient import gnewsclient         # for fetching google news
from newspaper import Article               # to obtain text from news articles
from transformers import pipeline           # to summarize text
import spacy                                # to obtain keyword
from annotated_text import annotated_text   # to display keywords


# Load sshleifer/distilbart-cnn-12-6 model
@st.cache(allow_output_mutation=True)
def load_model():   
    model = pipeline("summarization")                        
    return model

data = gnewsclient.NewsClient(max_results=0) 

#faster method - inference api - 30k characters/mo
#API_URL = "https://api-inference.huggingface.co/models/sshleifer/distilbart-cnn-12-6"
#API_KEY=os.getenv("API_KEY")
#headers = {"Authorization": f"Bearer {API_KEY}"}
#def query(payload):
#	response = requests.post(API_URL, headers=headers, json=payload)
#	return response.json()


# obtain urls and it's content
def getNews(topic,location):                
    count=0
    contents=[]
    titles=[]
    authors=[]
    urls=[]
    data = gnewsclient.NewsClient(language='english',location=location,topic=topic,max_results=10) 
    news = data.get_news()  
    for item in news:
        url=item['link']
        article = Article(url)
        try:
            article.download()
            article.parse()
            temp=item['title'][::-1]
            index=temp.find("-")
            temp=temp[:index-1][::-1]
            urls.append(url)
            contents.append(article.text)
            titles.append(item['title'][:-index-1])    
            authors.append(temp)
            count+=1
            if(count==5):
                break
        except:
            continue 
    return contents,titles,authors,urls 


 # Summarizes the content- minimum word limit 30 and maximum 60
def getNewsSummary(contents,summarizer):   
    summaries=[]     
    for content in contents:
        minimum=len(content.split())
        summaries.append(summarizer(content,max_length=60,min_length=min(30,minimum),do_sample=False,truncation=True)[0]['summary_text'])   
    return summaries


# Obtain 4 keywords from content (person,organisation or geopolitical entity)
def generateKeyword(contents):            
    keywords=[]
    words=[]
    nlp = spacy.load("en_core_web_lg")    
    labels=["PERSON","ORG","GPE"]
    for content in contents:
        doc=nlp(content)
        keys=[]
        limit=0
        for ent in doc.ents:
            key=ent.text.upper()
            label=ent.label_
            if(key not in words and key not in keywords and label in labels): 
                keys.append(key)
                limit+=1
                for element in key.split():
                    words.append(element)
            if(limit==4):
                keywords.append(keys)
                break                           
    return keywords


# Display title,author and summary in streamlit
def DisplaySummary(titles,authors,summaries,keywords,urls):
    for i in range(5):
        if(i+1<=len(summaries) and i+1<=len(keywords)):
            st.text("")
            st.subheader(f'[{titles[i]}] ({urls[i]})')
            st.markdown(f'<b>{authors[i]}</b>',unsafe_allow_html=True)
            st.write(summaries[i])
            if(len(keywords[i])==4):
                annotated_text("KEYWORDS :",(keywords[i][0],"","#faa")," ",(keywords[i][1],"","#faa")," ",(keywords[i][2],"","#faa")," ",(keywords[i][3],"","#faa"))
            elif(len(keywords[i])==3):
                annotated_text("KEYWORDS :",(keywords[i][0],"","#faa")," ",(keywords[i][1],"","#faa")," ",(keywords[i][2],"","#faa"))  
            elif(len(keywords[i])==2):
                annotated_text("KEYWORDS :",(keywords[i][0],"","#faa")," ",(keywords[i][1],"","#faa"))  
            elif(len(keywords[i])==1):
                annotated_text("KEYWORDS :",(keywords[i][0],"","#faa"))            
        st.text("")
        st.text("")


def main(): 
    summarizer=load_model()                 
    st.title('Briefly')
    with st.expander('Read trending news in less than 60 words...', expanded=True):
        with st.form(key='form1'):
            topic=st.selectbox('Category:',data.topics[2:]+["World"])
            location=st.selectbox('Location:',data.locations)        
            submit_button=st.form_submit_button() 

    if submit_button:
        with st.spinner('Fetching news...'):            
            contents,titles,authors,urls=getNews(topic,location)
            summaries=getNewsSummary(contents,summarizer)
            keywords=generateKeyword(contents)
        DisplaySummary(titles,authors,summaries,keywords,urls)


if __name__ == '__main__':
    main()