Devika Nair M commited on
Commit
425bb64
·
unverified ·
1 Parent(s): 5284cf4

Add files via upload

Browse files
Files changed (5) hide show
  1. Briefly.ipynb +556 -0
  2. Briefly.py +119 -0
  3. Procfile +1 -0
  4. requirements.txt +6 -0
  5. setup.sh +13 -0
Briefly.ipynb ADDED
@@ -0,0 +1,556 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "## Briefly\n",
8
+ "\n",
9
+ "\n",
10
+ "### __ Problem Statement __\n",
11
+ "- Obtain news from google news articles\n",
12
+ "- Sammarize the articles within 60 words\n",
13
+ "- Obtain keywords from the articles\n",
14
+ "\n",
15
+ "\n",
16
+ "\n",
17
+ "\n",
18
+ "\n",
19
+ "\n",
20
+ "\n",
21
+ "\n",
22
+ "\n",
23
+ "\n",
24
+ "##### Importing all the necessary libraries required to run the following code "
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 1,
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "from gnewsclient import gnewsclient # for fetching google news\n",
34
+ "from newspaper import Article # to obtain text from news articles\n",
35
+ "from transformers import pipeline # to summarize text\n",
36
+ "import spacy # for named entity recognition\n",
37
+ "import spacy.displacy as displacy # display keywords"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "markdown",
42
+ "metadata": {},
43
+ "source": [
44
+ "##### Load sshleifer/distilbart-cnn-12-6 model"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 2,
50
+ "metadata": {},
51
+ "outputs": [],
52
+ "source": [
53
+ "def load_model(): \n",
54
+ " model = pipeline('summarization')\n",
55
+ " return model\n",
56
+ "data = gnewsclient.NewsClient(max_results=0)\n",
57
+ "nlp = spacy.load(\"en_core_web_lg\") "
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "markdown",
62
+ "metadata": {},
63
+ "source": [
64
+ "##### Obtain urls and it's content"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": 3,
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "def getNews(topic,location): \n",
74
+ " count=0\n",
75
+ " contents=[]\n",
76
+ " titles=[]\n",
77
+ " authors=[]\n",
78
+ " urls=[]\n",
79
+ " data = gnewsclient.NewsClient(language='english',location=location,topic=topic,max_results=10) \n",
80
+ " news = data.get_news() \n",
81
+ " for item in news:\n",
82
+ " url=item['link']\n",
83
+ " article = Article(url)\n",
84
+ " try:\n",
85
+ " article.download()\n",
86
+ " article.parse()\n",
87
+ " temp=item['title'][::-1]\n",
88
+ " index=temp.find(\"-\")\n",
89
+ " temp=temp[:index-1][::-1]\n",
90
+ " urls.append(url)\n",
91
+ " contents.append(article.text)\n",
92
+ " titles.append(item['title'][:-index-1]) \n",
93
+ " authors.append(temp)\n",
94
+ " count+=1\n",
95
+ " if(count==5):\n",
96
+ " break\n",
97
+ " except:\n",
98
+ " continue \n",
99
+ " return contents,titles,authors,urls "
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "markdown",
104
+ "metadata": {},
105
+ "source": [
106
+ "##### Summarizes the content- minimum word limit 30 and maximum 60"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": 4,
112
+ "metadata": {},
113
+ "outputs": [],
114
+ "source": [
115
+ "def getNewsSummary(contents,summarizer): \n",
116
+ " summaries=[] \n",
117
+ " for content in contents:\n",
118
+ " minimum=len(content.split())\n",
119
+ " summaries.append(summarizer(content,max_length=60,min_length=min(30,minimum),do_sample=False,truncation=True)[0]['summary_text']) \n",
120
+ " return summaries"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "markdown",
125
+ "metadata": {},
126
+ "source": [
127
+ "##### Named Entity Recognition"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": 5,
133
+ "metadata": {},
134
+ "outputs": [],
135
+ "source": [
136
+ "# Obtain 4 keywords from content (person,organisation or geopolitical entity) \n",
137
+ "def generateKeyword(contents): \n",
138
+ " keywords=[]\n",
139
+ " words=[] \n",
140
+ " labels=[\"PERSON\",\"ORG\",\"GPE\"]\n",
141
+ " for content in contents:\n",
142
+ " doc=nlp(content)\n",
143
+ " keys=[]\n",
144
+ " limit=0\n",
145
+ " for ent in doc.ents:\n",
146
+ " key=ent.text.upper()\n",
147
+ " label=ent.label_\n",
148
+ " if(key not in words and key not in keywords and label in labels): \n",
149
+ " keys.append(key)\n",
150
+ " limit+=1\n",
151
+ " for element in key.split():\n",
152
+ " words.append(element)\n",
153
+ " if(limit==4):\n",
154
+ " keywords.append(keys)\n",
155
+ " break \n",
156
+ " return keywords\n",
157
+ " "
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "markdown",
162
+ "metadata": {},
163
+ "source": [
164
+ "##### Displaying keywords "
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "code",
169
+ "execution_count": 6,
170
+ "metadata": {},
171
+ "outputs": [],
172
+ "source": [
173
+ "def printKeywords(keywords):\n",
174
+ " for keyword in keywords:\n",
175
+ " print(keyword)"
176
+ ]
177
+ },
178
+ {
179
+ "cell_type": "markdown",
180
+ "metadata": {},
181
+ "source": [
182
+ "##### Displaying the Summary with keywords in it highlighted"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": 7,
188
+ "metadata": {},
189
+ "outputs": [],
190
+ "source": [
191
+ "def printSummary(summaries,titles):\n",
192
+ " colors = {\"ORG\": \"linear-gradient(90deg, #aa9cfc, #fc9ce7)\"}\n",
193
+ " options = {\"ents\": [\"PERSON\",\"ORG\",\"GPE\",\"NORP\",\"PERCENT\"],\"colors\": colors} \n",
194
+ " for summary,title in zip(summaries,titles):\n",
195
+ " doc=nlp(summary) \n",
196
+ " print('\\033[1m' + title.upper() + '\\033[0m\\n')\n",
197
+ " displacy.render(doc, style=\"ent\", options=options,jupyter=True)\n",
198
+ " print(\"\\n\\n\")"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "code",
203
+ "execution_count": 8,
204
+ "metadata": {},
205
+ "outputs": [
206
+ {
207
+ "name": "stderr",
208
+ "output_type": "stream",
209
+ "text": [
210
+ "No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)\n"
211
+ ]
212
+ }
213
+ ],
214
+ "source": [
215
+ "summarizer=load_model() "
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": 9,
221
+ "metadata": {},
222
+ "outputs": [],
223
+ "source": [
224
+ "contents,titles,authors,urls=getNews(\"Sports\",\"India\")"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": 10,
230
+ "metadata": {},
231
+ "outputs": [],
232
+ "source": [
233
+ "summaries=getNewsSummary(contents,summarizer)"
234
+ ]
235
+ },
236
+ {
237
+ "cell_type": "code",
238
+ "execution_count": 11,
239
+ "metadata": {},
240
+ "outputs": [],
241
+ "source": [
242
+ "keywords=generateKeyword(contents)"
243
+ ]
244
+ },
245
+ {
246
+ "cell_type": "code",
247
+ "execution_count": 12,
248
+ "metadata": {},
249
+ "outputs": [
250
+ {
251
+ "name": "stdout",
252
+ "output_type": "stream",
253
+ "text": [
254
+ "['DWAYNE BRAVO', 'SRI LANKA', 'ICC', 'THE WEST INDIES']\n",
255
+ "['VIRAT KOHLI', 'INDIA', 'SCOTLAND', 'SUPER 12']\n",
256
+ "['AUSTRALIA', 'AFGHANISTAN', 'CRICKET AUSTRALIA', 'CRICBUZZ STAFF •']\n",
257
+ "['GARY STEAD', 'TRENT BOULT', 'COLIN DE GRANDHOMME', 'BLACKCAPS']\n",
258
+ "[\"VIRAT KOHLI'S\", 'TEAM INDIA', 'DHONI', 'UAE']\n"
259
+ ]
260
+ }
261
+ ],
262
+ "source": [
263
+ "printKeywords(keywords)"
264
+ ]
265
+ },
266
+ {
267
+ "cell_type": "code",
268
+ "execution_count": 13,
269
+ "metadata": {},
270
+ "outputs": [
271
+ {
272
+ "name": "stdout",
273
+ "output_type": "stream",
274
+ "text": [
275
+ "\u001b[1mT20 WORLD CUP 2021: WEST INDIES AND CHENNAI SUPER KINGS ALL-ROUNDER DWAYNE BRAVO TO RETIRE AFTER SHOWPIECE... \u001b[0m\n",
276
+ "\n"
277
+ ]
278
+ },
279
+ {
280
+ "data": {
281
+ "text/html": [
282
+ "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\"> \n",
283
+ "<mark class=\"entity\" style=\"background: #c887fb; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
284
+ " West Indies\n",
285
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">NORP</span>\n",
286
+ "</mark>\n",
287
+ " all-rounder \n",
288
+ "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
289
+ " Dwayne Bravo\n",
290
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
291
+ "</mark>\n",
292
+ " will hang his boots at the end of the ICC T20 World Cup 2021 . \n",
293
+ "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
294
+ " Bravo\n",
295
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
296
+ "</mark>\n",
297
+ " told \n",
298
+ "<mark class=\"entity\" style=\"background: linear-gradient(90deg, #aa9cfc, #fc9ce7); padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
299
+ " ICC\n",
300
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
301
+ "</mark>\n",
302
+ " on the post-match Facebook Live show that he will be drawing the curtains on his international career . \n",
303
+ "<mark class=\"entity\" style=\"background: #c887fb; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
304
+ " West Indies\n",
305
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">NORP</span>\n",
306
+ "</mark>\n",
307
+ " lost to \n",
308
+ "<mark class=\"entity\" style=\"background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
309
+ " Sri Lanka\n",
310
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">GPE</span>\n",
311
+ "</mark>\n",
312
+ " by 20 runs in</div></span>"
313
+ ],
314
+ "text/plain": [
315
+ "<IPython.core.display.HTML object>"
316
+ ]
317
+ },
318
+ "metadata": {},
319
+ "output_type": "display_data"
320
+ },
321
+ {
322
+ "name": "stdout",
323
+ "output_type": "stream",
324
+ "text": [
325
+ "\n",
326
+ "\n",
327
+ "\n",
328
+ "\u001b[1mHAPPY BIRTHDAY VIRAT KOHLI: INDIAN CRICKET TEAM CAPTAIN TURNS 33 \u001b[0m\n",
329
+ "\n"
330
+ ]
331
+ },
332
+ {
333
+ "data": {
334
+ "text/html": [
335
+ "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\"> \n",
336
+ "<mark class=\"entity\" style=\"background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
337
+ " India\n",
338
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">GPE</span>\n",
339
+ "</mark>\n",
340
+ " captain \n",
341
+ "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
342
+ " Virat Kohli\n",
343
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
344
+ "</mark>\n",
345
+ " turns 33 on Friday . The 33-year-old is currently leading the national side in the ongoing T20 World Cup . \n",
346
+ "<mark class=\"entity\" style=\"background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
347
+ " India\n",
348
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">GPE</span>\n",
349
+ "</mark>\n",
350
+ " lost their first two games in the tournament before winning the third . \n",
351
+ "<mark class=\"entity\" style=\"background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
352
+ " India\n",
353
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">GPE</span>\n",
354
+ "</mark>\n",
355
+ " sit in fourth spot in the Group 2 points table .</div></span>"
356
+ ],
357
+ "text/plain": [
358
+ "<IPython.core.display.HTML object>"
359
+ ]
360
+ },
361
+ "metadata": {},
362
+ "output_type": "display_data"
363
+ },
364
+ {
365
+ "name": "stdout",
366
+ "output_type": "stream",
367
+ "text": [
368
+ "\n",
369
+ "\n",
370
+ "\n",
371
+ "\u001b[1mONE-OFF TEST VS AFGHANISTAN POSTPONED, CONFIRMS CRICKET AUSTRALIA | CRICBUZZ.COM - CRICBUZZ \u001b[0m\n",
372
+ "\n"
373
+ ]
374
+ },
375
+ {
376
+ "data": {
377
+ "text/html": [
378
+ "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\"> \n",
379
+ "<mark class=\"entity\" style=\"background: linear-gradient(90deg, #aa9cfc, #fc9ce7); padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
380
+ " Cricket Australia's\n",
381
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
382
+ "</mark>\n",
383
+ " one-off Test against \n",
384
+ "<mark class=\"entity\" style=\"background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
385
+ " Afghanistan\n",
386
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">GPE</span>\n",
387
+ "</mark>\n",
388
+ " has officially been postponed . The historic Test has been hanging in the balance since the CA revealed that they wouldn't support the \n",
389
+ "<mark class=\"entity\" style=\"background: linear-gradient(90deg, #aa9cfc, #fc9ce7); padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
390
+ " Taliban\n",
391
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
392
+ "</mark>\n",
393
+ " government's stance against the inclusion of women in sports . Instead of cancelling the Test match, CA has vowed to</div></span>"
394
+ ],
395
+ "text/plain": [
396
+ "<IPython.core.display.HTML object>"
397
+ ]
398
+ },
399
+ "metadata": {},
400
+ "output_type": "display_data"
401
+ },
402
+ {
403
+ "name": "stdout",
404
+ "output_type": "stream",
405
+ "text": [
406
+ "\n",
407
+ "\n",
408
+ "\n",
409
+ "\u001b[1mNEW ZEALAND INCLUDE FIVE SPINNERS FOR INDIA TOUR, TRENT BOULT OPTS OUT CITING BUBBLE FATIGUE \u001b[0m\n",
410
+ "\n"
411
+ ]
412
+ },
413
+ {
414
+ "data": {
415
+ "text/html": [
416
+ "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\"> \n",
417
+ "<mark class=\"entity\" style=\"background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
418
+ " New Zealand\n",
419
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">GPE</span>\n",
420
+ "</mark>\n",
421
+ " name five spinners in 15-man squad for two-Test series against \n",
422
+ "<mark class=\"entity\" style=\"background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
423
+ " India\n",
424
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">GPE</span>\n",
425
+ "</mark>\n",
426
+ " . Senior pacer \n",
427
+ "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
428
+ " Trent Boult\n",
429
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
430
+ "</mark>\n",
431
+ " and fast-bowling all-rounder \n",
432
+ "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
433
+ " Colin de Grandhomme\n",
434
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
435
+ "</mark>\n",
436
+ " will miss tour due to bio-bubble fatigue . \n",
437
+ "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
438
+ " Ajaz Patel\n",
439
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
440
+ "</mark>\n",
441
+ ", \n",
442
+ "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
443
+ " Will Somerville\n",
444
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
445
+ "</mark>\n",
446
+ " and</div></span>"
447
+ ],
448
+ "text/plain": [
449
+ "<IPython.core.display.HTML object>"
450
+ ]
451
+ },
452
+ "metadata": {},
453
+ "output_type": "display_data"
454
+ },
455
+ {
456
+ "name": "stdout",
457
+ "output_type": "stream",
458
+ "text": [
459
+ "\n",
460
+ "\n",
461
+ "\n",
462
+ "\u001b[1m‘THERE ARE MANY CANDIDATES BUT HE’S THE BEST': SEHWAG PICKS NEXT INDIA CAPTAIN AFTER KOHLI STEPS DOWN AT END OF T20 WC \u001b[0m\n",
463
+ "\n"
464
+ ]
465
+ },
466
+ {
467
+ "data": {
468
+ "text/html": [
469
+ "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\"> \n",
470
+ "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
471
+ " Virat Kohli\n",
472
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
473
+ "</mark>\n",
474
+ " set to step down as T20I captain after this World Cup in \n",
475
+ "<mark class=\"entity\" style=\"background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
476
+ " UAE\n",
477
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">GPE</span>\n",
478
+ "</mark>\n",
479
+ " and \n",
480
+ "<mark class=\"entity\" style=\"background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
481
+ " Oman\n",
482
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">GPE</span>\n",
483
+ "</mark>\n",
484
+ " . Many experts are anticipating his deputy \n",
485
+ "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
486
+ " Rohit Sharma\n",
487
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
488
+ "</mark>\n",
489
+ " to fill up the position . Former \n",
490
+ "<mark class=\"entity\" style=\"background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
491
+ " India\n",
492
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">GPE</span>\n",
493
+ "</mark>\n",
494
+ " opener \n",
495
+ "<mark class=\"entity\" style=\"background: linear-gradient(90deg, #aa9cfc, #fc9ce7); padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
496
+ " Virender Sehwag\n",
497
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
498
+ "</mark>\n",
499
+ " backed \n",
500
+ "<mark class=\"entity\" style=\"background: linear-gradient(90deg, #aa9cfc, #fc9ce7); padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
501
+ " Rohit\n",
502
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
503
+ "</mark>\n",
504
+ " as the ideal candidate .</div></span>"
505
+ ],
506
+ "text/plain": [
507
+ "<IPython.core.display.HTML object>"
508
+ ]
509
+ },
510
+ "metadata": {},
511
+ "output_type": "display_data"
512
+ },
513
+ {
514
+ "name": "stdout",
515
+ "output_type": "stream",
516
+ "text": [
517
+ "\n",
518
+ "\n",
519
+ "\n"
520
+ ]
521
+ }
522
+ ],
523
+ "source": [
524
+ "printSummary(summaries,titles)"
525
+ ]
526
+ },
527
+ {
528
+ "cell_type": "code",
529
+ "execution_count": null,
530
+ "metadata": {},
531
+ "outputs": [],
532
+ "source": []
533
+ }
534
+ ],
535
+ "metadata": {
536
+ "kernelspec": {
537
+ "display_name": "Python 3",
538
+ "language": "python",
539
+ "name": "python3"
540
+ },
541
+ "language_info": {
542
+ "codemirror_mode": {
543
+ "name": "ipython",
544
+ "version": 3
545
+ },
546
+ "file_extension": ".py",
547
+ "mimetype": "text/x-python",
548
+ "name": "python",
549
+ "nbconvert_exporter": "python",
550
+ "pygments_lexer": "ipython3",
551
+ "version": "3.8.5"
552
+ }
553
+ },
554
+ "nbformat": 4,
555
+ "nbformat_minor": 4
556
+ }
Briefly.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st #Web App
2
+ from gnewsclient import gnewsclient # for fetching google news
3
+ from newspaper import Article # to obtain text from news articles
4
+ from transformers import pipeline # to summarize text
5
+ import spacy # to obtain keyword
6
+ from annotated_text import annotated_text # to display keywords
7
+
8
+
9
+ # Load sshleifer/distilbart-cnn-12-6 model
10
+ @st.cache(allow_output_mutation=True)
11
+ def load_model():
12
+ model = pipeline('summarization')
13
+ return model
14
+
15
+ data = gnewsclient.NewsClient(max_results=0)
16
+
17
+
18
+ # obtain urls and it's content
19
+ def getNews(topic,location):
20
+ count=0
21
+ contents=[]
22
+ titles=[]
23
+ authors=[]
24
+ urls=[]
25
+ data = gnewsclient.NewsClient(language='english',location=location,topic=topic,max_results=10)
26
+ news = data.get_news()
27
+ for item in news:
28
+ url=item['link']
29
+ article = Article(url)
30
+ try:
31
+ article.download()
32
+ article.parse()
33
+ temp=item['title'][::-1]
34
+ index=temp.find("-")
35
+ temp=temp[:index-1][::-1]
36
+ urls.append(url)
37
+ contents.append(article.text)
38
+ titles.append(item['title'][:-index-1])
39
+ authors.append(temp)
40
+ count+=1
41
+ if(count==5):
42
+ break
43
+ except:
44
+ continue
45
+ return contents,titles,authors,urls
46
+
47
+
48
+ # Summarizes the content- minimum word limit 30 and maximum 60
49
+ def getNewsSummary(contents,summarizer):
50
+ summaries=[]
51
+ for content in contents:
52
+ minimum=len(content.split())
53
+ summaries.append(summarizer(content,max_length=60,min_length=min(30,minimum),do_sample=False,truncation=True)[0]['summary_text'])
54
+ return summaries
55
+
56
+
57
+ # Obtain 4 keywords from content (person,organisation or geopolitical entity)
58
+ def generateKeyword(contents):
59
+ keywords=[]
60
+ words=[]
61
+ nlp = spacy.load("en_core_web_lg")
62
+ labels=["PERSON","ORG","GPE"]
63
+ for content in contents:
64
+ doc=nlp(content)
65
+ keys=[]
66
+ limit=0
67
+ for ent in doc.ents:
68
+ key=ent.text.upper()
69
+ label=ent.label_
70
+ if(key not in words and key not in keywords and label in labels):
71
+ keys.append(key)
72
+ limit+=1
73
+ for element in key.split():
74
+ words.append(element)
75
+ if(limit==4):
76
+ keywords.append(keys)
77
+ break
78
+ return keywords
79
+
80
+
81
+ # Display title,author and summary in streamlit
82
+ def DisplaySummary(titles,authors,summaries,keywords,urls):
83
+ for i in range(5):
84
+ if(i+1<=len(summaries) and i+1<=len(keywords)):
85
+ st.text("")
86
+ st.subheader(f'[{titles[i]}] ({urls[i]})')
87
+ st.markdown(f'<b>{authors[i]}</b>',unsafe_allow_html=True)
88
+ st.write(summaries[i])
89
+ if(len(keywords[i])==4):
90
+ annotated_text("KEYWORDS :",(keywords[i][0],"","#faa")," ",(keywords[i][1],"","#faa")," ",(keywords[i][2],"","#faa")," ",(keywords[i][3],"","#faa"))
91
+ elif(len(keywords[i])==3):
92
+ annotated_text("KEYWORDS :",(keywords[i][0],"","#faa")," ",(keywords[i][1],"","#faa")," ",(keywords[i][2],"","#faa"))
93
+ elif(len(keywords[i])==2):
94
+ annotated_text("KEYWORDS :",(keywords[i][0],"","#faa")," ",(keywords[i][1],"","#faa"))
95
+ elif(len(keywords[i])==1):
96
+ annotated_text("KEYWORDS :",(keywords[i][0],"","#faa"))
97
+ st.text("")
98
+ st.text("")
99
+
100
+
101
+ def main():
102
+ summarizer=load_model()
103
+ st.title('Briefly')
104
+ with st.expander('Read trending news in less than 60 words...', expanded=True):
105
+ with st.form(key='form1'):
106
+ topic=st.selectbox('Category:',data.topics[2:]+["World"])
107
+ location=st.selectbox('Location:',data.locations)
108
+ submit_button=st.form_submit_button()
109
+
110
+ if submit_button:
111
+ with st.spinner('Fetching news...'):
112
+ contents,titles,authors,urls=getNews(topic,location)
113
+ summaries=getNewsSummary(contents,summarizer)
114
+ keywords=generateKeyword(contents)
115
+ DisplaySummary(titles,authors,summaries,keywords,urls)
116
+
117
+
118
+ if __name__ == '__main__':
119
+ main()
Procfile ADDED
@@ -0,0 +1 @@
 
 
1
+ web: sh setup.sh && streamlit run Briefly.py
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ gnewsclient
3
+ transformers
4
+ newspaper3k
5
+ spacy
6
+ annotated_text
setup.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mkdir -p ~/.streamlit/
2
+
3
+ echo "\
4
+ [general]\n\
5
+ email = \"[email protected]\"\n\
6
+ " > ~/.streamlit/credentials.toml
7
+
8
+ echo "\
9
+ [server]\n\
10
+ headless = true\n\
11
+ enableCORS=false\n\
12
+ port = $PORT\n\
13
+ " > ~/.streamlit/config.toml