diff --git "a/IS424_Data_Mining/code/NewsScraper/newsScraper.ipynb" "b/IS424_Data_Mining/code/NewsScraper/newsScraper.ipynb" new file mode 100644--- /dev/null +++ "b/IS424_Data_Mining/code/NewsScraper/newsScraper.ipynb" @@ -0,0 +1,3792 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "ddf1e32e-7751-43db-9b5a-22cb08e35c6c", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from gnews import GNews\n", + "import nltk\n", + "from nltk.corpus import PlaintextCorpusReader\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem.porter import *\n", + "from nltk import pos_tag, word_tokenize\n", + "from nltk.stem import WordNetLemmatizer\n", + "from nltk.probability import FreqDist\n", + "from nltk.tokenize import sent_tokenize\n", + "from nltk.tokenize import word_tokenize\n", + "import contractions\n", + "\n", + "import gensim\n", + "from gensim import corpora\n", + "from gensim import similarities\n", + "from gensim import models\n", + "from gensim.models import CoherenceModel\n", + "\n", + "# from wordcloud import WordCloud, ImageColorGenerator\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import pandas as pd\n", + "import re\n", + "import os\n", + "import glob\n", + "import json\n", + "\n", + "import psycopg2\n", + "import pickle\n", + "from datetime import datetime\n", + "import datetime" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "aca214bc-6426-4fc8-8574-23375f7f46f8", + "metadata": {}, + "outputs": [], + "source": [ + "# !pip3 install GNews" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a9c6ed07-a7d0-4aaa-b571-038919c75e05", + "metadata": {}, + "outputs": [], + "source": [ + "# Function to get full news article by headline\n", + "def get_news_article(headline):\n", + " # Initialize GNews client\n", + " gnews = GNews()\n", + "\n", + " articles = gnews.get_news(headline)\n", + "\n", + " # Check if any articles were found\n", + " results = []\n", + "\n", + " if articles:\n", + " for link in range(len(articles)):\n", + " try:\n", + " article = gnews.get_full_article(articles[link]['url'])\n", + " if article.text:\n", + " results.append([article.url, article.title, article.text])\n", + " break\n", + "\n", + " except Exception as e:\n", + " # link cannot be scraped\n", + " continue\n", + "\n", + " if not results:\n", + " # if blocked by the website\n", + " results.append(['cannot scrape the url', 'cannot scrape the title', 'cannot scrape the content'])\n", + " \n", + " # No articles found for the given title.\n", + " else:\n", + " results.append(['no url found', 'no title found', 'no content found'])\n", + "\n", + " return results\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0acaf899-cd96-4424-b825-c71bf042355a", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Read the CSV file with news titles\n", + "csv_file_path = 'LDA/cleaned_data.csv' \n", + "df = pd.read_csv(csv_file_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "465acb47-5f57-4d90-a272-7972d56e002b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5782, 17)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "801d892d-3b87-4ecf-805e-3441815d00f1", + "metadata": {}, + "outputs": [], + "source": [ + "# drop empty news\n", + "df.dropna(subset=['Headline'], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1267bf68-4726-49bf-ae21-9fed09682945", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[['Year', 'Headline', 'Region']].duplicated().any()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2a8c4229-adc3-42e7-83e1-6593be1977ec", + "metadata": {}, + "outputs": [], + "source": [ + "# drop the duplicated news\n", + "duplicates = df.duplicated(subset=['Year', 'Headline', 'Region'], keep='first')\n", + "df_uni = df[~duplicates]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "eb663cdd-b6c7-4468-9f69-f6e1c818ccd2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5710, 17)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_uni.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8520b8bf-e6d0-4533-9b0a-d5003a5e2fe3", + "metadata": {}, + "outputs": [], + "source": [ + "print(datetime.datetime.now())" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f3c35a3c-d852-4285-86bf-3f7db7df78e7", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":10: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " tes.at[index, 'url'] = results[0][0]\n", + ":11: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " tes.at[index, 'title'] = results[0][1]\n", + ":12: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " tes.at[index, 'content'] = results[0][2]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/world/china/major-container-ports-eastern-china-see-worsening-congestion-after-covid-cases-2021-08-12/ on URL https://news.google.com/rss/articles/CBMifmh0dHBzOi8vd3d3LnJldXRlcnMuY29tL3dvcmxkL2NoaW5hL21ham9yLWNvbnRhaW5lci1wb3J0cy1lYXN0ZXJuLWNoaW5hLXNlZS13b3JzZW5pbmctY29uZ2VzdGlvbi1hZnRlci1jb3ZpZC1jYXNlcy0yMDIxLTA4LTEyL9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/article/britain-equinor-oil-strike-idUKL8N1UG4QA/ on URL https://news.google.com/rss/articles/CBMiSWh0dHBzOi8vd3d3LnJldXRlcnMuY29tL2FydGljbGUvYnJpdGFpbi1lcXVpbm9yLW9pbC1zdHJpa2UtaWRVS0w4TjFVRzRRQS_SAQA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.moneyweb.co.za/news/south-africa/it-will-take-months-to-clear-durban-port-backlog/ on URL https://news.google.com/rss/articles/CBMiXmh0dHBzOi8vd3d3Lm1vbmV5d2ViLmNvLnphL25ld3Mvc291dGgtYWZyaWNhL2l0LXdpbGwtdGFrZS1tb250aHMtdG8tY2xlYXItZHVyYmFuLXBvcnQtYmFja2xvZy_SAQA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/11/2024 11:42:16 AM - fromstring() returned an invalid string: \n", + "\n", + "\n", + " ...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/world/europe/one-dead-cargo-ship-fire-electric-car-suspected-source-dutch-coastguard-2023-07-26/ on URL https://news.google.com/rss/articles/CBMieGh0dHBzOi8vd3d3LnJldXRlcnMuY29tL3dvcmxkL2V1cm9wZS9vbmUtZGVhZC1jYXJnby1zaGlwLWZpcmUtZWxlY3RyaWMtY2FyLXN1c3BlY3RlZC1zb3VyY2UtZHV0Y2gtY29hc3RndWFyZC0yMDIzLTA3LTI2L9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://news.usni.org/2023/05/30/authorities-detain-chinese-ship-suspected-of-salvaging-u-k-wwii-wrecks on URL https://news.google.com/rss/articles/CBMiZ2h0dHBzOi8vbmV3cy51c25pLm9yZy8yMDIzLzA1LzMwL2F1dGhvcml0aWVzLWRldGFpbi1jaGluZXNlLXNoaXAtc3VzcGVjdGVkLW9mLXNhbHZhZ2luZy11LWstd3dpaS13cmVja3PSAQA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/11/2024 11:47:56 AM - fromstring() returned an invalid string: \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\t...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.abc.net.au/news/2023-02-18/sydney-storm-outages/101994636 on URL https://news.google.com/rss/articles/CBMiRWh0dHBzOi8vd3d3LmFiYy5uZXQuYXUvbmV3cy8yMDIzLTAyLTE4L3N5ZG5leS1zdG9ybS1vdXRhZ2VzLzEwMTk5NDYzNtIBKGh0dHBzOi8vYW1wLmFiYy5uZXQuYXUvYXJ0aWNsZS8xMDE5OTQ2MzY?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with HTTPSConnectionPool(host='www.aljazeera.com', port=443): Read timed out. (read timeout=7) on URL https://news.google.com/rss/articles/CBMiYGh0dHBzOi8vd3d3LmFsamF6ZWVyYS5jb20vbmV3cy8yMDE4LzEwLzIyL3llbWVuLWRlYXRoLXRvbGwtZnJvbS10cm9waWNhbC1zdG9ybS1sdWJhbi1yaXNlcy10by0xMtIBZGh0dHBzOi8vd3d3LmFsamF6ZWVyYS5jb20vYW1wL25ld3MvMjAxOC8xMC8yMi95ZW1lbi1kZWF0aC10b2xsLWZyb20tdHJvcGljYWwtc3Rvcm0tbHViYW4tcmlzZXMtdG8tMTI?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.iol.co.za/business-report/economy/durban-container-terminal-pier-2-backlog-reduced-to-five-vessels-at-anchor-transnet-fa3b7dbe-3275-4afc-9244-4b8ef0b88822 on URL https://news.google.com/rss/articles/CBMipgFodHRwczovL3d3dy5pb2wuY28uemEvYnVzaW5lc3MtcmVwb3J0L2Vjb25vbXkvZHVyYmFuLWNvbnRhaW5lci10ZXJtaW5hbC1waWVyLTItYmFja2xvZy1yZWR1Y2VkLXRvLWZpdmUtdmVzc2Vscy1hdC1hbmNob3ItdHJhbnNuZXQtZmEzYjdkYmUtMzI3NS00YWZjLTkyNDQtNGI4ZWYwYjg4ODIy0gEA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/business/sustainable-business/south-africas-busiest-port-durban-hobbled-by-strike-2022-10-12/ on URL https://news.google.com/rss/articles/CBMidWh0dHBzOi8vd3d3LnJldXRlcnMuY29tL2J1c2luZXNzL3N1c3RhaW5hYmxlLWJ1c2luZXNzL3NvdXRoLWFmcmljYXMtYnVzaWVzdC1wb3J0LWR1cmJhbi1ob2JibGVkLWJ5LXN0cmlrZS0yMDIyLTEwLTEyL9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/11/2024 12:05:17 PM - fromstring() returned an invalid string: \n", + "\n", + "\t...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/world/asia-pacific/pakistan-election-candidate-shot-dead-while-campaigning-2024-01-10/ on URL https://news.google.com/rss/articles/CBMibmh0dHBzOi8vd3d3LnJldXRlcnMuY29tL3dvcmxkL2FzaWEtcGFjaWZpYy9wYWtpc3Rhbi1lbGVjdGlvbi1jYW5kaWRhdGUtc2hvdC1kZWFkLXdoaWxlLWNhbXBhaWduaW5nLTIwMjQtMDEtMTAv0gEA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/business/autos-transportation/toyota-resume-operations-remaining-halted-plants-tuesday-2023-10-23/ on URL https://news.google.com/rss/articles/CBMiemh0dHBzOi8vd3d3LnJldXRlcnMuY29tL2J1c2luZXNzL2F1dG9zLXRyYW5zcG9ydGF0aW9uL3RveW90YS1yZXN1bWUtb3BlcmF0aW9ucy1yZW1haW5pbmctaGFsdGVkLXBsYW50cy10dWVzZGF5LTIwMjMtMTAtMjMv0gEA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.timesofisrael.com/liveblog-december-18-2013/ on URL https://news.google.com/rss/articles/CBMiOGh0dHBzOi8vd3d3LnRpbWVzb2Zpc3JhZWwuY29tL2xpdmVibG9nLWRlY2VtYmVyLTE4LTIwMTMv0gE8aHR0cHM6Ly93d3cudGltZXNvZmlzcmFlbC5jb20vbGl2ZWJsb2ctZGVjZW1iZXItMTgtMjAxMy9hbXAv?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/11/2024 12:09:23 PM - fromstring() returned an invalid string: \n", + "\n", + "\t...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/business/autos-transportation/uaw-expands-strike-against-gm-walking-out-texas-suv-plant-2023-10-24/ on URL https://news.google.com/rss/articles/CBMie2h0dHBzOi8vd3d3LnJldXRlcnMuY29tL2J1c2luZXNzL2F1dG9zLXRyYW5zcG9ydGF0aW9uL3Vhdy1leHBhbmRzLXN0cmlrZS1hZ2FpbnN0LWdtLXdhbGtpbmctb3V0LXRleGFzLXN1di1wbGFudC0yMDIzLTEwLTI0L9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/business/autos-transportation/hyundai-motor-south-korea-union-reach-tentative-wage-deal-union-official-2023-09-12/ on URL https://news.google.com/rss/articles/CBMiigFodHRwczovL3d3dy5yZXV0ZXJzLmNvbS9idXNpbmVzcy9hdXRvcy10cmFuc3BvcnRhdGlvbi9oeXVuZGFpLW1vdG9yLXNvdXRoLWtvcmVhLXVuaW9uLXJlYWNoLXRlbnRhdGl2ZS13YWdlLWRlYWwtdW5pb24tb2ZmaWNpYWwtMjAyMy0wOS0xMi_SAQA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://news.usni.org/2018/06/13/trump-kim-summit-statement-casts-doubt-on-us-navy-korea-visits-in-doubt on URL https://news.google.com/rss/articles/CBMiaGh0dHBzOi8vbmV3cy51c25pLm9yZy8yMDE4LzA2LzEzL3RydW1wLWtpbS1zdW1taXQtc3RhdGVtZW50LWNhc3RzLWRvdWJ0LW9uLXVzLW5hdnkta29yZWEtdmlzaXRzLWluLWRvdWJ00gEA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/world/asia-pacific/philippines-lifts-tsunami-alert-after-magnitude-74-earthquake-2023-12-03/ on URL https://news.google.com/rss/articles/CBMidGh0dHBzOi8vd3d3LnJldXRlcnMuY29tL3dvcmxkL2FzaWEtcGFjaWZpYy9waGlsaXBwaW5lcy1saWZ0cy10c3VuYW1pLWFsZXJ0LWFmdGVyLW1hZ25pdHVkZS03NC1lYXJ0aHF1YWtlLTIwMjMtMTItMDMv0gEA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/11/2024 12:13:45 PM - fromstring() returned an invalid string: \n", + "\n", + "\n", + "\n", + "\n", + "\t...\n", + "03/11/2024 12:19:12 PM - fromstring() returned an invalid string: \n", + "\n", + " ...\n", + "03/11/2024 12:21:15 PM - fromstring() returned an invalid string: \n", + "\n", + " ...\n", + "03/11/2024 12:21:24 PM - fromstring() returned an invalid string: \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\t...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.houstonchronicle.com/news/houston-weather/article/Live-weather-updates-Houston-snow-winter-storm-15951522.php on URL https://news.google.com/rss/articles/CBMieWh0dHBzOi8vd3d3LmhvdXN0b25jaHJvbmljbGUuY29tL25ld3MvaG91c3Rvbi13ZWF0aGVyL2FydGljbGUvTGl2ZS13ZWF0aGVyLXVwZGF0ZXMtSG91c3Rvbi1zbm93LXdpbnRlci1zdG9ybS0xNTk1MTUyMi5waHDSAQA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/11/2024 12:38:54 PM - fromstring() returned an invalid string: \n", + "\n", + "\n", + "\n", + "\n", + " ...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.news.com.au/technology/environment/smoke-settles-over-sydney-as-firefighters-rush-to-reduce-fire-risk-ahead-of-warmer-weather/news-story/3c641dedab3af67e9fd9847176783500 on URL https://news.google.com/rss/articles/CBMitQFodHRwczovL3d3dy5uZXdzLmNvbS5hdS90ZWNobm9sb2d5L2Vudmlyb25tZW50L3Ntb2tlLXNldHRsZXMtb3Zlci1zeWRuZXktYXMtZmlyZWZpZ2h0ZXJzLXJ1c2gtdG8tcmVkdWNlLWZpcmUtcmlzay1haGVhZC1vZi13YXJtZXItd2VhdGhlci9uZXdzLXN0b3J5LzNjNjQxZGVkYWIzYWY2N2U5ZmQ5ODQ3MTc2NzgzNTAw0gG5AWh0dHBzOi8vd3d3Lm5ld3MuY29tLmF1L3RlY2hub2xvZ3kvZW52aXJvbm1lbnQvc21va2Utc2V0dGxlcy1vdmVyLXN5ZG5leS1hcy1maXJlZmlnaHRlcnMtcnVzaC10by1yZWR1Y2UtZmlyZS1yaXNrLWFoZWFkLW9mLXdhcm1lci13ZWF0aGVyL25ld3Mtc3RvcnkvM2M2NDFkZWRhYjNhZjY3ZTlmZDk4NDcxNzY3ODM1MDA_YW1w?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.news.com.au/travel/travel-updates/incidents/fresh-chaos-at-sydney-airport-as-16-more-flights-cancelled/news-story/6d860f4f59f990bdc3fb5c57b7e72798 on URL https://news.google.com/rss/articles/CBMingFodHRwczovL3d3dy5uZXdzLmNvbS5hdS90cmF2ZWwvdHJhdmVsLXVwZGF0ZXMvaW5jaWRlbnRzL2ZyZXNoLWNoYW9zLWF0LXN5ZG5leS1haXJwb3J0LWFzLTE2LW1vcmUtZmxpZ2h0cy1jYW5jZWxsZWQvbmV3cy1zdG9yeS82ZDg2MGY0ZjU5Zjk5MGJkYzNmYjVjNTdiN2U3Mjc5ONIBogFodHRwczovL3d3dy5uZXdzLmNvbS5hdS90cmF2ZWwvdHJhdmVsLXVwZGF0ZXMvaW5jaWRlbnRzL2ZyZXNoLWNoYW9zLWF0LXN5ZG5leS1haXJwb3J0LWFzLTE2LW1vcmUtZmxpZ2h0cy1jYW5jZWxsZWQvbmV3cy1zdG9yeS82ZDg2MGY0ZjU5Zjk5MGJkYzNmYjVjNTdiN2U3Mjc5OD9hbXA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/11/2024 12:46:02 PM - fromstring() returned an invalid string: \n", + "\n", + "\n", + "\n", + "\n", + "\n", + " ...\n", + "03/11/2024 12:50:13 PM - fromstring() returned an invalid string: \n", + " ...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://english.ahram.org.eg/NewsContent/1/64/404393/Egypt/Politics-/The-Little-Sun-gale-to-drench-Alexandria-for-three.aspx on URL https://news.google.com/rss/articles/CBMifGh0dHBzOi8vZW5nbGlzaC5haHJhbS5vcmcuZWcvTmV3c0NvbnRlbnQvMS82NC80MDQzOTMvRWd5cHQvUG9saXRpY3MtL1RoZS1MaXR0bGUtU3VuLWdhbGUtdG8tZHJlbmNoLUFsZXhhbmRyaWEtZm9yLXRocmVlLmFzcHjSAQA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.legacy.com/us/obituaries/stltoday/name/james-bagby-obituary?id=54529291 on URL https://news.google.com/rss/articles/CBMiU2h0dHBzOi8vd3d3LmxlZ2FjeS5jb20vdXMvb2JpdHVhcmllcy9zdGx0b2RheS9uYW1lL2phbWVzLWJhZ2J5LW9iaXR1YXJ5P2lkPTU0NTI5Mjkx0gEA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Max restarts limit reached for url: https://www.forbes.com/uk/advisor/travel-insurance/2024/01/02/travel-latest-news/ on URL https://news.google.com/rss/articles/CBMiUWh0dHBzOi8vd3d3LmZvcmJlcy5jb20vdWsvYWR2aXNvci90cmF2ZWwtaW5zdXJhbmNlLzIwMjQvMDEvMDIvdHJhdmVsLWxhdGVzdC1uZXdzL9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/11/2024 12:50:37 PM - fromstring() returned an invalid string: \n", + "\n", + " ...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/world/europe/port-antwerp-disrupted-by-belgian-farmers-protests-2024-02-13/ on URL https://news.google.com/rss/articles/CBMiY2h0dHBzOi8vd3d3LnJldXRlcnMuY29tL3dvcmxkL2V1cm9wZS9wb3J0LWFudHdlcnAtZGlzcnVwdGVkLWJ5LWJlbGdpYW4tZmFybWVycy1wcm90ZXN0cy0yMDI0LTAyLTEzL9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://newsbook.com.mt/en/bridge-collapse-blocks-brussels-scheldt-canal-traffic/ on URL https://news.google.com/rss/articles/CBMiUWh0dHBzOi8vbmV3c2Jvb2suY29tLm10L2VuL2JyaWRnZS1jb2xsYXBzZS1ibG9ja3MtYnJ1c3NlbHMtc2NoZWxkdC1jYW5hbC10cmFmZmljL9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.ajot.com/news/msc-adds-new-container-service-through-jaxport on URL https://news.google.com/rss/articles/CBMiSGh0dHBzOi8vd3d3LmFqb3QuY29tL25ld3MvbXNjLWFkZHMtbmV3LWNvbnRhaW5lci1zZXJ2aWNlLXRocm91Z2gtamF4cG9ydNIBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://newsbook.com.mt/en/cargo-ship-runs-aground-in-istanbuls-bosphorus-strait/ on URL https://news.google.com/rss/articles/CBMiUWh0dHBzOi8vbmV3c2Jvb2suY29tLm10L2VuL2NhcmdvLXNoaXAtcnVucy1hZ3JvdW5kLWluLWlzdGFuYnVscy1ib3NwaG9ydXMtc3RyYWl0L9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.timesofisrael.com/liveblog-february-18-2024/ on URL https://news.google.com/rss/articles/CBMiOGh0dHBzOi8vd3d3LnRpbWVzb2Zpc3JhZWwuY29tL2xpdmVibG9nLWZlYnJ1YXJ5LTE4LTIwMjQv0gE8aHR0cHM6Ly93d3cudGltZXNvZmlzcmFlbC5jb20vbGl2ZWJsb2ctZmVicnVhcnktMTgtMjAyNC9hbXAv?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://sunnewsonline.com/piracy-navy-omsl-rescue-chinese-cargo-ship-off-nigerias-coast/ on URL https://news.google.com/rss/articles/CBMiWGh0dHBzOi8vc3VubmV3c29ubGluZS5jb20vcGlyYWN5LW5hdnktb21zbC1yZXNjdWUtY2hpbmVzZS1jYXJnby1zaGlwLW9mZi1uaWdlcmlhcy1jb2FzdC_SAQA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.caranddriver.com/features/a35877638/golden-ray-final-voyage/ on URL https://news.google.com/rss/articles/CBMiSGh0dHBzOi8vd3d3LmNhcmFuZGRyaXZlci5jb20vZmVhdHVyZXMvYTM1ODc3NjM4L2dvbGRlbi1yYXktZmluYWwtdm95YWdlL9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.newsweek.com/watch-massive-container-ships-collide-losing-cargo-overboard-852319 on URL https://news.google.com/rss/articles/CBMiXGh0dHBzOi8vd3d3Lm5ld3N3ZWVrLmNvbS93YXRjaC1tYXNzaXZlLWNvbnRhaW5lci1zaGlwcy1jb2xsaWRlLWxvc2luZy1jYXJnby1vdmVyYm9hcmQtODUyMzE50gEA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.hapag-lloyd.com/es/company/about-us/newsletter/2023/09/-i-love-the-sailor-s-life-----jan-rusch.html on URL https://news.google.com/rss/articles/CBMib2h0dHBzOi8vd3d3LmhhcGFnLWxsb3lkLmNvbS9lcy9jb21wYW55L2Fib3V0LXVzL25ld3NsZXR0ZXIvMjAyMy8wOS8taS1sb3ZlLXRoZS1zYWlsb3Itcy1saWZlLS0tLS1qYW4tcnVzY2guaHRtbNIBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 429 Client Error: for url: https://www.rivieramm.com/opinion/opinion/lof-award-supports-french-etv-services-77561 on URL https://news.google.com/rss/articles/CBMiVmh0dHBzOi8vd3d3LnJpdmllcmFtbS5jb20vb3Bpbmlvbi9vcGluaW9uL2xvZi1hd2FyZC1zdXBwb3J0cy1mcmVuY2gtZXR2LXNlcnZpY2VzLTc3NTYx0gEA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://english.alarabiya.net/business/economy/2023/06/21/Suez-Canal-revenues-reach-all-time-high-of-9-4-bln-Official on URL https://news.google.com/rss/articles/CBMidWh0dHBzOi8vZW5nbGlzaC5hbGFyYWJpeWEubmV0L2J1c2luZXNzL2Vjb25vbXkvMjAyMy8wNi8yMS9TdWV6LUNhbmFsLXJldmVudWVzLXJlYWNoLWFsbC10aW1lLWhpZ2gtb2YtOS00LWJsbi1PZmZpY2lhbNIBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/11/2024 12:55:18 PM - fromstring() returned an invalid string: \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\t...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "An error occurred while fetching the article: Article `download()` failed with ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')) on URL https://news.google.com/rss/articles/CBMiP2h0dHBzOi8vd3d3LmFuZHJvaWRwb2xpY2UuY29tL2F2b2lkLWZhY2Vib29rLW1hcmtldHBsYWNlLXNjYW1zL9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", + "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.fbi.gov/stats-services/publications/financial-crimes-report-2009 on URL https://news.google.com/rss/articles/CBMiTGh0dHBzOi8vd3d3LmZiaS5nb3Yvc3RhdHMtc2VydmljZXMvcHVibGljYXRpb25zL2ZpbmFuY2lhbC1jcmltZXMtcmVwb3J0LTIwMDnSAQA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/11/2024 01:06:27 PM - fromstring() returned an invalid string: \n", + "\n", + "\n", + "\n", + "<...\n", + "03/11/2024 01:14:37 PM - fromstring() returned an invalid string: \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\t...\n", + "03/11/2024 01:18:21 PM - fromstring() returned an invalid string: \n", + "\n", + "\n", + "\n", + "\n", + "