Spaces:

rahulNenavath305
/

article-scraper

Runtime error

App Files Files Community

rahulNenavath305 commited on Jul 3, 2022

Commit

94ec2b7

1 Parent(s): 7d32cd5

Added streamlit app.py file

Browse files

Files changed (1) hide show

app.py +88 -0

app.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import os
+import streamlit as st
+import requests
+from streamlit_lottie import st_lottie
+def main() -> None:
+    # ----- Loading Assets ----
+    def load_lottieurl(lottie_url:str):
+        r = requests.get(url=lottie_url)
+        return r.json() if r.status_code == 200 else None
+    def fetch(url):
+        try:
+            result = requests.post(url=os.environ.get('scraper-api-endpoint'), json={'url': url})
+            return result.json()
+        except Exception:
+            return {}
+    st.set_page_config(page_title="Article Scraper - Rahul Portfolio Project", page_icon=":spider:", layout="wide")
+    lottie_animation = load_lottieurl(lottie_url="https://assets3.lottiefiles.com/private_files/lf30_UaWyEa.json")
+    # ----- Introduction --------
+    with st.container():
+        st.subheader("Article Scraper")
+        st.title("A Digital News / Article Information Extraction Application")
+        st.write("A portfolio project developed to showcase my ability in developing Information Extraction Services")
+        st.write("This service can be utilised in the data collection / curation process of data science workflow")
+        st.write("[My Website >](https://www.rahulnenavath.co.in/)")
+    with st.container():
+        st.write("---")
+        left_col, right_col = st.columns(2)
+        with left_col:
+            st.header("How it works?")
+            st.write("##")
+            st.write('**Input**: Article URL')
+            st.write('**Output**: Extracted Article Information')
+            st.write(
+                """
+                **Working**:
+                - Download the HTML content from the given Article URL
+                - Makes use of BeautifulSoup and extracts content from different HTML tags and ClassNames
+                - Arrange Information appropriately
+                - Regex based text cleaning to remove characters like additional spaces, unicodes, tabs, and newline characters
+                """
+            )
+        with right_col:
+            st_lottie(lottie_animation, height=500)
+    with st.container():
+        with st.form("my_form"):
+            article_url = st.text_input("Article URL", value="", key="article_url")
+            submitted = st.form_submit_button("Submit")
+            if submitted:
+                with st.spinner('Scraping Information ...'):
+                    data = fetch(url=article_url)
+                if data:
+                    with st.container():
+                        content = data.get("scraped_content")
+                        st.write("---")
+                        st.title(f'Extracted Article Information')
+                        st.write(f'**Article Title:**')
+                        st.header(f"{content.get('article_title')}")
+                        st.write(f"**Author:** {content.get('author')}")
+                        st.write(f"**Published Date:** {content.get('publish_date')}")
+                        st.write(f"**Description:** {content.get('description')}")
+                        st.write(f"**Content:** {content.get('article_content')}")
+                        st.write(f"**Article URL:** {content.get('article_url')}")
+                        st.write(f"**Canonical URL:** {content.get('canonical_url')}")
+                        st.write(f"**Publisher Name:** {content.get('publisher_name')}")
+                        st.write(f"**Article Image:** {content.get('image')}")
+                        st.write(f"**Article Keywords:** {content.get('keywords')}")
+                        st.write(f"**Video URL:** {content.get('video_url')}")
+                        st.write(f"**Audio URL:** {content.get('audio_url')}")
+                else:
+                    st.error("Error")
+if __name__ == "__main__":
+    main()