rahulNenavath305 commited on
Commit
94ec2b7
·
1 Parent(s): 7d32cd5

Added streamlit app.py file

Browse files
Files changed (1) hide show
  1. app.py +88 -0
app.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import requests
4
+ from streamlit_lottie import st_lottie
5
+
6
+ def main() -> None:
7
+
8
+ # ----- Loading Assets ----
9
+
10
+ def load_lottieurl(lottie_url:str):
11
+ r = requests.get(url=lottie_url)
12
+ return r.json() if r.status_code == 200 else None
13
+
14
+ def fetch(url):
15
+ try:
16
+ result = requests.post(url=os.environ.get('scraper-api-endpoint'), json={'url': url})
17
+ return result.json()
18
+ except Exception:
19
+ return {}
20
+
21
+ st.set_page_config(page_title="Article Scraper - Rahul Portfolio Project", page_icon=":spider:", layout="wide")
22
+
23
+ lottie_animation = load_lottieurl(lottie_url="https://assets3.lottiefiles.com/private_files/lf30_UaWyEa.json")
24
+
25
+ # ----- Introduction --------
26
+ with st.container():
27
+ st.subheader("Article Scraper")
28
+ st.title("A Digital News / Article Information Extraction Application")
29
+ st.write("A portfolio project developed to showcase my ability in developing Information Extraction Services")
30
+ st.write("This service can be utilised in the data collection / curation process of data science workflow")
31
+ st.write("[My Website >](https://www.rahulnenavath.co.in/)")
32
+
33
+ with st.container():
34
+ st.write("---")
35
+ left_col, right_col = st.columns(2)
36
+
37
+ with left_col:
38
+ st.header("How it works?")
39
+ st.write("##")
40
+ st.write('**Input**: Article URL')
41
+ st.write('**Output**: Extracted Article Information')
42
+ st.write(
43
+ """
44
+ **Working**:
45
+ - Download the HTML content from the given Article URL
46
+ - Makes use of BeautifulSoup and extracts content from different HTML tags and ClassNames
47
+ - Arrange Information appropriately
48
+ - Regex based text cleaning to remove characters like additional spaces, unicodes, tabs, and newline characters
49
+ """
50
+ )
51
+
52
+ with right_col:
53
+ st_lottie(lottie_animation, height=500)
54
+
55
+ with st.container():
56
+ with st.form("my_form"):
57
+ article_url = st.text_input("Article URL", value="", key="article_url")
58
+
59
+ submitted = st.form_submit_button("Submit")
60
+
61
+ if submitted:
62
+ with st.spinner('Scraping Information ...'):
63
+ data = fetch(url=article_url)
64
+
65
+ if data:
66
+ with st.container():
67
+ content = data.get("scraped_content")
68
+ st.write("---")
69
+ st.title(f'Extracted Article Information')
70
+ st.write(f'**Article Title:**')
71
+ st.header(f"{content.get('article_title')}")
72
+ st.write(f"**Author:** {content.get('author')}")
73
+ st.write(f"**Published Date:** {content.get('publish_date')}")
74
+ st.write(f"**Description:** {content.get('description')}")
75
+ st.write(f"**Content:** {content.get('article_content')}")
76
+ st.write(f"**Article URL:** {content.get('article_url')}")
77
+ st.write(f"**Canonical URL:** {content.get('canonical_url')}")
78
+ st.write(f"**Publisher Name:** {content.get('publisher_name')}")
79
+ st.write(f"**Article Image:** {content.get('image')}")
80
+ st.write(f"**Article Keywords:** {content.get('keywords')}")
81
+ st.write(f"**Video URL:** {content.get('video_url')}")
82
+ st.write(f"**Audio URL:** {content.get('audio_url')}")
83
+ else:
84
+ st.error("Error")
85
+
86
+
87
+ if __name__ == "__main__":
88
+ main()