thunder-007 commited on
Commit
96dd5a5
·
1 Parent(s): 70ab479
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +41 -0
  3. requirements.txt +69 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ venv
app.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import os
5
+ import tempfile
6
+ import re
7
+
8
+ title = 'Web Scraper'
9
+ description = '''
10
+ '''
11
+
12
+
13
+ def get_from_url(url):
14
+ try:
15
+ response = requests.get(url)
16
+ response.raise_for_status()
17
+ soup = BeautifulSoup(response.content, 'html.parser')
18
+ text = soup.get_text(separator="\n")
19
+
20
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, encoding="utf-8") as file:
21
+ filename = file.name
22
+ file.write(text)
23
+
24
+ return filename # Return the temporary file path
25
+ except requests.exceptions.RequestException as e:
26
+ print("Error fetching the URL:", e)
27
+ return None
28
+ except Exception as e:
29
+ print("Error:", e)
30
+ return None
31
+
32
+
33
+ def extract_text_from_html(urls):
34
+ return [get_from_url(url) for url in urls.split("\n")]
35
+
36
+
37
+ interface = gr.Interface(fn=extract_text_from_html, inputs=[gr.Textbox(label="Url input")],
38
+ outputs=[gr.File(label="Scrapped Text")], title=title,
39
+ description=description)
40
+
41
+ interface.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.1.0
2
+ aiohttp==3.8.5
3
+ aiosignal==1.3.1
4
+ altair==5.0.1
5
+ annotated-types==0.5.0
6
+ anyio==3.7.1
7
+ async-timeout==4.0.2
8
+ attrs==23.1.0
9
+ beautifulsoup4==4.12.2
10
+ certifi==2023.7.22
11
+ charset-normalizer==3.2.0
12
+ click==8.1.6
13
+ contourpy==1.1.0
14
+ cycler==0.11.0
15
+ exceptiongroup==1.1.2
16
+ fastapi==0.100.1
17
+ ffmpy==0.3.1
18
+ filelock==3.12.2
19
+ fonttools==4.41.1
20
+ frozenlist==1.4.0
21
+ fsspec==2023.6.0
22
+ gradio==3.39.0
23
+ gradio_client==0.3.0
24
+ h11==0.14.0
25
+ httpcore==0.17.3
26
+ httpx==0.24.1
27
+ huggingface-hub==0.16.4
28
+ idna==3.4
29
+ Jinja2==3.1.2
30
+ jsonschema==4.18.4
31
+ jsonschema-specifications==2023.7.1
32
+ kiwisolver==1.4.4
33
+ linkify-it-py==2.0.2
34
+ markdown-it-py==2.2.0
35
+ MarkupSafe==2.1.3
36
+ matplotlib==3.7.2
37
+ mdit-py-plugins==0.3.3
38
+ mdurl==0.1.2
39
+ multidict==6.0.4
40
+ numpy==1.25.1
41
+ orjson==3.9.2
42
+ packaging==23.1
43
+ pandas==2.0.3
44
+ Pillow==10.0.0
45
+ pydantic==2.1.1
46
+ pydantic_core==2.4.0
47
+ pydub==0.25.1
48
+ pyparsing==3.0.9
49
+ python-dateutil==2.8.2
50
+ python-multipart==0.0.6
51
+ pytz==2023.3
52
+ PyYAML==6.0.1
53
+ referencing==0.30.0
54
+ requests==2.31.0
55
+ rpds-py==0.9.2
56
+ semantic-version==2.10.0
57
+ six==1.16.0
58
+ sniffio==1.3.0
59
+ soupsieve==2.4.1
60
+ starlette==0.27.0
61
+ toolz==0.12.0
62
+ tqdm==4.65.0
63
+ typing_extensions==4.7.1
64
+ tzdata==2023.3
65
+ uc-micro-py==1.0.2
66
+ urllib3==2.0.4
67
+ uvicorn==0.23.1
68
+ websockets==11.0.3
69
+ yarl==1.9.2