acecalisto3 commited on
Commit
35d78ec
·
verified ·
1 Parent(s): 19ffead

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -24
app.py CHANGED
@@ -1,31 +1,19 @@
1
-
2
  import json
3
  import os
4
  import re
5
  import time
6
- try:
7
- import matplotlib
8
- except ImportError:
9
- import subprocess
10
- subprocess.run(["pip", "install", "matplotlib"])
11
- import matplotlib
12
  import logging
13
  import mimetypes
14
- import concurrent.futures
15
- import string
16
  import zipfile
17
  import tempfile
18
  from datetime import datetime
19
- from typing import List, Dict, Optional, Union
20
  from pathlib import Path
21
- from urllib.parse import urlparse
22
  import requests
23
  import validators
24
  import gradio as gr
25
- from diskcache import Cache
26
  from bs4 import BeautifulSoup
27
  from fake_useragent import UserAgent
28
- from ratelimit import limits, sleep_and_retry
29
  from cleantext import clean
30
 
31
  # Setup logging with detailed configuration
@@ -44,7 +32,7 @@ class URLProcessor:
44
  self.session = requests.Session()
45
  self.timeout = 10 # seconds
46
  self.session.headers.update({
47
- 'User-Agent': UserAgent().random,
48
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
49
  'Accept-Language': 'en-US,en;q=0.5',
50
  'Accept-Encoding': 'gzip, deflate, br',
@@ -92,15 +80,10 @@ class URLProcessor:
92
  def fetch_content(self, url: str) -> Optional[Dict]:
93
  """Universal content fetcher with special case handling"""
94
  try:
95
- # Google Drive document handling
96
  if 'drive.google.com' in url:
97
  return self._handle_google_drive(url)
98
-
99
- # Google Calendar ICS handling
100
  if 'calendar.google.com' in url and 'ical' in url:
101
  return self._handle_google_calendar(url)
102
-
103
- # Standard HTML processing
104
  return self._fetch_html_content(url)
105
  except Exception as e:
106
  logger.error(f"Content fetch failed: {e}")
@@ -115,7 +98,7 @@ class URLProcessor:
115
  return None
116
 
117
  direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
118
- response = self.session.get(direct_url, timeout=self.timeout)
119
  response.raise_for_status()
120
 
121
  return {
@@ -149,14 +132,11 @@ class URLProcessor:
149
 
150
  soup = BeautifulSoup(response.text, 'html.parser')
151
 
152
- # Remove unwanted elements
153
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
154
  element.decompose()
155
 
156
- # Extract main content
157
  main_content = soup.find('main') or soup.find('article') or soup.body
158
 
159
- # Clean and structure content
160
  text_content = main_content.get_text(separator='\n', strip=True)
161
  cleaned_content = self.advanced_text_cleaning(text_content)
162
 
@@ -336,7 +316,6 @@ def create_interface():
336
  json.dump(results, f, ensure_ascii=False, indent=2)
337
 
338
  summary = f"Processed {len(results)} items successfully!"
339
- # Convert Path object to string here
340
  return str(output_path), summary
341
  else:
342
  return None, "No valid content to process."
 
 
1
  import json
2
  import os
3
  import re
4
  import time
 
 
 
 
 
 
5
  import logging
6
  import mimetypes
 
 
7
  import zipfile
8
  import tempfile
9
  from datetime import datetime
10
+ from typing import List, Dict, Optional
11
  from pathlib import Path
 
12
  import requests
13
  import validators
14
  import gradio as gr
 
15
  from bs4 import BeautifulSoup
16
  from fake_useragent import UserAgent
 
17
  from cleantext import clean
18
 
19
  # Setup logging with detailed configuration
 
32
  self.session = requests.Session()
33
  self.timeout = 10 # seconds
34
  self.session.headers.update({
35
+ 'User -Agent': UserAgent().random,
36
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
37
  'Accept-Language': 'en-US,en;q=0.5',
38
  'Accept-Encoding': 'gzip, deflate, br',
 
80
  def fetch_content(self, url: str) -> Optional[Dict]:
81
  """Universal content fetcher with special case handling"""
82
  try:
 
83
  if 'drive.google.com' in url:
84
  return self._handle_google_drive(url)
 
 
85
  if 'calendar.google.com' in url and 'ical' in url:
86
  return self._handle_google_calendar(url)
 
 
87
  return self._fetch_html_content(url)
88
  except Exception as e:
89
  logger.error(f"Content fetch failed: {e}")
 
98
  return None
99
 
100
  direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
101
+ response = self.session.get(direct_url, timeout = self.timeout)
102
  response.raise_for_status()
103
 
104
  return {
 
132
 
133
  soup = BeautifulSoup(response.text, 'html.parser')
134
 
 
135
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
136
  element.decompose()
137
 
 
138
  main_content = soup.find('main') or soup.find('article') or soup.body
139
 
 
140
  text_content = main_content.get_text(separator='\n', strip=True)
141
  cleaned_content = self.advanced_text_cleaning(text_content)
142
 
 
316
  json.dump(results, f, ensure_ascii=False, indent=2)
317
 
318
  summary = f"Processed {len(results)} items successfully!"
 
319
  return str(output_path), summary
320
  else:
321
  return None, "No valid content to process."