tunnel / src /services /image_downloader.py
theshresthshukla's picture
Upload 10 files
21d27b2 verified
"""
Image downloading module with multiple search providers
"""
import os
import json
import logging
from typing import List, Tuple
from urllib.parse import quote_plus, urlparse
import requests
from bs4 import BeautifulSoup
from services.appconfig import HEADERS, DOWNLOAD_TIMEOUT, REQUEST_DELAY, ALLOWED_EXTENSIONS
from utils.utils import is_valid_image_file, get_file_extension, clean_up_file, rate_limit_delay
logger = logging.getLogger(__name__)
class ImageDownloader:
"""Download images from various search providers"""
def __init__(self):
"""Initialize ImageDownloader"""
self.session = requests.Session()
self.session.headers.update(HEADERS)
def get_bing_image_urls(self, entity: str, num_images: int = 15) -> List[str]:
"""
Get image URLs from Bing search
Args:
entity (str): Entity name to search for
num_images (int): Maximum number of URLs to return
Returns:
List[str]: List of image URLs
"""
logger.info(f"Searching Bing for {entity} logos...")
query = f"{entity} logo png transparent high quality"
encoded_query = quote_plus(query)
search_url = f"https://www.bing.com/images/search?q={encoded_query}&form=HDRSC2&first=1&tsc=ImageBasicHover"
try:
response = self.session.get(search_url, timeout=DOWNLOAD_TIMEOUT, verify=False)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
image_urls = []
# Find image data in Bing's format
img_containers = soup.find_all('a', {'class': 'iusc'})
for container in img_containers:
m_attr = container.get('m')
if m_attr:
try:
img_data = json.loads(m_attr)
img_url = img_data.get('murl') or img_data.get('turl')
if img_url and self._is_valid_image_url(img_url):
image_urls.append(img_url)
except json.JSONDecodeError:
continue
# Fallback: regular img tags
if len(image_urls) < 5:
img_tags = soup.find_all('img')
for img in img_tags:
src = img.get('src') or img.get('data-src')
if src and self._is_valid_image_url(src) and 'logo' in src.lower():
if src.startswith('http'):
image_urls.append(src)
logger.info(f"Found {len(image_urls)} URLs from Bing")
return image_urls[:num_images]
except Exception as e:
logger.error(f"Bing search failed for {entity}: {e}")
return []
def get_duckduckgo_image_urls(self, entity: str, num_images: int = 15) -> List[str]:
"""
Get image URLs from DuckDuckGo search
Args:
entity (str): Entity name to search for
num_images (int): Maximum number of URLs to return
Returns:
List[str]: List of image URLs
"""
logger.info(f"Searching DuckDuckGo for {entity} logos...")
query = f"{entity} logo hd png transparent"
encoded_query = quote_plus(query)
search_url = f"https://duckduckgo.com/?q={encoded_query}&t=h_&iax=images&ia=images"
try:
response = self.session.get(search_url, timeout=DOWNLOAD_TIMEOUT,verify=False)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
image_urls = []
img_tags = soup.find_all('img')
for img in img_tags:
src = img.get('src') or img.get('data-src')
if src and self._is_valid_image_url(src) and src.startswith('http'):
image_urls.append(src)
logger.info(f"Found {len(image_urls)} URLs from DuckDuckGo")
return image_urls[:num_images]
except Exception as e:
logger.error(f"DuckDuckGo search failed for {entity}: {e}")
return []
def get_alternative_logo_sources(self, entity: str) -> List[str]:
"""
Get URLs from alternative logo sources
Args:
entity (str): Entity name
Returns:
List[str]: List of alternative logo URLs
"""
urls = []
entity_clean = entity.lower().replace(' ', '').replace('.', '')
entity_hyphen = entity.lower().replace(' ', '-')
# Try various logo services
logo_sources = [
f"https://cdn.worldvectorlogo.com/logos/{entity_hyphen}.svg",
f"https://logos-world.net/wp-content/uploads/2020/11/{entity.replace(' ', '-')}-Logo.png",
f"https://logoeps.com/wp-content/uploads/2013/03/vector-{entity_clean}-logo.png",
f"https://1000logos.net/wp-content/uploads/2016/10/{entity.replace(' ', '-')}-Logo.png",
]
for url in logo_sources:
try:
response = self.session.head(url, timeout=5)
if response.status_code == 200:
urls.append(url)
logger.info(f"Found alternative logo: {url}")
except Exception:
continue
return urls
def _is_valid_image_url(self, url: str) -> bool:
"""
Check if URL is a valid image URL
Args:
url (str): URL to check
Returns:
bool: True if valid image URL
"""
if not url:
return False
# Check if URL contains image extension
url_lower = url.lower()
return any(ext in url_lower for ext in ALLOWED_EXTENSIONS)
def download_image(self, url: str, filepath: str) -> bool:
"""
Download image from URL
Args:
url (str): Image URL
filepath (str): Local filepath to save image
Returns:
bool: True if download successful
"""
try:
logger.debug(f"Downloading: {url}")
response = self.session.get(url, timeout=DOWNLOAD_TIMEOUT, stream=True,verify=False)
response.raise_for_status()
# Check content type
content_type = response.headers.get('content-type', '').lower()
if not any(img_type in content_type for img_type in ['image', 'svg']):
logger.warning(f"Invalid content type for {url}: {content_type}")
return False
# Download with streaming
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
# Validate downloaded file
if is_valid_image_file(filepath):
logger.debug(f"Successfully downloaded: {filepath}")
return True
else:
clean_up_file(filepath)
logger.warning(f"Downloaded invalid image: {url}")
return False
except Exception as e:
clean_up_file(filepath)
logger.error(f"Download failed for {url}: {e}")
return False
def download_logos_for_entity(self, entity: str, entity_folder: str, num_logos: int = 10) -> Tuple[int, List[str]]:
"""
Download logos for a single entity
Args:
entity (str): Entity name
entity_folder (str): Folder to save logos
num_logos (int): Number of logos to download
Returns:
Tuple[int, List[str]]: (number downloaded, list of downloaded files)
"""
logger.info(f"Downloading top {num_logos} logos for: {entity}")
# Collect URLs from all sources
all_urls = []
# Alternative logo services
alt_urls = self.get_alternative_logo_sources(entity)
all_urls.extend(alt_urls)
# Bing search
bing_urls = self.get_bing_image_urls(entity, 20)
all_urls.extend(bing_urls)
# DuckDuckGo search
ddg_urls = self.get_duckduckgo_image_urls(entity, 15)
all_urls.extend(ddg_urls)
# Remove duplicates while preserving order
unique_urls = []
seen = set()
for url in all_urls:
if url not in seen:
seen.add(url)
unique_urls.append(url)
if not unique_urls:
logger.warning(f"No URLs found for {entity}")
return 0, []
logger.info(f"Found {len(unique_urls)} unique URLs for {entity}")
# Download images
downloaded_files = []
downloaded_count = 0
for i, url in enumerate(unique_urls):
if downloaded_count >= num_logos:
break
try:
extension = get_file_extension(url)
filename = f"{entity.replace(' ', '_')}_logo_{downloaded_count + 1}{extension}"
filepath = os.path.join(entity_folder, filename)
if self.download_image(url, filepath):
downloaded_count += 1
downloaded_files.append(filepath)
logger.info(f"Downloaded ({downloaded_count}/{num_logos}): {filename}")
# Be respectful to servers
rate_limit_delay(REQUEST_DELAY)
except Exception as e:
logger.error(f"Error processing URL {url}: {e}")
continue
logger.info(f"Successfully downloaded {downloaded_count}/{num_logos} logos for {entity}")
return downloaded_count, downloaded_files