""" Image downloading module with multiple search providers """ import os import json import logging from typing import List, Tuple from urllib.parse import quote_plus, urlparse import requests from bs4 import BeautifulSoup from services.appconfig import HEADERS, DOWNLOAD_TIMEOUT, REQUEST_DELAY, ALLOWED_EXTENSIONS from utils.utils import is_valid_image_file, get_file_extension, clean_up_file, rate_limit_delay logger = logging.getLogger(__name__) class ImageDownloader: """Download images from various search providers""" def __init__(self): """Initialize ImageDownloader""" self.session = requests.Session() self.session.headers.update(HEADERS) def get_bing_image_urls(self, entity: str, num_images: int = 15) -> List[str]: """ Get image URLs from Bing search Args: entity (str): Entity name to search for num_images (int): Maximum number of URLs to return Returns: List[str]: List of image URLs """ logger.info(f"Searching Bing for {entity} logos...") query = f"{entity} logo png transparent high quality" encoded_query = quote_plus(query) search_url = f"https://www.bing.com/images/search?q={encoded_query}&form=HDRSC2&first=1&tsc=ImageBasicHover" try: response = self.session.get(search_url, timeout=DOWNLOAD_TIMEOUT, verify=False) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') image_urls = [] # Find image data in Bing's format img_containers = soup.find_all('a', {'class': 'iusc'}) for container in img_containers: m_attr = container.get('m') if m_attr: try: img_data = json.loads(m_attr) img_url = img_data.get('murl') or img_data.get('turl') if img_url and self._is_valid_image_url(img_url): image_urls.append(img_url) except json.JSONDecodeError: continue # Fallback: regular img tags if len(image_urls) < 5: img_tags = soup.find_all('img') for img in img_tags: src = img.get('src') or img.get('data-src') if src and self._is_valid_image_url(src) and 'logo' in src.lower(): if src.startswith('http'): image_urls.append(src) logger.info(f"Found {len(image_urls)} URLs from Bing") return image_urls[:num_images] except Exception as e: logger.error(f"Bing search failed for {entity}: {e}") return [] def get_duckduckgo_image_urls(self, entity: str, num_images: int = 15) -> List[str]: """ Get image URLs from DuckDuckGo search Args: entity (str): Entity name to search for num_images (int): Maximum number of URLs to return Returns: List[str]: List of image URLs """ logger.info(f"Searching DuckDuckGo for {entity} logos...") query = f"{entity} logo hd png transparent" encoded_query = quote_plus(query) search_url = f"https://duckduckgo.com/?q={encoded_query}&t=h_&iax=images&ia=images" try: response = self.session.get(search_url, timeout=DOWNLOAD_TIMEOUT,verify=False) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') image_urls = [] img_tags = soup.find_all('img') for img in img_tags: src = img.get('src') or img.get('data-src') if src and self._is_valid_image_url(src) and src.startswith('http'): image_urls.append(src) logger.info(f"Found {len(image_urls)} URLs from DuckDuckGo") return image_urls[:num_images] except Exception as e: logger.error(f"DuckDuckGo search failed for {entity}: {e}") return [] def get_alternative_logo_sources(self, entity: str) -> List[str]: """ Get URLs from alternative logo sources Args: entity (str): Entity name Returns: List[str]: List of alternative logo URLs """ urls = [] entity_clean = entity.lower().replace(' ', '').replace('.', '') entity_hyphen = entity.lower().replace(' ', '-') # Try various logo services logo_sources = [ f"https://cdn.worldvectorlogo.com/logos/{entity_hyphen}.svg", f"https://logos-world.net/wp-content/uploads/2020/11/{entity.replace(' ', '-')}-Logo.png", f"https://logoeps.com/wp-content/uploads/2013/03/vector-{entity_clean}-logo.png", f"https://1000logos.net/wp-content/uploads/2016/10/{entity.replace(' ', '-')}-Logo.png", ] for url in logo_sources: try: response = self.session.head(url, timeout=5) if response.status_code == 200: urls.append(url) logger.info(f"Found alternative logo: {url}") except Exception: continue return urls def _is_valid_image_url(self, url: str) -> bool: """ Check if URL is a valid image URL Args: url (str): URL to check Returns: bool: True if valid image URL """ if not url: return False # Check if URL contains image extension url_lower = url.lower() return any(ext in url_lower for ext in ALLOWED_EXTENSIONS) def download_image(self, url: str, filepath: str) -> bool: """ Download image from URL Args: url (str): Image URL filepath (str): Local filepath to save image Returns: bool: True if download successful """ try: logger.debug(f"Downloading: {url}") response = self.session.get(url, timeout=DOWNLOAD_TIMEOUT, stream=True,verify=False) response.raise_for_status() # Check content type content_type = response.headers.get('content-type', '').lower() if not any(img_type in content_type for img_type in ['image', 'svg']): logger.warning(f"Invalid content type for {url}: {content_type}") return False # Download with streaming with open(filepath, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk) # Validate downloaded file if is_valid_image_file(filepath): logger.debug(f"Successfully downloaded: {filepath}") return True else: clean_up_file(filepath) logger.warning(f"Downloaded invalid image: {url}") return False except Exception as e: clean_up_file(filepath) logger.error(f"Download failed for {url}: {e}") return False def download_logos_for_entity(self, entity: str, entity_folder: str, num_logos: int = 10) -> Tuple[int, List[str]]: """ Download logos for a single entity Args: entity (str): Entity name entity_folder (str): Folder to save logos num_logos (int): Number of logos to download Returns: Tuple[int, List[str]]: (number downloaded, list of downloaded files) """ logger.info(f"Downloading top {num_logos} logos for: {entity}") # Collect URLs from all sources all_urls = [] # Alternative logo services alt_urls = self.get_alternative_logo_sources(entity) all_urls.extend(alt_urls) # Bing search bing_urls = self.get_bing_image_urls(entity, 20) all_urls.extend(bing_urls) # DuckDuckGo search ddg_urls = self.get_duckduckgo_image_urls(entity, 15) all_urls.extend(ddg_urls) # Remove duplicates while preserving order unique_urls = [] seen = set() for url in all_urls: if url not in seen: seen.add(url) unique_urls.append(url) if not unique_urls: logger.warning(f"No URLs found for {entity}") return 0, [] logger.info(f"Found {len(unique_urls)} unique URLs for {entity}") # Download images downloaded_files = [] downloaded_count = 0 for i, url in enumerate(unique_urls): if downloaded_count >= num_logos: break try: extension = get_file_extension(url) filename = f"{entity.replace(' ', '_')}_logo_{downloaded_count + 1}{extension}" filepath = os.path.join(entity_folder, filename) if self.download_image(url, filepath): downloaded_count += 1 downloaded_files.append(filepath) logger.info(f"Downloaded ({downloaded_count}/{num_logos}): {filename}") # Be respectful to servers rate_limit_delay(REQUEST_DELAY) except Exception as e: logger.error(f"Error processing URL {url}: {e}") continue logger.info(f"Successfully downloaded {downloaded_count}/{num_logos} logos for {entity}") return downloaded_count, downloaded_files