File size: 10,212 Bytes
21d27b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
"""
Image downloading module with multiple search providers
"""
import os
import json
import logging
from typing import List, Tuple
from urllib.parse import quote_plus, urlparse
import requests
from bs4 import BeautifulSoup

from services.appconfig import HEADERS, DOWNLOAD_TIMEOUT, REQUEST_DELAY, ALLOWED_EXTENSIONS
from utils.utils import is_valid_image_file, get_file_extension, clean_up_file, rate_limit_delay

logger = logging.getLogger(__name__)


class ImageDownloader:
    """Download images from various search providers"""
    
    def __init__(self):
        """Initialize ImageDownloader"""
        self.session = requests.Session()
        self.session.headers.update(HEADERS)
    
    def get_bing_image_urls(self, entity: str, num_images: int = 15) -> List[str]:
        """
        Get image URLs from Bing search
        
        Args:
            entity (str): Entity name to search for
            num_images (int): Maximum number of URLs to return
            
        Returns:
            List[str]: List of image URLs
        """
        logger.info(f"Searching Bing for {entity} logos...")
        
        query = f"{entity} logo png transparent high quality"
        encoded_query = quote_plus(query)
        search_url = f"https://www.bing.com/images/search?q={encoded_query}&form=HDRSC2&first=1&tsc=ImageBasicHover"
        
        try:
            response = self.session.get(search_url, timeout=DOWNLOAD_TIMEOUT, verify=False)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            image_urls = []
            
            # Find image data in Bing's format
            img_containers = soup.find_all('a', {'class': 'iusc'})
            for container in img_containers:
                m_attr = container.get('m')
                if m_attr:
                    try:
                        img_data = json.loads(m_attr)
                        img_url = img_data.get('murl') or img_data.get('turl')
                        if img_url and self._is_valid_image_url(img_url):
                            image_urls.append(img_url)
                    except json.JSONDecodeError:
                        continue
            
            # Fallback: regular img tags
            if len(image_urls) < 5:
                img_tags = soup.find_all('img')
                for img in img_tags:
                    src = img.get('src') or img.get('data-src')
                    if src and self._is_valid_image_url(src) and 'logo' in src.lower():
                        if src.startswith('http'):
                            image_urls.append(src)
            
            logger.info(f"Found {len(image_urls)} URLs from Bing")
            return image_urls[:num_images]
            
        except Exception as e:
            logger.error(f"Bing search failed for {entity}: {e}")
            return []
    
    def get_duckduckgo_image_urls(self, entity: str, num_images: int = 15) -> List[str]:
        """
        Get image URLs from DuckDuckGo search
        
        Args:
            entity (str): Entity name to search for
            num_images (int): Maximum number of URLs to return
            
        Returns:
            List[str]: List of image URLs
        """
        logger.info(f"Searching DuckDuckGo for {entity} logos...")
        
        query = f"{entity} logo hd png transparent"
        encoded_query = quote_plus(query)
        search_url = f"https://duckduckgo.com/?q={encoded_query}&t=h_&iax=images&ia=images"
        
        try:
            response = self.session.get(search_url, timeout=DOWNLOAD_TIMEOUT,verify=False)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            image_urls = []
            
            img_tags = soup.find_all('img')
            for img in img_tags:
                src = img.get('src') or img.get('data-src')
                if src and self._is_valid_image_url(src) and src.startswith('http'):
                    image_urls.append(src)
            
            logger.info(f"Found {len(image_urls)} URLs from DuckDuckGo")
            return image_urls[:num_images]
            
        except Exception as e:
            logger.error(f"DuckDuckGo search failed for {entity}: {e}")
            return []
    
    def get_alternative_logo_sources(self, entity: str) -> List[str]:
        """
        Get URLs from alternative logo sources
        
        Args:
            entity (str): Entity name
            
        Returns:
            List[str]: List of alternative logo URLs
        """
        urls = []
        entity_clean = entity.lower().replace(' ', '').replace('.', '')
        entity_hyphen = entity.lower().replace(' ', '-')
        
        # Try various logo services
        logo_sources = [
            f"https://cdn.worldvectorlogo.com/logos/{entity_hyphen}.svg",
            f"https://logos-world.net/wp-content/uploads/2020/11/{entity.replace(' ', '-')}-Logo.png",
            f"https://logoeps.com/wp-content/uploads/2013/03/vector-{entity_clean}-logo.png",
            f"https://1000logos.net/wp-content/uploads/2016/10/{entity.replace(' ', '-')}-Logo.png",
        ]
        
        for url in logo_sources:
            try:
                response = self.session.head(url, timeout=5)
                if response.status_code == 200:
                    urls.append(url)
                    logger.info(f"Found alternative logo: {url}")
            except Exception:
                continue
        
        return urls
    
    def _is_valid_image_url(self, url: str) -> bool:
        """
        Check if URL is a valid image URL
        
        Args:
            url (str): URL to check
            
        Returns:
            bool: True if valid image URL
        """
        if not url:
            return False
        
        # Check if URL contains image extension
        url_lower = url.lower()
        return any(ext in url_lower for ext in ALLOWED_EXTENSIONS)
    
    def download_image(self, url: str, filepath: str) -> bool:
        """
        Download image from URL
        
        Args:
            url (str): Image URL
            filepath (str): Local filepath to save image
            
        Returns:
            bool: True if download successful
        """
        try:
            logger.debug(f"Downloading: {url}")
            
            response = self.session.get(url, timeout=DOWNLOAD_TIMEOUT, stream=True,verify=False)
            response.raise_for_status()
            
            # Check content type
            content_type = response.headers.get('content-type', '').lower()
            if not any(img_type in content_type for img_type in ['image', 'svg']):
                logger.warning(f"Invalid content type for {url}: {content_type}")
                return False
            
            # Download with streaming
            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
            
            # Validate downloaded file
            if is_valid_image_file(filepath):
                logger.debug(f"Successfully downloaded: {filepath}")
                return True
            else:
                clean_up_file(filepath)
                logger.warning(f"Downloaded invalid image: {url}")
                return False
                
        except Exception as e:
            clean_up_file(filepath)
            logger.error(f"Download failed for {url}: {e}")
            return False
    
    def download_logos_for_entity(self, entity: str, entity_folder: str, num_logos: int = 10) -> Tuple[int, List[str]]:
        """
        Download logos for a single entity
        
        Args:
            entity (str): Entity name
            entity_folder (str): Folder to save logos
            num_logos (int): Number of logos to download
            
        Returns:
            Tuple[int, List[str]]: (number downloaded, list of downloaded files)
        """
        logger.info(f"Downloading top {num_logos} logos for: {entity}")
        
        # Collect URLs from all sources
        all_urls = []
        
        # Alternative logo services
        alt_urls = self.get_alternative_logo_sources(entity)
        all_urls.extend(alt_urls)
        
        # Bing search
        bing_urls = self.get_bing_image_urls(entity, 20)
        all_urls.extend(bing_urls)
        
        # DuckDuckGo search
        ddg_urls = self.get_duckduckgo_image_urls(entity, 15)
        all_urls.extend(ddg_urls)
        
        # Remove duplicates while preserving order
        unique_urls = []
        seen = set()
        for url in all_urls:
            if url not in seen:
                seen.add(url)
                unique_urls.append(url)
        
        if not unique_urls:
            logger.warning(f"No URLs found for {entity}")
            return 0, []
        
        logger.info(f"Found {len(unique_urls)} unique URLs for {entity}")
        
        # Download images
        downloaded_files = []
        downloaded_count = 0
        
        for i, url in enumerate(unique_urls):
            if downloaded_count >= num_logos:
                break
            
            try:
                extension = get_file_extension(url)
                filename = f"{entity.replace(' ', '_')}_logo_{downloaded_count + 1}{extension}"
                filepath = os.path.join(entity_folder, filename)
                
                if self.download_image(url, filepath):
                    downloaded_count += 1
                    downloaded_files.append(filepath)
                    logger.info(f"Downloaded ({downloaded_count}/{num_logos}): {filename}")
                
                # Be respectful to servers
                rate_limit_delay(REQUEST_DELAY)
                
            except Exception as e:
                logger.error(f"Error processing URL {url}: {e}")
                continue
        
        logger.info(f"Successfully downloaded {downloaded_count}/{num_logos} logos for {entity}")
        return downloaded_count, downloaded_files