Spaces:

theshresthshukla
/

tunnel

Sleeping

App Files Files Community

theshresthshukla commited on Jun 29

Commit

2c01a8f

verified ·

1 Parent(s): 1a3c0d0

tool to download logos from internet

Browse files

streamlit code to download company logo from the internet

Files changed (13) hide show

app.py +142 -0
gradio_app.py +262 -0
requirements.txt +7 -2
services/__pycache__/appconfig.cpython-310.pyc +0 -0
services/__pycache__/entity_extractor.cpython-310.pyc +0 -0
services/__pycache__/image_downloader.cpython-310.pyc +0 -0
services/__pycache__/logo_downloader.cpython-310.pyc +0 -0
services/appconfig.py +60 -0
services/entity_extractor.py +195 -0
services/image_downloader.py +278 -0
services/logo_downloader.py +228 -0
utils/__pycache__/utils.cpython-310.pyc +0 -0
utils/utils.py +178 -0

app.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""
+Streamlit web interface for the Logo Downloader
+"""
+import os
+import logging
+import streamlit as st
+from pathlib import Path
+from typing import Optional
+from services.logo_downloader import LogoDownloader
+from services.appconfig import GEMINI_API_KEY, DEFAULT_LOGOS_PER_ENTITY, MAX_LOGOS_PER_ENTITY
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def process_text_request(text: str, api_key: Optional[str], num_logos: int = DEFAULT_LOGOS_PER_ENTITY):
+    """
+    Process text and download logos through Streamlit interface
+    """
+    try:
+        if not text or not text.strip():
+            return "❌ Please provide some text to analyze.", None, "No text provided."
+        if num_logos < 1 or num_logos > MAX_LOGOS_PER_ENTITY:
+            return f"❌ Number of logos must be between 1 and {MAX_LOGOS_PER_ENTITY}.", None, f"Invalid number: {num_logos}"
+        final_api_key = api_key.strip() if api_key and api_key.strip() else GEMINI_API_KEY
+        downloader = LogoDownloader(gemini_api_key=final_api_key)
+        results = downloader.process_text(text, num_logos)
+        if results['status'] == 'success' and results['stats']['total_downloads'] > 0:
+            status_msg = f"✅ {downloader.get_stats_summary()}"
+            zip_path = results.get('zip_path')
+            detailed_results = _format_detailed_results(results)
+            return status_msg, zip_path, detailed_results
+        elif results['status'] == 'warning':
+            return f"⚠️ {results['message']}", None, results.get('message', 'No details available')
+        else:
+            return f"❌ Processing failed: {results['message']}", None, results.get('message', 'Unknown error')
+    except Exception as e:
+        logger.error(f"Error in process_text_request: {e}")
+        return f"❌ An error occurred: {str(e)}", None, f"Error details: {str(e)}"
+def _format_detailed_results(results):
+    if not results.get('results'):
+        return "No detailed results available."
+    details = []
+    details.append(f"📊 **Processing Summary:**")
+    details.append(f"- Total entities found: {results['stats']['total_entities']}")
+    details.append(f"- Total logos downloaded: {results['stats']['total_downloads']}")
+    details.append(f"- Successful entities: {results['stats']['successful_entities']}")
+    details.append(f"- Failed entities: {results['stats']['failed_entities']}")
+    details.append("")
+    details.append("📋 **Entity Details:**")
+    for result in results['results']:
+        entity = result['entity']
+        count = result['downloaded_count']
+        if count > 0:
+            details.append(f"✅ **{entity}**: {count} logos downloaded")
+        else:
+            error_msg = result.get('error', 'No logos found')
+            details.append(f"❌ **{entity}**: Failed ({error_msg})")
+    return "\n".join(details)
+def main():
+    st.set_page_config(page_title="🎨 Logo Downloader", layout="centered")
+    st.title("🎨 Logo Downloader")
+    st.markdown("Extract entities from text and download their logos automatically.")
+    with st.form(key="logo_form"):
+        text_input = st.text_area(
+            "📝 Enter text containing company names, products, or brands:",
+            placeholder="e.g., We use AWS, Docker, React, and Adobe Photoshop for our projects",
+            height=150
+        )
+        api_key_input = st.text_input(
+            "🔑 Gemini API Key (optional)",
+            type="password",
+            placeholder="Enter your Gemini API key for enhanced extraction"
+        )
+        num_logos_input = st.slider(
+            "🖼️ Logos per entity",
+            min_value=1,
+            max_value=MAX_LOGOS_PER_ENTITY,
+            value=DEFAULT_LOGOS_PER_ENTITY,
+            step=1
+        )
+        submit_btn = st.form_submit_button("🚀 Download Logos")
+    if submit_btn:
+        with st.spinner("Processing logos..."):
+            status_msg, zip_path, detailed_results = process_text_request(
+                text_input,
+                api_key_input,
+                num_logos_input
+            )
+        st.markdown(status_msg)
+        if zip_path and Path(zip_path).exists():
+            with open(zip_path, "rb") as f:
+                st.download_button(
+                    label="📥 Download Logos ZIP",
+                    data=f,
+                    file_name=Path(zip_path).name,
+                    mime="application/zip"
+                )
+        st.markdown("## 📊 Detailed Results")
+        st.markdown(detailed_results)
+    st.markdown("---")
+    st.info("💡 Tip: Get a free Gemini API key at [Google AI Studio](https://makersuite.google.com/app/apikey) for better extraction accuracy.")
+    st.markdown("## 💡 Examples")
+    examples = [
+        "Our tech stack includes React, Node.js, MongoDB, Docker, AWS, and we use Figma for design, along with GitHub for version control.",
+        "The team uses Microsoft Office, Adobe Creative Suite, Slack for communication, Zoom for meetings, and Salesforce for CRM.",
+        "Popular social media platforms like Instagram, TikTok, Twitter, LinkedIn, and YouTube are essential for digital marketing."
+    ]
+    for ex in examples:
+        if st.button(f"Use example: {ex[:50]}..."):
+            st.session_state["text_input"] = ex
+if __name__ == "__main__":
+    main()

gradio_app.py ADDED Viewed

	@@ -0,0 +1,262 @@

+"""
+Gradio web interface for the Logo Downloader
+"""
+import os
+import gradio as gr
+import logging
+from pathlib import Path
+from typing import Optional
+from services.logo_downloader import LogoDownloader
+from services.appconfig import GEMINI_API_KEY, DEFAULT_LOGOS_PER_ENTITY, MAX_LOGOS_PER_ENTITY
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def process_text_request(text: str, api_key: Optional[str], num_logos: int = DEFAULT_LOGOS_PER_ENTITY):
+    """
+    Process text and download logos through Gradio interface
+    Args:
+        text (str): Input text
+        api_key (str): Optional Gemini API key
+        num_logos (int): Number of logos per entity
+    Returns:
+        Tuple: (status_message, zip_file_path or None, detailed_results)
+    """
+    try:
+        # Validate inputs
+        if not text or not text.strip():
+            return "❌ Please provide some text to analyze.", None, "No text provided."
+        if num_logos < 1 or num_logos > MAX_LOGOS_PER_ENTITY:
+            return f"❌ Number of logos must be between 1 and {MAX_LOGOS_PER_ENTITY}.", None, f"Invalid number: {num_logos}"
+        # Use provided API key or environment variable
+        final_api_key = api_key.strip() if api_key and api_key.strip() else GEMINI_API_KEY
+        # Initialize downloader
+        downloader = LogoDownloader(gemini_api_key=final_api_key)
+        # Process the text
+        results = downloader.process_text(text, num_logos)
+        # Format response based on results
+        if results['status'] == 'success' and results['stats']['total_downloads'] > 0:
+            status_msg = f"✅ {downloader.get_stats_summary()}"
+            zip_path = results.get('zip_path')
+            # Create detailed results
+            detailed_results = _format_detailed_results(results)
+            return status_msg, zip_path, detailed_results
+        elif results['status'] == 'warning':
+            return f"⚠️ {results['message']}", None, results.get('message', 'No details available')
+        else:
+            return f"❌ Processing failed: {results['message']}", None, results.get('message', 'Unknown error')
+    except Exception as e:
+        logger.error(f"Error in process_text_request: {e}")
+        return f"❌ An error occurred: {str(e)}", None, f"Error details: {str(e)}"
+def _format_detailed_results(results):
+    """Format detailed results for display"""
+    if not results.get('results'):
+        return "No detailed results available."
+    details = []
+    details.append(f"📊 **Processing Summary:**")
+    details.append(f"- Total entities found: {results['stats']['total_entities']}")
+    details.append(f"- Total logos downloaded: {results['stats']['total_downloads']}")
+    details.append(f"- Successful entities: {results['stats']['successful_entities']}")
+    details.append(f"- Failed entities: {results['stats']['failed_entities']}")
+    details.append("")
+    details.append("📋 **Entity Details:**")
+    for result in results['results']:
+        entity = result['entity']
+        count = result['downloaded_count']
+        if count > 0:
+            details.append(f"✅ **{entity}**: {count} logos downloaded")
+        else:
+            error_msg = result.get('error', 'No logos found')
+            details.append(f"❌ **{entity}**: Failed ({error_msg})")
+    return "\n".join(details)
+def create_interface():
+    """Create and configure Gradio interface"""
+    # Custom CSS for better styling
+    css = """
+    .gradio-container {
+        max-width: 1200px !important;
+        margin: auto !important;
+    }
+    .main-header {
+        text-align: center;
+        margin-bottom: 2rem;
+    }
+    .status-success {
+        color: #10b981 !important;
+    }
+    .status-error {
+        color: #ef4444 !important;
+    }
+    .status-warning {
+        color: #f59e0b !important;
+    }
+    """
+    with gr.Blocks(css=css, title="Logo Downloader", theme=gr.themes.Soft()) as interface:
+        # Header
+        gr.HTML("""
+        <div class="main-header">
+            <h1>🎨 Logo Downloader</h1>
+            <p>Extract entities from text and download their logos automatically</p>
+        </div>
+        """)
+        with gr.Row():
+            with gr.Column(scale=2):
+                # Input section
+                gr.Markdown("## 📝 Input")
+                text_input = gr.Textbox(
+                    label="Text to analyze",
+                    placeholder="Enter text containing company names, products, or brands (e.g., 'We use AWS, Docker, React, and Adobe Photoshop for our projects')",
+                    lines=5,
+                    max_lines=10
+                )
+                with gr.Row():
+                    api_key_input = gr.Textbox(
+                        label="Gemini API Key (optional)",
+                        placeholder="Enter your Gemini API key for better entity extraction",
+                        type="password",
+                        value=""
+                    )
+                    num_logos_input = gr.Slider(
+                        label="Logos per entity",
+                        minimum=1,
+                        maximum=MAX_LOGOS_PER_ENTITY,
+                        value=DEFAULT_LOGOS_PER_ENTITY,
+                        step=1
+                    )
+                process_btn = gr.Button("🚀 Download Logos", variant="primary", size="lg")
+                # API key help
+                gr.Markdown("""
+                💡 **Tip:** Get a free Gemini API key at [Google AI Studio](https://makersuite.google.com/app/apikey) for better entity extraction.
+                Without an API key, the tool will use basic pattern matching.
+                """)
+            with gr.Column(scale=1):
+                # Output section
+                gr.Markdown("## 📊 Results")
+                status_output = gr.Textbox(
+                    label="Status",
+                    interactive=False,
+                    lines=2
+                )
+                download_output = gr.File(
+                    label="Download ZIP",
+                    interactive=False
+                )
+                detailed_output = gr.Textbox(
+                    label="Detailed Results",
+                    interactive=False,
+                    lines=10,
+                    max_lines=15
+                )
+        # Examples section
+        gr.Markdown("## 💡 Examples")
+        examples = [
+            [
+                "Our tech stack includes React, Node.js, MongoDB, Docker, AWS, and we use Figma for design, along with GitHub for version control.",
+                "",
+                8
+            ],
+            [
+                "The team uses Microsoft Office, Adobe Creative Suite, Slack for communication, Zoom for meetings, and Salesforce for CRM.",
+                "",
+                6
+            ],
+            [
+                "Popular social media platforms like Instagram, TikTok, Twitter, LinkedIn, and YouTube are essential for digital marketing.",
+                "",
+                5
+            ]
+        ]
+        gr.Examples(
+            examples=examples,
+            inputs=[text_input, api_key_input, num_logos_input],
+            outputs=[status_output, download_output, detailed_output],
+            fn=process_text_request,
+            cache_examples=False
+        )
+        # Process button click event
+        process_btn.click(
+            fn=process_text_request,
+            inputs=[text_input, api_key_input, num_logos_input],
+            outputs=[status_output, download_output, detailed_output],
+            show_progress='minimal'
+        )
+        # Footer
+        gr.HTML("""
+        <div style="text-align: center; margin-top: 2rem; padding: 1rem; border-top: 1px solid #e5e7eb;">
+            <p>🔧 Built with Gradio | 🤖 Powered by Gemini AI</p>
+            <p><small>This tool respects rate limits and downloads publicly available logos.</small></p>
+        </div>
+        """)
+    return interface
+def main():
+    """Main function to launch the application"""
+    logger.info("Starting Logo Downloader application...")
+    # Check for API key
+    if not GEMINI_API_KEY:
+        logger.warning("No Gemini API key found in environment variables")
+        logger.info("The application will work with fallback entity extraction")
+    else:
+        logger.info("Gemini API key found")
+    # Create and launch interface
+    interface = create_interface()
+    # Launch configuration
+    launch_kwargs = {
+        "server_name": "0.0.0.0",
+        "server_port": int(os.environ.get("PORT", 7860)),
+        "share": False,
+        "show_error": True,
+        "max_threads": 4
+    }
+    # Launch the interface
+    interface.launch(**launch_kwargs)
+if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

@@ -1,3 +1,8 @@
-altair
-pandas
 streamlit

+google-generativeai==0.5.4
+requests==2.31.0
+beautifulsoup4==4.12.2
+gradio==4.15.0
+python-dotenv==1.0.0
+Pillow==10.0.1
+pathlib
 streamlit

services/__pycache__/appconfig.cpython-310.pyc ADDED Viewed

Binary file (1.69 kB). View file

services/__pycache__/entity_extractor.cpython-310.pyc ADDED Viewed

Binary file (5.48 kB). View file

services/__pycache__/image_downloader.cpython-310.pyc ADDED Viewed

Binary file (8.31 kB). View file

services/__pycache__/logo_downloader.cpython-310.pyc ADDED Viewed

Binary file (6.5 kB). View file

services/appconfig.py ADDED Viewed

	@@ -0,0 +1,60 @@

+"""
+Configuration settings for the Logo Downloader application
+"""
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+load_dotenv()
+# API Configuration
+GEMINI_API_KEY = os.getenv('GEMINI_API_KEY', '')
+# Directory Configuration
+BASE_DIR = Path(__file__).parent
+# DOWNLOADS_DIR = BASE_DIR / 'downloads'
+DOWNLOADS_DIR = Path('downloads')
+TEMP_DIR = BASE_DIR / 'temp'
+# Download Configuration
+MAX_ENTITIES = 20
+MAX_LOGOS_PER_ENTITY = 15
+DEFAULT_LOGOS_PER_ENTITY = 10
+DOWNLOAD_TIMEOUT = 15
+REQUEST_DELAY = 1  # seconds between requests
+# File Configuration
+ALLOWED_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.svg', '.webp']
+MIN_FILE_SIZE = 500  # bytes
+MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
+# HTTP Configuration
+HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+    'Accept-Language': 'en-US,en;q=0.5',
+    'Accept-Encoding': 'gzip, deflate',
+    'Connection': 'keep-alive',
+    'Upgrade-Insecure-Requests': '1',
+}
+# Image signatures for validation
+IMAGE_SIGNATURES = [
+    b'\x89PNG',      # PNG
+    b'\xff\xd8\xff', # JPEG
+    b'<svg',         # SVG
+    b'RIFF',         # WebP
+    b'GIF8',         # GIF
+]
+# Common tech entities for fallback
+COMMON_TECH_ENTITIES = [
+    'Microsoft', 'Google', 'Apple', 'Amazon', 'Adobe', 'React', 'Angular', 'Vue',
+    'Docker', 'Kubernetes', 'AWS', 'Azure', 'Firebase', 'MongoDB', 'PostgreSQL',
+    'Redis', 'Node.js', 'Python', 'JavaScript', 'TypeScript', 'Figma', 'Sketch',
+    'Photoshop', 'Illustrator', 'AutoCAD', 'Unity', 'Blender', 'GitHub', 'GitLab',
+    'Slack', 'Discord', 'Zoom', 'Teams', 'Spotify', 'Netflix', 'Instagram',
+    'Facebook', 'Twitter', 'LinkedIn', 'TikTok', 'WhatsApp', 'Telegram',
+    'Shopify', 'WordPress', 'Salesforce', 'Microsoft Fabric'
+]

services/entity_extractor.py ADDED Viewed

	@@ -0,0 +1,195 @@

+"""
+Entity extraction module using Gemini AI with fallback methods
+"""
+import re
+import logging
+from typing import List, Optional
+import google.generativeai as genai
+from services.appconfig import GEMINI_API_KEY, COMMON_TECH_ENTITIES, MAX_ENTITIES
+logger = logging.getLogger(__name__)
+class EntityExtractor:
+    """Extract entities from text using Gemini AI or fallback methods"""
+    def __init__(self, api_key: Optional[str] = None):
+        """
+        Initialize EntityExtractor
+        Args:
+            api_key (str, optional): Gemini API key
+        """
+        self.api_key = api_key or GEMINI_API_KEY
+        self.model = None
+        self._setup_gemini()
+    def _setup_gemini(self) -> None:
+        """Setup Gemini API"""
+        if not self.api_key:
+            logger.warning("No Gemini API key provided, using fallback method")
+            return
+        try:
+            genai.configure(api_key=self.api_key)
+            self.model = genai.GenerativeModel('gemini-2.0-flash-exp')
+            logger.info("Gemini API initialized successfully")
+        except Exception as e:
+            logger.error(f"Failed to initialize Gemini API: {e}")
+            self.model = None
+    def extract_with_gemini(self, text: str) -> List[str]:
+        """
+        Extract entities using Gemini AI
+        Args:
+            text (str): Input text
+        Returns:
+            List[str]: List of extracted entities
+        """
+        if not self.model:
+            raise Exception("Gemini model not available")
+        prompt = """
+        Extract company names, product names, software names, tool names, and brand names from this text.
+        Only return names that would have recognizable logos (like Microsoft, Adobe, React, etc.).
+        Return as a simple list, one name per line, no bullet points or numbers.
+        Avoid generic terms like "cloud" or "database".
+        Text: {text}
+        """.format(text=text)
+        try:
+            response = self.model.generate_content(prompt)
+            if not response.text:
+                return []
+            entities = [
+                line.strip()
+                for line in response.text.strip().split('\n')
+                if line.strip() and not line.strip().startswith('-') and len(line.strip()) > 1
+            ]
+            # Filter out common words that aren't entities
+            filtered_entities = []
+            for entity in entities:
+                if self._is_valid_entity(entity):
+                    filtered_entities.append(entity)
+            logger.info(f"Gemini extracted {len(filtered_entities)} entities")
+            return filtered_entities[:MAX_ENTITIES]
+        except Exception as e:
+            logger.error(f"Gemini extraction failed: {e}")
+            raise
+    def extract_with_fallback(self, text: str) -> List[str]:
+        """
+        Extract entities using fallback pattern matching
+        Args:
+            text (str): Input text
+        Returns:
+            List[str]: List of extracted entities
+        """
+        entities = []
+        # Find common tech entities
+        for tech_entity in COMMON_TECH_ENTITIES:
+            if tech_entity.lower() in text.lower():
+                entities.append(tech_entity)
+        # Find capitalized words (likely proper nouns)
+        cap_words = re.findall(r'\b[A-Z][a-zA-Z]{2,}\b', text)
+        for word in cap_words:
+            if self._is_valid_entity(word) and word not in entities:
+                entities.append(word)
+        # Find words with specific patterns (e.g., Node.js, C++)
+        pattern_words = re.findall(r'\b[A-Z][a-zA-Z]*\.[a-zA-Z]+\b', text)
+        for word in pattern_words:
+            if word not in entities:
+                entities.append(word)
+        # Remove duplicates while preserving order
+        unique_entities = []
+        seen = set()
+        for entity in entities:
+            if entity.lower() not in seen:
+                seen.add(entity.lower())
+                unique_entities.append(entity)
+        logger.info(f"Fallback extracted {len(unique_entities)} entities")
+        return unique_entities[:MAX_ENTITIES]
+    def _is_valid_entity(self, entity: str) -> bool:
+        """
+        Check if entity is valid for logo extraction
+        Args:
+            entity (str): Entity name
+        Returns:
+            bool: True if valid entity
+        """
+        # Filter out common words that aren't brand names
+        invalid_words = {
+            'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
+            'by', 'from', 'up', 'about', 'into', 'through', 'during', 'before',
+            'after', 'above', 'below', 'between', 'among'}
+        # 'cloud', 'database',
+        #     'server', 'client', 'user', 'admin', 'data', 'system', 'network',
+        #     'security', 'management', 'development', 'application', 'platform',
+        #     'service', 'solution', 'technology', 'software', 'hardware', 'tool'
+        # }
+        entity_lower = entity.lower()
+        # Check length
+        if len(entity) < 2 or len(entity) > 50:
+            return False
+        # Check if it's a common invalid word
+        if entity_lower in invalid_words:
+            return False
+        # Must contain at least one letter
+        if not re.search(r'[a-zA-Z]', entity):
+            return False
+        return True
+    def extract_entities(self, text: str) -> List[str]:
+        """
+        Extract entities from text using available methods
+        Args:
+            text (str): Input text
+        Returns:
+            List[str]: List of extracted entities
+        """
+        if not text or not text.strip():
+            return []
+        logger.info("Starting entity extraction...")
+        # Try Gemini first
+        if self.model:
+            try:
+                entities = self.extract_with_gemini(text)
+                if entities:
+                    logger.info(f"Successfully extracted {len(entities)} entities with Gemini")
+                    return entities
+            except Exception as e:
+                logger.warning(f"Gemini extraction failed, using fallback: {e}")
+        # Use fallback method
+        entities = self.extract_with_fallback(text)
+        logger.info(f"Extracted {len(entities)} entities using fallback method")
+        return entities

services/image_downloader.py ADDED Viewed

	@@ -0,0 +1,278 @@

+"""
+Image downloading module with multiple search providers
+"""
+import os
+import json
+import logging
+from typing import List, Tuple
+from urllib.parse import quote_plus, urlparse
+import requests
+from bs4 import BeautifulSoup
+from services.appconfig import HEADERS, DOWNLOAD_TIMEOUT, REQUEST_DELAY, ALLOWED_EXTENSIONS
+from utils.utils import is_valid_image_file, get_file_extension, clean_up_file, rate_limit_delay
+logger = logging.getLogger(__name__)
+class ImageDownloader:
+    """Download images from various search providers"""
+    def __init__(self):
+        """Initialize ImageDownloader"""
+        self.session = requests.Session()
+        self.session.headers.update(HEADERS)
+    def get_bing_image_urls(self, entity: str, num_images: int = 15) -> List[str]:
+        """
+        Get image URLs from Bing search
+        Args:
+            entity (str): Entity name to search for
+            num_images (int): Maximum number of URLs to return
+        Returns:
+            List[str]: List of image URLs
+        """
+        logger.info(f"Searching Bing for {entity} logos...")
+        query = f"{entity} logo png transparent high quality"
+        encoded_query = quote_plus(query)
+        search_url = f"https://www.bing.com/images/search?q={encoded_query}&form=HDRSC2&first=1&tsc=ImageBasicHover"
+        try:
+            response = self.session.get(search_url, timeout=DOWNLOAD_TIMEOUT, verify=False)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            image_urls = []
+            # Find image data in Bing's format
+            img_containers = soup.find_all('a', {'class': 'iusc'})
+            for container in img_containers:
+                m_attr = container.get('m')
+                if m_attr:
+                    try:
+                        img_data = json.loads(m_attr)
+                        img_url = img_data.get('murl') or img_data.get('turl')
+                        if img_url and self._is_valid_image_url(img_url):
+                            image_urls.append(img_url)
+                    except json.JSONDecodeError:
+                        continue
+            # Fallback: regular img tags
+            if len(image_urls) < 5:
+                img_tags = soup.find_all('img')
+                for img in img_tags:
+                    src = img.get('src') or img.get('data-src')
+                    if src and self._is_valid_image_url(src) and 'logo' in src.lower():
+                        if src.startswith('http'):
+                            image_urls.append(src)
+            logger.info(f"Found {len(image_urls)} URLs from Bing")
+            return image_urls[:num_images]
+        except Exception as e:
+            logger.error(f"Bing search failed for {entity}: {e}")
+            return []
+    def get_duckduckgo_image_urls(self, entity: str, num_images: int = 15) -> List[str]:
+        """
+        Get image URLs from DuckDuckGo search
+        Args:
+            entity (str): Entity name to search for
+            num_images (int): Maximum number of URLs to return
+        Returns:
+            List[str]: List of image URLs
+        """
+        logger.info(f"Searching DuckDuckGo for {entity} logos...")
+        query = f"{entity} logo hd png transparent"
+        encoded_query = quote_plus(query)
+        search_url = f"https://duckduckgo.com/?q={encoded_query}&t=h_&iax=images&ia=images"
+        try:
+            response = self.session.get(search_url, timeout=DOWNLOAD_TIMEOUT,verify=False)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            image_urls = []
+            img_tags = soup.find_all('img')
+            for img in img_tags:
+                src = img.get('src') or img.get('data-src')
+                if src and self._is_valid_image_url(src) and src.startswith('http'):
+                    image_urls.append(src)
+            logger.info(f"Found {len(image_urls)} URLs from DuckDuckGo")
+            return image_urls[:num_images]
+        except Exception as e:
+            logger.error(f"DuckDuckGo search failed for {entity}: {e}")
+            return []
+    def get_alternative_logo_sources(self, entity: str) -> List[str]:
+        """
+        Get URLs from alternative logo sources
+        Args:
+            entity (str): Entity name
+        Returns:
+            List[str]: List of alternative logo URLs
+        """
+        urls = []
+        entity_clean = entity.lower().replace(' ', '').replace('.', '')
+        entity_hyphen = entity.lower().replace(' ', '-')
+        # Try various logo services
+        logo_sources = [
+            f"https://cdn.worldvectorlogo.com/logos/{entity_hyphen}.svg",
+            f"https://logos-world.net/wp-content/uploads/2020/11/{entity.replace(' ', '-')}-Logo.png",
+            f"https://logoeps.com/wp-content/uploads/2013/03/vector-{entity_clean}-logo.png",
+            f"https://1000logos.net/wp-content/uploads/2016/10/{entity.replace(' ', '-')}-Logo.png",
+        ]
+        for url in logo_sources:
+            try:
+                response = self.session.head(url, timeout=5)
+                if response.status_code == 200:
+                    urls.append(url)
+                    logger.info(f"Found alternative logo: {url}")
+            except Exception:
+                continue
+        return urls
+    def _is_valid_image_url(self, url: str) -> bool:
+        """
+        Check if URL is a valid image URL
+        Args:
+            url (str): URL to check
+        Returns:
+            bool: True if valid image URL
+        """
+        if not url:
+            return False
+        # Check if URL contains image extension
+        url_lower = url.lower()
+        return any(ext in url_lower for ext in ALLOWED_EXTENSIONS)
+    def download_image(self, url: str, filepath: str) -> bool:
+        """
+        Download image from URL
+        Args:
+            url (str): Image URL
+            filepath (str): Local filepath to save image
+        Returns:
+            bool: True if download successful
+        """
+        try:
+            logger.debug(f"Downloading: {url}")
+            response = self.session.get(url, timeout=DOWNLOAD_TIMEOUT, stream=True,verify=False)
+            response.raise_for_status()
+            # Check content type
+            content_type = response.headers.get('content-type', '').lower()
+            if not any(img_type in content_type for img_type in ['image', 'svg']):
+                logger.warning(f"Invalid content type for {url}: {content_type}")
+                return False
+            # Download with streaming
+            with open(filepath, 'wb') as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    if chunk:
+                        f.write(chunk)
+            # Validate downloaded file
+            if is_valid_image_file(filepath):
+                logger.debug(f"Successfully downloaded: {filepath}")
+                return True
+            else:
+                clean_up_file(filepath)
+                logger.warning(f"Downloaded invalid image: {url}")
+                return False
+        except Exception as e:
+            clean_up_file(filepath)
+            logger.error(f"Download failed for {url}: {e}")
+            return False
+    def download_logos_for_entity(self, entity: str, entity_folder: str, num_logos: int = 10) -> Tuple[int, List[str]]:
+        """
+        Download logos for a single entity
+        Args:
+            entity (str): Entity name
+            entity_folder (str): Folder to save logos
+            num_logos (int): Number of logos to download
+        Returns:
+            Tuple[int, List[str]]: (number downloaded, list of downloaded files)
+        """
+        logger.info(f"Downloading top {num_logos} logos for: {entity}")
+        # Collect URLs from all sources
+        all_urls = []
+        # Alternative logo services
+        alt_urls = self.get_alternative_logo_sources(entity)
+        all_urls.extend(alt_urls)
+        # Bing search
+        bing_urls = self.get_bing_image_urls(entity, 20)
+        all_urls.extend(bing_urls)
+        # DuckDuckGo search
+        ddg_urls = self.get_duckduckgo_image_urls(entity, 15)
+        all_urls.extend(ddg_urls)
+        # Remove duplicates while preserving order
+        unique_urls = []
+        seen = set()
+        for url in all_urls:
+            if url not in seen:
+                seen.add(url)
+                unique_urls.append(url)
+        if not unique_urls:
+            logger.warning(f"No URLs found for {entity}")
+            return 0, []
+        logger.info(f"Found {len(unique_urls)} unique URLs for {entity}")
+        # Download images
+        downloaded_files = []
+        downloaded_count = 0
+        for i, url in enumerate(unique_urls):
+            if downloaded_count >= num_logos:
+                break
+            try:
+                extension = get_file_extension(url)
+                filename = f"{entity.replace(' ', '_')}_logo_{downloaded_count + 1}{extension}"
+                filepath = os.path.join(entity_folder, filename)
+                if self.download_image(url, filepath):
+                    downloaded_count += 1
+                    downloaded_files.append(filepath)
+                    logger.info(f"Downloaded ({downloaded_count}/{num_logos}): {filename}")
+                # Be respectful to servers
+                rate_limit_delay(REQUEST_DELAY)
+            except Exception as e:
+                logger.error(f"Error processing URL {url}: {e}")
+                continue
+        logger.info(f"Successfully downloaded {downloaded_count}/{num_logos} logos for {entity}")
+        return downloaded_count, downloaded_files

services/logo_downloader.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""
+Main Logo Downloader class that orchestrates the entire process
+"""
+import os
+import zipfile
+import logging
+from pathlib import Path
+from typing import List, Tuple, Dict, Optional
+from services.appconfig import DOWNLOADS_DIR, DEFAULT_LOGOS_PER_ENTITY
+from utils.utils import create_safe_filename, create_directory, format_file_size
+from .entity_extractor import EntityExtractor
+from .image_downloader import ImageDownloader
+logger = logging.getLogger(__name__)
+class LogoDownloader:
+    """Main class for downloading logos based on extracted entities"""
+    def __init__(self, gemini_api_key: str, output_dir: Optional[str] = None):
+        """
+        Initialize LogoDownloader
+        Args:
+            gemini_api_key (str): Gemini API key for entity extraction
+            output_dir (str): Directory to save downloads
+        """
+        self.output_dir = Path(output_dir) if output_dir else DOWNLOADS_DIR
+        self.entity_extractor = EntityExtractor(gemini_api_key)
+        self.image_downloader = ImageDownloader()
+        self.stats = {
+            'total_entities': 0,
+            'total_downloads': 0,
+            'successful_entities': 0,
+            'failed_entities': 0
+        }
+        # Create output directory
+        create_directory(self.output_dir)
+    def process_text(self, text: str, logos_per_entity: int = DEFAULT_LOGOS_PER_ENTITY) -> Dict:
+        """
+        Main processing function: extract entities and download logos
+        Args:
+            text (str): Input text containing entity references
+            logos_per_entity (int): Number of logos to download per entity
+        Returns:
+            Dict: Processing results and statistics
+        """
+        logger.info("Starting logo download process...")
+        # Reset stats
+        self._reset_stats()
+        # Extract entities
+        entities = self.entity_extractor.extract_entities(text)
+        if not entities:
+            logger.warning("No entities found in text")
+            return self._get_results("No entities found in the provided text")
+        self.stats['total_entities'] = len(entities)
+        logger.info(f"Found {len(entities)} entities: {', '.join(entities)}")
+        # Download logos for each entity
+        results = []
+        for i, entity in enumerate(entities, 1):
+            logger.info(f"Processing [{i}/{len(entities)}]: {entity}")
+            try:
+                result = self._process_single_entity(entity, logos_per_entity)
+                results.append(result)
+                if result['downloaded_count'] > 0:
+                    self.stats['successful_entities'] += 1
+                    self.stats['total_downloads'] += result['downloaded_count']
+                else:
+                    self.stats['failed_entities'] += 1
+            except Exception as e:
+                logger.error(f"Failed to process entity {entity}: {e}")
+                self.stats['failed_entities'] += 1
+                results.append({
+                    'entity': entity,
+                    'downloaded_count': 0,
+                    'files': [],
+                    'error': str(e)
+                })
+        # Create zip package if we have downloads
+        zip_path = None
+        if self.stats['total_downloads'] > 0:
+            zip_path = self._create_zip_package()
+        return self._get_results(
+            "Processing completed successfully",
+            entities=entities,
+            results=results,
+            zip_path=zip_path
+        )
+    def _process_single_entity(self, entity: str, logos_per_entity: int) -> Dict:
+        """
+        Process a single entity: create folder and download logos
+        Args:
+            entity (str): Entity name
+            logos_per_entity (int): Number of logos to download
+        Returns:
+            Dict: Processing result for this entity
+        """
+        safe_name = create_safe_filename(entity)
+        entity_folder = self.output_dir / safe_name
+        # Create entity folder
+        if not create_directory(entity_folder):
+            raise Exception(f"Failed to create directory for {entity}")
+        # Download logos
+        downloaded_count, downloaded_files = self.image_downloader.download_logos_for_entity(
+            entity, str(entity_folder), logos_per_entity
+        )
+        return {
+            'entity': entity,
+            'safe_name': safe_name,
+            'downloaded_count': downloaded_count,
+            'files': downloaded_files,
+            'folder': str(entity_folder)
+        }
+    def _create_zip_package(self) -> str:
+        """
+        Create ZIP package of all downloaded logos
+        Returns:
+            str: Path to created ZIP file
+        """
+        zip_filename = f"{self.output_dir.name}_logos.zip"
+        zip_path = self.output_dir.parent / zip_filename
+        logger.info(f"Creating ZIP package: {zip_path}")
+        try:
+            with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+                for root, dirs, files in os.walk(self.output_dir):
+                    for file in files:
+                        file_path = os.path.join(root, file)
+                        arcname = os.path.relpath(file_path, self.output_dir)
+                        zipf.write(file_path, arcname)
+            file_size = os.path.getsize(zip_path)
+            logger.info(f"ZIP package created: {zip_path} ({format_file_size(file_size)})")
+            return str(zip_path)
+        except Exception as e:
+            logger.error(f"Failed to create ZIP package: {e}")
+            raise
+    def _reset_stats(self) -> None:
+        """Reset processing statistics"""
+        self.stats = {
+            'total_entities': 0,
+            'total_downloads': 0,
+            'successful_entities': 0,
+            'failed_entities': 0
+        }
+    def _get_results(self, message: str, **kwargs) -> Dict:
+        """
+        Get formatted results dictionary
+        Args:
+            message (str): Status message
+            **kwargs: Additional result data
+        Returns:
+            Dict: Formatted results
+        """
+        return {
+            'status': 'success' if self.stats['total_downloads'] > 0 else 'warning',
+            'message': message,
+            'stats': self.stats.copy(),
+            **kwargs
+        }
+    def get_stats_summary(self) -> str:
+        """
+        Get human-readable stats summary
+        Returns:
+            str: Stats summary
+        """
+        if self.stats['total_entities'] == 0:
+            return "No entities processed"
+        avg_downloads = (
+            self.stats['total_downloads'] / self.stats['successful_entities']
+            if self.stats['successful_entities'] > 0 else 0
+        )
+        return (
+            f"Processed {self.stats['total_entities']} entities. "
+            f"Successfully downloaded {self.stats['total_downloads']} logos "
+            f"({avg_downloads:.1f} average per entity). "
+            f"Success rate: {self.stats['successful_entities']}/{self.stats['total_entities']}"
+        )
+def download_logos(text: str, gemini_api_key: str, logos_per_entity: int = DEFAULT_LOGOS_PER_ENTITY) -> Dict:
+    """
+    Convenience function for downloading logos
+    Args:
+        text (str): Text containing entity references
+        gemini_api_key (str): Gemini API key
+        logos_per_entity (int): Number of logos per entity
+    Returns:
+        Dict: Processing results
+    """
+    downloader = LogoDownloader(gemini_api_key)
+    return downloader.process_text(text, logos_per_entity)

utils/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (4.52 kB). View file

utils/utils.py ADDED Viewed

	@@ -0,0 +1,178 @@

+"""
+Utility functions for the Logo Downloader application
+"""
+import os
+import re
+import json
+import time
+from pathlib import Path
+from typing import List, Optional
+from urllib.parse import urlparse
+import logging
+from services.appconfig import IMAGE_SIGNATURES, MIN_FILE_SIZE, MAX_FILE_SIZE
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def create_safe_filename(name: str) -> str:
+    """
+    Create a safe filename from entity name
+    Args:
+        name (str): Entity name
+    Returns:
+        str: Safe filename
+    """
+    safe_name = re.sub(r'[^\w\s-]', '', name).strip()
+    safe_name = re.sub(r'[-\s]+', '_', safe_name)
+    return safe_name
+def get_file_extension(url: str) -> str:
+    """
+    Extract file extension from URL
+    Args:
+        url (str): Image URL
+    Returns:
+        str: File extension
+    """
+    parsed_url = urlparse(url)
+    extension = os.path.splitext(parsed_url.path)[1]
+    if not extension or extension.lower() not in ['.png', '.jpg', '.jpeg', '.svg', '.webp']:
+        extension = '.png'
+    return extension
+def is_valid_image_file(filepath: str) -> bool:
+    """
+    Validate if file is a proper image
+    Args:
+        filepath (str): Path to image file
+    Returns:
+        bool: True if valid image
+    """
+    try:
+        # Check file exists and size
+        if not os.path.exists(filepath):
+            return False
+        file_size = os.path.getsize(filepath)
+        if file_size < MIN_FILE_SIZE or file_size > MAX_FILE_SIZE:
+            logger.warning(f"Invalid file size: {file_size}")
+            return False
+        # Check image signature
+        with open(filepath, 'rb') as f:
+            header = f.read(12)
+        for signature in IMAGE_SIGNATURES:
+            if header.startswith(signature):
+                return True
+        return False
+    except Exception as e:
+        logger.error(f"Error validating image: {e}")
+        return False
+def create_directory(path: Path) -> bool:
+    """
+    Create directory if it doesn't exist
+    Args:
+        path (Path): Directory path
+    Returns:
+        bool: True if successful
+    """
+    try:
+        path.mkdir(parents=True, exist_ok=True)
+        return True
+    except Exception as e:
+        logger.error(f"Error creating directory {path}: {e}")
+        return False
+def clean_up_file(filepath: str) -> None:
+    """
+    Remove file if it exists
+    Args:
+        filepath (str): Path to file to remove
+    """
+    try:
+        if os.path.exists(filepath):
+            os.remove(filepath)
+    except Exception as e:
+        logger.error(f"Error removing file {filepath}: {e}")
+def parse_json_safely(json_string: str) -> Optional[dict]:
+    """
+    Safely parse JSON string
+    Args:
+        json_string (str): JSON string to parse
+    Returns:
+        dict or None: Parsed JSON or None if failed
+    """
+    try:
+        return json.loads(json_string)
+    except json.JSONDecodeError:
+        return None
+def rate_limit_delay(delay: float = 1.0) -> None:
+    """
+    Add delay between requests to be respectful to servers
+    Args:
+        delay (float): Delay in seconds
+    """
+    time.sleep(delay)
+def format_file_size(size_bytes: int) -> str:
+    """
+    Format file size in human readable format
+    Args:
+        size_bytes (int): Size in bytes
+    Returns:
+        str: Formatted size string
+    """
+    if size_bytes < 1024:
+        return f"{size_bytes} B"
+    elif size_bytes < 1024 * 1024:
+        return f"{size_bytes / 1024:.1f} KB"
+    else:
+        return f"{size_bytes / (1024 * 1024):.1f} MB"
+def truncate_text(text: str, max_length: int = 100) -> str:
+    """
+    Truncate text to specified length
+    Args:
+        text (str): Text to truncate
+        max_length (int): Maximum length
+    Returns:
+        str: Truncated text
+    """
+    if len(text) <= max_length:
+        return text
+    return text[:max_length - 3] + "..."