Rename ux_components.py to extractor.py
Browse files- extractor.py +109 -0
- ux_components.py +0 -24
    	
        extractor.py
    ADDED
    
    | @@ -0,0 +1,109 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # /extractor.py
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            """
         | 
| 4 | 
            +
            Handles content extraction from various sources like files, images, and websites.
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            This module encapsulates the logic for parsing different file formats (PDF, DOCX),
         | 
| 7 | 
            +
            performing Optical Character Recognition (OCR) on images, and scraping web content.
         | 
| 8 | 
            +
            """
         | 
| 9 | 
            +
            import mimetypes
         | 
| 10 | 
            +
            import os
         | 
| 11 | 
            +
            import re
         | 
| 12 | 
            +
            from urllib.parse import urlparse, urljoin
         | 
| 13 | 
            +
            import logging
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            import PyPDF2
         | 
| 16 | 
            +
            import docx
         | 
| 17 | 
            +
            import requests
         | 
| 18 | 
            +
            from bs4 import BeautifulSoup
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            # --- Setup Logging ---
         | 
| 21 | 
            +
            logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            # --- Optional OCR Imports ---
         | 
| 24 | 
            +
            try:
         | 
| 25 | 
            +
                import cv2
         | 
| 26 | 
            +
                import numpy as np
         | 
| 27 | 
            +
                import pytesseract
         | 
| 28 | 
            +
                OCR_AVAILABLE = True
         | 
| 29 | 
            +
            except ImportError:
         | 
| 30 | 
            +
                OCR_AVAILABLE = False
         | 
| 31 | 
            +
                logging.warning("OCR libraries not found (cv2, numpy, pytesseract). Text extraction from images will be disabled.")
         | 
| 32 | 
            +
             | 
| 33 | 
            +
            def extract_text_from_image(image_path: str) -> str:
         | 
| 34 | 
            +
                """Extracts text from an image file using Tesseract OCR."""
         | 
| 35 | 
            +
                if not OCR_AVAILABLE:
         | 
| 36 | 
            +
                    return "Error: OCR dependencies are not installed. Please run 'pip install opencv-python-headless pytesseract'."
         | 
| 37 | 
            +
                try:
         | 
| 38 | 
            +
                    pytesseract.get_tesseract_version()
         | 
| 39 | 
            +
                except Exception:
         | 
| 40 | 
            +
                    return "Error: Tesseract OCR is not installed or not in your PATH."
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                try:
         | 
| 43 | 
            +
                    image = cv2.imread(image_path)
         | 
| 44 | 
            +
                    if image is None:
         | 
| 45 | 
            +
                        return "Error: Could not read image file."
         | 
| 46 | 
            +
                    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
         | 
| 47 | 
            +
                    text = pytesseract.image_to_string(gray)
         | 
| 48 | 
            +
                    return text.strip() or "No text found in image."
         | 
| 49 | 
            +
                except Exception as e:
         | 
| 50 | 
            +
                    logging.error(f"OCR extraction failed: {e}")
         | 
| 51 | 
            +
                    return f"Error during OCR: {e}"
         | 
| 52 | 
            +
             | 
| 53 | 
            +
            def extract_text_from_file(file_path: str) -> str:
         | 
| 54 | 
            +
                """Extracts text from a variety of file types."""
         | 
| 55 | 
            +
                if not file_path:
         | 
| 56 | 
            +
                    return ""
         | 
| 57 | 
            +
                ext = os.path.splitext(file_path)[1].lower()
         | 
| 58 | 
            +
                try:
         | 
| 59 | 
            +
                    if ext == ".pdf":
         | 
| 60 | 
            +
                        with open(file_path, "rb") as f:
         | 
| 61 | 
            +
                            reader = PyPDF2.PdfReader(f)
         | 
| 62 | 
            +
                            return "\n".join(page.extract_text() or "" for page in reader.pages)
         | 
| 63 | 
            +
                    elif ext == ".docx":
         | 
| 64 | 
            +
                        doc = docx.Document(file_path)
         | 
| 65 | 
            +
                        return "\n".join(p.text for p in doc.paragraphs)
         | 
| 66 | 
            +
                    elif ext in [".txt", ".md", ".csv", ".html", ".css", ".js", ".py"]:
         | 
| 67 | 
            +
                        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
         | 
| 68 | 
            +
                            return f.read()
         | 
| 69 | 
            +
                    elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
         | 
| 70 | 
            +
                        return extract_text_from_image(file_path)
         | 
| 71 | 
            +
                    else:
         | 
| 72 | 
            +
                        return f"Unsupported file type: {ext}"
         | 
| 73 | 
            +
                except Exception as e:
         | 
| 74 | 
            +
                    logging.error(f"Error extracting text from {file_path}: {e}")
         | 
| 75 | 
            +
                    return f"Error extracting text: {e}"
         | 
| 76 | 
            +
             | 
| 77 | 
            +
            def extract_website_content(url: str) -> str:
         | 
| 78 | 
            +
                """Scrapes and returns the primary HTML content of a given URL."""
         | 
| 79 | 
            +
                try:
         | 
| 80 | 
            +
                    headers = {
         | 
| 81 | 
            +
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         | 
| 82 | 
            +
                    }
         | 
| 83 | 
            +
                    response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
         | 
| 84 | 
            +
                    response.raise_for_status()
         | 
| 85 | 
            +
                    response.encoding = response.apparent_encoding
         | 
| 86 | 
            +
                    soup = BeautifulSoup(response.text, 'html.parser')
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                    # Make all resource links absolute
         | 
| 89 | 
            +
                    for tag, attr in [('img', 'src'), ('link', 'href'), ('script', 'src')]:
         | 
| 90 | 
            +
                        for item in soup.find_all(tag):
         | 
| 91 | 
            +
                            if item.has_attr(attr):
         | 
| 92 | 
            +
                                item[attr] = urljoin(url, item[attr])
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                    title = soup.title.string if soup.title else "N/A"
         | 
| 95 | 
            +
                    # Return a prettified version of the body content for context
         | 
| 96 | 
            +
                    body_content = soup.body.prettify() if soup.body else str(soup)
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                    # Truncate for prompt
         | 
| 99 | 
            +
                    if len(body_content) > 15000:
         | 
| 100 | 
            +
                         body_content = body_content[:15000] + "\n<!-- ... HTML truncated ... -->"
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                    return f"<!-- Original URL: {url} -->\n<!-- Title: {title} -->\n{body_content}"
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                except requests.RequestException as e:
         | 
| 105 | 
            +
                    logging.error(f"Website extraction failed for {url}: {e}")
         | 
| 106 | 
            +
                    return f"Error: Could not fetch content from the URL. Details: {e}"
         | 
| 107 | 
            +
                except Exception as e:
         | 
| 108 | 
            +
                    logging.error(f"An unexpected error occurred during website extraction: {e}")
         | 
| 109 | 
            +
                    return f"Error: An unexpected error occurred. Details: {e}"
         | 
    	
        ux_components.py
    DELETED
    
    | @@ -1,24 +0,0 @@ | |
| 1 | 
            -
            import gradio as gr
         | 
| 2 | 
            -
            from config import DEMO_LIST
         | 
| 3 | 
            -
             | 
| 4 | 
            -
            def create_top_demo_cards(input_textbox):
         | 
| 5 | 
            -
                """Creates a Gradio Column with buttons for the top 3 demo examples."""
         | 
| 6 | 
            -
                with gr.Column(visible=True) as quick_examples_col:
         | 
| 7 | 
            -
                    for i, demo_item in enumerate(DEMO_LIST[:3]):
         | 
| 8 | 
            -
                        demo_card = gr.Button(
         | 
| 9 | 
            -
                            value=demo_item['title'],
         | 
| 10 | 
            -
                            variant="secondary",
         | 
| 11 | 
            -
                            size="sm",
         | 
| 12 | 
            -
                            elem_id=f"demo_card_{i}"  # Add an ID for potential styling
         | 
| 13 | 
            -
                        )
         | 
| 14 | 
            -
                        demo_card.click(
         | 
| 15 | 
            -
                            fn=lambda idx=i: gr.update(value=DEMO_LIST[idx]['description']),
         | 
| 16 | 
            -
                            outputs=input_textbox,
         | 
| 17 | 
            -
                        )
         | 
| 18 | 
            -
                return quick_examples_col
         | 
| 19 | 
            -
             | 
| 20 | 
            -
            if __name__ == "__main__":
         | 
| 21 | 
            -
                with gr.Blocks() as demo:
         | 
| 22 | 
            -
                    input_textbox = gr.Textbox(label="Input")
         | 
| 23 | 
            -
                    create_top_demo_cards(input_textbox)
         | 
| 24 | 
            -
                demo.launch()
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
