Spaces:

rein0421
/

AIdentify

Runtime error

App Files Files

syurein commited on Jul 1

Commit

b42a7a4

1 Parent(s): 8851713

print

Browse files

Files changed (6) hide show

__pycache__/LLM_package.cpython-312.pyc +0 -0
__pycache__/search.cpython-312.pyc +0 -0
app.py +4 -1
search.py +175 -0
templates/index.html +18 -0
test.py +18 -30

__pycache__/LLM_package.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/LLM_package.cpython-312.pyc and b/__pycache__/LLM_package.cpython-312.pyc differ

__pycache__/search.cpython-312.pyc ADDED Viewed

Binary file (11.5 kB). View file

app.py CHANGED Viewed

@@ -244,6 +244,7 @@ def llm_to_process_image_simple(risk_level, image_path, point1, point2, threshol
     image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
     mask_llm = np.zeros(image.shape[:2], dtype=np.uint8)
     llm_results = Objectdetector.detect_objects(image_path)
     for result in llm_results:
             bbox=result['box_2d']
             x1, y1 = int(bbox[1]* image.shape[1]), int(bbox[0]* image.shape[0])
@@ -270,13 +271,15 @@ def llm_to_process_image_simple_auto(risk_level, image_path, point1, point2, thr
     Objectdetector = ObjectDetector(API_KEY=GEMINI_API_KEY)
     debug_image_path='/test_llm.jpg'
     response=Objectdetector.detect_auto(image_path)
     Objectdetector.prompt_objects=response["objects_to_remove"]
     # 画像の読み込みとRGB変換
     image = cv2.imread(image_path)
     image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
     mask_llm = np.zeros(image.shape[:2], dtype=np.uint8)
     llm_results = Objectdetector.detect_objects(image_path)
     for result in llm_results:
             bbox=result['box_2d']
             x1, y1 = int(bbox[1]* image.shape[1]), int(bbox[0]* image.shape[0])

     image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
     mask_llm = np.zeros(image.shape[:2], dtype=np.uint8)
     llm_results = Objectdetector.detect_objects(image_path)
+    print(f"LLM Results: {llm_results}")
     for result in llm_results:
             bbox=result['box_2d']
             x1, y1 = int(bbox[1]* image.shape[1]), int(bbox[0]* image.shape[0])
     Objectdetector = ObjectDetector(API_KEY=GEMINI_API_KEY)
     debug_image_path='/test_llm.jpg'
     response=Objectdetector.detect_auto(image_path)
+    print(response)
     Objectdetector.prompt_objects=response["objects_to_remove"]
     # 画像の読み込みとRGB変換
+    print(f"Objectdetector.prompt_objects: {Objectdetector.prompt_objects}")
     image = cv2.imread(image_path)
     image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
     mask_llm = np.zeros(image.shape[:2], dtype=np.uint8)
     llm_results = Objectdetector.detect_objects(image_path)
+    print(f"llm_results: {llm_results}")
     for result in llm_results:
             bbox=result['box_2d']
             x1, y1 = int(bbox[1]* image.shape[1]), int(bbox[0]* image.shape[0])

search.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import asyncio
+from playwright.async_api import async_playwright, Page, Browser
+from bs4 import BeautifulSoup
+from bs4.element import Comment # BeautifulSoupのコメント削除用
+from urllib.parse import urlparse, parse_qs
+from typing import List, Dict, Optional
+class WebScraper:
+    """
+    DuckDuckGoでの検索、URLからのコンテンツ取得、HTMLクリーンアップを行うクラス。
+    """
+    def __init__(self, headless: bool = True, default_timeout: int = 30000):
+        """
+        WebScraperのインスタンスを初期化します。
+        Args:
+            headless (bool): Playwrightをヘッドレスモードで実行するかどうか (デフォルト: True)。
+            default_timeout (int): ページのロードタイムアウト (ミリ秒、デフォルト: 30000 = 30秒)。
+        """
+        self.headless = headless
+        self.default_timeout = default_timeout
+        self._browser: Optional[Browser] = None # Browserインスタンスを保持するため
+    async def _launch_browser(self) -> Browser:
+        """ブラウザを起動し、インスタンス変数に格納します。"""
+        if not self._browser or not self._browser.is_connected():
+            self._browser = await async_playwright().chromium.launch(headless=self.headless)
+        return self._browser
+    async def _close_browser(self):
+        """ブラウザを閉じます。"""
+        if self._browser and self._browser.is_connected():
+            await self._browser.close()
+        self._browser = None
+    async def _get_new_page(self) -> Page:
+        """新しいページ（タブ）を作成します。"""
+        browser = await self._launch_browser()
+        page = await browser.new_page()
+        page.set_default_timeout(self.default_timeout)
+        return page
+    async def search_duckduckgo(self, query: str, num_results: int = 3) -> List[Dict[str, str]]:
+        """
+        DuckDuckGoで指定されたクエリを検索し、上位N件の検索結果（タイトルとURL）を返します。
+        """
+        results = []
+        page: Optional[Page] = None # 明示的に型ヒントを追加
+        try:
+            page = await self._get_new_page()
+            print(f"DuckDuckGoで '{query}' を検索中...")
+            await page.goto("https://duckduckgo.com/")
+            await page.fill("#search_form_input_homepage", query)
+            await page.press("#search_form_input_homepage", "Enter")
+            await page.wait_for_selector("#links .result__a", timeout=10000)
+            search_elements = await page.query_selector_all("#links .result")
+            for i, element in enumerate(search_elements):
+                if i >= num_results:
+                    break
+                title_element = await element.query_selector(".result__a")
+                url_element = await element.query_selector(".result__url")
+                title = await title_element.text_content() if title_element else "タイトルなし"
+                url = await url_element.get_attribute("href") if url_element else "URLなし"
+                # DuckDuckGoのURLのデコードとクリーンアップ
+                if url and url != "URLなし":
+                    parsed_url = urlparse(url)
+                    if parsed_url.path == '/l/':
+                        decoded_url = parse_qs(parsed_url.query).get('uddg', [''])[0]
+                        url = decoded_url
+                results.append({"title": title.strip(), "url": url.strip()})
+        except Exception as e:
+            print(f"DuckDuckGo検索中にエラーが発生しました: {e}")
+        finally:
+            if page:
+                await page.close() # ページを閉じる
+        print(f"検索が完了しました。{len(results)} 件の結果が見つかりました。")
+        return results
+    async def _get_raw_html_content(self, url: str) -> Optional[str]:
+        """指定されたURLから生のHTMLコンテンツを取得します。"""
+        page: Optional[Page] = None
+        try:
+            page = await self._get_new_page()
+            print(f"  URL: {url} のコンテンツを取得中...")
+            await page.goto(url)
+            return await page.content()
+        except Exception as e:
+            print(f"  URL: {url} のコンテンツ取得中にエラーが発生しました: {e}")
+            return None
+        finally:
+            if page:
+                await page.close()
+    def _clean_html_to_text(self, html_content: str) -> str:
+        """
+        HTMLコンテンツからJavaScript、スタイル、不要なリンクなどを除去し、整形されたテキストを返します。
+        """
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # スクリプトタグとスタイルタグを削除
+        for script_or_style in soup(["script", "style"]):
+            script_or_style.decompose()
+        # headタグ内のリンクタグ（CSSなど）を削除
+        if soup.head:
+            for link_tag in soup.head.find_all('link'):
+                link_tag.decompose()
+        # HTMLコメントを削除
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+            comment.extract()
+        # 複数の連続する改行を1つに減らす
+        cleaned_text = soup.get_text(separator='\n', strip=True)
+        cleaned_text_lines = [line.strip() for line in cleaned_text.splitlines() if line.strip()]
+        return '\n'.join(cleaned_text_lines)
+    async def get_processed_documents(self, search_query: str, num_search_results: int = 2) -> List[Dict[str, str]]:
+        """
+        DuckDuckGoで検索し、上位N件の検索結果のURLからクリーンなHTMLコンテンツを取得します。
+        Args:
+            search_query (str): 検索クエリ。
+            num_search_results (int): 取得する検索結果の数。
+        Returns:
+            List[Dict[str, str]]: 処理されたドキュメントのリスト。
+                                  各ドキュメントは 'title', 'original_url', 'cleaned_html_content' を含む。
+        """
+        processed_documents = []
+        # Playwrightの非同期コンテキストマネージャでブラウザインスタンスを管理
+        async with async_playwright() as p:
+            # ブラウザを一度だけ起動し、インスタンス変数に保持
+            self._browser = await p.chromium.launch(headless=self.headless)
+            top_results = await self.search_duckduckgo(search_query, num_search_results)
+            if top_results:
+                for i, result in enumerate(top_results):
+                    print(f"\n--- 処理中の記事 {i+1} ---")
+                    print(f"タイトル: {result['title']}")
+                    print(f"元URL: {result['url']}")
+                    # 個別のURLのコンテンツを取得・クリーンアップ
+                    raw_html = await self._get_raw_html_content(result['url'])
+                    if raw_html:
+                        cleaned_content = self._clean_html_to_text(raw_html)
+                        processed_documents.append({
+                            "title": result['title'],
+                            "original_url": result['url'],
+                            "cleaned_html_content": cleaned_content
+                        })
+                        print(f"  クリーンなコンテンツの長さ: {len(cleaned_content)} 文字")
+                        print(f"  クリーンなコンテンツ（一部）:\n{cleaned_content[:500]}...")
+                    else:
+                        print("  クリーンなコンテンツを取得できませんでした。")
+            else:
+                print("検索結果が見つからなかったため、処理をスキップします。")
+            await self._close_browser() # 全ての処理後にブラウザを閉じる
+        return processed_documents
+# クラスの使用例

templates/index.html CHANGED Viewed

@@ -185,6 +185,8 @@
                         <option value="simple_lama">Simple Lamaインペイント</option>
                         <option value="stamp">stampインペイント</option>
                         <option value="mosaic">mosaicインペイント</option>
                     </select>
                 </div>
                 <div class="slider-container">
@@ -440,6 +442,22 @@
                 apiEndpoint = "/create-mask-and-inpaint-stamp";
             } else if (processingType === "mosaic") {
                 apiEndpoint = "/create-mask-and-inpaint-mosaic";
             }
             processImageRequest(formData, "https://rein0421-aidentify.hf.space" + apiEndpoint);
         }

                         <option value="simple_lama">Simple Lamaインペイント</option>
                         <option value="stamp">stampインペイント</option>
                         <option value="mosaic">mosaicインペイント</option>
+                        <option value="llm-auto">llm-autoインペイント</option>
+                        <option value="llm">llmインペイント</option>
                     </select>
                 </div>
                 <div class="slider-container">
                 apiEndpoint = "/create-mask-and-inpaint-stamp";
             } else if (processingType === "mosaic") {
                 apiEndpoint = "/create-mask-and-inpaint-mosaic";
+            } else if (processingType === "llm-auto") {
+                apiEndpoint = "/create-mask-and-inpaint-sum-llm-auto";
+                formData.append('x1', 0.001); // FastAPIで使うデフォルト値と同じ値を設定
+                formData.append('y1', 0.001); // FastAPIで使うデフォルト値と同じ値を設定
+                formData.append('x2', 0.001); // FastAPIで使うデフォルト値と同じ値を設定
+                formData.append('y2', 0.001); // FastAPIで使うデフォルト値と同じ値を設定
+            } else if (processingType === "llm") {
+                apiEndpoint = "/create-mask-and-inpaint-sum-llm-auto";
+                formData.append('x1', 0.001); // FastAPIで使うデフォルト値と同じ値を設定
+                formData.append('y1', 0.001); // FastAPIで使うデフォルト値と同じ値を設定
+                formData.append('x2', 0.001); // FastAPIで使うデフォルト値と同じ値を設定
+                formData.append('y2', 0.001); // FastAPIで使うデフォルト値と同じ値を設定
+            } else {
+                alert("無効な処理方法が選択されました。");
+                hideLoadingSpinner();
+                return;
             }
             processImageRequest(formData, "https://rein0421-aidentify.hf.space" + apiEndpoint);
         }

test.py CHANGED Viewed

@@ -4,36 +4,24 @@ from dotenv import load_dotenv
 import numpy as np
 import cv2
 from PIL import Image
 load_dotenv(dotenv_path='../.env')
-def llm_to_process_image(risk_level, image_path, point1, point2, thresholds=None):
-    print(risk_level, image_path, point1, point2, thresholds)
-    print('point1,point2', point1, point2)
-    GEMINI_API_KEY=os.getenv('GEMINI_API_KEY')
-    # 画像処理のロジックをここに追加
-    Objectdetector = ObjectDetector(API_KEY=GEMINI_API_KEY)
-    debug_image_path='/test_llm.jpg'
-    Objectdetector.prompt_objects={'face', 'poster', 'Name tag', 'License plate', 'Digital screens',
-        'signboard', 'sign', 'logo', 'manhole', 'electricity pole', 'cardboard'}
-    # 画像の読み込みとRGB変換
-    image = cv2.imread(image_path)
-    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-    mask_llm = np.zeros(image.shape[:2], dtype=np.uint8)
-    llm_results = Objectdetector.detect_objects(image_path)
-    for result in llm_results:
-            bbox=result['box_2d']
-            x1, y1 = int(bbox[1]* image.shape[1]), int(bbox[0]* image.shape[0])
-            x2, y2 = int(bbox[3]* image.shape[1]), int(bbox[2]* image.shape[0])
-            mask_llm[y1:y2, x1:x2] = 255  # テキスト領域をマスク
-    p1_x, p1_y = int(point1[0] * image.shape[1]), int(point1[1] * image.shape[0])
-    p2_x, p2_y = int(point2[0] * image.shape[1]), int(point2[1] * image.shape[0])
-    x_min, y_min = max(0, min(p1_x, p2_x)), max(0, min(p1_y, p2_y))
-    x_max, y_max = min(image.shape[1], max(p1_x, p2_x)), min(image.shape[0], max(p1_y, p2_y))
-    mask_llm[y_min:y_max, x_min:x_max] = 0  # 範囲を黒に設定
-    save_dir = "./saved_images"
-    os.makedirs(save_dir, exist_ok=True)
-    debug_image_pil = Image.fromarray(mask_llm)
-    debug_image_pil.save(save_dir + debug_image_path)
-llm_to_process_image(50, "../../16508.jpg", (0, 0), (0, 0), thresholds=None)

 import numpy as np
 import cv2
 from PIL import Image
+from search import WebScraper
 load_dotenv(dotenv_path='../.env')
+async def main():
+    scraper = WebScraper(headless=True) # UIなしで実行
+    # 個人情報流出に関する事例を検索し、上位2件のクリーンなコンテンツを取得
+    personal_breach_docs = await scraper.get_processed_documents(
+        search_query="個人情報流出 事例",
+        num_search_results=2
+    )
+    if personal_breach_docs:
+        print("\n--- 全ての処理済みドキュメントの概要 ---")
+        for doc in personal_breach_docs:
+            print(f"タイトル: {doc['title']}")
+            print(f"URL: {doc['original_url']}")
+            print(f"コンテンツサイズ: {len(doc['cleaned_html_content'])} 文字")
+            print("-" * 30)
+    else:
+        print("処理されたドキュメントはありませんでした。")
+main()