Spaces:

garage-lab
/

MCP_HTML2JSON

Paused

App Files Files Community

abdo-Mansour commited on Jul 2

Commit

251790a

1 Parent(s): 7924dcb

a little suprise

Browse files

Files changed (8) hide show

app.py +7 -3
requirements.txt +9 -1
web2json/__pycache__/ai_extractor.cpython-311.pyc +0 -0
web2json/__pycache__/pipeline.cpython-311.pyc +0 -0
web2json/__pycache__/postprocessor.cpython-311.pyc +0 -0
web2json/__pycache__/preprocessor.cpython-311.pyc +0 -0
web2json/ai_extractor.py +74 -12
web2json/contentextractors.py +379 -0

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import pandas as pd
 import gradio as gr
 from typing import Dict, Any, Type
 from web2json.preprocessor import BasicPreprocessor
-from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient, NvidiaRerankerClient
 from web2json.postprocessor import PostProcessor
 from web2json.pipeline import Pipeline
 from pydantic import BaseModel, Field, create_model
@@ -185,7 +185,10 @@ def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str,
     - Follow the exact structure and data types specified in the schema
     - If a required field cannot be found, indicate this clearly
     - Preserve the original formatting and context where relevant
-    - Return the extracted data in the format specified by the schema"""
     classification_prompt_template = schema.model_json_schema()
     # Initialize pipeline components
@@ -194,7 +197,8 @@ def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str,
     try:
         # llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
         llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})
-        reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'nv-rerank-qa-mistral-4b:1'})
     except Exception as e:
         return {"error": f"Failed to initialize LLM client: {str(e)}"}

 import gradio as gr
 from typing import Dict, Any, Type
 from web2json.preprocessor import BasicPreprocessor
+from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient, NvidiaRerankerClient , ModalRerankerClient
 from web2json.postprocessor import PostProcessor
 from web2json.pipeline import Pipeline
 from pydantic import BaseModel, Field, create_model
     - Follow the exact structure and data types specified in the schema
     - If a required field cannot be found, indicate this clearly
     - Preserve the original formatting and context where relevant
+    - Return the extracted data in the format specified by the schema
+    - STICK TO THE SCHEMA DON'T EVEN THINK OF DOING SOMETHING ELSE
+    - IF THE SCHEMA ASKS FOR AN ARRAY THEN YOU MAY TRY TO EXTRACT ONE IF THERE IS
+    - OR I WILL KILL AND KIDNAP YOUR FAMILY AND TORTURE THEM """
     classification_prompt_template = schema.model_json_schema()
     # Initialize pipeline components
     try:
         # llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
         llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})
+        # reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'nv-rerank-qa-mistral-4b:1'})\
+        reranker = ModalRerankerClient("https://abdulrahmanmfam2003--qwen3-reranker-rerank.modal.run")
     except Exception as e:
         return {"error": f"Failed to initialize LLM client: {str(e)}"}

requirements.txt CHANGED Viewed

@@ -15,4 +15,12 @@ openai
 html_chunking
 langchain_nvidia_ai_endpoints
 langchain_core
-lxml

 html_chunking
 langchain_nvidia_ai_endpoints
 langchain_core
+lxml
+pdfkit
+html2text
+inscriptis
+trafilatura
+markdownify
+beautifulsoup4
+readabilipy
+docling

web2json/__pycache__/ai_extractor.cpython-311.pyc CHANGED Viewed

Binary files a/web2json/__pycache__/ai_extractor.cpython-311.pyc and b/web2json/__pycache__/ai_extractor.cpython-311.pyc differ

web2json/__pycache__/pipeline.cpython-311.pyc CHANGED Viewed

Binary files a/web2json/__pycache__/pipeline.cpython-311.pyc and b/web2json/__pycache__/pipeline.cpython-311.pyc differ

web2json/__pycache__/postprocessor.cpython-311.pyc CHANGED Viewed

Binary files a/web2json/__pycache__/postprocessor.cpython-311.pyc and b/web2json/__pycache__/postprocessor.cpython-311.pyc differ

web2json/__pycache__/preprocessor.cpython-311.pyc CHANGED Viewed

Binary files a/web2json/__pycache__/preprocessor.cpython-311.pyc and b/web2json/__pycache__/preprocessor.cpython-311.pyc differ

web2json/ai_extractor.py CHANGED Viewed

@@ -23,6 +23,9 @@ import requests
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch
 from typing import List, Dict
 class LLMClient(ABC):
     """
@@ -208,9 +211,9 @@ class NvidiaLLMClient(LLMClient):
         # Store generation settings with sensible defaults
         gen_conf = config.get("generation_config", {})
-        self.temperature = gen_conf.get("temperature", 0.1)
         self.top_p = gen_conf.get("top_p", 0.7)
-        self.max_tokens = gen_conf.get("max_tokens", 512)
     def set_model(self, model_name: str):
         """
@@ -237,7 +240,7 @@ class NvidiaLLMClient(LLMClient):
             model=self.model_name,
             messages=[{"role": "user", "content": prompt}],
             temperature=self.temperature,
-            top_p=self.top_p,
             max_tokens=self.max_tokens
             # stream is omitted (defaults to False)
         )
@@ -301,13 +304,12 @@ class NvidiaRerankerClient(RerankerClient):
         p_scores = 1 / (1 + np.exp(-raw_scores))
         print(f"Sigmoid scores: {p_scores}")
-        # 3. Min-max normalization
-        min_score = np.min(p_scores)
         max_score = np.max(p_scores)
-        if max_score == min_score:
-            norm_scores = np.ones_like(p_scores)  # All values same — normalize to 1
         else:
-            norm_scores = (p_scores - min_score) / (max_score - min_score)
         print(f"Normalized scores: {norm_scores}")
         # 4. Filter by threshold using normalized scores
@@ -325,6 +327,60 @@ class NvidiaRerankerClient(RerankerClient):
     # def call_batch(self, prompts, max_workers=8):
     #     pass
 class HFRerankerClient(LLMClient):
     """
@@ -485,16 +541,22 @@ class LLMClassifierExtractor(AIExtractor):
             hf (bool): Whether to use the Hugging Face reranker or NVIDIA (default).
         """
         # print("TIME TO EXTRACT")
-        chunks = self.chunk_content(content, max_tokens=1000)
-        # print(f"Content successfully chunked into {len(chunks)}.")
         # print(f"Content successfully chunked: {chunks}")
         classified_chunks = self.classify_chunks(chunks, hf=hf)  # conditional reranker
         # extracting the content
-        # classified_chunks = [chunk.page_content for chunk in classified_chunks]
-        # print(f"Classified Chunks {len(classified_chunks)}")
         # print(classified_chunks)
         # print('='*80)
         filtered_content = "\n\n".join(classified_chunks)
         if not filtered_content:

 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch
 from typing import List, Dict
+from tenacity import retry, wait_exponential, stop_after_attempt
+import trafilatura
 class LLMClient(ABC):
     """
         # Store generation settings with sensible defaults
         gen_conf = config.get("generation_config", {})
+        self.temperature = gen_conf.get("temperature", 0)
         self.top_p = gen_conf.get("top_p", 0.7)
+        self.max_tokens = gen_conf.get("max_tokens", 8192)
     def set_model(self, model_name: str):
         """
             model=self.model_name,
             messages=[{"role": "user", "content": prompt}],
             temperature=self.temperature,
+            # top_p=self.top_p,
             max_tokens=self.max_tokens
             # stream is omitted (defaults to False)
         )
         p_scores = 1 / (1 + np.exp(-raw_scores))
         print(f"Sigmoid scores: {p_scores}")
+        # 3. Max normalization
         max_score = np.max(p_scores)
+        if max_score == 0:
+            norm_scores = np.zeros_like(p_scores)
         else:
+            norm_scores = p_scores / max_score
         print(f"Normalized scores: {norm_scores}")
         # 4. Filter by threshold using normalized scores
     # def call_batch(self, prompts, max_workers=8):
     #     pass
+def retry_on_error(fn):
+    """Simple retry decorator (exponential back-off, max 6 tries)."""
+    return retry(
+        wait=wait_exponential(multiplier=0.5, min=0.5, max=5),
+        stop=stop_after_attempt(6),
+        reraise=True,
+    )(fn)
+class ModalRerankerClient(RerankerClient):
+    """Client for the Modal Qwen3-Reranker endpoint (non-streaming)."""
+    def __init__(self, endpoint_url: str):
+        self.endpoint_url = endpoint_url.rstrip("/")  # ensure no trailing slash
+    def set_endpoint(self, url: str):
+        self.endpoint_url = url.rstrip("/")
+    @retry_on_error
+    def rerank(
+        self,
+        query: str,
+        passages: List[str],
+        threshold: float = 0.5,
+    ) -> List[Document]:
+        """Call the remote endpoint and return filtered passages."""
+        if not isinstance(query,str):
+            query = str(query)
+        payload = {"query": query, "passages": passages}
+        print(payload)
+        res = requests.post(self.endpoint_url, json=payload, timeout=60)
+        res.raise_for_status()
+        data = res.json()
+        # The endpoint already returns probabilities (0-1). Extract them.
+        ranked = data.get("ranked_passages", [])
+        # Extract scores
+        scores = np.array([p["score"] for p in ranked], dtype=float)
+        # Max normalization
+        max_score = scores.max() if len(scores) > 0 else 1.0
+        if max_score == 0:
+            norm_scores = np.zeros_like(scores)
+        else:
+            norm_scores = scores / max_score
+        # Filter by threshold using normalized scores
+        filtered = [
+            (p, norm) for p, norm in zip(ranked, norm_scores) if norm >= threshold
+        ]
+        # Convert to LangChain Documents
+        docs = [
+            Document(page_content=p["passage"], metadata={"score": p["score"], "norm_score": norm})
+            for p, norm in filtered
+        ]
+        return docs
 class HFRerankerClient(LLMClient):
     """
             hf (bool): Whether to use the Hugging Face reranker or NVIDIA (default).
         """
         # print("TIME TO EXTRACT")
+        chunks = self.chunk_content(content, max_tokens=500)
+        print(f"Content successfully chunked into {len(chunks)}.")
         # print(f"Content successfully chunked: {chunks}")
+        # chunks = [trafilatura.extract(chunk,favor_recall=True) for chunk in chunks]
+        # chunks = [chunk for chunk in chunks if chunk is not None]
         classified_chunks = self.classify_chunks(chunks, hf=hf)  # conditional reranker
         # extracting the content
+        if isinstance(classified_chunks[0],Document):
+            classified_chunks = [chunk.page_content for chunk in classified_chunks]
+        print(f"Classified Chunks {len(classified_chunks)}")
         # print(classified_chunks)
         # print('='*80)
+        # NOTE: More preprocesing
+        # classified_chunks = [trafilatura.extract(chunk,favor_recall=True) for chunk in classified_chunks]
+        # classified_chunks = [chunk for chunk in classified_chunks if chunk is not None]
         filtered_content = "\n\n".join(classified_chunks)
         if not filtered_content:

web2json/contentextractors.py ADDED Viewed

	@@ -0,0 +1,379 @@

+import os
+import re
+import json
+import pdfkit
+import requests
+import warnings
+import tempfile
+# import textract
+import html2text
+import inscriptis
+import trafilatura
+from pathlib import Path
+from markdownify import markdownify
+from json_repair import repair_json
+from bs4 import BeautifulSoup, Comment
+from html_chunking import get_html_chunks
+from urllib.error import URLError, HTTPError
+from html_to_markdown import convert_to_markdown
+from readabilipy import simple_json_from_html_string
+from docling.document_converter import DocumentConverter
+from dateparser_scripts.update_supported_languages_and_locales import to_string
+def clean_html(html_content: str) -> str:
+    """
+    Cleans up the given HTML content by:
+      - Removing <script> and <style> tags and their content.
+      - Removing HTML comments.
+      - Extracting and returning the visible text with normalized whitespace.
+    Args:
+        html_content (str): The HTML content to clean.
+    Returns:
+        str: The cleaned, visible text from the HTML.
+    """
+    # Parse the HTML content
+    soup = BeautifulSoup(html_content, "html.parser")
+    # Remove script and style elements
+    # Remove unwanted tags
+    for tag in soup(["script", "style", "img", "a", "table", "tr", "td", "th", "thead", "tbody",
+                     "tfoot", "header", "footer", "link", "rel"]):
+        tag.decompose()
+    # Remove elements that do not contain any visible text
+    for element in soup.find_all():
+        # If the element has no text (after stripping whitespace), remove it
+        if not element.get_text(strip=True):
+            element.decompose()
+    # Remove HTML comments
+    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+        comment.extract()
+    # Extract text and normalize whitespace
+    # text = soup.get_text(separator=" ", strip=True)
+    # clean_text = re.sub(r'\s+', ' ', text)
+    # return clean_text
+    return str(soup)
+def print_content_extractors():
+    print(
+        [
+            "Default: the plain text of the HTML page",
+            "Inscriptis",
+            "Trafilatura",
+        ]
+    )
+class ContentExtractor:
+    def get_text(self, html):
+        return clean_html(html)
+    # TODO: Clean this mess
+    def url_to_html(self, url,clean=False):
+        # Define custom headers to mimic a browser request
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
+            "Accept-Language": "en-US,en;q=0.6",
+            "Cache-Control": "max-age=0",
+            "Sec-Ch-Ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"",
+            "Sec-Ch-Ua-Mobile": "?0",
+            "Sec-Ch-Ua-Platform": "\"Windows\"",
+            "Sec-Fetch-Dest": "document",
+            "Sec-Fetch-Mode": "navigate",
+            "Sec-Fetch-Site": "none",
+            "Sec-Fetch-User": "?1",
+            "Upgrade-Insecure-Requests": "1"
+        }
+        try:
+            # Create a Request object with custom headers
+            response = requests.get(url, headers=headers, timeout=10)
+            html = None
+            if response.status_code == 200:
+                html = response.text
+            else:
+                print(f"Failed to retrieve HTML. Status code: {response.status_code}")
+                return None
+            if clean:
+                return self.get_text(html)
+            return html
+        except HTTPError as e:
+            print(f"HTTP Error: {e.code} - {e.reason}")
+            return None
+        except URLError as e:
+            print(f"URL Error: {e.reason}")
+            return None
+        except Exception as e:
+            print(f"An unexpected error occurred: {e}")
+            return None
+class Inscriptis(ContentExtractor):
+    def __init__(self):
+        super()
+        self.headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Brave/119.0.0.0",
+            "Accept-Language": "en-US,en;q=0.9,ar;q=0.8",
+        }
+        warnings.warn("\nBeware, put only clean links with no trackers, or it may produce unexpected results.")
+    def get_text(self, html):
+        """Extract text from HTML using inscriptis."""
+        return inscriptis.get_text(html)
+    def url_to_html(self, url):
+        response = requests.get(url, headers=self.headers)
+        return response.text
+class Docling(ContentExtractor):
+    def __init__(self):
+        super().__init__()
+    # TODO: This is an unexpected behaviour but due to docling docs website being down, it's what works for now
+    def get_text(self, text_content):
+        result = None
+        with tempfile.NamedTemporaryFile(mode='w+', suffix='.html', delete=False, encoding='utf-8') as tmpfile:
+            tmpfile.write(text_content)
+            tmpfile.flush()
+            tmpfile_path = tmpfile.name.replace("\\", "/")
+            tmpfile_path = Path(tmpfile_path)
+        try:
+            converter = DocumentConverter()
+            document = converter.convert(tmpfile_path).document
+            tables = []
+            for table_ix, table in enumerate(document.tables):
+                table_text = table.export_to_markdown()
+                tables.append(table_text)
+            result = document.export_to_markdown()
+            for table in tables:
+                result += "\n\n" + table
+        finally:
+            os.remove(tmpfile_path)
+        return result
+class ReadabiliPy(ContentExtractor):
+    def __init__(self):
+        super().__init__()
+    def get_text(self, html):
+        content = simple_json_from_html_string(html, use_readability=True)
+        json_object = json.dumps(content, indent=4)
+        repaired = repair_json(json_object)
+        return repaired
+class Trafilatura(ContentExtractor):
+    def __init__(self):
+        super().__init__()
+        self.headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
+            "Accept-Language": "en-US,en;q=0.9",
+        }
+        warnings.warn("\nTrafilatura Content Extractor: Beware, put only clean links with no trackers, or it may produce unexpected results.")
+        from copy import deepcopy
+        from trafilatura.settings import DEFAULT_CONFIG
+        config = deepcopy(DEFAULT_CONFIG)
+        # config['DEFAULT']['MIN_EXTRACTED_SIZE'] = '5000' # Configurable but this value worked well for me
+        self.config = config
+    def url_to_html(self, url):
+        response = requests.get(url, headers=self.headers)
+        return response.text
+    def get_text(self, html, output_format="markdown", min_extracted_size_char=20_000):
+        # self.config['DEFAULT']['MIN_EXTRACTED_SIZE'] = f"{min_extracted_size_char}"
+        # self.config['DEFAULT']['MIN_OUTPUT_SIZE'] = f"{min_extracted_size_char}"
+        return trafilatura.extract(filecontent=html, favor_recall=True, config=self.config, output_format=output_format)
+class Markdownify(ContentExtractor):
+    def get_text(self, html):
+        alt = re.sub(r"\n{3,}", "\n\n", html)
+        md = markdownify(alt, strip=['href', 'table', 'tr', 'td', 'header', 'footer'])
+        md = re.sub(r'!?\[[^\]]*\]\([^)]*\)', '', md)
+        # Remove extra newlines
+        md = re.sub(r"\n{3,}", "\n\n", md)
+        md = md.strip()
+        return md
+class HTML2Text(ContentExtractor):
+    def get_text(self, html):
+        converter = html2text.HTML2Text()
+        converter.ignore_tables=True
+        converter.ignore_links=True
+        converter.ignore_images=True
+        converter.ignore_mailto_links=True
+        return converter.handle(html)
+class HTML_TO_Markdown(ContentExtractor):
+    def get_text(self, html):
+        alt = re.sub(r"\n{3,}", "\n\n", html)
+        md = convert_to_markdown(alt, strip=['href', 'table', 'tr', 'td', 'header', 'footer'])
+        md = re.sub(r'!?\[[^\]]*\]\([^)]*\)', '', md)
+        # Remove extra newlines
+        md = re.sub(r"\n{3,}", "\n\n", md)
+        md = md.strip()
+        return md
+class PDFkitDocling(ContentExtractor):
+    def get_text(self, html):
+        soup = BeautifulSoup(html, "html.parser")
+        # Remove <a>, <link>, <img>, and other unwanted tags
+        for tag in soup.find_all(['a', 'link', 'img', 'base', 'meta', 'style', 'script', 'noscript', 'head']):
+            tag.decompose()
+        # Remove HTML comments
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+            comment.extract()
+        content = str(soup)
+        # PDF path to save
+        pdf_path = 'test.pdf'
+        # Create PDF
+        pdfkit.from_string(content, pdf_path)
+        converter = DocumentConverter()
+        return converter.convert(pdf_path).document.export_to_markdown()
+class TrafilatraCHUNKS(ContentExtractor):
+    def __init__(self):
+        super().__init__()
+        # self.trafi = Trafilatura()
+    def get_text(self, html, max_tokens=1000):
+        soup = BeautifulSoup(html, "html.parser")
+        # Remove <a>, <link>, <img>, and other unwanted tags
+        for tag in soup.find_all(['a', 'link', 'img', 'base', 'meta', 'style', 'script', 'noscript', 'head']):
+            tag.decompose()
+        # Remove HTML comments
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+            comment.extract()
+        content = str(soup)
+        chunks = get_html_chunks(content, max_tokens=max_tokens, is_clean_html=True, attr_cutoff_len=50)
+        cleaned = [trafilatura.extract(chunk) for chunk in chunks]
+        cleaned = [chunk for chunk in cleaned if chunk is not None]
+        combined_text = ""
+        for chunk in cleaned:
+            if chunk is None:
+                continue
+            combined_text += chunk + "\n"
+        return combined_text
+class TrafilaCHUNKSRobust(ContentExtractor):
+    def __init__(self):
+        super().__init__()
+        # self.trafi = Trafilatura()
+    def get_text(self, html, max_tokens=1000):
+        soup = BeautifulSoup(html, "html.parser")
+        for tag in soup.find_all(['style', 'script', 'head', 'img', 'base', 'noscript']):
+            tag.decompose()
+        for tag in soup.find_all(lambda tag: tag.attrs and any("nav" in str(v) for v in tag.attrs.values())):
+            tag.decompose()
+        # Remove HTML comments
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+            comment.extract()
+        content = str(soup)
+        chunks = get_html_chunks(content, max_tokens=max_tokens, is_clean_html=True, attr_cutoff_len=50)
+        cleaned = [trafilatura.extract(chunk) for chunk in chunks]
+        cleaned = [chunk for chunk in cleaned if chunk is not None]
+        combined_text = ""
+        for chunk in cleaned:
+            if chunk is None:
+                continue
+            combined_text += chunk + "\n"
+        return combined_text
+class TrafilaCHUNKSRobustV2(ContentExtractor):
+    def __init__(self):
+        super().__init__()
+        # self.trafi = Trafilatura()
+    def get_text(self, html, max_tokens=1000):
+        soup = BeautifulSoup(html, "html.parser")
+        for tag in soup.find_all(['style', 'script', 'head', 'img', 'base', 'noscript']):
+            tag.decompose()
+        # Remove HTML comments
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+            comment.extract()
+        content = str(soup)
+        chunks = get_html_chunks(content, max_tokens=max_tokens, is_clean_html=True, attr_cutoff_len=50)
+        cleaned = [trafilatura.extract(chunk) for chunk in chunks]
+        cleaned = [chunk for chunk in cleaned if chunk is not None]
+        combined_text = ""
+        for chunk in cleaned:
+            if chunk is None:
+                continue
+            combined_text += chunk + "\n"
+        return combined_text
+# Very Bad lol
+# class Textract(ContentExtractor):
+#     def get_text(self, html):
+#         with tempfile.NamedTemporaryFile(mode='w+', suffix='.html', delete=False, encoding='utf-8') as tmpfile:
+#             tmpfile.write(html)
+#             tmpfile.flush()
+#             tmpfile_path = tmpfile.name.replace("\\", "/")
+#             tmpfile_path = Path(tmpfile_path)
+#         try:
+#             result = textract.process(tmpfile_path)
+#         finally:
+#             os.remove(tmpfile_path)
+#         return result