Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on May 27

Commit

03fc5c6

verified ·

1 Parent(s): 4cff691

Update app2.py

Browse files

Files changed (1) hide show

app2.py +267 -49

app2.py CHANGED Viewed

@@ -769,8 +769,6 @@ class EnhancedFileProcessor:
                                         elif extracted_file_path.suffix.lower() in self.archive_extensions:
                                              logger.info(f"Found nested archive '{member.name}', processing recursively.")
                                              dataset.extend(self._process_archive(extracted_file_path, extract_to))
-                                        else:
-                                             logger.debug(f"Skipping unsupported file in archive: '{member.name}'")
                                     else:
                                          logger.warning(f"Could not get file-like object for {member.name} from tar.")
@@ -1413,6 +1411,7 @@ def respond_to_chat(
                 "I'm equipped to filter your data. Try 'find entries where name contains widget'."
              ])
     except Exception as e:
         logger.error(f"Chatbot runtime error: {e}")
         response = f"An internal error occurred while processing your request: {e}"
@@ -1420,6 +1419,12 @@ def respond_to_chat(
         # On unexpected error, preserve the current_filtered_df_state rather than clearing or modifying it.
         # new_filtered_df_state = current_filtered_df_state # This line is effectively already done by initialization
     if not response: # Final safety net for response, if it's somehow still empty
         response = "I'm not sure how to respond to that. Please try rephrasing or ask for help on available commands."
@@ -1646,49 +1651,205 @@ def create_modern_interface():
              clear_chat_btn = gr.Button("Clear Chat History")
         # Event handlers must be defined within the Blocks context
-        example_btn.click(load_example, inputs=[], outputs=text_input)
-        clear_btn.click(clear_input, inputs=[], outputs=[url_input, file_input, text_input, chatbot_data])
-        process_btn.click(
-            process_inputs,
-            inputs=[url_input, file_input, text_input, combine_data, crawl_depth_slider, generate_qr_toggle],
-            outputs=[output_json, output_gallery, output_text, chatbot_data]
-        ).then(
-            on_qr_generation,
-            inputs=[output_gallery],
-            outputs=[qr_code_paths, enabled_qr_codes]
-        )
-        viewport_tab.select(update_viewport, inputs=[qr_code_paths, enabled_qr_codes], outputs=[viewport_output])
-        send_msg_btn.click(
-            respond_to_chat,
-            inputs=[chat_input, chat_history, chatbot_data, filtered_chatbot_df_state],
-            outputs=[chatbot, chatbot_data, filtered_chatbot_df_state]
-        ).then(
-            lambda: "",
-            inputs=None,
-            outputs=chat_input
-        )
-        chat_input.submit(
-            respond_to_chat,
-            inputs=[chat_input, chat_history, chatbot_data, filtered_chatbot_df_state], # Pass filtered_chatbot_df_state here too
-            outputs=[chatbot, chatbot_data, filtered_chatbot_df_state] # And return it
-        ).then(
-            lambda: "",
-            inputs=None,
-            outputs=chat_input
-        )
-        clear_chat_btn.click(
-            lambda: ([], None), # Clear chat history and filtered data state
-            inputs=None,
-            outputs=[chatbot, filtered_chatbot_df_state]
-        )
         # --- Download Logic ---
         def download_json_data(data_df: Optional[pd.DataFrame], filename_prefix: str) -> Optional[str]:
             if data_df is None or data_df.empty:
                 logger.info(f"No data provided for download with prefix '{filename_prefix}'.")
                 return None
@@ -1715,13 +1876,14 @@ def create_modern_interface():
                 return None
         def handle_download_full_json(current_chatbot_data_state: Optional[List[Dict]]) -> Optional[str]:
-            # This function receives the full processed data (List[Dict])
             if not current_chatbot_data_state:
                 logger.info("No full data available to download.")
                 return None
             try:
-                # Attempt to create a DataFrame from the full data state
                 # This uses the same flattening logic as the chatbot
                 flat_data = []
                 def flatten_item_for_download(d, parent_key='', sep='_'):
@@ -1743,8 +1905,10 @@ def create_modern_interface():
                 for item in current_chatbot_data_state:
                     if isinstance(item, dict):
                         flat_data.append(flatten_item_for_download(item))
                     elif isinstance(item, (list, str, int, float, bool, type(None))):
-                        flat_data.append({'item_value': item})
                 if not flat_data:
                      logger.info("Full data flattened to empty list. Nothing to download.")
@@ -1765,24 +1929,77 @@ def create_modern_interface():
         def handle_download_filtered_json(current_filtered_df_state: Optional[pd.DataFrame]) -> Optional[str]:
-            # This function receives the already filtered DataFrame
             if current_filtered_df_state is None or current_filtered_df_state.empty:
                 logger.info("No filtered data available to download.")
                 return None
             # Pass the DataFrame directly to the generic download function
             return download_json_data(current_filtered_df_state, "filtered_data")
         download_full_json_btn.click(
             fn=handle_download_full_json,
-            inputs=[chatbot_data],  # chatbot_data is the gr.State holding the full dataset (List[Dict])
-            outputs=[download_file_output]
         )
         download_filtered_json_btn.click(
             fn=handle_download_filtered_json,
-            inputs=[filtered_chatbot_df_state],  # This state holds the filtered DataFrame
-            outputs=[download_file_output]
         )
         gr.Markdown("""
         ### 🚀 Features
         - **Enhanced URL Scraping**: Extracts HTML text, title, meta description, links, and attempts parsing JSON/XML from URLs based on content type. Supports crawling links up to a specified depth. **(Now performs real fetching)**
@@ -1818,10 +2035,10 @@ def main():
         mimetypes.init()
         interface = create_modern_interface()
         interface.launch(
-            share=False,
-            debug=False,
-            show_error=True,
-            show_api=False
         )
     except Exception as e:
         logger.error(f"Application startup error: {e}")
@@ -1829,4 +2046,5 @@ def main():
         raise
 if __name__ == "__main__":
     main()

                                         elif extracted_file_path.suffix.lower() in self.archive_extensions:
                                              logger.info(f"Found nested archive '{member.name}', processing recursively.")
                                              dataset.extend(self._process_archive(extracted_file_path, extract_to))
                                     else:
                                          logger.warning(f"Could not get file-like object for {member.name} from tar.")
                 "I'm equipped to filter your data. Try 'find entries where name contains widget'."
              ])
+    # --- End of main try block ---
     except Exception as e:
         logger.error(f"Chatbot runtime error: {e}")
         response = f"An internal error occurred while processing your request: {e}"
         # On unexpected error, preserve the current_filtered_df_state rather than clearing or modifying it.
         # new_filtered_df_state = current_filtered_df_state # This line is effectively already done by initialization
+    # --- Finally block (optional, but good practice if cleanup is needed) ---
+    # finally:
+    #     # Any cleanup code can go here
+    #     pass
     if not response: # Final safety net for response, if it's somehow still empty
         response = "I'm not sure how to respond to that. Please try rephrasing or ask for help on available commands."
              clear_chat_btn = gr.Button("Clear Chat History")
         # Event handlers must be defined within the Blocks context
+        def load_example():
+            example = {
+                "type": "product_catalog",
+                "items": [
+                    {
+                        "id": "123",
+                        "name": "Premium Widget",
+                        "description": "High-quality widget with advanced features",
+                        "price": 299.99,
+                        "category": "electronics",
+                        "tags": ["premium", "featured", "new"]
+                    },
+                    {
+                        "id": "456",
+                        "name": "Basic Widget",
+                        "description": "Reliable widget for everyday use",
+                        "price": 149.99,
+                        "category": "electronics",
+                        "tags": ["basic", "popular"]
+                    }
+                ],
+                "metadata": {
+                    "timestamp": datetime.now().isoformat(),
+                    "version": "2.0",
+                    "source": "example"
+                }
+            }
+            return json.dumps(example, indent=2)
+        def clear_input():
+            # Clear all input fields and the chatbot data state
+            return "", None, "", None
+        def update_viewport(paths, enabled_states):
+            if not paths:
+                return "<p>No QR codes generated yet.</p>"
+            num_qr_codes = len(paths)
+            # Determine grid columns based on the number of QRs, aiming for a roughly square layout
+            cols = math.ceil(math.sqrt(num_qr_codes))
+            cols = max(1, min(cols, 6)) # Clamp columns between 1 and 6
+            viewport_html = f'<div class="viewport-container" style="grid-template-columns: repeat({cols}, 1fr);">'
+            # Ensure enabled_states is a list of indices if it's None or doesn't match current paths
+            if enabled_states is None or len(enabled_states) != num_qr_codes:
+                 enabled_states = list(range(num_qr_codes))
+            for i, path in enumerate(paths):
+                is_enabled = i in enabled_states
+                border = "border: 2px solid green;" if is_enabled else "border: 2px solid lightgray;"
+                opacity = "opacity: 1.0;" if is_enabled else "opacity: 0.5;"
+                # Use /file= prefix for Gradio to serve local files
+                viewport_html += f'<div class="viewport-item" id="qr_item_{i}">'
+                viewport_html += f'<img src="/file={path}" style="{border} {opacity}" alt="QR Code {i+1}">'
+                # Add checkbox with data-index for JS to identify which QR it controls
+                viewport_html += f'<label><input type="checkbox" data-index="{i}" {"checked" if is_enabled else ""} onchange="updateEnabledStates(this)"> Enable</label>'
+                viewport_html += '</div>'
+            viewport_html += '</div>'
+            return viewport_html
+        def on_qr_generation(qr_paths_list):
+            """Handler to initialize enabled_qr_codes state after QR generation."""
+            if qr_paths_list is None:
+                num_qrs = 0
+            else:
+                num_qrs = len(qr_paths_list)
+            # Initially enable all generated QR codes
+            initial_enabled_states = list(range(num_qrs))
+            # Return the paths list and the initial enabled states
+            return qr_paths_list, initial_enabled_states
+        def process_inputs(urls, files, text, combine, crawl_depth, generate_qr_enabled):
+            """Process all inputs and generate QR codes based on toggle"""
+            results = []
+            processing_status_messages = []
+            url_processor = EnhancedURLProcessor()
+            file_processor = EnhancedFileProcessor()
+            try:
+                if text and text.strip():
+                    try:
+                        json_data = json.loads(text)
+                        results.append({
+                            'source': 'json_input',
+                            'extracted_data': json_data,
+                            'timestamp': datetime.now().isoformat(),
+                            'processing_notes': ['Parsed from direct JSON input.']
+                        })
+                        processing_status_messages.append("✅ Successfully parsed direct JSON input.")
+                    except json.JSONDecodeError as e:
+                        processing_status_messages.append(f"❌ Invalid JSON format in text input: {str(e)}")
+                        logger.error(f"Invalid JSON format in text input: {e}")
+                    except Exception as e:
+                        processing_status_messages.append(f"❌ Error processing direct JSON input: {str(e)}")
+                        logger.error(f"Error processing direct JSON input: {e}")
+                if urls and urls.strip():
+                    url_list = re.split(r'[,\n]', urls)
+                    url_list = [url.strip() for url in url_list if url.strip()]
+                    for url in url_list:
+                        processing_status_messages.append(f"🌐 Processing URL: {url} with crawl depth {crawl_depth}...")
+                        # Call fetch_content_with_depth which handles recursion
+                        content_result = url_processor.fetch_content_with_depth(url, max_steps=crawl_depth)
+                        # The result from fetch_content_with_depth is already structured
+                        # It includes the main fetch_result and linked_extractions
+                        if content_result: # Check if a result dictionary was returned
+                             results.append(content_result)
+                             # Provide status based on the fetch_result within the recursive structure
+                             main_fetch_status = content_result.get('fetch_result', {}).get('status_code')
+                             if main_fetch_status is not None and 200 <= main_fetch_status < 300:
+                                 processing_status_messages.append(f"✅ Processed URL: {url} (Level 0, Status: {main_fetch_status})")
+                                 if content_result.get('processing_notes'):
+                                      processing_status_messages.append(f"   Notes for {url}: {'; '.join(content_result['processing_notes'])}")
+                                 # Count successfully processed linked pages
+                                 def count_successful_fetches(crawl_result):
+                                     count = 0
+                                     if crawl_result and crawl_result.get('fetch_result') is not None:
+                                         status = crawl_result['fetch_result'].get('status_code')
+                                         if status is not None and 200 <= status < 300:
+                                             count += 1
+                                     for linked_result in crawl_result.get('linked_extractions', []):
+                                         count += count_successful_fetches(linked_result)
+                                     return count
+                                 total_attempted_links = len(content_result.get('linked_extractions', []))
+                                 total_successful_linked = count_successful_fetches({'linked_extractions': content_result.get('linked_extractions', [])}) # Wrap to match expected structure
+                                 if total_attempted_links > 0:
+                                     processing_status_messages.append(f"   Processed {total_successful_linked}/{total_attempted_links} linked pages up to depth {crawl_depth}.")
+                             else:
+                                 processing_status_messages.append(f"❌ Failed to fetch or process URL: {url} (Status: {main_fetch_status})")
+                                 if content_result.get('processing_notes'):
+                                      processing_status_messages.append(f"   Notes for {url}: {'; '.join(content_result['processing_notes'])}")
+                        else:
+                             processing_status_messages.append(f"❌ Failed to process URL: {url} (No result returned)")
+                if files:
+                    for file in files:
+                        processing_status_messages.append(f"📁 Processing file: {file.name}...")
+                        file_results = file_processor.process_file(file)
+                        if file_results:
+                             results.extend(file_results)
+                             processing_status_messages.append(f"✅ Processed file: {file.name}")
+                             for res in file_results:
+                                  if res.get('processing_notes'):
+                                       processing_status_messages.append(f"   Notes for {res.get('filename', 'item')}: {'; '.join(res['processing_notes'])}")
+                        else:
+                             processing_status_messages.append(f"❌ Failed to process file: {file.name}")
+                             # Add a default note if process_file returned empty list without notes
+                             if not file_results and file and hasattr(file, 'name'):
+                                  processing_status_messages.append(f"   No results returned for file: {file.name}")
+                qr_paths = []
+                final_json_output = None
+                if results:
+                    final_json_output = results # Assign processed data regardless of QR generation
+                    if generate_qr_enabled:
+                        processing_status_messages.append("⚙️ Generating QR codes as requested...")
+                        # generate_qr_codes expects a List[Dict]
+                        qr_paths = generate_qr_codes(results, combine)
+                        if qr_paths:
+                            processing_status_messages.append(f"✅ Successfully generated {len(qr_paths)} QR codes.")
+                        else:
+                            processing_status_messages.append("❌ Failed to generate QR codes (empty result or error). Check logs.")
+                    else:
+                        processing_status_messages.append("☑️ QR code generation was disabled. Processed data is available.")
+                        qr_paths = [] # Ensure it's empty
+                else:
+                    processing_status_messages.append("⚠️ No valid content collected from inputs.")
+                    final_json_output = [] # Ensure output_json is cleared if no results
+            except Exception as e:
+                logger.error(f"Overall processing error in process_inputs: {e}")
+                processing_status_messages.append(f"❌ An unexpected error occurred during processing: {str(e)}")
+                final_json_output = [] # Clear output on unexpected error
+                qr_paths = [] # Clear qrs on unexpected error
+            # Return the processed data, QR paths, status messages, and update chatbot_data state
+            return (
+                final_json_output,
+                [str(path) for path in qr_paths], # Return paths as strings for Gradio Gallery
+                "\n".join(processing_status_messages),
+                final_json_output # Update chatbot_data state
+            )
         # --- Download Logic ---
         def download_json_data(data_df: Optional[pd.DataFrame], filename_prefix: str) -> Optional[str]:
+            """Helper function to convert DataFrame to JSON file for download."""
             if data_df is None or data_df.empty:
                 logger.info(f"No data provided for download with prefix '{filename_prefix}'.")
                 return None
                 return None
         def handle_download_full_json(current_chatbot_data_state: Optional[List[Dict]]) -> Optional[str]:
+            """Handler for the 'Download Full JSON' button."""
+            # This function receives the full processed data (List[Dict]) from the chatbot_data state
             if not current_chatbot_data_state:
                 logger.info("No full data available to download.")
                 return None
             try:
+                # Attempt to create a DataFrame from the full data state for consistent output structure
                 # This uses the same flattening logic as the chatbot
                 flat_data = []
                 def flatten_item_for_download(d, parent_key='', sep='_'):
                 for item in current_chatbot_data_state:
                     if isinstance(item, dict):
                         flat_data.append(flatten_item_for_download(item))
+                    # Handle cases where top-level items might not be dicts, wrap them
                     elif isinstance(item, (list, str, int, float, bool, type(None))):
+                         flat_data.append({'item_value': item})
                 if not flat_data:
                      logger.info("Full data flattened to empty list. Nothing to download.")
         def handle_download_filtered_json(current_filtered_df_state: Optional[pd.DataFrame]) -> Optional[str]:
+            """Handler for the 'Download Filtered JSON' button."""
+            # This function receives the already filtered DataFrame from the state
             if current_filtered_df_state is None or current_filtered_df_state.empty:
                 logger.info("No filtered data available to download.")
                 return None
             # Pass the DataFrame directly to the generic download function
             return download_json_data(current_filtered_df_state, "filtered_data")
+        # Connect event handlers within the Blocks context
+        example_btn.click(load_example, inputs=[], outputs=text_input)
+        clear_btn.click(clear_input, inputs=[], outputs=[url_input, file_input, text_input, chatbot_data])
+        process_btn.click(
+            process_inputs,
+            inputs=[url_input, file_input, text_input, combine_data, crawl_depth_slider, generate_qr_toggle],
+            outputs=[output_json, output_gallery, output_text, chatbot_data]
+        ).then(
+            # This .then() is triggered after process_inputs completes and updates output_gallery
+            on_qr_generation,
+            inputs=[output_gallery], # Pass the list of QR paths from the gallery output
+            outputs=[qr_code_paths, enabled_qr_codes] # Update the state variables
+        )
+        # When the viewport tab is selected, update the viewport HTML
+        viewport_tab.select(update_viewport, inputs=[qr_code_paths, enabled_qr_codes], outputs=[viewport_output])
+        # Chatbot send button and text input submit events
+        send_msg_btn.click(
+            respond_to_chat,
+            inputs=[chat_input, chat_history, chatbot_data, filtered_chatbot_df_state],
+            outputs=[chatbot, chatbot_data, filtered_chatbot_df_state]
+        ).then(
+            # Clear the chat input box after sending message
+            lambda: "",
+            inputs=None,
+            outputs=chat_input
+        )
+        chat_input.submit( # Allow submitting by pressing Enter in the text box
+            respond_to_chat,
+            inputs=[chat_input, chat_history, chatbot_data, filtered_chatbot_df_state], # Pass filtered_chatbot_df_state here too
+            outputs=[chatbot, chatbot_data, filtered_chatbot_df_state] # And return it
+        ).then(
+            # Clear the chat input box after submitting
+            lambda: "",
+            inputs=None,
+            outputs=chat_input
+        )
+        # Clear chat history button
+        clear_chat_btn.click(
+            # Clear chat history component and the filtered data state
+            lambda: ([], None),
+            inputs=None,
+            outputs=[chatbot, filtered_chatbot_df_state]
+        )
+        # Download buttons
         download_full_json_btn.click(
             fn=handle_download_full_json,
+            inputs=[chatbot_data], # chatbot_data is the gr.State holding the full dataset (List[Dict])
+            outputs=[download_file_output] # The File component acts as the download trigger
         )
         download_filtered_json_btn.click(
             fn=handle_download_filtered_json,
+            inputs=[filtered_chatbot_df_state], # This state holds the filtered DataFrame
+            outputs=[download_file_output] # The File component acts as the download trigger
         )
         gr.Markdown("""
         ### 🚀 Features
         - **Enhanced URL Scraping**: Extracts HTML text, title, meta description, links, and attempts parsing JSON/XML from URLs based on content type. Supports crawling links up to a specified depth. **(Now performs real fetching)**
         mimetypes.init()
         interface = create_modern_interface()
         interface.launch(
+            share=False, # Set to True to create a public link (requires auth token)
+            debug=False, # Set to True for detailed debug output
+            show_error=True, # Show errors in the UI
+            show_api=False # Hide API endpoint details
         )
     except Exception as e:
         logger.error(f"Application startup error: {e}")
         raise
 if __name__ == "__main__":
+    # Ensure the script is run directly (not imported)
     main()