acecalisto3 commited on
Commit
03fc5c6
Β·
verified Β·
1 Parent(s): 4cff691

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +267 -49
app2.py CHANGED
@@ -769,8 +769,6 @@ class EnhancedFileProcessor:
769
  elif extracted_file_path.suffix.lower() in self.archive_extensions:
770
  logger.info(f"Found nested archive '{member.name}', processing recursively.")
771
  dataset.extend(self._process_archive(extracted_file_path, extract_to))
772
- else:
773
- logger.debug(f"Skipping unsupported file in archive: '{member.name}'")
774
  else:
775
  logger.warning(f"Could not get file-like object for {member.name} from tar.")
776
 
@@ -1413,6 +1411,7 @@ def respond_to_chat(
1413
  "I'm equipped to filter your data. Try 'find entries where name contains widget'."
1414
  ])
1415
 
 
1416
  except Exception as e:
1417
  logger.error(f"Chatbot runtime error: {e}")
1418
  response = f"An internal error occurred while processing your request: {e}"
@@ -1420,6 +1419,12 @@ def respond_to_chat(
1420
  # On unexpected error, preserve the current_filtered_df_state rather than clearing or modifying it.
1421
  # new_filtered_df_state = current_filtered_df_state # This line is effectively already done by initialization
1422
 
 
 
 
 
 
 
1423
  if not response: # Final safety net for response, if it's somehow still empty
1424
  response = "I'm not sure how to respond to that. Please try rephrasing or ask for help on available commands."
1425
 
@@ -1646,49 +1651,205 @@ def create_modern_interface():
1646
  clear_chat_btn = gr.Button("Clear Chat History")
1647
 
1648
  # Event handlers must be defined within the Blocks context
1649
- example_btn.click(load_example, inputs=[], outputs=text_input)
1650
- clear_btn.click(clear_input, inputs=[], outputs=[url_input, file_input, text_input, chatbot_data])
1651
 
1652
- process_btn.click(
1653
- process_inputs,
1654
- inputs=[url_input, file_input, text_input, combine_data, crawl_depth_slider, generate_qr_toggle],
1655
- outputs=[output_json, output_gallery, output_text, chatbot_data]
1656
- ).then(
1657
- on_qr_generation,
1658
- inputs=[output_gallery],
1659
- outputs=[qr_code_paths, enabled_qr_codes]
1660
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1661
 
1662
- viewport_tab.select(update_viewport, inputs=[qr_code_paths, enabled_qr_codes], outputs=[viewport_output])
 
 
 
1663
 
1664
- send_msg_btn.click(
1665
- respond_to_chat,
1666
- inputs=[chat_input, chat_history, chatbot_data, filtered_chatbot_df_state],
1667
- outputs=[chatbot, chatbot_data, filtered_chatbot_df_state]
1668
- ).then(
1669
- lambda: "",
1670
- inputs=None,
1671
- outputs=chat_input
1672
- )
1673
 
1674
- chat_input.submit(
1675
- respond_to_chat,
1676
- inputs=[chat_input, chat_history, chatbot_data, filtered_chatbot_df_state], # Pass filtered_chatbot_df_state here too
1677
- outputs=[chatbot, chatbot_data, filtered_chatbot_df_state] # And return it
1678
- ).then(
1679
- lambda: "",
1680
- inputs=None,
1681
- outputs=chat_input
1682
- )
1683
 
1684
- clear_chat_btn.click(
1685
- lambda: ([], None), # Clear chat history and filtered data state
1686
- inputs=None,
1687
- outputs=[chatbot, filtered_chatbot_df_state]
1688
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1689
 
1690
  # --- Download Logic ---
1691
  def download_json_data(data_df: Optional[pd.DataFrame], filename_prefix: str) -> Optional[str]:
 
1692
  if data_df is None or data_df.empty:
1693
  logger.info(f"No data provided for download with prefix '{filename_prefix}'.")
1694
  return None
@@ -1715,13 +1876,14 @@ def create_modern_interface():
1715
  return None
1716
 
1717
  def handle_download_full_json(current_chatbot_data_state: Optional[List[Dict]]) -> Optional[str]:
1718
- # This function receives the full processed data (List[Dict])
 
1719
  if not current_chatbot_data_state:
1720
  logger.info("No full data available to download.")
1721
  return None
1722
 
1723
  try:
1724
- # Attempt to create a DataFrame from the full data state
1725
  # This uses the same flattening logic as the chatbot
1726
  flat_data = []
1727
  def flatten_item_for_download(d, parent_key='', sep='_'):
@@ -1743,8 +1905,10 @@ def create_modern_interface():
1743
  for item in current_chatbot_data_state:
1744
  if isinstance(item, dict):
1745
  flat_data.append(flatten_item_for_download(item))
 
1746
  elif isinstance(item, (list, str, int, float, bool, type(None))):
1747
- flat_data.append({'item_value': item})
 
1748
 
1749
  if not flat_data:
1750
  logger.info("Full data flattened to empty list. Nothing to download.")
@@ -1765,24 +1929,77 @@ def create_modern_interface():
1765
 
1766
 
1767
  def handle_download_filtered_json(current_filtered_df_state: Optional[pd.DataFrame]) -> Optional[str]:
1768
- # This function receives the already filtered DataFrame
 
1769
  if current_filtered_df_state is None or current_filtered_df_state.empty:
1770
  logger.info("No filtered data available to download.")
1771
  return None
1772
  # Pass the DataFrame directly to the generic download function
1773
  return download_json_data(current_filtered_df_state, "filtered_data")
1774
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1775
  download_full_json_btn.click(
1776
  fn=handle_download_full_json,
1777
- inputs=[chatbot_data], # chatbot_data is the gr.State holding the full dataset (List[Dict])
1778
- outputs=[download_file_output]
1779
  )
1780
  download_filtered_json_btn.click(
1781
  fn=handle_download_filtered_json,
1782
- inputs=[filtered_chatbot_df_state], # This state holds the filtered DataFrame
1783
- outputs=[download_file_output]
1784
  )
1785
 
 
1786
  gr.Markdown("""
1787
  ### πŸš€ Features
1788
  - **Enhanced URL Scraping**: Extracts HTML text, title, meta description, links, and attempts parsing JSON/XML from URLs based on content type. Supports crawling links up to a specified depth. **(Now performs real fetching)**
@@ -1818,10 +2035,10 @@ def main():
1818
  mimetypes.init()
1819
  interface = create_modern_interface()
1820
  interface.launch(
1821
- share=False,
1822
- debug=False,
1823
- show_error=True,
1824
- show_api=False
1825
  )
1826
  except Exception as e:
1827
  logger.error(f"Application startup error: {e}")
@@ -1829,4 +2046,5 @@ def main():
1829
  raise
1830
 
1831
  if __name__ == "__main__":
 
1832
  main()
 
769
  elif extracted_file_path.suffix.lower() in self.archive_extensions:
770
  logger.info(f"Found nested archive '{member.name}', processing recursively.")
771
  dataset.extend(self._process_archive(extracted_file_path, extract_to))
 
 
772
  else:
773
  logger.warning(f"Could not get file-like object for {member.name} from tar.")
774
 
 
1411
  "I'm equipped to filter your data. Try 'find entries where name contains widget'."
1412
  ])
1413
 
1414
+ # --- End of main try block ---
1415
  except Exception as e:
1416
  logger.error(f"Chatbot runtime error: {e}")
1417
  response = f"An internal error occurred while processing your request: {e}"
 
1419
  # On unexpected error, preserve the current_filtered_df_state rather than clearing or modifying it.
1420
  # new_filtered_df_state = current_filtered_df_state # This line is effectively already done by initialization
1421
 
1422
+ # --- Finally block (optional, but good practice if cleanup is needed) ---
1423
+ # finally:
1424
+ # # Any cleanup code can go here
1425
+ # pass
1426
+
1427
+
1428
  if not response: # Final safety net for response, if it's somehow still empty
1429
  response = "I'm not sure how to respond to that. Please try rephrasing or ask for help on available commands."
1430
 
 
1651
  clear_chat_btn = gr.Button("Clear Chat History")
1652
 
1653
  # Event handlers must be defined within the Blocks context
 
 
1654
 
1655
+ def load_example():
1656
+ example = {
1657
+ "type": "product_catalog",
1658
+ "items": [
1659
+ {
1660
+ "id": "123",
1661
+ "name": "Premium Widget",
1662
+ "description": "High-quality widget with advanced features",
1663
+ "price": 299.99,
1664
+ "category": "electronics",
1665
+ "tags": ["premium", "featured", "new"]
1666
+ },
1667
+ {
1668
+ "id": "456",
1669
+ "name": "Basic Widget",
1670
+ "description": "Reliable widget for everyday use",
1671
+ "price": 149.99,
1672
+ "category": "electronics",
1673
+ "tags": ["basic", "popular"]
1674
+ }
1675
+ ],
1676
+ "metadata": {
1677
+ "timestamp": datetime.now().isoformat(),
1678
+ "version": "2.0",
1679
+ "source": "example"
1680
+ }
1681
+ }
1682
+ return json.dumps(example, indent=2)
1683
+
1684
+ def clear_input():
1685
+ # Clear all input fields and the chatbot data state
1686
+ return "", None, "", None
1687
+
1688
+ def update_viewport(paths, enabled_states):
1689
+ if not paths:
1690
+ return "<p>No QR codes generated yet.</p>"
1691
+
1692
+ num_qr_codes = len(paths)
1693
+ # Determine grid columns based on the number of QRs, aiming for a roughly square layout
1694
+ cols = math.ceil(math.sqrt(num_qr_codes))
1695
+ cols = max(1, min(cols, 6)) # Clamp columns between 1 and 6
1696
+
1697
+ viewport_html = f'<div class="viewport-container" style="grid-template-columns: repeat({cols}, 1fr);">'
1698
+
1699
+ # Ensure enabled_states is a list of indices if it's None or doesn't match current paths
1700
+ if enabled_states is None or len(enabled_states) != num_qr_codes:
1701
+ enabled_states = list(range(num_qr_codes))
1702
+
1703
+ for i, path in enumerate(paths):
1704
+ is_enabled = i in enabled_states
1705
+ border = "border: 2px solid green;" if is_enabled else "border: 2px solid lightgray;"
1706
+ opacity = "opacity: 1.0;" if is_enabled else "opacity: 0.5;"
1707
+ # Use /file= prefix for Gradio to serve local files
1708
+ viewport_html += f'<div class="viewport-item" id="qr_item_{i}">'
1709
+ viewport_html += f'<img src="/file={path}" style="{border} {opacity}" alt="QR Code {i+1}">'
1710
+ # Add checkbox with data-index for JS to identify which QR it controls
1711
+ viewport_html += f'<label><input type="checkbox" data-index="{i}" {"checked" if is_enabled else ""} onchange="updateEnabledStates(this)"> Enable</label>'
1712
+ viewport_html += '</div>'
1713
+ viewport_html += '</div>'
1714
+
1715
+ return viewport_html
1716
+
1717
+ def on_qr_generation(qr_paths_list):
1718
+ """Handler to initialize enabled_qr_codes state after QR generation."""
1719
+ if qr_paths_list is None:
1720
+ num_qrs = 0
1721
+ else:
1722
+ num_qrs = len(qr_paths_list)
1723
 
1724
+ # Initially enable all generated QR codes
1725
+ initial_enabled_states = list(range(num_qrs))
1726
+ # Return the paths list and the initial enabled states
1727
+ return qr_paths_list, initial_enabled_states
1728
 
1729
+ def process_inputs(urls, files, text, combine, crawl_depth, generate_qr_enabled):
1730
+ """Process all inputs and generate QR codes based on toggle"""
1731
+ results = []
1732
+ processing_status_messages = []
 
 
 
 
 
1733
 
1734
+ url_processor = EnhancedURLProcessor()
1735
+ file_processor = EnhancedFileProcessor()
 
 
 
 
 
 
 
1736
 
1737
+ try:
1738
+ if text and text.strip():
1739
+ try:
1740
+ json_data = json.loads(text)
1741
+ results.append({
1742
+ 'source': 'json_input',
1743
+ 'extracted_data': json_data,
1744
+ 'timestamp': datetime.now().isoformat(),
1745
+ 'processing_notes': ['Parsed from direct JSON input.']
1746
+ })
1747
+ processing_status_messages.append("βœ… Successfully parsed direct JSON input.")
1748
+ except json.JSONDecodeError as e:
1749
+ processing_status_messages.append(f"❌ Invalid JSON format in text input: {str(e)}")
1750
+ logger.error(f"Invalid JSON format in text input: {e}")
1751
+ except Exception as e:
1752
+ processing_status_messages.append(f"❌ Error processing direct JSON input: {str(e)}")
1753
+ logger.error(f"Error processing direct JSON input: {e}")
1754
+
1755
+ if urls and urls.strip():
1756
+ url_list = re.split(r'[,\n]', urls)
1757
+ url_list = [url.strip() for url in url_list if url.strip()]
1758
+ for url in url_list:
1759
+ processing_status_messages.append(f"🌐 Processing URL: {url} with crawl depth {crawl_depth}...")
1760
+ # Call fetch_content_with_depth which handles recursion
1761
+ content_result = url_processor.fetch_content_with_depth(url, max_steps=crawl_depth)
1762
+
1763
+ # The result from fetch_content_with_depth is already structured
1764
+ # It includes the main fetch_result and linked_extractions
1765
+ if content_result: # Check if a result dictionary was returned
1766
+ results.append(content_result)
1767
+ # Provide status based on the fetch_result within the recursive structure
1768
+ main_fetch_status = content_result.get('fetch_result', {}).get('status_code')
1769
+ if main_fetch_status is not None and 200 <= main_fetch_status < 300:
1770
+ processing_status_messages.append(f"βœ… Processed URL: {url} (Level 0, Status: {main_fetch_status})")
1771
+ if content_result.get('processing_notes'):
1772
+ processing_status_messages.append(f" Notes for {url}: {'; '.join(content_result['processing_notes'])}")
1773
+
1774
+ # Count successfully processed linked pages
1775
+ def count_successful_fetches(crawl_result):
1776
+ count = 0
1777
+ if crawl_result and crawl_result.get('fetch_result') is not None:
1778
+ status = crawl_result['fetch_result'].get('status_code')
1779
+ if status is not None and 200 <= status < 300:
1780
+ count += 1
1781
+ for linked_result in crawl_result.get('linked_extractions', []):
1782
+ count += count_successful_fetches(linked_result)
1783
+ return count
1784
+
1785
+ total_attempted_links = len(content_result.get('linked_extractions', []))
1786
+ total_successful_linked = count_successful_fetches({'linked_extractions': content_result.get('linked_extractions', [])}) # Wrap to match expected structure
1787
+
1788
+ if total_attempted_links > 0:
1789
+ processing_status_messages.append(f" Processed {total_successful_linked}/{total_attempted_links} linked pages up to depth {crawl_depth}.")
1790
+
1791
+ else:
1792
+ processing_status_messages.append(f"❌ Failed to fetch or process URL: {url} (Status: {main_fetch_status})")
1793
+ if content_result.get('processing_notes'):
1794
+ processing_status_messages.append(f" Notes for {url}: {'; '.join(content_result['processing_notes'])}")
1795
+ else:
1796
+ processing_status_messages.append(f"❌ Failed to process URL: {url} (No result returned)")
1797
+
1798
+
1799
+ if files:
1800
+ for file in files:
1801
+ processing_status_messages.append(f"πŸ“ Processing file: {file.name}...")
1802
+ file_results = file_processor.process_file(file)
1803
+ if file_results:
1804
+ results.extend(file_results)
1805
+ processing_status_messages.append(f"βœ… Processed file: {file.name}")
1806
+ for res in file_results:
1807
+ if res.get('processing_notes'):
1808
+ processing_status_messages.append(f" Notes for {res.get('filename', 'item')}: {'; '.join(res['processing_notes'])}")
1809
+ else:
1810
+ processing_status_messages.append(f"❌ Failed to process file: {file.name}")
1811
+ # Add a default note if process_file returned empty list without notes
1812
+ if not file_results and file and hasattr(file, 'name'):
1813
+ processing_status_messages.append(f" No results returned for file: {file.name}")
1814
+
1815
+
1816
+ qr_paths = []
1817
+ final_json_output = None
1818
+
1819
+ if results:
1820
+ final_json_output = results # Assign processed data regardless of QR generation
1821
+ if generate_qr_enabled:
1822
+ processing_status_messages.append("βš™οΈ Generating QR codes as requested...")
1823
+ # generate_qr_codes expects a List[Dict]
1824
+ qr_paths = generate_qr_codes(results, combine)
1825
+ if qr_paths:
1826
+ processing_status_messages.append(f"βœ… Successfully generated {len(qr_paths)} QR codes.")
1827
+ else:
1828
+ processing_status_messages.append("❌ Failed to generate QR codes (empty result or error). Check logs.")
1829
+ else:
1830
+ processing_status_messages.append("β˜‘οΈ QR code generation was disabled. Processed data is available.")
1831
+ qr_paths = [] # Ensure it's empty
1832
+ else:
1833
+ processing_status_messages.append("⚠️ No valid content collected from inputs.")
1834
+ final_json_output = [] # Ensure output_json is cleared if no results
1835
+
1836
+ except Exception as e:
1837
+ logger.error(f"Overall processing error in process_inputs: {e}")
1838
+ processing_status_messages.append(f"❌ An unexpected error occurred during processing: {str(e)}")
1839
+ final_json_output = [] # Clear output on unexpected error
1840
+ qr_paths = [] # Clear qrs on unexpected error
1841
+
1842
+ # Return the processed data, QR paths, status messages, and update chatbot_data state
1843
+ return (
1844
+ final_json_output,
1845
+ [str(path) for path in qr_paths], # Return paths as strings for Gradio Gallery
1846
+ "\n".join(processing_status_messages),
1847
+ final_json_output # Update chatbot_data state
1848
+ )
1849
 
1850
  # --- Download Logic ---
1851
  def download_json_data(data_df: Optional[pd.DataFrame], filename_prefix: str) -> Optional[str]:
1852
+ """Helper function to convert DataFrame to JSON file for download."""
1853
  if data_df is None or data_df.empty:
1854
  logger.info(f"No data provided for download with prefix '{filename_prefix}'.")
1855
  return None
 
1876
  return None
1877
 
1878
  def handle_download_full_json(current_chatbot_data_state: Optional[List[Dict]]) -> Optional[str]:
1879
+ """Handler for the 'Download Full JSON' button."""
1880
+ # This function receives the full processed data (List[Dict]) from the chatbot_data state
1881
  if not current_chatbot_data_state:
1882
  logger.info("No full data available to download.")
1883
  return None
1884
 
1885
  try:
1886
+ # Attempt to create a DataFrame from the full data state for consistent output structure
1887
  # This uses the same flattening logic as the chatbot
1888
  flat_data = []
1889
  def flatten_item_for_download(d, parent_key='', sep='_'):
 
1905
  for item in current_chatbot_data_state:
1906
  if isinstance(item, dict):
1907
  flat_data.append(flatten_item_for_download(item))
1908
+ # Handle cases where top-level items might not be dicts, wrap them
1909
  elif isinstance(item, (list, str, int, float, bool, type(None))):
1910
+ flat_data.append({'item_value': item})
1911
+
1912
 
1913
  if not flat_data:
1914
  logger.info("Full data flattened to empty list. Nothing to download.")
 
1929
 
1930
 
1931
  def handle_download_filtered_json(current_filtered_df_state: Optional[pd.DataFrame]) -> Optional[str]:
1932
+ """Handler for the 'Download Filtered JSON' button."""
1933
+ # This function receives the already filtered DataFrame from the state
1934
  if current_filtered_df_state is None or current_filtered_df_state.empty:
1935
  logger.info("No filtered data available to download.")
1936
  return None
1937
  # Pass the DataFrame directly to the generic download function
1938
  return download_json_data(current_filtered_df_state, "filtered_data")
1939
 
1940
+
1941
+ # Connect event handlers within the Blocks context
1942
+ example_btn.click(load_example, inputs=[], outputs=text_input)
1943
+ clear_btn.click(clear_input, inputs=[], outputs=[url_input, file_input, text_input, chatbot_data])
1944
+
1945
+ process_btn.click(
1946
+ process_inputs,
1947
+ inputs=[url_input, file_input, text_input, combine_data, crawl_depth_slider, generate_qr_toggle],
1948
+ outputs=[output_json, output_gallery, output_text, chatbot_data]
1949
+ ).then(
1950
+ # This .then() is triggered after process_inputs completes and updates output_gallery
1951
+ on_qr_generation,
1952
+ inputs=[output_gallery], # Pass the list of QR paths from the gallery output
1953
+ outputs=[qr_code_paths, enabled_qr_codes] # Update the state variables
1954
+ )
1955
+
1956
+ # When the viewport tab is selected, update the viewport HTML
1957
+ viewport_tab.select(update_viewport, inputs=[qr_code_paths, enabled_qr_codes], outputs=[viewport_output])
1958
+
1959
+ # Chatbot send button and text input submit events
1960
+ send_msg_btn.click(
1961
+ respond_to_chat,
1962
+ inputs=[chat_input, chat_history, chatbot_data, filtered_chatbot_df_state],
1963
+ outputs=[chatbot, chatbot_data, filtered_chatbot_df_state]
1964
+ ).then(
1965
+ # Clear the chat input box after sending message
1966
+ lambda: "",
1967
+ inputs=None,
1968
+ outputs=chat_input
1969
+ )
1970
+
1971
+ chat_input.submit( # Allow submitting by pressing Enter in the text box
1972
+ respond_to_chat,
1973
+ inputs=[chat_input, chat_history, chatbot_data, filtered_chatbot_df_state], # Pass filtered_chatbot_df_state here too
1974
+ outputs=[chatbot, chatbot_data, filtered_chatbot_df_state] # And return it
1975
+ ).then(
1976
+ # Clear the chat input box after submitting
1977
+ lambda: "",
1978
+ inputs=None,
1979
+ outputs=chat_input
1980
+ )
1981
+
1982
+ # Clear chat history button
1983
+ clear_chat_btn.click(
1984
+ # Clear chat history component and the filtered data state
1985
+ lambda: ([], None),
1986
+ inputs=None,
1987
+ outputs=[chatbot, filtered_chatbot_df_state]
1988
+ )
1989
+
1990
+ # Download buttons
1991
  download_full_json_btn.click(
1992
  fn=handle_download_full_json,
1993
+ inputs=[chatbot_data], # chatbot_data is the gr.State holding the full dataset (List[Dict])
1994
+ outputs=[download_file_output] # The File component acts as the download trigger
1995
  )
1996
  download_filtered_json_btn.click(
1997
  fn=handle_download_filtered_json,
1998
+ inputs=[filtered_chatbot_df_state], # This state holds the filtered DataFrame
1999
+ outputs=[download_file_output] # The File component acts as the download trigger
2000
  )
2001
 
2002
+
2003
  gr.Markdown("""
2004
  ### πŸš€ Features
2005
  - **Enhanced URL Scraping**: Extracts HTML text, title, meta description, links, and attempts parsing JSON/XML from URLs based on content type. Supports crawling links up to a specified depth. **(Now performs real fetching)**
 
2035
  mimetypes.init()
2036
  interface = create_modern_interface()
2037
  interface.launch(
2038
+ share=False, # Set to True to create a public link (requires auth token)
2039
+ debug=False, # Set to True for detailed debug output
2040
+ show_error=True, # Show errors in the UI
2041
+ show_api=False # Hide API endpoint details
2042
  )
2043
  except Exception as e:
2044
  logger.error(f"Application startup error: {e}")
 
2046
  raise
2047
 
2048
  if __name__ == "__main__":
2049
+ # Ensure the script is run directly (not imported)
2050
  main()