Spaces:

Arxived
/

search-patents-datewise

Sleeping

App Files Files Community

DrishtiSharma commited on Dec 27, 2024

Commit

0b7fc3c

verified ·

1 Parent(s): ee17e02

Update patentwiz/preprocess_data.py

Browse files

Files changed (1) hide show

patentwiz/preprocess_data.py +13 -15

patentwiz/preprocess_data.py CHANGED Viewed

@@ -91,9 +91,8 @@ import tempfile
 def filter_rf_patents(patents, keywords=None, fields=None):
     """
     Filters patents based on keywords and specified fields, with parsing for raw patent files.
-    Supports temporary storage for environments like Hugging Face Spaces.
     """
-    import streamlit as st  # Use Streamlit for debugging
     if keywords is None:
         keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"]  # Default keywords
@@ -117,6 +116,7 @@ def filter_rf_patents(patents, keywords=None, fields=None):
     def parse_patent(file_path):
         """
         Parses an XML patent file into a structured dictionary.
         """
         try:
             tree = ET.parse(file_path)
@@ -142,29 +142,27 @@ def filter_rf_patents(patents, keywords=None, fields=None):
             st.write(f"Error parsing patent {file_path}: {e}")
             return None
-    # Use temporary directory
     with tempfile.TemporaryDirectory() as temp_dir:
         st.write(f"Using temporary directory: {temp_dir}")
-        # Simulate saving raw patent files (e.g., for testing)
         temp_files = []
         for i, patent in enumerate(patents):
             if isinstance(patent, str) and patent.endswith(".txt"):
-                # Save fake patent data as text files in the temp directory
                 temp_file_path = os.path.join(temp_dir, f"patent_{i}.txt")
                 with open(temp_file_path, "w") as f:
-                    f.write(patent)
                 temp_files.append(temp_file_path)
-        # Display first 5 patents for inspection (before parsing)
-        st.write("Debugging: First 5 raw patents for inspection")
         for patent in temp_files[:5]:
-            st.write(patent)  # Display file paths
         filtered_patents = []
         for patent_file in temp_files:
             parsed_patent = parse_patent(patent_file)
             if not parsed_patent:
                 continue
@@ -172,8 +170,10 @@ def filter_rf_patents(patents, keywords=None, fields=None):
             matched = False
             for field in fields:
                 field_content = parsed_patent.get(field, "")
-                st.write(f"Checking field '{field}': {field_content}")
-                if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
                     st.write(f"Match found in field '{field}'")
                     filtered_patents.append(parsed_patent)
                     matched = True
@@ -181,7 +181,7 @@ def filter_rf_patents(patents, keywords=None, fields=None):
             # Global fallback if no fields match
             if not matched:
-                full_text = " ".join(parsed_patent.values())  # Combine all fields
                 if any(keyword.lower() in full_text.lower() for keyword in keywords):
                     st.write("Match found in global fallback search!")
                     filtered_patents.append(parsed_patent)
@@ -191,8 +191,6 @@ def filter_rf_patents(patents, keywords=None, fields=None):
 def extract_patents(year, month, day, logging):
     """
     This function reads a patent file in XML format, splits it into individual patents, parses each

 def filter_rf_patents(patents, keywords=None, fields=None):
     """
     Filters patents based on keywords and specified fields, with parsing for raw patent files.
+    Includes enhanced debugging to identify issues.
     """
     if keywords is None:
         keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"]  # Default keywords
     def parse_patent(file_path):
         """
         Parses an XML patent file into a structured dictionary.
+        Includes debugging for XML structure issues.
         """
         try:
             tree = ET.parse(file_path)
             st.write(f"Error parsing patent {file_path}: {e}")
             return None
+    # Use temporary directory for patents
     with tempfile.TemporaryDirectory() as temp_dir:
         st.write(f"Using temporary directory: {temp_dir}")
+        # Save and inspect raw patent data
         temp_files = []
         for i, patent in enumerate(patents):
             if isinstance(patent, str) and patent.endswith(".txt"):
                 temp_file_path = os.path.join(temp_dir, f"patent_{i}.txt")
                 with open(temp_file_path, "w") as f:
+                    f.write(patent)  # Save raw data to temp file
                 temp_files.append(temp_file_path)
+        st.write("Display first 5 raw patents for inspection")
         for patent in temp_files[:5]:
+            st.write(patent)
         filtered_patents = []
         for patent_file in temp_files:
             parsed_patent = parse_patent(patent_file)
+            st.write("Parsed patent data:", parsed_patent)  # Log parsed data
             if not parsed_patent:
                 continue
             matched = False
             for field in fields:
                 field_content = parsed_patent.get(field, "")
+                if not field_content:
+                    st.write(f"Field '{field}' is empty for patent:", parsed_patent)
+                    continue
+                if any(keyword.lower() in field_content.lower() for keyword in keywords):
                     st.write(f"Match found in field '{field}'")
                     filtered_patents.append(parsed_patent)
                     matched = True
             # Global fallback if no fields match
             if not matched:
+                full_text = " ".join(parsed_patent.values())
                 if any(keyword.lower() in full_text.lower() for keyword in keywords):
                     st.write("Match found in global fallback search!")
                     filtered_patents.append(parsed_patent)
 def extract_patents(year, month, day, logging):
     """
     This function reads a patent file in XML format, splits it into individual patents, parses each