Spaces:

Arxived
/

search-patents-datewise

Sleeping

App Files Files Community

DrishtiSharma commited on Dec 27, 2024

Commit

c675721

verified ·

1 Parent(s): 5e84d1b

Update patentwiz/preprocess_data.py

Browse files

Files changed (1) hide show

patentwiz/preprocess_data.py +30 -17

patentwiz/preprocess_data.py CHANGED Viewed

@@ -84,7 +84,8 @@ def download_weekly_patents(year, month, day, logging):
 def filter_rf_patents(patents, keywords=None, fields=None):
     """
-    Filters patents based on keywords and specified fields, with fallback for inconsistent field names.
     Parameters:
         patents (list): List of patent texts (as strings or structured data).
         keywords (list): Keywords to filter patents.
@@ -111,34 +112,46 @@ def filter_rf_patents(patents, keywords=None, fields=None):
     filtered_patents = []
     for patent in patents:
-        # Debugging: Print patent data
         print(f"Processing patent: {patent}")
-        # Normalize field names in the patent dictionary
-        if isinstance(patent, dict):
             normalized_patent = {}
             for field, content in patent.items():
                 normalized_field = FIELD_NAME_MAPPING.get(field, field)  # Map to standard field name
                 normalized_patent[normalized_field] = content
             patent = normalized_patent
-        # Field-specific match
-        matched = False
-        for field in fields:
-            field_content = patent.get(field, "")
-            if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
-                filtered_patents.append(patent)
-                matched = True
-                break
-        # Global fallback if no fields match
-        if not matched:
-            full_text = " ".join(patent.values())  # Combine all fields into one string
-            if any(keyword.lower() in full_text.lower() for keyword in keywords):
-                filtered_patents.append(patent)
     return filtered_patents
 def extract_patents(year, month, day, logging):
     """
     This function reads a patent file in XML format, splits it into individual patents, parses each

 def filter_rf_patents(patents, keywords=None, fields=None):
     """
+    Filters patents based on keywords and specified fields, with a fallback for inconsistent field names.
+    Handles both string and dictionary-type patent representations.
     Parameters:
         patents (list): List of patent texts (as strings or structured data).
         keywords (list): Keywords to filter patents.
     filtered_patents = []
     for patent in patents:
+        # Debugging: Print patent data type
         print(f"Processing patent: {patent}")
+        # Case 1: Handle string-type patents (global search)
+        if isinstance(patent, str):
+            if any(keyword.lower() in patent.lower() for keyword in keywords):
+                filtered_patents.append(patent)
+                continue
+        # Case 2: Handle dictionary-type patents
+        elif isinstance(patent, dict):
+            # Normalize field names in the patent dictionary
             normalized_patent = {}
             for field, content in patent.items():
                 normalized_field = FIELD_NAME_MAPPING.get(field, field)  # Map to standard field name
                 normalized_patent[normalized_field] = content
             patent = normalized_patent
+            # Field-specific match
+            matched = False
+            for field in fields:
+                field_content = patent.get(field, "")
+                if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
+                    filtered_patents.append(patent)
+                    matched = True
+                    break
+            # Global fallback if no fields match
+            if not matched:
+                full_text = " ".join(patent.values())  # Combine all fields into one string
+                if any(keyword.lower() in full_text.lower() for keyword in keywords):
+                    filtered_patents.append(patent)
+        else:
+            # Handle unexpected data formats gracefully
+            print(f"Unknown patent format: {type(patent)}")
     return filtered_patents
 def extract_patents(year, month, day, logging):
     """
     This function reads a patent file in XML format, splits it into individual patents, parses each