Spaces:

Arxived
/

search-patents-datewise

Sleeping

App Files Files Community

DrishtiSharma commited on Dec 27, 2024

Commit

dd6b5f9

verified ·

1 Parent(s): 9b50deb

Update patentwiz/preprocess_data.py

Browse files

Files changed (1) hide show

patentwiz/preprocess_data.py +62 -37

patentwiz/preprocess_data.py CHANGED Viewed

@@ -85,9 +85,9 @@ def download_weekly_patents(year, month, day, logging):
 def filter_rf_patents(patents, keywords=None, fields=None):
     """
-    Filters patents based on keywords and specified fields, with a fallback for inconsistent field names.
-    Includes Streamlit-based debugging to display progress and results in the UI.
     """
     if keywords is None:
         keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"]  # Default keywords
@@ -103,49 +103,74 @@ def filter_rf_patents(patents, keywords=None, fields=None):
         "claims": "Claims",
         "CLAIMS": "Claims",
         "detailed description": "Detailed Description",
-        "DETAILED DESCRIPTION": "Detailed Description"
     }
     filtered_patents = []
-    # Display first 5 patents for inspection
-    st.write("Display First 5 patents for inspection")
     for patent in patents[:5]:
-        st.json(patent)
     for patent in patents:
-        if isinstance(patent, str):
-            # Global keyword search for string-type patents
-            if any(keyword.lower() in patent.lower() for keyword in keywords):
-                st.write(f"Match found in string patent!")
-                filtered_patents.append(patent)
-        elif isinstance(patent, dict):
-            # Normalize field names
-            normalized_patent = {}
-            for field, content in patent.items():
-                # Map field names to standard format
-                normalized_field = FIELD_NAME_MAPPING.get(field, field)
-                normalized_patent[normalized_field] = content
-            # Field-specific match
-            matched = False
-            for field in fields:
-                field_content = normalized_patent.get(field, "")
-                st.write(f"Checking field '{field}': {field_content}")
-                if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
-                    st.write(f"Match found in field '{field}'")
-                    filtered_patents.append(normalized_patent)
-                    matched = True
-                    break
-            # Global fallback if no fields match
-            if not matched:
-                full_text = " ".join(normalized_patent.values())  # Combine all fields
-                if any(keyword.lower() in full_text.lower() for keyword in keywords):
-                    st.write(f"Match found in global search!")
-                    filtered_patents.append(normalized_patent)
         else:
-            st.write(f"Unknown patent format: {type(patent)}")  # Handle unexpected data formats
     st.write(f"Total filtered patents: {len(filtered_patents)}")
     return filtered_patents

 def filter_rf_patents(patents, keywords=None, fields=None):
     """
+    Filters patents based on keywords and specified fields, with parsing for raw patent files.
     """
+    import streamlit as st  # Use Streamlit for debugging
     if keywords is None:
         keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"]  # Default keywords
         "claims": "Claims",
         "CLAIMS": "Claims",
         "detailed description": "Detailed Description",
+        "DETAILED DESCRIPTION": "Detailed Description",
+        "title": "Title",
+        "TITLE": "Title",
     }
+    def parse_patent(file_path):
+        """
+        Parses an XML patent file into a structured dictionary.
+        """
+        try:
+            tree = ET.parse(file_path)
+            root = tree.getroot()
+            # Extract fields from XML (adjust based on actual XML structure)
+            patent_data = {
+                "Title": root.findtext(".//title", default=""),
+                "Abstract": root.findtext(".//abstract", default=""),
+                "Summary": root.findtext(".//summary", default=""),
+                "Claims": root.findtext(".//claims", default=""),
+                "Detailed Description": root.findtext(".//detailedDescription", default=""),
+            }
+            # Normalize field names
+            normalized_patent = {}
+            for field, content in patent_data.items():
+                normalized_field = FIELD_NAME_MAPPING.get(field, field)
+                normalized_patent[normalized_field] = content.strip() if content else ""
+            return normalized_patent
+        except Exception as e:
+            st.write(f"Error parsing patent {file_path}: {e}")
+            return None
     filtered_patents = []
+    # Display first 5 patents for inspection (before parsing)
+    st.write("Debugging: First 5 raw patents for inspection")
     for patent in patents[:5]:
+        st.write(patent)  # Display raw data
     for patent in patents:
+        if isinstance(patent, str):
+            parsed_patent = parse_patent(patent)
+            if not parsed_patent:
+                continue
+        elif isinstance(patent, dict):
+            parsed_patent = patent
         else:
+            st.write(f"Unknown patent format: {type(patent)}")
+            continue
+        # Field-specific matching
+        matched = False
+        for field in fields:
+            field_content = parsed_patent.get(field, "")
+            st.write(f"Checking field '{field}': {field_content}")
+            if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
+                st.write(f"Match found in field '{field}'")
+                filtered_patents.append(parsed_patent)
+                matched = True
+                break
+        # Global fallback if no fields match
+        if not matched:
+            full_text = " ".join(parsed_patent.values())  # Combine all fields
+            if any(keyword.lower() in full_text.lower() for keyword in keywords):
+                st.write("Match found in global fallback search!")
+                filtered_patents.append(parsed_patent)
     st.write(f"Total filtered patents: {len(filtered_patents)}")
     return filtered_patents