Spaces:

Arxived
/

search-patents-datewise

Sleeping

App Files Files Community

DrishtiSharma commited on Dec 27, 2024

Commit

c9e8376

verified ·

1 Parent(s): 0b7fc3c

Update patentwiz/preprocess_data.py

Browse files

Files changed (1) hide show

patentwiz/preprocess_data.py +37 -50

patentwiz/preprocess_data.py CHANGED Viewed

@@ -83,16 +83,11 @@ def download_weekly_patents(year, month, day, logging):
         )
         return False
-import os
-import xml.etree.ElementTree as ET
-import tempfile
 def filter_rf_patents(patents, keywords=None, fields=None):
     """
     Filters patents based on keywords and specified fields, with parsing for raw patent files.
-    Includes enhanced debugging to identify issues.
     """
     if keywords is None:
         keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"]  # Default keywords
@@ -116,7 +111,6 @@ def filter_rf_patents(patents, keywords=None, fields=None):
     def parse_patent(file_path):
         """
         Parses an XML patent file into a structured dictionary.
-        Includes debugging for XML structure issues.
         """
         try:
             tree = ET.parse(file_path)
@@ -142,52 +136,45 @@ def filter_rf_patents(patents, keywords=None, fields=None):
             st.write(f"Error parsing patent {file_path}: {e}")
             return None
-    # Use temporary directory for patents
-    with tempfile.TemporaryDirectory() as temp_dir:
-        st.write(f"Using temporary directory: {temp_dir}")
-        # Save and inspect raw patent data
-        temp_files = []
-        for i, patent in enumerate(patents):
-            if isinstance(patent, str) and patent.endswith(".txt"):
-                temp_file_path = os.path.join(temp_dir, f"patent_{i}.txt")
-                with open(temp_file_path, "w") as f:
-                    f.write(patent)  # Save raw data to temp file
-                temp_files.append(temp_file_path)
-        st.write("Display first 5 raw patents for inspection")
-        for patent in temp_files[:5]:
-            st.write(patent)
-        filtered_patents = []
-        for patent_file in temp_files:
-            parsed_patent = parse_patent(patent_file)
-            st.write("Parsed patent data:", parsed_patent)  # Log parsed data
             if not parsed_patent:
                 continue
-            # Field-specific matching
-            matched = False
-            for field in fields:
-                field_content = parsed_patent.get(field, "")
-                if not field_content:
-                    st.write(f"Field '{field}' is empty for patent:", parsed_patent)
-                    continue
-                if any(keyword.lower() in field_content.lower() for keyword in keywords):
-                    st.write(f"Match found in field '{field}'")
-                    filtered_patents.append(parsed_patent)
-                    matched = True
-                    break
-            # Global fallback if no fields match
-            if not matched:
-                full_text = " ".join(parsed_patent.values())
-                if any(keyword.lower() in full_text.lower() for keyword in keywords):
-                    st.write("Match found in global fallback search!")
-                    filtered_patents.append(parsed_patent)
-        st.write(f"Total filtered patents: {len(filtered_patents)}")
-        return filtered_patents

         )
         return False
 def filter_rf_patents(patents, keywords=None, fields=None):
     """
     Filters patents based on keywords and specified fields, with parsing for raw patent files.
     """
+    import streamlit as st  # Use Streamlit for debugging
     if keywords is None:
         keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"]  # Default keywords
     def parse_patent(file_path):
         """
         Parses an XML patent file into a structured dictionary.
         """
         try:
             tree = ET.parse(file_path)
             st.write(f"Error parsing patent {file_path}: {e}")
             return None
+    filtered_patents = []
+    # Display first 5 patents for inspection (before parsing)
+    st.write("Debugging: First 5 raw patents for inspection")
+    for patent in patents[:5]:
+        st.write(patent)  # Display raw data
+    for patent in patents:
+        if isinstance(patent, str):
+            parsed_patent = parse_patent(patent)
             if not parsed_patent:
                 continue
+        elif isinstance(patent, dict):
+            parsed_patent = patent
+        else:
+            st.write(f"Unknown patent format: {type(patent)}")
+            continue
+        # Field-specific matching
+        matched = False
+        for field in fields:
+            field_content = parsed_patent.get(field, "")
+            st.write(f"Checking field '{field}': {field_content}")
+            if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
+                st.write(f"Match found in field '{field}'")
+                filtered_patents.append(parsed_patent)
+                matched = True
+                break
+        # Global fallback if no fields match
+        if not matched:
+            full_text = " ".join(parsed_patent.values())  # Combine all fields
+            if any(keyword.lower() in full_text.lower() for keyword in keywords):
+                st.write("Match found in global fallback search!")
+                filtered_patents.append(parsed_patent)
+    st.write(f"Total filtered patents: {len(filtered_patents)}")
+    return filtered_patents