Spaces:

Arxived
/

search-patents-datewise

Sleeping

App Files Files Community

DrishtiSharma commited on Dec 27, 2024

Commit

ee17e02

verified ·

1 Parent(s): 903ffe4

Update patentwiz/preprocess_data.py

Browse files

Files changed (1) hide show

patentwiz/preprocess_data.py +51 -36

patentwiz/preprocess_data.py CHANGED Viewed

@@ -83,9 +83,15 @@ def download_weekly_patents(year, month, day, logging):
         )
         return False
 def filter_rf_patents(patents, keywords=None, fields=None):
     """
     Filters patents based on keywords and specified fields, with parsing for raw patent files.
     """
     import streamlit as st  # Use Streamlit for debugging
@@ -136,44 +142,53 @@ def filter_rf_patents(patents, keywords=None, fields=None):
             st.write(f"Error parsing patent {file_path}: {e}")
             return None
-    filtered_patents = []
-    # Display first 5 patents for inspection (before parsing)
-    st.write("Debugging: First 5 raw patents for inspection")
-    for patent in patents[:5]:
-        st.write(patent)  # Display raw data
-    for patent in patents:
-        if isinstance(patent, str):
-            parsed_patent = parse_patent(patent)
             if not parsed_patent:
                 continue
-        elif isinstance(patent, dict):
-            parsed_patent = patent
-        else:
-            st.write(f"Unknown patent format: {type(patent)}")
-            continue
-        # Field-specific matching
-        matched = False
-        for field in fields:
-            field_content = parsed_patent.get(field, "")
-            st.write(f"Checking field '{field}': {field_content}")
-            if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
-                st.write(f"Match found in field '{field}'")
-                filtered_patents.append(parsed_patent)
-                matched = True
-                break
-        # Global fallback if no fields match
-        if not matched:
-            full_text = " ".join(parsed_patent.values())  # Combine all fields
-            if any(keyword.lower() in full_text.lower() for keyword in keywords):
-                st.write("Match found in global fallback search!")
-                filtered_patents.append(parsed_patent)
-    st.write(f"Total filtered patents: {len(filtered_patents)}")
-    return filtered_patents

         )
         return False
+import os
+import xml.etree.ElementTree as ET
+import tempfile
 def filter_rf_patents(patents, keywords=None, fields=None):
     """
     Filters patents based on keywords and specified fields, with parsing for raw patent files.
+    Supports temporary storage for environments like Hugging Face Spaces.
     """
     import streamlit as st  # Use Streamlit for debugging
             st.write(f"Error parsing patent {file_path}: {e}")
             return None
+    # Use temporary directory
+    with tempfile.TemporaryDirectory() as temp_dir:
+        st.write(f"Using temporary directory: {temp_dir}")
+        # Simulate saving raw patent files (e.g., for testing)
+        temp_files = []
+        for i, patent in enumerate(patents):
+            if isinstance(patent, str) and patent.endswith(".txt"):
+                # Save fake patent data as text files in the temp directory
+                temp_file_path = os.path.join(temp_dir, f"patent_{i}.txt")
+                with open(temp_file_path, "w") as f:
+                    f.write(patent)
+                temp_files.append(temp_file_path)
+        # Display first 5 patents for inspection (before parsing)
+        st.write("Debugging: First 5 raw patents for inspection")
+        for patent in temp_files[:5]:
+            st.write(patent)  # Display file paths
+        filtered_patents = []
+        for patent_file in temp_files:
+            parsed_patent = parse_patent(patent_file)
             if not parsed_patent:
                 continue
+            # Field-specific matching
+            matched = False
+            for field in fields:
+                field_content = parsed_patent.get(field, "")
+                st.write(f"Checking field '{field}': {field_content}")
+                if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
+                    st.write(f"Match found in field '{field}'")
+                    filtered_patents.append(parsed_patent)
+                    matched = True
+                    break
+            # Global fallback if no fields match
+            if not matched:
+                full_text = " ".join(parsed_patent.values())  # Combine all fields
+                if any(keyword.lower() in full_text.lower() for keyword in keywords):
+                    st.write("Match found in global fallback search!")
+                    filtered_patents.append(parsed_patent)
+        st.write(f"Total filtered patents: {len(filtered_patents)}")
+        return filtered_patents