Spaces:
Sleeping
Sleeping
Update patentwiz/preprocess_data.py
Browse files- patentwiz/preprocess_data.py +51 -36
patentwiz/preprocess_data.py
CHANGED
@@ -83,9 +83,15 @@ def download_weekly_patents(year, month, day, logging):
|
|
83 |
)
|
84 |
return False
|
85 |
|
|
|
|
|
|
|
|
|
|
|
86 |
def filter_rf_patents(patents, keywords=None, fields=None):
|
87 |
"""
|
88 |
Filters patents based on keywords and specified fields, with parsing for raw patent files.
|
|
|
89 |
"""
|
90 |
import streamlit as st # Use Streamlit for debugging
|
91 |
|
@@ -136,44 +142,53 @@ def filter_rf_patents(patents, keywords=None, fields=None):
|
|
136 |
st.write(f"Error parsing patent {file_path}: {e}")
|
137 |
return None
|
138 |
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
if not parsed_patent:
|
150 |
continue
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
filtered_patents.append(parsed_patent)
|
174 |
-
|
175 |
-
st.write(f"Total filtered patents: {len(filtered_patents)}")
|
176 |
-
return filtered_patents
|
177 |
|
178 |
|
179 |
|
|
|
83 |
)
|
84 |
return False
|
85 |
|
86 |
+
import os
|
87 |
+
import xml.etree.ElementTree as ET
|
88 |
+
import tempfile
|
89 |
+
|
90 |
+
|
91 |
def filter_rf_patents(patents, keywords=None, fields=None):
|
92 |
"""
|
93 |
Filters patents based on keywords and specified fields, with parsing for raw patent files.
|
94 |
+
Supports temporary storage for environments like Hugging Face Spaces.
|
95 |
"""
|
96 |
import streamlit as st # Use Streamlit for debugging
|
97 |
|
|
|
142 |
st.write(f"Error parsing patent {file_path}: {e}")
|
143 |
return None
|
144 |
|
145 |
+
# Use temporary directory
|
146 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
147 |
+
st.write(f"Using temporary directory: {temp_dir}")
|
148 |
+
|
149 |
+
# Simulate saving raw patent files (e.g., for testing)
|
150 |
+
temp_files = []
|
151 |
+
for i, patent in enumerate(patents):
|
152 |
+
if isinstance(patent, str) and patent.endswith(".txt"):
|
153 |
+
# Save fake patent data as text files in the temp directory
|
154 |
+
temp_file_path = os.path.join(temp_dir, f"patent_{i}.txt")
|
155 |
+
with open(temp_file_path, "w") as f:
|
156 |
+
f.write(patent)
|
157 |
+
temp_files.append(temp_file_path)
|
158 |
+
|
159 |
+
# Display first 5 patents for inspection (before parsing)
|
160 |
+
st.write("Debugging: First 5 raw patents for inspection")
|
161 |
+
for patent in temp_files[:5]:
|
162 |
+
st.write(patent) # Display file paths
|
163 |
+
|
164 |
+
filtered_patents = []
|
165 |
+
|
166 |
+
for patent_file in temp_files:
|
167 |
+
parsed_patent = parse_patent(patent_file)
|
168 |
if not parsed_patent:
|
169 |
continue
|
170 |
+
|
171 |
+
# Field-specific matching
|
172 |
+
matched = False
|
173 |
+
for field in fields:
|
174 |
+
field_content = parsed_patent.get(field, "")
|
175 |
+
st.write(f"Checking field '{field}': {field_content}")
|
176 |
+
if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
|
177 |
+
st.write(f"Match found in field '{field}'")
|
178 |
+
filtered_patents.append(parsed_patent)
|
179 |
+
matched = True
|
180 |
+
break
|
181 |
+
|
182 |
+
# Global fallback if no fields match
|
183 |
+
if not matched:
|
184 |
+
full_text = " ".join(parsed_patent.values()) # Combine all fields
|
185 |
+
if any(keyword.lower() in full_text.lower() for keyword in keywords):
|
186 |
+
st.write("Match found in global fallback search!")
|
187 |
+
filtered_patents.append(parsed_patent)
|
188 |
+
|
189 |
+
st.write(f"Total filtered patents: {len(filtered_patents)}")
|
190 |
+
return filtered_patents
|
191 |
+
|
|
|
|
|
|
|
|
|
192 |
|
193 |
|
194 |
|