Spaces:
Sleeping
Sleeping
Update patentwiz/preprocess_data.py
Browse files- patentwiz/preprocess_data.py +37 -50
patentwiz/preprocess_data.py
CHANGED
@@ -83,16 +83,11 @@ def download_weekly_patents(year, month, day, logging):
|
|
83 |
)
|
84 |
return False
|
85 |
|
86 |
-
import os
|
87 |
-
import xml.etree.ElementTree as ET
|
88 |
-
import tempfile
|
89 |
-
|
90 |
-
|
91 |
def filter_rf_patents(patents, keywords=None, fields=None):
|
92 |
"""
|
93 |
Filters patents based on keywords and specified fields, with parsing for raw patent files.
|
94 |
-
Includes enhanced debugging to identify issues.
|
95 |
"""
|
|
|
96 |
|
97 |
if keywords is None:
|
98 |
keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"] # Default keywords
|
@@ -116,7 +111,6 @@ def filter_rf_patents(patents, keywords=None, fields=None):
|
|
116 |
def parse_patent(file_path):
|
117 |
"""
|
118 |
Parses an XML patent file into a structured dictionary.
|
119 |
-
Includes debugging for XML structure issues.
|
120 |
"""
|
121 |
try:
|
122 |
tree = ET.parse(file_path)
|
@@ -142,52 +136,45 @@ def filter_rf_patents(patents, keywords=None, fields=None):
|
|
142 |
st.write(f"Error parsing patent {file_path}: {e}")
|
143 |
return None
|
144 |
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
f.write(patent) # Save raw data to temp file
|
156 |
-
temp_files.append(temp_file_path)
|
157 |
-
|
158 |
-
st.write("Display first 5 raw patents for inspection")
|
159 |
-
for patent in temp_files[:5]:
|
160 |
-
st.write(patent)
|
161 |
-
|
162 |
-
filtered_patents = []
|
163 |
-
for patent_file in temp_files:
|
164 |
-
parsed_patent = parse_patent(patent_file)
|
165 |
-
st.write("Parsed patent data:", parsed_patent) # Log parsed data
|
166 |
if not parsed_patent:
|
167 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
-
# Field-specific matching
|
170 |
-
matched = False
|
171 |
-
for field in fields:
|
172 |
-
field_content = parsed_patent.get(field, "")
|
173 |
-
if not field_content:
|
174 |
-
st.write(f"Field '{field}' is empty for patent:", parsed_patent)
|
175 |
-
continue
|
176 |
-
if any(keyword.lower() in field_content.lower() for keyword in keywords):
|
177 |
-
st.write(f"Match found in field '{field}'")
|
178 |
-
filtered_patents.append(parsed_patent)
|
179 |
-
matched = True
|
180 |
-
break
|
181 |
-
|
182 |
-
# Global fallback if no fields match
|
183 |
-
if not matched:
|
184 |
-
full_text = " ".join(parsed_patent.values())
|
185 |
-
if any(keyword.lower() in full_text.lower() for keyword in keywords):
|
186 |
-
st.write("Match found in global fallback search!")
|
187 |
-
filtered_patents.append(parsed_patent)
|
188 |
-
|
189 |
-
st.write(f"Total filtered patents: {len(filtered_patents)}")
|
190 |
-
return filtered_patents
|
191 |
|
192 |
|
193 |
|
|
|
83 |
)
|
84 |
return False
|
85 |
|
|
|
|
|
|
|
|
|
|
|
86 |
def filter_rf_patents(patents, keywords=None, fields=None):
|
87 |
"""
|
88 |
Filters patents based on keywords and specified fields, with parsing for raw patent files.
|
|
|
89 |
"""
|
90 |
+
import streamlit as st # Use Streamlit for debugging
|
91 |
|
92 |
if keywords is None:
|
93 |
keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"] # Default keywords
|
|
|
111 |
def parse_patent(file_path):
|
112 |
"""
|
113 |
Parses an XML patent file into a structured dictionary.
|
|
|
114 |
"""
|
115 |
try:
|
116 |
tree = ET.parse(file_path)
|
|
|
136 |
st.write(f"Error parsing patent {file_path}: {e}")
|
137 |
return None
|
138 |
|
139 |
+
filtered_patents = []
|
140 |
+
|
141 |
+
# Display first 5 patents for inspection (before parsing)
|
142 |
+
st.write("Debugging: First 5 raw patents for inspection")
|
143 |
+
for patent in patents[:5]:
|
144 |
+
st.write(patent) # Display raw data
|
145 |
+
|
146 |
+
for patent in patents:
|
147 |
+
if isinstance(patent, str):
|
148 |
+
parsed_patent = parse_patent(patent)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
if not parsed_patent:
|
150 |
continue
|
151 |
+
elif isinstance(patent, dict):
|
152 |
+
parsed_patent = patent
|
153 |
+
else:
|
154 |
+
st.write(f"Unknown patent format: {type(patent)}")
|
155 |
+
continue
|
156 |
+
|
157 |
+
# Field-specific matching
|
158 |
+
matched = False
|
159 |
+
for field in fields:
|
160 |
+
field_content = parsed_patent.get(field, "")
|
161 |
+
st.write(f"Checking field '{field}': {field_content}")
|
162 |
+
if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
|
163 |
+
st.write(f"Match found in field '{field}'")
|
164 |
+
filtered_patents.append(parsed_patent)
|
165 |
+
matched = True
|
166 |
+
break
|
167 |
+
|
168 |
+
# Global fallback if no fields match
|
169 |
+
if not matched:
|
170 |
+
full_text = " ".join(parsed_patent.values()) # Combine all fields
|
171 |
+
if any(keyword.lower() in full_text.lower() for keyword in keywords):
|
172 |
+
st.write("Match found in global fallback search!")
|
173 |
+
filtered_patents.append(parsed_patent)
|
174 |
+
|
175 |
+
st.write(f"Total filtered patents: {len(filtered_patents)}")
|
176 |
+
return filtered_patents
|
177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
|
179 |
|
180 |
|