Spaces:
Sleeping
Sleeping
Update patentwiz/preprocess_data.py
Browse files- patentwiz/preprocess_data.py +13 -15
patentwiz/preprocess_data.py
CHANGED
@@ -91,9 +91,8 @@ import tempfile
|
|
91 |
def filter_rf_patents(patents, keywords=None, fields=None):
|
92 |
"""
|
93 |
Filters patents based on keywords and specified fields, with parsing for raw patent files.
|
94 |
-
|
95 |
"""
|
96 |
-
import streamlit as st # Use Streamlit for debugging
|
97 |
|
98 |
if keywords is None:
|
99 |
keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"] # Default keywords
|
@@ -117,6 +116,7 @@ def filter_rf_patents(patents, keywords=None, fields=None):
|
|
117 |
def parse_patent(file_path):
|
118 |
"""
|
119 |
Parses an XML patent file into a structured dictionary.
|
|
|
120 |
"""
|
121 |
try:
|
122 |
tree = ET.parse(file_path)
|
@@ -142,29 +142,27 @@ def filter_rf_patents(patents, keywords=None, fields=None):
|
|
142 |
st.write(f"Error parsing patent {file_path}: {e}")
|
143 |
return None
|
144 |
|
145 |
-
# Use temporary directory
|
146 |
with tempfile.TemporaryDirectory() as temp_dir:
|
147 |
st.write(f"Using temporary directory: {temp_dir}")
|
148 |
|
149 |
-
#
|
150 |
temp_files = []
|
151 |
for i, patent in enumerate(patents):
|
152 |
if isinstance(patent, str) and patent.endswith(".txt"):
|
153 |
-
# Save fake patent data as text files in the temp directory
|
154 |
temp_file_path = os.path.join(temp_dir, f"patent_{i}.txt")
|
155 |
with open(temp_file_path, "w") as f:
|
156 |
-
f.write(patent)
|
157 |
temp_files.append(temp_file_path)
|
158 |
|
159 |
-
|
160 |
-
st.write("Debugging: First 5 raw patents for inspection")
|
161 |
for patent in temp_files[:5]:
|
162 |
-
st.write(patent)
|
163 |
|
164 |
filtered_patents = []
|
165 |
-
|
166 |
for patent_file in temp_files:
|
167 |
parsed_patent = parse_patent(patent_file)
|
|
|
168 |
if not parsed_patent:
|
169 |
continue
|
170 |
|
@@ -172,8 +170,10 @@ def filter_rf_patents(patents, keywords=None, fields=None):
|
|
172 |
matched = False
|
173 |
for field in fields:
|
174 |
field_content = parsed_patent.get(field, "")
|
175 |
-
|
176 |
-
|
|
|
|
|
177 |
st.write(f"Match found in field '{field}'")
|
178 |
filtered_patents.append(parsed_patent)
|
179 |
matched = True
|
@@ -181,7 +181,7 @@ def filter_rf_patents(patents, keywords=None, fields=None):
|
|
181 |
|
182 |
# Global fallback if no fields match
|
183 |
if not matched:
|
184 |
-
full_text = " ".join(parsed_patent.values())
|
185 |
if any(keyword.lower() in full_text.lower() for keyword in keywords):
|
186 |
st.write("Match found in global fallback search!")
|
187 |
filtered_patents.append(parsed_patent)
|
@@ -191,8 +191,6 @@ def filter_rf_patents(patents, keywords=None, fields=None):
|
|
191 |
|
192 |
|
193 |
|
194 |
-
|
195 |
-
|
196 |
def extract_patents(year, month, day, logging):
|
197 |
"""
|
198 |
This function reads a patent file in XML format, splits it into individual patents, parses each
|
|
|
91 |
def filter_rf_patents(patents, keywords=None, fields=None):
|
92 |
"""
|
93 |
Filters patents based on keywords and specified fields, with parsing for raw patent files.
|
94 |
+
Includes enhanced debugging to identify issues.
|
95 |
"""
|
|
|
96 |
|
97 |
if keywords is None:
|
98 |
keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"] # Default keywords
|
|
|
116 |
def parse_patent(file_path):
|
117 |
"""
|
118 |
Parses an XML patent file into a structured dictionary.
|
119 |
+
Includes debugging for XML structure issues.
|
120 |
"""
|
121 |
try:
|
122 |
tree = ET.parse(file_path)
|
|
|
142 |
st.write(f"Error parsing patent {file_path}: {e}")
|
143 |
return None
|
144 |
|
145 |
+
# Use temporary directory for patents
|
146 |
with tempfile.TemporaryDirectory() as temp_dir:
|
147 |
st.write(f"Using temporary directory: {temp_dir}")
|
148 |
|
149 |
+
# Save and inspect raw patent data
|
150 |
temp_files = []
|
151 |
for i, patent in enumerate(patents):
|
152 |
if isinstance(patent, str) and patent.endswith(".txt"):
|
|
|
153 |
temp_file_path = os.path.join(temp_dir, f"patent_{i}.txt")
|
154 |
with open(temp_file_path, "w") as f:
|
155 |
+
f.write(patent) # Save raw data to temp file
|
156 |
temp_files.append(temp_file_path)
|
157 |
|
158 |
+
st.write("Display first 5 raw patents for inspection")
|
|
|
159 |
for patent in temp_files[:5]:
|
160 |
+
st.write(patent)
|
161 |
|
162 |
filtered_patents = []
|
|
|
163 |
for patent_file in temp_files:
|
164 |
parsed_patent = parse_patent(patent_file)
|
165 |
+
st.write("Parsed patent data:", parsed_patent) # Log parsed data
|
166 |
if not parsed_patent:
|
167 |
continue
|
168 |
|
|
|
170 |
matched = False
|
171 |
for field in fields:
|
172 |
field_content = parsed_patent.get(field, "")
|
173 |
+
if not field_content:
|
174 |
+
st.write(f"Field '{field}' is empty for patent:", parsed_patent)
|
175 |
+
continue
|
176 |
+
if any(keyword.lower() in field_content.lower() for keyword in keywords):
|
177 |
st.write(f"Match found in field '{field}'")
|
178 |
filtered_patents.append(parsed_patent)
|
179 |
matched = True
|
|
|
181 |
|
182 |
# Global fallback if no fields match
|
183 |
if not matched:
|
184 |
+
full_text = " ".join(parsed_patent.values())
|
185 |
if any(keyword.lower() in full_text.lower() for keyword in keywords):
|
186 |
st.write("Match found in global fallback search!")
|
187 |
filtered_patents.append(parsed_patent)
|
|
|
191 |
|
192 |
|
193 |
|
|
|
|
|
194 |
def extract_patents(year, month, day, logging):
|
195 |
"""
|
196 |
This function reads a patent file in XML format, splits it into individual patents, parses each
|