Spaces:
Sleeping
Sleeping
Update patentwiz/preprocess_data.py
Browse files- patentwiz/preprocess_data.py +62 -37
patentwiz/preprocess_data.py
CHANGED
@@ -85,9 +85,9 @@ def download_weekly_patents(year, month, day, logging):
|
|
85 |
|
86 |
def filter_rf_patents(patents, keywords=None, fields=None):
|
87 |
"""
|
88 |
-
Filters patents based on keywords and specified fields, with
|
89 |
-
Includes Streamlit-based debugging to display progress and results in the UI.
|
90 |
"""
|
|
|
91 |
|
92 |
if keywords is None:
|
93 |
keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"] # Default keywords
|
@@ -103,49 +103,74 @@ def filter_rf_patents(patents, keywords=None, fields=None):
|
|
103 |
"claims": "Claims",
|
104 |
"CLAIMS": "Claims",
|
105 |
"detailed description": "Detailed Description",
|
106 |
-
"DETAILED DESCRIPTION": "Detailed Description"
|
|
|
|
|
107 |
}
|
108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
filtered_patents = []
|
110 |
|
111 |
-
# Display first 5 patents for inspection
|
112 |
-
st.write("
|
113 |
for patent in patents[:5]:
|
114 |
-
st.
|
115 |
|
116 |
for patent in patents:
|
117 |
-
if isinstance(patent, str):
|
118 |
-
|
119 |
-
if
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
# Normalize field names
|
124 |
-
normalized_patent = {}
|
125 |
-
for field, content in patent.items():
|
126 |
-
# Map field names to standard format
|
127 |
-
normalized_field = FIELD_NAME_MAPPING.get(field, field)
|
128 |
-
normalized_patent[normalized_field] = content
|
129 |
-
|
130 |
-
# Field-specific match
|
131 |
-
matched = False
|
132 |
-
for field in fields:
|
133 |
-
field_content = normalized_patent.get(field, "")
|
134 |
-
st.write(f"Checking field '{field}': {field_content}")
|
135 |
-
if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
|
136 |
-
st.write(f"Match found in field '{field}'")
|
137 |
-
filtered_patents.append(normalized_patent)
|
138 |
-
matched = True
|
139 |
-
break
|
140 |
-
|
141 |
-
# Global fallback if no fields match
|
142 |
-
if not matched:
|
143 |
-
full_text = " ".join(normalized_patent.values()) # Combine all fields
|
144 |
-
if any(keyword.lower() in full_text.lower() for keyword in keywords):
|
145 |
-
st.write(f"Match found in global search!")
|
146 |
-
filtered_patents.append(normalized_patent)
|
147 |
else:
|
148 |
-
st.write(f"Unknown patent format: {type(patent)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
|
150 |
st.write(f"Total filtered patents: {len(filtered_patents)}")
|
151 |
return filtered_patents
|
|
|
85 |
|
86 |
def filter_rf_patents(patents, keywords=None, fields=None):
|
87 |
"""
|
88 |
+
Filters patents based on keywords and specified fields, with parsing for raw patent files.
|
|
|
89 |
"""
|
90 |
+
import streamlit as st # Use Streamlit for debugging
|
91 |
|
92 |
if keywords is None:
|
93 |
keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"] # Default keywords
|
|
|
103 |
"claims": "Claims",
|
104 |
"CLAIMS": "Claims",
|
105 |
"detailed description": "Detailed Description",
|
106 |
+
"DETAILED DESCRIPTION": "Detailed Description",
|
107 |
+
"title": "Title",
|
108 |
+
"TITLE": "Title",
|
109 |
}
|
110 |
|
111 |
+
def parse_patent(file_path):
|
112 |
+
"""
|
113 |
+
Parses an XML patent file into a structured dictionary.
|
114 |
+
"""
|
115 |
+
try:
|
116 |
+
tree = ET.parse(file_path)
|
117 |
+
root = tree.getroot()
|
118 |
+
|
119 |
+
# Extract fields from XML (adjust based on actual XML structure)
|
120 |
+
patent_data = {
|
121 |
+
"Title": root.findtext(".//title", default=""),
|
122 |
+
"Abstract": root.findtext(".//abstract", default=""),
|
123 |
+
"Summary": root.findtext(".//summary", default=""),
|
124 |
+
"Claims": root.findtext(".//claims", default=""),
|
125 |
+
"Detailed Description": root.findtext(".//detailedDescription", default=""),
|
126 |
+
}
|
127 |
+
|
128 |
+
# Normalize field names
|
129 |
+
normalized_patent = {}
|
130 |
+
for field, content in patent_data.items():
|
131 |
+
normalized_field = FIELD_NAME_MAPPING.get(field, field)
|
132 |
+
normalized_patent[normalized_field] = content.strip() if content else ""
|
133 |
+
|
134 |
+
return normalized_patent
|
135 |
+
except Exception as e:
|
136 |
+
st.write(f"Error parsing patent {file_path}: {e}")
|
137 |
+
return None
|
138 |
+
|
139 |
filtered_patents = []
|
140 |
|
141 |
+
# Display first 5 patents for inspection (before parsing)
|
142 |
+
st.write("Debugging: First 5 raw patents for inspection")
|
143 |
for patent in patents[:5]:
|
144 |
+
st.write(patent) # Display raw data
|
145 |
|
146 |
for patent in patents:
|
147 |
+
if isinstance(patent, str):
|
148 |
+
parsed_patent = parse_patent(patent)
|
149 |
+
if not parsed_patent:
|
150 |
+
continue
|
151 |
+
elif isinstance(patent, dict):
|
152 |
+
parsed_patent = patent
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
else:
|
154 |
+
st.write(f"Unknown patent format: {type(patent)}")
|
155 |
+
continue
|
156 |
+
|
157 |
+
# Field-specific matching
|
158 |
+
matched = False
|
159 |
+
for field in fields:
|
160 |
+
field_content = parsed_patent.get(field, "")
|
161 |
+
st.write(f"Checking field '{field}': {field_content}")
|
162 |
+
if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
|
163 |
+
st.write(f"Match found in field '{field}'")
|
164 |
+
filtered_patents.append(parsed_patent)
|
165 |
+
matched = True
|
166 |
+
break
|
167 |
+
|
168 |
+
# Global fallback if no fields match
|
169 |
+
if not matched:
|
170 |
+
full_text = " ".join(parsed_patent.values()) # Combine all fields
|
171 |
+
if any(keyword.lower() in full_text.lower() for keyword in keywords):
|
172 |
+
st.write("Match found in global fallback search!")
|
173 |
+
filtered_patents.append(parsed_patent)
|
174 |
|
175 |
st.write(f"Total filtered patents: {len(filtered_patents)}")
|
176 |
return filtered_patents
|