Spaces:
Sleeping
Sleeping
Update patentwiz/preprocess_data.py
Browse files- patentwiz/preprocess_data.py +30 -17
patentwiz/preprocess_data.py
CHANGED
@@ -84,7 +84,8 @@ def download_weekly_patents(year, month, day, logging):
|
|
84 |
|
85 |
def filter_rf_patents(patents, keywords=None, fields=None):
|
86 |
"""
|
87 |
-
Filters patents based on keywords and specified fields, with fallback for inconsistent field names.
|
|
|
88 |
Parameters:
|
89 |
patents (list): List of patent texts (as strings or structured data).
|
90 |
keywords (list): Keywords to filter patents.
|
@@ -111,34 +112,46 @@ def filter_rf_patents(patents, keywords=None, fields=None):
|
|
111 |
|
112 |
filtered_patents = []
|
113 |
for patent in patents:
|
114 |
-
# Debugging: Print patent data
|
115 |
print(f"Processing patent: {patent}")
|
116 |
|
117 |
-
#
|
118 |
-
if isinstance(patent,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
normalized_patent = {}
|
120 |
for field, content in patent.items():
|
121 |
normalized_field = FIELD_NAME_MAPPING.get(field, field) # Map to standard field name
|
122 |
normalized_patent[normalized_field] = content
|
123 |
patent = normalized_patent
|
124 |
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
-
# Global fallback if no fields match
|
135 |
-
if not matched:
|
136 |
-
full_text = " ".join(patent.values()) # Combine all fields into one string
|
137 |
-
if any(keyword.lower() in full_text.lower() for keyword in keywords):
|
138 |
-
filtered_patents.append(patent)
|
139 |
return filtered_patents
|
140 |
|
141 |
|
|
|
142 |
def extract_patents(year, month, day, logging):
|
143 |
"""
|
144 |
This function reads a patent file in XML format, splits it into individual patents, parses each
|
|
|
84 |
|
85 |
def filter_rf_patents(patents, keywords=None, fields=None):
|
86 |
"""
|
87 |
+
Filters patents based on keywords and specified fields, with a fallback for inconsistent field names.
|
88 |
+
Handles both string and dictionary-type patent representations.
|
89 |
Parameters:
|
90 |
patents (list): List of patent texts (as strings or structured data).
|
91 |
keywords (list): Keywords to filter patents.
|
|
|
112 |
|
113 |
filtered_patents = []
|
114 |
for patent in patents:
|
115 |
+
# Debugging: Print patent data type
|
116 |
print(f"Processing patent: {patent}")
|
117 |
|
118 |
+
# Case 1: Handle string-type patents (global search)
|
119 |
+
if isinstance(patent, str):
|
120 |
+
if any(keyword.lower() in patent.lower() for keyword in keywords):
|
121 |
+
filtered_patents.append(patent)
|
122 |
+
continue
|
123 |
+
|
124 |
+
# Case 2: Handle dictionary-type patents
|
125 |
+
elif isinstance(patent, dict):
|
126 |
+
# Normalize field names in the patent dictionary
|
127 |
normalized_patent = {}
|
128 |
for field, content in patent.items():
|
129 |
normalized_field = FIELD_NAME_MAPPING.get(field, field) # Map to standard field name
|
130 |
normalized_patent[normalized_field] = content
|
131 |
patent = normalized_patent
|
132 |
|
133 |
+
# Field-specific match
|
134 |
+
matched = False
|
135 |
+
for field in fields:
|
136 |
+
field_content = patent.get(field, "")
|
137 |
+
if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
|
138 |
+
filtered_patents.append(patent)
|
139 |
+
matched = True
|
140 |
+
break
|
141 |
+
|
142 |
+
# Global fallback if no fields match
|
143 |
+
if not matched:
|
144 |
+
full_text = " ".join(patent.values()) # Combine all fields into one string
|
145 |
+
if any(keyword.lower() in full_text.lower() for keyword in keywords):
|
146 |
+
filtered_patents.append(patent)
|
147 |
+
else:
|
148 |
+
# Handle unexpected data formats gracefully
|
149 |
+
print(f"Unknown patent format: {type(patent)}")
|
150 |
|
|
|
|
|
|
|
|
|
|
|
151 |
return filtered_patents
|
152 |
|
153 |
|
154 |
+
|
155 |
def extract_patents(year, month, day, logging):
|
156 |
"""
|
157 |
This function reads a patent file in XML format, splits it into individual patents, parses each
|