Spaces:
Sleeping
Sleeping
Update patentwiz/preprocess_data.py
Browse files- patentwiz/preprocess_data.py +36 -21
patentwiz/preprocess_data.py
CHANGED
@@ -84,7 +84,7 @@ def download_weekly_patents(year, month, day, logging):
|
|
84 |
|
85 |
def filter_rf_patents(patents, keywords=None, fields=None):
|
86 |
"""
|
87 |
-
Filters patents based on keywords and specified fields, with
|
88 |
Parameters:
|
89 |
patents (list): List of patent texts (as strings or structured data).
|
90 |
keywords (list): Keywords to filter patents.
|
@@ -97,31 +97,46 @@ def filter_rf_patents(patents, keywords=None, fields=None):
|
|
97 |
if fields is None:
|
98 |
fields = ["Title", "Abstract", "Summary", "Claims"] # Default fields
|
99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
filtered_patents = []
|
101 |
for patent in patents:
|
102 |
-
#
|
103 |
-
|
104 |
-
if any(keyword.lower() in patent.lower() for keyword in keywords):
|
105 |
-
filtered_patents.append(patent)
|
106 |
-
continue
|
107 |
|
108 |
-
#
|
109 |
if isinstance(patent, dict):
|
110 |
-
|
111 |
-
for field in
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
|
127 |
def extract_patents(year, month, day, logging):
|
|
|
84 |
|
85 |
def filter_rf_patents(patents, keywords=None, fields=None):
|
86 |
"""
|
87 |
+
Filters patents based on keywords and specified fields, with fallback for inconsistent field names.
|
88 |
Parameters:
|
89 |
patents (list): List of patent texts (as strings or structured data).
|
90 |
keywords (list): Keywords to filter patents.
|
|
|
97 |
if fields is None:
|
98 |
fields = ["Title", "Abstract", "Summary", "Claims"] # Default fields
|
99 |
|
100 |
+
# Standardize field names
|
101 |
+
FIELD_NAME_MAPPING = {
|
102 |
+
"abstract": "Abstract",
|
103 |
+
"ABSTRACT": "Abstract",
|
104 |
+
"summary": "Summary",
|
105 |
+
"SUMMARY": "Summary",
|
106 |
+
"claims": "Claims",
|
107 |
+
"CLAIMS": "Claims",
|
108 |
+
"detailed description": "Detailed Description",
|
109 |
+
"DETAILED DESCRIPTION": "Detailed Description"
|
110 |
+
}
|
111 |
+
|
112 |
filtered_patents = []
|
113 |
for patent in patents:
|
114 |
+
# Debugging: Print patent data
|
115 |
+
print(f"Processing patent: {patent}")
|
|
|
|
|
|
|
116 |
|
117 |
+
# Normalize field names in the patent dictionary
|
118 |
if isinstance(patent, dict):
|
119 |
+
normalized_patent = {}
|
120 |
+
for field, content in patent.items():
|
121 |
+
normalized_field = FIELD_NAME_MAPPING.get(field, field) # Map to standard field name
|
122 |
+
normalized_patent[normalized_field] = content
|
123 |
+
patent = normalized_patent
|
124 |
+
|
125 |
+
# Field-specific match
|
126 |
+
matched = False
|
127 |
+
for field in fields:
|
128 |
+
field_content = patent.get(field, "")
|
129 |
+
if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
|
130 |
+
filtered_patents.append(patent)
|
131 |
+
matched = True
|
132 |
+
break
|
133 |
|
134 |
+
# Global fallback if no fields match
|
135 |
+
if not matched:
|
136 |
+
full_text = " ".join(patent.values()) # Combine all fields into one string
|
137 |
+
if any(keyword.lower() in full_text.lower() for keyword in keywords):
|
138 |
+
filtered_patents.append(patent)
|
139 |
+
return filtered_patents
|
140 |
|
141 |
|
142 |
def extract_patents(year, month, day, logging):
|