DrishtiSharma commited on
Commit
5e84d1b
·
verified ·
1 Parent(s): 818c4cb

Update patentwiz/preprocess_data.py

Browse files
Files changed (1) hide show
  1. patentwiz/preprocess_data.py +36 -21
patentwiz/preprocess_data.py CHANGED
@@ -84,7 +84,7 @@ def download_weekly_patents(year, month, day, logging):
84
 
85
  def filter_rf_patents(patents, keywords=None, fields=None):
86
  """
87
- Filters patents based on keywords and specified fields, with a global fallback.
88
  Parameters:
89
  patents (list): List of patent texts (as strings or structured data).
90
  keywords (list): Keywords to filter patents.
@@ -97,31 +97,46 @@ def filter_rf_patents(patents, keywords=None, fields=None):
97
  if fields is None:
98
  fields = ["Title", "Abstract", "Summary", "Claims"] # Default fields
99
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  filtered_patents = []
101
  for patent in patents:
102
- # Global match (for string-type patents)
103
- if isinstance(patent, str):
104
- if any(keyword.lower() in patent.lower() for keyword in keywords):
105
- filtered_patents.append(patent)
106
- continue
107
 
108
- # Field-specific match (for dictionary-type patents)
109
  if isinstance(patent, dict):
110
- matched = False
111
- for field in fields:
112
- field_content = patent.get(field.lower(), "")
113
- if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
114
- filtered_patents.append(patent)
115
- matched = True
116
- break
117
- # Global fallback if no fields match
118
- if not matched:
119
- full_text = " ".join(patent.values()) # Combine all fields into one string
120
- if any(keyword.lower() in full_text.lower() for keyword in keywords):
121
- filtered_patents.append(patent)
122
- return filtered_patents
123
-
124
 
 
 
 
 
 
 
125
 
126
 
127
  def extract_patents(year, month, day, logging):
 
84
 
85
  def filter_rf_patents(patents, keywords=None, fields=None):
86
  """
87
+ Filters patents based on keywords and specified fields, with fallback for inconsistent field names.
88
  Parameters:
89
  patents (list): List of patent texts (as strings or structured data).
90
  keywords (list): Keywords to filter patents.
 
97
  if fields is None:
98
  fields = ["Title", "Abstract", "Summary", "Claims"] # Default fields
99
 
100
+ # Standardize field names
101
+ FIELD_NAME_MAPPING = {
102
+ "abstract": "Abstract",
103
+ "ABSTRACT": "Abstract",
104
+ "summary": "Summary",
105
+ "SUMMARY": "Summary",
106
+ "claims": "Claims",
107
+ "CLAIMS": "Claims",
108
+ "detailed description": "Detailed Description",
109
+ "DETAILED DESCRIPTION": "Detailed Description"
110
+ }
111
+
112
  filtered_patents = []
113
  for patent in patents:
114
+ # Debugging: Print patent data
115
+ print(f"Processing patent: {patent}")
 
 
 
116
 
117
+ # Normalize field names in the patent dictionary
118
  if isinstance(patent, dict):
119
+ normalized_patent = {}
120
+ for field, content in patent.items():
121
+ normalized_field = FIELD_NAME_MAPPING.get(field, field) # Map to standard field name
122
+ normalized_patent[normalized_field] = content
123
+ patent = normalized_patent
124
+
125
+ # Field-specific match
126
+ matched = False
127
+ for field in fields:
128
+ field_content = patent.get(field, "")
129
+ if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
130
+ filtered_patents.append(patent)
131
+ matched = True
132
+ break
133
 
134
+ # Global fallback if no fields match
135
+ if not matched:
136
+ full_text = " ".join(patent.values()) # Combine all fields into one string
137
+ if any(keyword.lower() in full_text.lower() for keyword in keywords):
138
+ filtered_patents.append(patent)
139
+ return filtered_patents
140
 
141
 
142
  def extract_patents(year, month, day, logging):