DrishtiSharma commited on
Commit
c675721
·
verified ·
1 Parent(s): 5e84d1b

Update patentwiz/preprocess_data.py

Browse files
Files changed (1) hide show
  1. patentwiz/preprocess_data.py +30 -17
patentwiz/preprocess_data.py CHANGED
@@ -84,7 +84,8 @@ def download_weekly_patents(year, month, day, logging):
84
 
85
  def filter_rf_patents(patents, keywords=None, fields=None):
86
  """
87
- Filters patents based on keywords and specified fields, with fallback for inconsistent field names.
 
88
  Parameters:
89
  patents (list): List of patent texts (as strings or structured data).
90
  keywords (list): Keywords to filter patents.
@@ -111,34 +112,46 @@ def filter_rf_patents(patents, keywords=None, fields=None):
111
 
112
  filtered_patents = []
113
  for patent in patents:
114
- # Debugging: Print patent data
115
  print(f"Processing patent: {patent}")
116
 
117
- # Normalize field names in the patent dictionary
118
- if isinstance(patent, dict):
 
 
 
 
 
 
 
119
  normalized_patent = {}
120
  for field, content in patent.items():
121
  normalized_field = FIELD_NAME_MAPPING.get(field, field) # Map to standard field name
122
  normalized_patent[normalized_field] = content
123
  patent = normalized_patent
124
 
125
- # Field-specific match
126
- matched = False
127
- for field in fields:
128
- field_content = patent.get(field, "")
129
- if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
130
- filtered_patents.append(patent)
131
- matched = True
132
- break
 
 
 
 
 
 
 
 
 
133
 
134
- # Global fallback if no fields match
135
- if not matched:
136
- full_text = " ".join(patent.values()) # Combine all fields into one string
137
- if any(keyword.lower() in full_text.lower() for keyword in keywords):
138
- filtered_patents.append(patent)
139
  return filtered_patents
140
 
141
 
 
142
  def extract_patents(year, month, day, logging):
143
  """
144
  This function reads a patent file in XML format, splits it into individual patents, parses each
 
84
 
85
  def filter_rf_patents(patents, keywords=None, fields=None):
86
  """
87
+ Filters patents based on keywords and specified fields, with a fallback for inconsistent field names.
88
+ Handles both string and dictionary-type patent representations.
89
  Parameters:
90
  patents (list): List of patent texts (as strings or structured data).
91
  keywords (list): Keywords to filter patents.
 
112
 
113
  filtered_patents = []
114
  for patent in patents:
115
+ # Debugging: Print patent data type
116
  print(f"Processing patent: {patent}")
117
 
118
+ # Case 1: Handle string-type patents (global search)
119
+ if isinstance(patent, str):
120
+ if any(keyword.lower() in patent.lower() for keyword in keywords):
121
+ filtered_patents.append(patent)
122
+ continue
123
+
124
+ # Case 2: Handle dictionary-type patents
125
+ elif isinstance(patent, dict):
126
+ # Normalize field names in the patent dictionary
127
  normalized_patent = {}
128
  for field, content in patent.items():
129
  normalized_field = FIELD_NAME_MAPPING.get(field, field) # Map to standard field name
130
  normalized_patent[normalized_field] = content
131
  patent = normalized_patent
132
 
133
+ # Field-specific match
134
+ matched = False
135
+ for field in fields:
136
+ field_content = patent.get(field, "")
137
+ if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
138
+ filtered_patents.append(patent)
139
+ matched = True
140
+ break
141
+
142
+ # Global fallback if no fields match
143
+ if not matched:
144
+ full_text = " ".join(patent.values()) # Combine all fields into one string
145
+ if any(keyword.lower() in full_text.lower() for keyword in keywords):
146
+ filtered_patents.append(patent)
147
+ else:
148
+ # Handle unexpected data formats gracefully
149
+ print(f"Unknown patent format: {type(patent)}")
150
 
 
 
 
 
 
151
  return filtered_patents
152
 
153
 
154
+
155
  def extract_patents(year, month, day, logging):
156
  """
157
  This function reads a patent file in XML format, splits it into individual patents, parses each