DrishtiSharma commited on
Commit
dd6b5f9
·
verified ·
1 Parent(s): 9b50deb

Update patentwiz/preprocess_data.py

Browse files
Files changed (1) hide show
  1. patentwiz/preprocess_data.py +62 -37
patentwiz/preprocess_data.py CHANGED
@@ -85,9 +85,9 @@ def download_weekly_patents(year, month, day, logging):
85
 
86
  def filter_rf_patents(patents, keywords=None, fields=None):
87
  """
88
- Filters patents based on keywords and specified fields, with a fallback for inconsistent field names.
89
- Includes Streamlit-based debugging to display progress and results in the UI.
90
  """
 
91
 
92
  if keywords is None:
93
  keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"] # Default keywords
@@ -103,49 +103,74 @@ def filter_rf_patents(patents, keywords=None, fields=None):
103
  "claims": "Claims",
104
  "CLAIMS": "Claims",
105
  "detailed description": "Detailed Description",
106
- "DETAILED DESCRIPTION": "Detailed Description"
 
 
107
  }
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  filtered_patents = []
110
 
111
- # Display first 5 patents for inspection
112
- st.write("Display First 5 patents for inspection")
113
  for patent in patents[:5]:
114
- st.json(patent)
115
 
116
  for patent in patents:
117
- if isinstance(patent, str):
118
- # Global keyword search for string-type patents
119
- if any(keyword.lower() in patent.lower() for keyword in keywords):
120
- st.write(f"Match found in string patent!")
121
- filtered_patents.append(patent)
122
- elif isinstance(patent, dict):
123
- # Normalize field names
124
- normalized_patent = {}
125
- for field, content in patent.items():
126
- # Map field names to standard format
127
- normalized_field = FIELD_NAME_MAPPING.get(field, field)
128
- normalized_patent[normalized_field] = content
129
-
130
- # Field-specific match
131
- matched = False
132
- for field in fields:
133
- field_content = normalized_patent.get(field, "")
134
- st.write(f"Checking field '{field}': {field_content}")
135
- if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
136
- st.write(f"Match found in field '{field}'")
137
- filtered_patents.append(normalized_patent)
138
- matched = True
139
- break
140
-
141
- # Global fallback if no fields match
142
- if not matched:
143
- full_text = " ".join(normalized_patent.values()) # Combine all fields
144
- if any(keyword.lower() in full_text.lower() for keyword in keywords):
145
- st.write(f"Match found in global search!")
146
- filtered_patents.append(normalized_patent)
147
  else:
148
- st.write(f"Unknown patent format: {type(patent)}") # Handle unexpected data formats
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
  st.write(f"Total filtered patents: {len(filtered_patents)}")
151
  return filtered_patents
 
85
 
86
  def filter_rf_patents(patents, keywords=None, fields=None):
87
  """
88
+ Filters patents based on keywords and specified fields, with parsing for raw patent files.
 
89
  """
90
+ import streamlit as st # Use Streamlit for debugging
91
 
92
  if keywords is None:
93
  keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"] # Default keywords
 
103
  "claims": "Claims",
104
  "CLAIMS": "Claims",
105
  "detailed description": "Detailed Description",
106
+ "DETAILED DESCRIPTION": "Detailed Description",
107
+ "title": "Title",
108
+ "TITLE": "Title",
109
  }
110
 
111
+ def parse_patent(file_path):
112
+ """
113
+ Parses an XML patent file into a structured dictionary.
114
+ """
115
+ try:
116
+ tree = ET.parse(file_path)
117
+ root = tree.getroot()
118
+
119
+ # Extract fields from XML (adjust based on actual XML structure)
120
+ patent_data = {
121
+ "Title": root.findtext(".//title", default=""),
122
+ "Abstract": root.findtext(".//abstract", default=""),
123
+ "Summary": root.findtext(".//summary", default=""),
124
+ "Claims": root.findtext(".//claims", default=""),
125
+ "Detailed Description": root.findtext(".//detailedDescription", default=""),
126
+ }
127
+
128
+ # Normalize field names
129
+ normalized_patent = {}
130
+ for field, content in patent_data.items():
131
+ normalized_field = FIELD_NAME_MAPPING.get(field, field)
132
+ normalized_patent[normalized_field] = content.strip() if content else ""
133
+
134
+ return normalized_patent
135
+ except Exception as e:
136
+ st.write(f"Error parsing patent {file_path}: {e}")
137
+ return None
138
+
139
  filtered_patents = []
140
 
141
+ # Display first 5 patents for inspection (before parsing)
142
+ st.write("Debugging: First 5 raw patents for inspection")
143
  for patent in patents[:5]:
144
+ st.write(patent) # Display raw data
145
 
146
  for patent in patents:
147
+ if isinstance(patent, str):
148
+ parsed_patent = parse_patent(patent)
149
+ if not parsed_patent:
150
+ continue
151
+ elif isinstance(patent, dict):
152
+ parsed_patent = patent
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  else:
154
+ st.write(f"Unknown patent format: {type(patent)}")
155
+ continue
156
+
157
+ # Field-specific matching
158
+ matched = False
159
+ for field in fields:
160
+ field_content = parsed_patent.get(field, "")
161
+ st.write(f"Checking field '{field}': {field_content}")
162
+ if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
163
+ st.write(f"Match found in field '{field}'")
164
+ filtered_patents.append(parsed_patent)
165
+ matched = True
166
+ break
167
+
168
+ # Global fallback if no fields match
169
+ if not matched:
170
+ full_text = " ".join(parsed_patent.values()) # Combine all fields
171
+ if any(keyword.lower() in full_text.lower() for keyword in keywords):
172
+ st.write("Match found in global fallback search!")
173
+ filtered_patents.append(parsed_patent)
174
 
175
  st.write(f"Total filtered patents: {len(filtered_patents)}")
176
  return filtered_patents