DrishtiSharma commited on
Commit
c9e8376
·
verified ·
1 Parent(s): 0b7fc3c

Update patentwiz/preprocess_data.py

Browse files
Files changed (1) hide show
  1. patentwiz/preprocess_data.py +37 -50
patentwiz/preprocess_data.py CHANGED
@@ -83,16 +83,11 @@ def download_weekly_patents(year, month, day, logging):
83
  )
84
  return False
85
 
86
- import os
87
- import xml.etree.ElementTree as ET
88
- import tempfile
89
-
90
-
91
  def filter_rf_patents(patents, keywords=None, fields=None):
92
  """
93
  Filters patents based on keywords and specified fields, with parsing for raw patent files.
94
- Includes enhanced debugging to identify issues.
95
  """
 
96
 
97
  if keywords is None:
98
  keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"] # Default keywords
@@ -116,7 +111,6 @@ def filter_rf_patents(patents, keywords=None, fields=None):
116
  def parse_patent(file_path):
117
  """
118
  Parses an XML patent file into a structured dictionary.
119
- Includes debugging for XML structure issues.
120
  """
121
  try:
122
  tree = ET.parse(file_path)
@@ -142,52 +136,45 @@ def filter_rf_patents(patents, keywords=None, fields=None):
142
  st.write(f"Error parsing patent {file_path}: {e}")
143
  return None
144
 
145
- # Use temporary directory for patents
146
- with tempfile.TemporaryDirectory() as temp_dir:
147
- st.write(f"Using temporary directory: {temp_dir}")
148
-
149
- # Save and inspect raw patent data
150
- temp_files = []
151
- for i, patent in enumerate(patents):
152
- if isinstance(patent, str) and patent.endswith(".txt"):
153
- temp_file_path = os.path.join(temp_dir, f"patent_{i}.txt")
154
- with open(temp_file_path, "w") as f:
155
- f.write(patent) # Save raw data to temp file
156
- temp_files.append(temp_file_path)
157
-
158
- st.write("Display first 5 raw patents for inspection")
159
- for patent in temp_files[:5]:
160
- st.write(patent)
161
-
162
- filtered_patents = []
163
- for patent_file in temp_files:
164
- parsed_patent = parse_patent(patent_file)
165
- st.write("Parsed patent data:", parsed_patent) # Log parsed data
166
  if not parsed_patent:
167
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
- # Field-specific matching
170
- matched = False
171
- for field in fields:
172
- field_content = parsed_patent.get(field, "")
173
- if not field_content:
174
- st.write(f"Field '{field}' is empty for patent:", parsed_patent)
175
- continue
176
- if any(keyword.lower() in field_content.lower() for keyword in keywords):
177
- st.write(f"Match found in field '{field}'")
178
- filtered_patents.append(parsed_patent)
179
- matched = True
180
- break
181
-
182
- # Global fallback if no fields match
183
- if not matched:
184
- full_text = " ".join(parsed_patent.values())
185
- if any(keyword.lower() in full_text.lower() for keyword in keywords):
186
- st.write("Match found in global fallback search!")
187
- filtered_patents.append(parsed_patent)
188
-
189
- st.write(f"Total filtered patents: {len(filtered_patents)}")
190
- return filtered_patents
191
 
192
 
193
 
 
83
  )
84
  return False
85
 
 
 
 
 
 
86
  def filter_rf_patents(patents, keywords=None, fields=None):
87
  """
88
  Filters patents based on keywords and specified fields, with parsing for raw patent files.
 
89
  """
90
+ import streamlit as st # Use Streamlit for debugging
91
 
92
  if keywords is None:
93
  keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"] # Default keywords
 
111
  def parse_patent(file_path):
112
  """
113
  Parses an XML patent file into a structured dictionary.
 
114
  """
115
  try:
116
  tree = ET.parse(file_path)
 
136
  st.write(f"Error parsing patent {file_path}: {e}")
137
  return None
138
 
139
+ filtered_patents = []
140
+
141
+ # Display first 5 patents for inspection (before parsing)
142
+ st.write("Debugging: First 5 raw patents for inspection")
143
+ for patent in patents[:5]:
144
+ st.write(patent) # Display raw data
145
+
146
+ for patent in patents:
147
+ if isinstance(patent, str):
148
+ parsed_patent = parse_patent(patent)
 
 
 
 
 
 
 
 
 
 
 
149
  if not parsed_patent:
150
  continue
151
+ elif isinstance(patent, dict):
152
+ parsed_patent = patent
153
+ else:
154
+ st.write(f"Unknown patent format: {type(patent)}")
155
+ continue
156
+
157
+ # Field-specific matching
158
+ matched = False
159
+ for field in fields:
160
+ field_content = parsed_patent.get(field, "")
161
+ st.write(f"Checking field '{field}': {field_content}")
162
+ if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
163
+ st.write(f"Match found in field '{field}'")
164
+ filtered_patents.append(parsed_patent)
165
+ matched = True
166
+ break
167
+
168
+ # Global fallback if no fields match
169
+ if not matched:
170
+ full_text = " ".join(parsed_patent.values()) # Combine all fields
171
+ if any(keyword.lower() in full_text.lower() for keyword in keywords):
172
+ st.write("Match found in global fallback search!")
173
+ filtered_patents.append(parsed_patent)
174
+
175
+ st.write(f"Total filtered patents: {len(filtered_patents)}")
176
+ return filtered_patents
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
 
180