DrishtiSharma commited on
Commit
0b7fc3c
·
verified ·
1 Parent(s): ee17e02

Update patentwiz/preprocess_data.py

Browse files
Files changed (1) hide show
  1. patentwiz/preprocess_data.py +13 -15
patentwiz/preprocess_data.py CHANGED
@@ -91,9 +91,8 @@ import tempfile
91
  def filter_rf_patents(patents, keywords=None, fields=None):
92
  """
93
  Filters patents based on keywords and specified fields, with parsing for raw patent files.
94
- Supports temporary storage for environments like Hugging Face Spaces.
95
  """
96
- import streamlit as st # Use Streamlit for debugging
97
 
98
  if keywords is None:
99
  keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"] # Default keywords
@@ -117,6 +116,7 @@ def filter_rf_patents(patents, keywords=None, fields=None):
117
  def parse_patent(file_path):
118
  """
119
  Parses an XML patent file into a structured dictionary.
 
120
  """
121
  try:
122
  tree = ET.parse(file_path)
@@ -142,29 +142,27 @@ def filter_rf_patents(patents, keywords=None, fields=None):
142
  st.write(f"Error parsing patent {file_path}: {e}")
143
  return None
144
 
145
- # Use temporary directory
146
  with tempfile.TemporaryDirectory() as temp_dir:
147
  st.write(f"Using temporary directory: {temp_dir}")
148
 
149
- # Simulate saving raw patent files (e.g., for testing)
150
  temp_files = []
151
  for i, patent in enumerate(patents):
152
  if isinstance(patent, str) and patent.endswith(".txt"):
153
- # Save fake patent data as text files in the temp directory
154
  temp_file_path = os.path.join(temp_dir, f"patent_{i}.txt")
155
  with open(temp_file_path, "w") as f:
156
- f.write(patent)
157
  temp_files.append(temp_file_path)
158
 
159
- # Display first 5 patents for inspection (before parsing)
160
- st.write("Debugging: First 5 raw patents for inspection")
161
  for patent in temp_files[:5]:
162
- st.write(patent) # Display file paths
163
 
164
  filtered_patents = []
165
-
166
  for patent_file in temp_files:
167
  parsed_patent = parse_patent(patent_file)
 
168
  if not parsed_patent:
169
  continue
170
 
@@ -172,8 +170,10 @@ def filter_rf_patents(patents, keywords=None, fields=None):
172
  matched = False
173
  for field in fields:
174
  field_content = parsed_patent.get(field, "")
175
- st.write(f"Checking field '{field}': {field_content}")
176
- if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
 
 
177
  st.write(f"Match found in field '{field}'")
178
  filtered_patents.append(parsed_patent)
179
  matched = True
@@ -181,7 +181,7 @@ def filter_rf_patents(patents, keywords=None, fields=None):
181
 
182
  # Global fallback if no fields match
183
  if not matched:
184
- full_text = " ".join(parsed_patent.values()) # Combine all fields
185
  if any(keyword.lower() in full_text.lower() for keyword in keywords):
186
  st.write("Match found in global fallback search!")
187
  filtered_patents.append(parsed_patent)
@@ -191,8 +191,6 @@ def filter_rf_patents(patents, keywords=None, fields=None):
191
 
192
 
193
 
194
-
195
-
196
  def extract_patents(year, month, day, logging):
197
  """
198
  This function reads a patent file in XML format, splits it into individual patents, parses each
 
91
  def filter_rf_patents(patents, keywords=None, fields=None):
92
  """
93
  Filters patents based on keywords and specified fields, with parsing for raw patent files.
94
+ Includes enhanced debugging to identify issues.
95
  """
 
96
 
97
  if keywords is None:
98
  keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"] # Default keywords
 
116
  def parse_patent(file_path):
117
  """
118
  Parses an XML patent file into a structured dictionary.
119
+ Includes debugging for XML structure issues.
120
  """
121
  try:
122
  tree = ET.parse(file_path)
 
142
  st.write(f"Error parsing patent {file_path}: {e}")
143
  return None
144
 
145
+ # Use temporary directory for patents
146
  with tempfile.TemporaryDirectory() as temp_dir:
147
  st.write(f"Using temporary directory: {temp_dir}")
148
 
149
+ # Save and inspect raw patent data
150
  temp_files = []
151
  for i, patent in enumerate(patents):
152
  if isinstance(patent, str) and patent.endswith(".txt"):
 
153
  temp_file_path = os.path.join(temp_dir, f"patent_{i}.txt")
154
  with open(temp_file_path, "w") as f:
155
+ f.write(patent) # Save raw data to temp file
156
  temp_files.append(temp_file_path)
157
 
158
+ st.write("Display first 5 raw patents for inspection")
 
159
  for patent in temp_files[:5]:
160
+ st.write(patent)
161
 
162
  filtered_patents = []
 
163
  for patent_file in temp_files:
164
  parsed_patent = parse_patent(patent_file)
165
+ st.write("Parsed patent data:", parsed_patent) # Log parsed data
166
  if not parsed_patent:
167
  continue
168
 
 
170
  matched = False
171
  for field in fields:
172
  field_content = parsed_patent.get(field, "")
173
+ if not field_content:
174
+ st.write(f"Field '{field}' is empty for patent:", parsed_patent)
175
+ continue
176
+ if any(keyword.lower() in field_content.lower() for keyword in keywords):
177
  st.write(f"Match found in field '{field}'")
178
  filtered_patents.append(parsed_patent)
179
  matched = True
 
181
 
182
  # Global fallback if no fields match
183
  if not matched:
184
+ full_text = " ".join(parsed_patent.values())
185
  if any(keyword.lower() in full_text.lower() for keyword in keywords):
186
  st.write("Match found in global fallback search!")
187
  filtered_patents.append(parsed_patent)
 
191
 
192
 
193
 
 
 
194
  def extract_patents(year, month, day, logging):
195
  """
196
  This function reads a patent file in XML format, splits it into individual patents, parses each