DrishtiSharma commited on
Commit
ee17e02
·
verified ·
1 Parent(s): 903ffe4

Update patentwiz/preprocess_data.py

Browse files
Files changed (1) hide show
  1. patentwiz/preprocess_data.py +51 -36
patentwiz/preprocess_data.py CHANGED
@@ -83,9 +83,15 @@ def download_weekly_patents(year, month, day, logging):
83
  )
84
  return False
85
 
 
 
 
 
 
86
  def filter_rf_patents(patents, keywords=None, fields=None):
87
  """
88
  Filters patents based on keywords and specified fields, with parsing for raw patent files.
 
89
  """
90
  import streamlit as st # Use Streamlit for debugging
91
 
@@ -136,44 +142,53 @@ def filter_rf_patents(patents, keywords=None, fields=None):
136
  st.write(f"Error parsing patent {file_path}: {e}")
137
  return None
138
 
139
- filtered_patents = []
140
-
141
- # Display first 5 patents for inspection (before parsing)
142
- st.write("Debugging: First 5 raw patents for inspection")
143
- for patent in patents[:5]:
144
- st.write(patent) # Display raw data
145
-
146
- for patent in patents:
147
- if isinstance(patent, str):
148
- parsed_patent = parse_patent(patent)
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  if not parsed_patent:
150
  continue
151
- elif isinstance(patent, dict):
152
- parsed_patent = patent
153
- else:
154
- st.write(f"Unknown patent format: {type(patent)}")
155
- continue
156
-
157
- # Field-specific matching
158
- matched = False
159
- for field in fields:
160
- field_content = parsed_patent.get(field, "")
161
- st.write(f"Checking field '{field}': {field_content}")
162
- if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
163
- st.write(f"Match found in field '{field}'")
164
- filtered_patents.append(parsed_patent)
165
- matched = True
166
- break
167
-
168
- # Global fallback if no fields match
169
- if not matched:
170
- full_text = " ".join(parsed_patent.values()) # Combine all fields
171
- if any(keyword.lower() in full_text.lower() for keyword in keywords):
172
- st.write("Match found in global fallback search!")
173
- filtered_patents.append(parsed_patent)
174
-
175
- st.write(f"Total filtered patents: {len(filtered_patents)}")
176
- return filtered_patents
177
 
178
 
179
 
 
83
  )
84
  return False
85
 
86
+ import os
87
+ import xml.etree.ElementTree as ET
88
+ import tempfile
89
+
90
+
91
  def filter_rf_patents(patents, keywords=None, fields=None):
92
  """
93
  Filters patents based on keywords and specified fields, with parsing for raw patent files.
94
+ Supports temporary storage for environments like Hugging Face Spaces.
95
  """
96
  import streamlit as st # Use Streamlit for debugging
97
 
 
142
  st.write(f"Error parsing patent {file_path}: {e}")
143
  return None
144
 
145
+ # Use temporary directory
146
+ with tempfile.TemporaryDirectory() as temp_dir:
147
+ st.write(f"Using temporary directory: {temp_dir}")
148
+
149
+ # Simulate saving raw patent files (e.g., for testing)
150
+ temp_files = []
151
+ for i, patent in enumerate(patents):
152
+ if isinstance(patent, str) and patent.endswith(".txt"):
153
+ # Save fake patent data as text files in the temp directory
154
+ temp_file_path = os.path.join(temp_dir, f"patent_{i}.txt")
155
+ with open(temp_file_path, "w") as f:
156
+ f.write(patent)
157
+ temp_files.append(temp_file_path)
158
+
159
+ # Display first 5 patents for inspection (before parsing)
160
+ st.write("Debugging: First 5 raw patents for inspection")
161
+ for patent in temp_files[:5]:
162
+ st.write(patent) # Display file paths
163
+
164
+ filtered_patents = []
165
+
166
+ for patent_file in temp_files:
167
+ parsed_patent = parse_patent(patent_file)
168
  if not parsed_patent:
169
  continue
170
+
171
+ # Field-specific matching
172
+ matched = False
173
+ for field in fields:
174
+ field_content = parsed_patent.get(field, "")
175
+ st.write(f"Checking field '{field}': {field_content}")
176
+ if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
177
+ st.write(f"Match found in field '{field}'")
178
+ filtered_patents.append(parsed_patent)
179
+ matched = True
180
+ break
181
+
182
+ # Global fallback if no fields match
183
+ if not matched:
184
+ full_text = " ".join(parsed_patent.values()) # Combine all fields
185
+ if any(keyword.lower() in full_text.lower() for keyword in keywords):
186
+ st.write("Match found in global fallback search!")
187
+ filtered_patents.append(parsed_patent)
188
+
189
+ st.write(f"Total filtered patents: {len(filtered_patents)}")
190
+ return filtered_patents
191
+
 
 
 
 
192
 
193
 
194