DrishtiSharma commited on
Commit
9b50deb
·
verified ·
1 Parent(s): c675721

Update patentwiz/preprocess_data.py

Browse files
Files changed (1) hide show
  1. patentwiz/preprocess_data.py +25 -24
patentwiz/preprocess_data.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import requests
3
  import zipfile
 
4
  import xml.etree.ElementTree as ET
5
  from datetime import datetime, timedelta
6
  import tempfile
@@ -85,18 +86,13 @@ def download_weekly_patents(year, month, day, logging):
85
  def filter_rf_patents(patents, keywords=None, fields=None):
86
  """
87
  Filters patents based on keywords and specified fields, with a fallback for inconsistent field names.
88
- Handles both string and dictionary-type patent representations.
89
- Parameters:
90
- patents (list): List of patent texts (as strings or structured data).
91
- keywords (list): Keywords to filter patents.
92
- fields (list): Fields to search for keywords (e.g., Title, Abstract, Claims).
93
- Returns:
94
- list: Filtered patents.
95
  """
 
96
  if keywords is None:
97
  keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"] # Default keywords
98
  if fields is None:
99
- fields = ["Title", "Abstract", "Summary", "Claims"] # Default fields
100
 
101
  # Standardize field names
102
  FIELD_NAME_MAPPING = {
@@ -111,47 +107,52 @@ def filter_rf_patents(patents, keywords=None, fields=None):
111
  }
112
 
113
  filtered_patents = []
114
- for patent in patents:
115
- # Debugging: Print patent data type
116
- print(f"Processing patent: {patent}")
117
 
118
- # Case 1: Handle string-type patents (global search)
 
 
 
 
 
119
  if isinstance(patent, str):
 
120
  if any(keyword.lower() in patent.lower() for keyword in keywords):
 
121
  filtered_patents.append(patent)
122
- continue
123
-
124
- # Case 2: Handle dictionary-type patents
125
  elif isinstance(patent, dict):
126
- # Normalize field names in the patent dictionary
127
  normalized_patent = {}
128
  for field, content in patent.items():
129
- normalized_field = FIELD_NAME_MAPPING.get(field, field) # Map to standard field name
 
130
  normalized_patent[normalized_field] = content
131
- patent = normalized_patent
132
 
133
  # Field-specific match
134
  matched = False
135
  for field in fields:
136
- field_content = patent.get(field, "")
 
137
  if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
138
- filtered_patents.append(patent)
 
139
  matched = True
140
  break
141
 
142
  # Global fallback if no fields match
143
  if not matched:
144
- full_text = " ".join(patent.values()) # Combine all fields into one string
145
  if any(keyword.lower() in full_text.lower() for keyword in keywords):
146
- filtered_patents.append(patent)
 
147
  else:
148
- # Handle unexpected data formats gracefully
149
- print(f"Unknown patent format: {type(patent)}")
150
 
 
151
  return filtered_patents
152
 
153
 
154
 
 
155
  def extract_patents(year, month, day, logging):
156
  """
157
  This function reads a patent file in XML format, splits it into individual patents, parses each
 
1
  import os
2
  import requests
3
  import zipfile
4
+ import streamlit as st
5
  import xml.etree.ElementTree as ET
6
  from datetime import datetime, timedelta
7
  import tempfile
 
86
  def filter_rf_patents(patents, keywords=None, fields=None):
87
  """
88
  Filters patents based on keywords and specified fields, with a fallback for inconsistent field names.
89
+ Includes Streamlit-based debugging to display progress and results in the UI.
 
 
 
 
 
 
90
  """
91
+
92
  if keywords is None:
93
  keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"] # Default keywords
94
  if fields is None:
95
+ fields = ["Title", "Abstract", "Summary", "Claims", "Detailed Description"] # Default fields
96
 
97
  # Standardize field names
98
  FIELD_NAME_MAPPING = {
 
107
  }
108
 
109
  filtered_patents = []
 
 
 
110
 
111
+ # Display first 5 patents for inspection
112
+ st.write("Display First 5 patents for inspection")
113
+ for patent in patents[:5]:
114
+ st.json(patent)
115
+
116
+ for patent in patents:
117
  if isinstance(patent, str):
118
+ # Global keyword search for string-type patents
119
  if any(keyword.lower() in patent.lower() for keyword in keywords):
120
+ st.write(f"Match found in string patent!")
121
  filtered_patents.append(patent)
 
 
 
122
  elif isinstance(patent, dict):
123
+ # Normalize field names
124
  normalized_patent = {}
125
  for field, content in patent.items():
126
+ # Map field names to standard format
127
+ normalized_field = FIELD_NAME_MAPPING.get(field, field)
128
  normalized_patent[normalized_field] = content
 
129
 
130
  # Field-specific match
131
  matched = False
132
  for field in fields:
133
+ field_content = normalized_patent.get(field, "")
134
+ st.write(f"Checking field '{field}': {field_content}")
135
  if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
136
+ st.write(f"Match found in field '{field}'")
137
+ filtered_patents.append(normalized_patent)
138
  matched = True
139
  break
140
 
141
  # Global fallback if no fields match
142
  if not matched:
143
+ full_text = " ".join(normalized_patent.values()) # Combine all fields
144
  if any(keyword.lower() in full_text.lower() for keyword in keywords):
145
+ st.write(f"Match found in global search!")
146
+ filtered_patents.append(normalized_patent)
147
  else:
148
+ st.write(f"Unknown patent format: {type(patent)}") # Handle unexpected data formats
 
149
 
150
+ st.write(f"Total filtered patents: {len(filtered_patents)}")
151
  return filtered_patents
152
 
153
 
154
 
155
+
156
  def extract_patents(year, month, day, logging):
157
  """
158
  This function reads a patent file in XML format, splits it into individual patents, parses each