Spaces:
Sleeping
Sleeping
Update patentwiz/preprocess_data.py
Browse files- patentwiz/preprocess_data.py +25 -24
patentwiz/preprocess_data.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import os
|
2 |
import requests
|
3 |
import zipfile
|
|
|
4 |
import xml.etree.ElementTree as ET
|
5 |
from datetime import datetime, timedelta
|
6 |
import tempfile
|
@@ -85,18 +86,13 @@ def download_weekly_patents(year, month, day, logging):
|
|
85 |
def filter_rf_patents(patents, keywords=None, fields=None):
|
86 |
"""
|
87 |
Filters patents based on keywords and specified fields, with a fallback for inconsistent field names.
|
88 |
-
|
89 |
-
Parameters:
|
90 |
-
patents (list): List of patent texts (as strings or structured data).
|
91 |
-
keywords (list): Keywords to filter patents.
|
92 |
-
fields (list): Fields to search for keywords (e.g., Title, Abstract, Claims).
|
93 |
-
Returns:
|
94 |
-
list: Filtered patents.
|
95 |
"""
|
|
|
96 |
if keywords is None:
|
97 |
keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"] # Default keywords
|
98 |
if fields is None:
|
99 |
-
fields = ["Title", "Abstract", "Summary", "Claims"] # Default fields
|
100 |
|
101 |
# Standardize field names
|
102 |
FIELD_NAME_MAPPING = {
|
@@ -111,47 +107,52 @@ def filter_rf_patents(patents, keywords=None, fields=None):
|
|
111 |
}
|
112 |
|
113 |
filtered_patents = []
|
114 |
-
for patent in patents:
|
115 |
-
# Debugging: Print patent data type
|
116 |
-
print(f"Processing patent: {patent}")
|
117 |
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
119 |
if isinstance(patent, str):
|
|
|
120 |
if any(keyword.lower() in patent.lower() for keyword in keywords):
|
|
|
121 |
filtered_patents.append(patent)
|
122 |
-
continue
|
123 |
-
|
124 |
-
# Case 2: Handle dictionary-type patents
|
125 |
elif isinstance(patent, dict):
|
126 |
-
# Normalize field names
|
127 |
normalized_patent = {}
|
128 |
for field, content in patent.items():
|
129 |
-
|
|
|
130 |
normalized_patent[normalized_field] = content
|
131 |
-
patent = normalized_patent
|
132 |
|
133 |
# Field-specific match
|
134 |
matched = False
|
135 |
for field in fields:
|
136 |
-
field_content =
|
|
|
137 |
if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
|
138 |
-
|
|
|
139 |
matched = True
|
140 |
break
|
141 |
|
142 |
# Global fallback if no fields match
|
143 |
if not matched:
|
144 |
-
full_text = " ".join(
|
145 |
if any(keyword.lower() in full_text.lower() for keyword in keywords):
|
146 |
-
|
|
|
147 |
else:
|
148 |
-
# Handle unexpected data formats
|
149 |
-
print(f"Unknown patent format: {type(patent)}")
|
150 |
|
|
|
151 |
return filtered_patents
|
152 |
|
153 |
|
154 |
|
|
|
155 |
def extract_patents(year, month, day, logging):
|
156 |
"""
|
157 |
This function reads a patent file in XML format, splits it into individual patents, parses each
|
|
|
1 |
import os
|
2 |
import requests
|
3 |
import zipfile
|
4 |
+
import streamlit as st
|
5 |
import xml.etree.ElementTree as ET
|
6 |
from datetime import datetime, timedelta
|
7 |
import tempfile
|
|
|
86 |
def filter_rf_patents(patents, keywords=None, fields=None):
|
87 |
"""
|
88 |
Filters patents based on keywords and specified fields, with a fallback for inconsistent field names.
|
89 |
+
Includes Streamlit-based debugging to display progress and results in the UI.
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
"""
|
91 |
+
|
92 |
if keywords is None:
|
93 |
keywords = ["Radio Frequency", "Antenna", "UAV", "Wireless Charging"] # Default keywords
|
94 |
if fields is None:
|
95 |
+
fields = ["Title", "Abstract", "Summary", "Claims", "Detailed Description"] # Default fields
|
96 |
|
97 |
# Standardize field names
|
98 |
FIELD_NAME_MAPPING = {
|
|
|
107 |
}
|
108 |
|
109 |
filtered_patents = []
|
|
|
|
|
|
|
110 |
|
111 |
+
# Display first 5 patents for inspection
|
112 |
+
st.write("Display First 5 patents for inspection")
|
113 |
+
for patent in patents[:5]:
|
114 |
+
st.json(patent)
|
115 |
+
|
116 |
+
for patent in patents:
|
117 |
if isinstance(patent, str):
|
118 |
+
# Global keyword search for string-type patents
|
119 |
if any(keyword.lower() in patent.lower() for keyword in keywords):
|
120 |
+
st.write(f"Match found in string patent!")
|
121 |
filtered_patents.append(patent)
|
|
|
|
|
|
|
122 |
elif isinstance(patent, dict):
|
123 |
+
# Normalize field names
|
124 |
normalized_patent = {}
|
125 |
for field, content in patent.items():
|
126 |
+
# Map field names to standard format
|
127 |
+
normalized_field = FIELD_NAME_MAPPING.get(field, field)
|
128 |
normalized_patent[normalized_field] = content
|
|
|
129 |
|
130 |
# Field-specific match
|
131 |
matched = False
|
132 |
for field in fields:
|
133 |
+
field_content = normalized_patent.get(field, "")
|
134 |
+
st.write(f"Checking field '{field}': {field_content}")
|
135 |
if field_content and any(keyword.lower() in field_content.lower() for keyword in keywords):
|
136 |
+
st.write(f"Match found in field '{field}'")
|
137 |
+
filtered_patents.append(normalized_patent)
|
138 |
matched = True
|
139 |
break
|
140 |
|
141 |
# Global fallback if no fields match
|
142 |
if not matched:
|
143 |
+
full_text = " ".join(normalized_patent.values()) # Combine all fields
|
144 |
if any(keyword.lower() in full_text.lower() for keyword in keywords):
|
145 |
+
st.write(f"Match found in global search!")
|
146 |
+
filtered_patents.append(normalized_patent)
|
147 |
else:
|
148 |
+
st.write(f"Unknown patent format: {type(patent)}") # Handle unexpected data formats
|
|
|
149 |
|
150 |
+
st.write(f"Total filtered patents: {len(filtered_patents)}")
|
151 |
return filtered_patents
|
152 |
|
153 |
|
154 |
|
155 |
+
|
156 |
def extract_patents(year, month, day, logging):
|
157 |
"""
|
158 |
This function reads a patent file in XML format, splits it into individual patents, parses each
|