Spaces:
Running
Running
Update app2.py
Browse files
app2.py
CHANGED
@@ -185,7 +185,7 @@ class EnhancedURLProcessor:
|
|
185 |
'url': url,
|
186 |
'raw_content': None,
|
187 |
'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(),
|
188 |
-
|
189 |
'extracted_data': None,
|
190 |
'processing_notes': [f"Failed to fetch content: {str(e)}"]
|
191 |
}
|
@@ -261,7 +261,9 @@ class EnhancedURLProcessor:
|
|
261 |
'title': None,
|
262 |
'meta_description': None,
|
263 |
'full_text': "",
|
264 |
-
'links': []
|
|
|
|
|
265 |
}
|
266 |
try:
|
267 |
soup = BeautifulSoup(content, 'html.parser')
|
@@ -270,6 +272,8 @@ class EnhancedURLProcessor:
|
|
270 |
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
271 |
if meta_desc and meta_desc.get('content'):
|
272 |
extracted['meta_description'] = meta_desc['content'].strip()
|
|
|
|
|
273 |
unique_links = set()
|
274 |
for a_tag in soup.find_all('a', href=True):
|
275 |
href = a_tag['href'].strip()
|
@@ -287,6 +291,27 @@ class EnhancedURLProcessor:
|
|
287 |
elif urlparse(href).netloc and href not in unique_links:
|
288 |
extracted['links'].append({'text': text, 'url': href})
|
289 |
unique_links.add(href)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
290 |
soup_copy = BeautifulSoup(content, 'html.parser')
|
291 |
for script_or_style in soup_copy(["script", "style"]):
|
292 |
script_or_style.extract()
|
@@ -701,7 +726,7 @@ class EnhancedFileProcessor:
|
|
701 |
elif archive_extension in ('.tar', '.gz', '.tgz'):
|
702 |
try:
|
703 |
mode = 'r'
|
704 |
-
if archive_extension in ('.tar.gz', '.tgz'): mode = 'r:gz'
|
705 |
with tarfile.open(archive_path, mode) as tar_ref:
|
706 |
for member in tar_ref.getmembers():
|
707 |
if member.isfile():
|
@@ -739,30 +764,30 @@ class EnhancedFileProcessor:
|
|
739 |
f"Failed to clean up extracted file {extracted_file_path}: {e}")
|
740 |
except tarfile.TarError as e:
|
741 |
logger.error(f"Error processing TAR archive '{archive_path.name}': {e}")
|
742 |
-
elif archive_extension == '.gz':
|
743 |
-
|
744 |
-
|
745 |
-
|
746 |
-
|
747 |
-
|
748 |
-
|
749 |
-
|
750 |
-
|
751 |
-
|
752 |
-
|
753 |
-
|
754 |
-
|
755 |
-
|
756 |
-
|
757 |
-
|
758 |
-
|
759 |
-
|
760 |
-
|
761 |
-
|
762 |
-
|
763 |
-
|
764 |
-
|
765 |
-
|
766 |
elif archive_extension in ('.bz2', '.7z', '.rar'):
|
767 |
logger.warning(
|
768 |
f"Support for {archive_extension} archives is not yet fully implemented and requires external tools/libraries.")
|
@@ -1041,9 +1066,9 @@ def respond_to_chat(
|
|
1041 |
filter_match = re.search(
|
1042 |
r'(?:filter|show items|show me items|find entries|select items|get items)\s+'
|
1043 |
r'(?:where|by|for|with|if)\s+'
|
1044 |
-
r'(\w+)\s+'
|
1045 |
r'(is|equals?|==|!=|>=?|<=?|contains?|starts with|ends with)\s+'
|
1046 |
-
r'([\'"]?[\w\s
|
1047 |
lower_message
|
1048 |
)
|
1049 |
if filter_match:
|
@@ -1056,57 +1081,57 @@ def respond_to_chat(
|
|
1056 |
response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}"
|
1057 |
new_filtered_df_state = None
|
1058 |
else:
|
1059 |
-
|
1060 |
try:
|
1061 |
-
target_value: Any
|
1062 |
-
col_dtype =
|
1063 |
-
|
1064 |
-
|
1065 |
-
|
|
|
|
|
1066 |
try:
|
1067 |
target_value = float(value_str)
|
1068 |
col_series = pd.to_numeric(df_to_filter[column_name], errors='coerce')
|
1069 |
except ValueError:
|
1070 |
response = f"For numeric column '{column_name}', '{value_str}' is not a valid number."
|
1071 |
-
target_value = None
|
1072 |
elif pd.api.types.is_bool_dtype(col_dtype) or value_str.lower() in ['true', 'false']:
|
1073 |
target_value = value_str.lower() == 'true'
|
1074 |
col_series = df_to_filter[column_name].astype(bool, errors='ignore')
|
1075 |
-
else:
|
1076 |
target_value = str(value_str)
|
1077 |
col_series = df_to_filter[column_name].astype(str).str.lower()
|
1078 |
value_str_lower = target_value.lower()
|
1079 |
-
|
|
|
|
|
1080 |
if operator in ['is', 'equals', '==']:
|
1081 |
-
if
|
1082 |
-
col_dtype):
|
1083 |
condition = col_series == target_value
|
1084 |
else:
|
1085 |
condition = col_series == value_str_lower
|
1086 |
elif operator == '!=':
|
1087 |
-
if
|
1088 |
-
col_dtype):
|
1089 |
condition = col_series != target_value
|
1090 |
else:
|
1091 |
condition = col_series != value_str_lower
|
1092 |
-
elif operator == '>' and
|
1093 |
condition = col_series > target_value
|
1094 |
-
elif operator == '>=' and
|
1095 |
condition = col_series >= target_value
|
1096 |
-
elif operator == '<' and
|
1097 |
condition = col_series < target_value
|
1098 |
-
elif operator == '<=' and
|
1099 |
condition = col_series <= target_value
|
1100 |
-
elif operator in ['contains', 'contain']
|
1101 |
-
condition =
|
1102 |
-
elif operator == 'starts with'
|
1103 |
-
condition =
|
1104 |
-
elif operator == 'ends with'
|
1105 |
-
condition =
|
1106 |
else:
|
1107 |
response = f"Unsupported operator '{operator}' for column '{column_name}' (type: {col_dtype})."
|
1108 |
-
|
1109 |
-
if response: new_filtered_df_state = None
|
1110 |
if condition is not None:
|
1111 |
filtered_results_df = df_to_filter[condition]
|
1112 |
if not filtered_results_df.empty:
|
@@ -1121,11 +1146,8 @@ def respond_to_chat(
|
|
1121 |
f"Here's a preview:\n```\n{preview_str}\n```\n"
|
1122 |
f"The full filtered dataset is now available for download using the 'Download Filtered JSON' button.")
|
1123 |
else:
|
1124 |
-
new_filtered_df_state = pd.DataFrame()
|
1125 |
response = f"No items found where '{column_name}' {operator} '{value_str}'."
|
1126 |
-
elif not response:
|
1127 |
-
response = f"Unsupported operator '{operator}' for column '{column_name}' (type: {col_dtype})."
|
1128 |
-
new_filtered_df_state = None
|
1129 |
except ValueError as ve:
|
1130 |
response = f"Invalid value '{value_str}' for numeric column '{column_name}'. {ve}"
|
1131 |
new_filtered_df_state = None
|
@@ -1563,17 +1585,17 @@ def create_modern_interface():
|
|
1563 |
processing_status_messages.append(f"β
Processed URL: {url} (Level 0)")
|
1564 |
if content_result.get('processing_notes'):
|
1565 |
processing_status_messages.append(
|
1566 |
-
f"
|
1567 |
if content_result.get('linked_extractions'):
|
1568 |
num_linked_processed = len([r for r in content_result['linked_extractions'] if
|
1569 |
r and r.get('fetch_result') is not None])
|
1570 |
processing_status_messages.append(
|
1571 |
-
f"
|
1572 |
else:
|
1573 |
processing_status_messages.append(f"β Failed to process URL: {url}")
|
1574 |
if content_result.get('processing_notes'):
|
1575 |
processing_status_messages.append(
|
1576 |
-
f"
|
1577 |
else:
|
1578 |
processing_status_messages.append(
|
1579 |
f"β Failed to process URL: {url} (No result returned)")
|
@@ -1587,7 +1609,7 @@ def create_modern_interface():
|
|
1587 |
for res in file_results:
|
1588 |
if res.get('processing_notes'):
|
1589 |
processing_status_messages.append(
|
1590 |
-
f"
|
1591 |
else:
|
1592 |
processing_status_messages.append(f"β Failed to process file: {file.name}")
|
1593 |
qr_paths = []
|
|
|
185 |
'url': url,
|
186 |
'raw_content': None,
|
187 |
'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(),
|
188 |
+
'status_code': getattr(e.response, 'status_code', None)},
|
189 |
'extracted_data': None,
|
190 |
'processing_notes': [f"Failed to fetch content: {str(e)}"]
|
191 |
}
|
|
|
261 |
'title': None,
|
262 |
'meta_description': None,
|
263 |
'full_text': "",
|
264 |
+
'links': [],
|
265 |
+
'images': [],
|
266 |
+
'media': []
|
267 |
}
|
268 |
try:
|
269 |
soup = BeautifulSoup(content, 'html.parser')
|
|
|
272 |
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
273 |
if meta_desc and meta_desc.get('content'):
|
274 |
extracted['meta_description'] = meta_desc['content'].strip()
|
275 |
+
|
276 |
+
# Extract links
|
277 |
unique_links = set()
|
278 |
for a_tag in soup.find_all('a', href=True):
|
279 |
href = a_tag['href'].strip()
|
|
|
291 |
elif urlparse(href).netloc and href not in unique_links:
|
292 |
extracted['links'].append({'text': text, 'url': href})
|
293 |
unique_links.add(href)
|
294 |
+
|
295 |
+
# Extract images
|
296 |
+
unique_images = set()
|
297 |
+
for img_tag in soup.find_all('img', src=True):
|
298 |
+
src = img_tag['src'].strip()
|
299 |
+
alt = img_tag.get('alt', '').strip()
|
300 |
+
if src and src not in unique_images:
|
301 |
+
absolute_url = urljoin(base_url, src)
|
302 |
+
extracted['images'].append({'src': absolute_url, 'alt': alt})
|
303 |
+
unique_images.add(src)
|
304 |
+
|
305 |
+
# Extract media (audio/video)
|
306 |
+
unique_media = set()
|
307 |
+
for media_tag in soup.find_all(['audio', 'video'], src=True):
|
308 |
+
src = media_tag['src'].strip()
|
309 |
+
if src and src not in unique_media:
|
310 |
+
absolute_url = urljoin(base_url, src)
|
311 |
+
extracted['media'].append({'src': absolute_url, 'type': media_tag.name})
|
312 |
+
unique_media.add(src)
|
313 |
+
|
314 |
+
# Extract text content
|
315 |
soup_copy = BeautifulSoup(content, 'html.parser')
|
316 |
for script_or_style in soup_copy(["script", "style"]):
|
317 |
script_or_style.extract()
|
|
|
726 |
elif archive_extension in ('.tar', '.gz', '.tgz'):
|
727 |
try:
|
728 |
mode = 'r'
|
729 |
+
if archive_extension in ('.tar.gz', '.tgz', '.gz'): mode = 'r:gz'
|
730 |
with tarfile.open(archive_path, mode) as tar_ref:
|
731 |
for member in tar_ref.getmembers():
|
732 |
if member.isfile():
|
|
|
764 |
f"Failed to clean up extracted file {extracted_file_path}: {e}")
|
765 |
except tarfile.TarError as e:
|
766 |
logger.error(f"Error processing TAR archive '{archive_path.name}': {e}")
|
767 |
+
elif archive_extension == '.gz': # This case is handled by tarfile, but added for single .gz files
|
768 |
+
extracted_name = archive_path.stem
|
769 |
+
extracted_path = extract_to / extracted_name
|
770 |
+
try:
|
771 |
+
with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile:
|
772 |
+
outfile.write(gz_file.read())
|
773 |
+
if extracted_path.suffix.lower() in self.supported_extensions and not self._is_archive(
|
774 |
+
extracted_path):
|
775 |
+
dataset.extend(self._process_single_file(extracted_path))
|
776 |
+
elif extracted_path.suffix.lower() in self.archive_extensions:
|
777 |
+
logger.info(f"Found nested archive '{extracted_name}', processing recursively.")
|
778 |
+
dataset.extend(self._process_archive(extracted_path, extract_to))
|
779 |
+
else:
|
780 |
+
logger.debug(f"Skipping unsupported file (from gz): '{extracted_name}'")
|
781 |
+
except gzip.GzipFile as e:
|
782 |
+
logger.error(f"Error processing GZIP file '{archive_path.name}': {e}")
|
783 |
+
except Exception as e:
|
784 |
+
logger.error(f"Error extracting/processing from GZIP '{archive_path.name}': {e}")
|
785 |
+
finally:
|
786 |
+
if extracted_path.exists():
|
787 |
+
try:
|
788 |
+
extracted_path.unlink()
|
789 |
+
except OSError as e:
|
790 |
+
logger.warning(f"Failed to clean up extracted file {extracted_path}: {e}")
|
791 |
elif archive_extension in ('.bz2', '.7z', '.rar'):
|
792 |
logger.warning(
|
793 |
f"Support for {archive_extension} archives is not yet fully implemented and requires external tools/libraries.")
|
|
|
1066 |
filter_match = re.search(
|
1067 |
r'(?:filter|show items|show me items|find entries|select items|get items)\s+'
|
1068 |
r'(?:where|by|for|with|if)\s+'
|
1069 |
+
r'([\w\._-]+)\s+' # Allow underscores, periods, and hyphens in column names
|
1070 |
r'(is|equals?|==|!=|>=?|<=?|contains?|starts with|ends with)\s+'
|
1071 |
+
r'([\'"]?[\w\s\.-]+[\'"]?)',
|
1072 |
lower_message
|
1073 |
)
|
1074 |
if filter_match:
|
|
|
1081 |
response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}"
|
1082 |
new_filtered_df_state = None
|
1083 |
else:
|
1084 |
+
df_to_filter = df.copy() # Always filter from the full dataframe
|
1085 |
try:
|
1086 |
+
target_value: Any = None
|
1087 |
+
col_dtype = df_to_filter[column_name].dtype
|
1088 |
+
|
1089 |
+
is_numeric_op = operator in ['>', '>=', '<', '<=', '==', '!=']
|
1090 |
+
is_numeric_col = pd.api.types.is_numeric_dtype(col_dtype)
|
1091 |
+
|
1092 |
+
if is_numeric_op and is_numeric_col:
|
1093 |
try:
|
1094 |
target_value = float(value_str)
|
1095 |
col_series = pd.to_numeric(df_to_filter[column_name], errors='coerce')
|
1096 |
except ValueError:
|
1097 |
response = f"For numeric column '{column_name}', '{value_str}' is not a valid number."
|
|
|
1098 |
elif pd.api.types.is_bool_dtype(col_dtype) or value_str.lower() in ['true', 'false']:
|
1099 |
target_value = value_str.lower() == 'true'
|
1100 |
col_series = df_to_filter[column_name].astype(bool, errors='ignore')
|
1101 |
+
else: # Treat as string
|
1102 |
target_value = str(value_str)
|
1103 |
col_series = df_to_filter[column_name].astype(str).str.lower()
|
1104 |
value_str_lower = target_value.lower()
|
1105 |
+
|
1106 |
+
if not response: # No error so far
|
1107 |
+
condition = None
|
1108 |
if operator in ['is', 'equals', '==']:
|
1109 |
+
if is_numeric_col or pd.api.types.is_bool_dtype(col_dtype):
|
|
|
1110 |
condition = col_series == target_value
|
1111 |
else:
|
1112 |
condition = col_series == value_str_lower
|
1113 |
elif operator == '!=':
|
1114 |
+
if is_numeric_col or pd.api.types.is_bool_dtype(col_dtype):
|
|
|
1115 |
condition = col_series != target_value
|
1116 |
else:
|
1117 |
condition = col_series != value_str_lower
|
1118 |
+
elif operator == '>' and is_numeric_col:
|
1119 |
condition = col_series > target_value
|
1120 |
+
elif operator == '>=' and is_numeric_col:
|
1121 |
condition = col_series >= target_value
|
1122 |
+
elif operator == '<' and is_numeric_col:
|
1123 |
condition = col_series < target_value
|
1124 |
+
elif operator == '<=' and is_numeric_col:
|
1125 |
condition = col_series <= target_value
|
1126 |
+
elif operator in ['contains', 'contain']:
|
1127 |
+
condition = df_to_filter[column_name].astype(str).str.contains(value_str, case=False, na=False)
|
1128 |
+
elif operator == 'starts with':
|
1129 |
+
condition = df_to_filter[column_name].astype(str).str.startswith(value_str, case=False, na=False)
|
1130 |
+
elif operator == 'ends with':
|
1131 |
+
condition = df_to_filter[column_name].astype(str).str.endswith(value_str, case=False, na=False)
|
1132 |
else:
|
1133 |
response = f"Unsupported operator '{operator}' for column '{column_name}' (type: {col_dtype})."
|
1134 |
+
|
|
|
1135 |
if condition is not None:
|
1136 |
filtered_results_df = df_to_filter[condition]
|
1137 |
if not filtered_results_df.empty:
|
|
|
1146 |
f"Here's a preview:\n```\n{preview_str}\n```\n"
|
1147 |
f"The full filtered dataset is now available for download using the 'Download Filtered JSON' button.")
|
1148 |
else:
|
1149 |
+
new_filtered_df_state = pd.DataFrame() # Empty dataframe
|
1150 |
response = f"No items found where '{column_name}' {operator} '{value_str}'."
|
|
|
|
|
|
|
1151 |
except ValueError as ve:
|
1152 |
response = f"Invalid value '{value_str}' for numeric column '{column_name}'. {ve}"
|
1153 |
new_filtered_df_state = None
|
|
|
1585 |
processing_status_messages.append(f"β
Processed URL: {url} (Level 0)")
|
1586 |
if content_result.get('processing_notes'):
|
1587 |
processing_status_messages.append(
|
1588 |
+
f" Notes: {'; '.join(content_result['processing_notes'])}")
|
1589 |
if content_result.get('linked_extractions'):
|
1590 |
num_linked_processed = len([r for r in content_result['linked_extractions'] if
|
1591 |
r and r.get('fetch_result') is not None])
|
1592 |
processing_status_messages.append(
|
1593 |
+
f" Found and processed {num_linked_processed}/{len(content_result['linked_extractions'])} direct links.")
|
1594 |
else:
|
1595 |
processing_status_messages.append(f"β Failed to process URL: {url}")
|
1596 |
if content_result.get('processing_notes'):
|
1597 |
processing_status_messages.append(
|
1598 |
+
f" Notes: {'; '.join(content_result['processing_notes'])}")
|
1599 |
else:
|
1600 |
processing_status_messages.append(
|
1601 |
f"β Failed to process URL: {url} (No result returned)")
|
|
|
1609 |
for res in file_results:
|
1610 |
if res.get('processing_notes'):
|
1611 |
processing_status_messages.append(
|
1612 |
+
f" Notes for {res.get('filename', 'item')}: {'; '.join(res['processing_notes'])}")
|
1613 |
else:
|
1614 |
processing_status_messages.append(f"β Failed to process file: {file.name}")
|
1615 |
qr_paths = []
|