acecalisto3 commited on
Commit
a8cd52c
Β·
verified Β·
1 Parent(s): fd61735

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +84 -62
app2.py CHANGED
@@ -185,7 +185,7 @@ class EnhancedURLProcessor:
185
  'url': url,
186
  'raw_content': None,
187
  'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(),
188
- 'status_code': getattr(e.response, 'status_code', None)},
189
  'extracted_data': None,
190
  'processing_notes': [f"Failed to fetch content: {str(e)}"]
191
  }
@@ -261,7 +261,9 @@ class EnhancedURLProcessor:
261
  'title': None,
262
  'meta_description': None,
263
  'full_text': "",
264
- 'links': []
 
 
265
  }
266
  try:
267
  soup = BeautifulSoup(content, 'html.parser')
@@ -270,6 +272,8 @@ class EnhancedURLProcessor:
270
  meta_desc = soup.find('meta', attrs={'name': 'description'})
271
  if meta_desc and meta_desc.get('content'):
272
  extracted['meta_description'] = meta_desc['content'].strip()
 
 
273
  unique_links = set()
274
  for a_tag in soup.find_all('a', href=True):
275
  href = a_tag['href'].strip()
@@ -287,6 +291,27 @@ class EnhancedURLProcessor:
287
  elif urlparse(href).netloc and href not in unique_links:
288
  extracted['links'].append({'text': text, 'url': href})
289
  unique_links.add(href)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  soup_copy = BeautifulSoup(content, 'html.parser')
291
  for script_or_style in soup_copy(["script", "style"]):
292
  script_or_style.extract()
@@ -701,7 +726,7 @@ class EnhancedFileProcessor:
701
  elif archive_extension in ('.tar', '.gz', '.tgz'):
702
  try:
703
  mode = 'r'
704
- if archive_extension in ('.tar.gz', '.tgz'): mode = 'r:gz'
705
  with tarfile.open(archive_path, mode) as tar_ref:
706
  for member in tar_ref.getmembers():
707
  if member.isfile():
@@ -739,30 +764,30 @@ class EnhancedFileProcessor:
739
  f"Failed to clean up extracted file {extracted_file_path}: {e}")
740
  except tarfile.TarError as e:
741
  logger.error(f"Error processing TAR archive '{archive_path.name}': {e}")
742
- elif archive_extension == '.gz':
743
- extracted_name = archive_path.stem
744
- extracted_path = extract_to / extracted_name
745
- try:
746
- with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile:
747
- outfile.write(gz_file.read())
748
- if extracted_path.suffix.lower() in self.supported_extensions and not self._is_archive(
749
- extracted_path):
750
- dataset.extend(self._process_single_file(extracted_path))
751
- elif extracted_path.suffix.lower() in self.archive_extensions:
752
- logger.info(f"Found nested archive '{extracted_name}', processing recursively.")
753
- dataset.extend(self._process_archive(extracted_path, extract_to))
754
- else:
755
- logger.debug(f"Skipping unsupported file (from gz): '{extracted_name}'")
756
- except gzip.GzipFile as e:
757
- logger.error(f"Error processing GZIP file '{archive_path.name}': {e}")
758
- except Exception as e:
759
- logger.error(f"Error extracting/processing from GZIP '{archive_path.name}': {e}")
760
- finally:
761
- if extracted_path.exists():
762
- try:
763
- extracted_path.unlink()
764
- except OSError as e:
765
- logger.warning(f"Failed to clean up extracted file {extracted_path}: {e}")
766
  elif archive_extension in ('.bz2', '.7z', '.rar'):
767
  logger.warning(
768
  f"Support for {archive_extension} archives is not yet fully implemented and requires external tools/libraries.")
@@ -1041,9 +1066,9 @@ def respond_to_chat(
1041
  filter_match = re.search(
1042
  r'(?:filter|show items|show me items|find entries|select items|get items)\s+'
1043
  r'(?:where|by|for|with|if)\s+'
1044
- r'(\w+)\s+'
1045
  r'(is|equals?|==|!=|>=?|<=?|contains?|starts with|ends with)\s+'
1046
- r'([\'"]?[\w\s.-]+[\'"]?)',
1047
  lower_message
1048
  )
1049
  if filter_match:
@@ -1056,57 +1081,57 @@ def respond_to_chat(
1056
  response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}"
1057
  new_filtered_df_state = None
1058
  else:
1059
- active_df_to_filter = df.copy()
1060
  try:
1061
- target_value: Any
1062
- col_dtype = df[column_name].dtype
1063
- df_to_filter = current_filtered_df_state if current_filtered_df_state is not None and not current_filtered_df_state.empty else df.copy()
1064
- if pd.api.types.is_numeric_dtype(col_dtype) and operator in ['>', '>=', '<', '<=', '==',
1065
- '!=']:
 
 
1066
  try:
1067
  target_value = float(value_str)
1068
  col_series = pd.to_numeric(df_to_filter[column_name], errors='coerce')
1069
  except ValueError:
1070
  response = f"For numeric column '{column_name}', '{value_str}' is not a valid number."
1071
- target_value = None
1072
  elif pd.api.types.is_bool_dtype(col_dtype) or value_str.lower() in ['true', 'false']:
1073
  target_value = value_str.lower() == 'true'
1074
  col_series = df_to_filter[column_name].astype(bool, errors='ignore')
1075
- else:
1076
  target_value = str(value_str)
1077
  col_series = df_to_filter[column_name].astype(str).str.lower()
1078
  value_str_lower = target_value.lower()
1079
- if 'response' not in locals():
 
 
1080
  if operator in ['is', 'equals', '==']:
1081
- if pd.api.types.is_numeric_dtype(col_dtype) or pd.api.types.is_bool_dtype(
1082
- col_dtype):
1083
  condition = col_series == target_value
1084
  else:
1085
  condition = col_series == value_str_lower
1086
  elif operator == '!=':
1087
- if pd.api.types.is_numeric_dtype(col_dtype) or pd.api.types.is_bool_dtype(
1088
- col_dtype):
1089
  condition = col_series != target_value
1090
  else:
1091
  condition = col_series != value_str_lower
1092
- elif operator == '>' and pd.api.types.is_numeric_dtype(col_dtype):
1093
  condition = col_series > target_value
1094
- elif operator == '>=' and pd.api.types.is_numeric_dtype(col_dtype):
1095
  condition = col_series >= target_value
1096
- elif operator == '<' and pd.api.types.is_numeric_dtype(col_dtype):
1097
  condition = col_series < target_value
1098
- elif operator == '<=' and pd.api.types.is_numeric_dtype(col_dtype):
1099
  condition = col_series <= target_value
1100
- elif operator in ['contains', 'contain'] and pd.api.types.is_string_dtype(col_series):
1101
- condition = col_series.str.contains(value_str_lower, case=False, na=False)
1102
- elif operator == 'starts with' and pd.api.types.is_string_dtype(col_series):
1103
- condition = col_series.str.startswith(value_str_lower, na=False)
1104
- elif operator == 'ends with' and pd.api.types.is_string_dtype(col_series):
1105
- condition = col_series.str.endswith(value_str_lower, na=False)
1106
  else:
1107
  response = f"Unsupported operator '{operator}' for column '{column_name}' (type: {col_dtype})."
1108
- condition = None
1109
- if response: new_filtered_df_state = None
1110
  if condition is not None:
1111
  filtered_results_df = df_to_filter[condition]
1112
  if not filtered_results_df.empty:
@@ -1121,11 +1146,8 @@ def respond_to_chat(
1121
  f"Here's a preview:\n```\n{preview_str}\n```\n"
1122
  f"The full filtered dataset is now available for download using the 'Download Filtered JSON' button.")
1123
  else:
1124
- new_filtered_df_state = pd.DataFrame()
1125
  response = f"No items found where '{column_name}' {operator} '{value_str}'."
1126
- elif not response:
1127
- response = f"Unsupported operator '{operator}' for column '{column_name}' (type: {col_dtype})."
1128
- new_filtered_df_state = None
1129
  except ValueError as ve:
1130
  response = f"Invalid value '{value_str}' for numeric column '{column_name}'. {ve}"
1131
  new_filtered_df_state = None
@@ -1563,17 +1585,17 @@ def create_modern_interface():
1563
  processing_status_messages.append(f"βœ… Processed URL: {url} (Level 0)")
1564
  if content_result.get('processing_notes'):
1565
  processing_status_messages.append(
1566
- f" Notes: {'; '.join(content_result['processing_notes'])}")
1567
  if content_result.get('linked_extractions'):
1568
  num_linked_processed = len([r for r in content_result['linked_extractions'] if
1569
  r and r.get('fetch_result') is not None])
1570
  processing_status_messages.append(
1571
- f" Found and processed {num_linked_processed}/{len(content_result['linked_extractions'])} direct links.")
1572
  else:
1573
  processing_status_messages.append(f"❌ Failed to process URL: {url}")
1574
  if content_result.get('processing_notes'):
1575
  processing_status_messages.append(
1576
- f" Notes: {'; '.join(content_result['processing_notes'])}")
1577
  else:
1578
  processing_status_messages.append(
1579
  f"❌ Failed to process URL: {url} (No result returned)")
@@ -1587,7 +1609,7 @@ def create_modern_interface():
1587
  for res in file_results:
1588
  if res.get('processing_notes'):
1589
  processing_status_messages.append(
1590
- f" Notes for {res.get('filename', 'item')}: {'; '.join(res['processing_notes'])}")
1591
  else:
1592
  processing_status_messages.append(f"❌ Failed to process file: {file.name}")
1593
  qr_paths = []
 
185
  'url': url,
186
  'raw_content': None,
187
  'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(),
188
+ 'status_code': getattr(e.response, 'status_code', None)},
189
  'extracted_data': None,
190
  'processing_notes': [f"Failed to fetch content: {str(e)}"]
191
  }
 
261
  'title': None,
262
  'meta_description': None,
263
  'full_text': "",
264
+ 'links': [],
265
+ 'images': [],
266
+ 'media': []
267
  }
268
  try:
269
  soup = BeautifulSoup(content, 'html.parser')
 
272
  meta_desc = soup.find('meta', attrs={'name': 'description'})
273
  if meta_desc and meta_desc.get('content'):
274
  extracted['meta_description'] = meta_desc['content'].strip()
275
+
276
+ # Extract links
277
  unique_links = set()
278
  for a_tag in soup.find_all('a', href=True):
279
  href = a_tag['href'].strip()
 
291
  elif urlparse(href).netloc and href not in unique_links:
292
  extracted['links'].append({'text': text, 'url': href})
293
  unique_links.add(href)
294
+
295
+ # Extract images
296
+ unique_images = set()
297
+ for img_tag in soup.find_all('img', src=True):
298
+ src = img_tag['src'].strip()
299
+ alt = img_tag.get('alt', '').strip()
300
+ if src and src not in unique_images:
301
+ absolute_url = urljoin(base_url, src)
302
+ extracted['images'].append({'src': absolute_url, 'alt': alt})
303
+ unique_images.add(src)
304
+
305
+ # Extract media (audio/video)
306
+ unique_media = set()
307
+ for media_tag in soup.find_all(['audio', 'video'], src=True):
308
+ src = media_tag['src'].strip()
309
+ if src and src not in unique_media:
310
+ absolute_url = urljoin(base_url, src)
311
+ extracted['media'].append({'src': absolute_url, 'type': media_tag.name})
312
+ unique_media.add(src)
313
+
314
+ # Extract text content
315
  soup_copy = BeautifulSoup(content, 'html.parser')
316
  for script_or_style in soup_copy(["script", "style"]):
317
  script_or_style.extract()
 
726
  elif archive_extension in ('.tar', '.gz', '.tgz'):
727
  try:
728
  mode = 'r'
729
+ if archive_extension in ('.tar.gz', '.tgz', '.gz'): mode = 'r:gz'
730
  with tarfile.open(archive_path, mode) as tar_ref:
731
  for member in tar_ref.getmembers():
732
  if member.isfile():
 
764
  f"Failed to clean up extracted file {extracted_file_path}: {e}")
765
  except tarfile.TarError as e:
766
  logger.error(f"Error processing TAR archive '{archive_path.name}': {e}")
767
+ elif archive_extension == '.gz': # This case is handled by tarfile, but added for single .gz files
768
+ extracted_name = archive_path.stem
769
+ extracted_path = extract_to / extracted_name
770
+ try:
771
+ with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile:
772
+ outfile.write(gz_file.read())
773
+ if extracted_path.suffix.lower() in self.supported_extensions and not self._is_archive(
774
+ extracted_path):
775
+ dataset.extend(self._process_single_file(extracted_path))
776
+ elif extracted_path.suffix.lower() in self.archive_extensions:
777
+ logger.info(f"Found nested archive '{extracted_name}', processing recursively.")
778
+ dataset.extend(self._process_archive(extracted_path, extract_to))
779
+ else:
780
+ logger.debug(f"Skipping unsupported file (from gz): '{extracted_name}'")
781
+ except gzip.GzipFile as e:
782
+ logger.error(f"Error processing GZIP file '{archive_path.name}': {e}")
783
+ except Exception as e:
784
+ logger.error(f"Error extracting/processing from GZIP '{archive_path.name}': {e}")
785
+ finally:
786
+ if extracted_path.exists():
787
+ try:
788
+ extracted_path.unlink()
789
+ except OSError as e:
790
+ logger.warning(f"Failed to clean up extracted file {extracted_path}: {e}")
791
  elif archive_extension in ('.bz2', '.7z', '.rar'):
792
  logger.warning(
793
  f"Support for {archive_extension} archives is not yet fully implemented and requires external tools/libraries.")
 
1066
  filter_match = re.search(
1067
  r'(?:filter|show items|show me items|find entries|select items|get items)\s+'
1068
  r'(?:where|by|for|with|if)\s+'
1069
+ r'([\w\._-]+)\s+' # Allow underscores, periods, and hyphens in column names
1070
  r'(is|equals?|==|!=|>=?|<=?|contains?|starts with|ends with)\s+'
1071
+ r'([\'"]?[\w\s\.-]+[\'"]?)',
1072
  lower_message
1073
  )
1074
  if filter_match:
 
1081
  response = f"I couldn't find a column named '{column_name}'. Available columns are: {', '.join(df.columns)}"
1082
  new_filtered_df_state = None
1083
  else:
1084
+ df_to_filter = df.copy() # Always filter from the full dataframe
1085
  try:
1086
+ target_value: Any = None
1087
+ col_dtype = df_to_filter[column_name].dtype
1088
+
1089
+ is_numeric_op = operator in ['>', '>=', '<', '<=', '==', '!=']
1090
+ is_numeric_col = pd.api.types.is_numeric_dtype(col_dtype)
1091
+
1092
+ if is_numeric_op and is_numeric_col:
1093
  try:
1094
  target_value = float(value_str)
1095
  col_series = pd.to_numeric(df_to_filter[column_name], errors='coerce')
1096
  except ValueError:
1097
  response = f"For numeric column '{column_name}', '{value_str}' is not a valid number."
 
1098
  elif pd.api.types.is_bool_dtype(col_dtype) or value_str.lower() in ['true', 'false']:
1099
  target_value = value_str.lower() == 'true'
1100
  col_series = df_to_filter[column_name].astype(bool, errors='ignore')
1101
+ else: # Treat as string
1102
  target_value = str(value_str)
1103
  col_series = df_to_filter[column_name].astype(str).str.lower()
1104
  value_str_lower = target_value.lower()
1105
+
1106
+ if not response: # No error so far
1107
+ condition = None
1108
  if operator in ['is', 'equals', '==']:
1109
+ if is_numeric_col or pd.api.types.is_bool_dtype(col_dtype):
 
1110
  condition = col_series == target_value
1111
  else:
1112
  condition = col_series == value_str_lower
1113
  elif operator == '!=':
1114
+ if is_numeric_col or pd.api.types.is_bool_dtype(col_dtype):
 
1115
  condition = col_series != target_value
1116
  else:
1117
  condition = col_series != value_str_lower
1118
+ elif operator == '>' and is_numeric_col:
1119
  condition = col_series > target_value
1120
+ elif operator == '>=' and is_numeric_col:
1121
  condition = col_series >= target_value
1122
+ elif operator == '<' and is_numeric_col:
1123
  condition = col_series < target_value
1124
+ elif operator == '<=' and is_numeric_col:
1125
  condition = col_series <= target_value
1126
+ elif operator in ['contains', 'contain']:
1127
+ condition = df_to_filter[column_name].astype(str).str.contains(value_str, case=False, na=False)
1128
+ elif operator == 'starts with':
1129
+ condition = df_to_filter[column_name].astype(str).str.startswith(value_str, case=False, na=False)
1130
+ elif operator == 'ends with':
1131
+ condition = df_to_filter[column_name].astype(str).str.endswith(value_str, case=False, na=False)
1132
  else:
1133
  response = f"Unsupported operator '{operator}' for column '{column_name}' (type: {col_dtype})."
1134
+
 
1135
  if condition is not None:
1136
  filtered_results_df = df_to_filter[condition]
1137
  if not filtered_results_df.empty:
 
1146
  f"Here's a preview:\n```\n{preview_str}\n```\n"
1147
  f"The full filtered dataset is now available for download using the 'Download Filtered JSON' button.")
1148
  else:
1149
+ new_filtered_df_state = pd.DataFrame() # Empty dataframe
1150
  response = f"No items found where '{column_name}' {operator} '{value_str}'."
 
 
 
1151
  except ValueError as ve:
1152
  response = f"Invalid value '{value_str}' for numeric column '{column_name}'. {ve}"
1153
  new_filtered_df_state = None
 
1585
  processing_status_messages.append(f"βœ… Processed URL: {url} (Level 0)")
1586
  if content_result.get('processing_notes'):
1587
  processing_status_messages.append(
1588
+ f" Notes: {'; '.join(content_result['processing_notes'])}")
1589
  if content_result.get('linked_extractions'):
1590
  num_linked_processed = len([r for r in content_result['linked_extractions'] if
1591
  r and r.get('fetch_result') is not None])
1592
  processing_status_messages.append(
1593
+ f" Found and processed {num_linked_processed}/{len(content_result['linked_extractions'])} direct links.")
1594
  else:
1595
  processing_status_messages.append(f"❌ Failed to process URL: {url}")
1596
  if content_result.get('processing_notes'):
1597
  processing_status_messages.append(
1598
+ f" Notes: {'; '.join(content_result['processing_notes'])}")
1599
  else:
1600
  processing_status_messages.append(
1601
  f"❌ Failed to process URL: {url} (No result returned)")
 
1609
  for res in file_results:
1610
  if res.get('processing_notes'):
1611
  processing_status_messages.append(
1612
+ f" Notes for {res.get('filename', 'item')}: {'; '.join(res['processing_notes'])}")
1613
  else:
1614
  processing_status_messages.append(f"❌ Failed to process file: {file.name}")
1615
  qr_paths = []