acecalisto3 commited on
Commit
23ab328
·
verified ·
1 Parent(s): f5f3613

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +255 -151
app2.py CHANGED
@@ -555,7 +555,7 @@ class FileProcessor:
555
  qr.add_data(json_str)
556
  qr.make(fit=True)
557
 
558
- img = qrcode.make_image(fill_color="black", back_color="white")
559
  output_path = output_dir / f'combined_qr_{int(time.time())}.png'
560
  img.save(str(output_path))
561
  return [str(output_path)]
@@ -686,7 +686,6 @@ def datachat_interface(mode: str, data_source: str, json_input: str, qr_image: s
686
  else:
687
  return "Invalid mode selected."
688
 
689
- # Replace the create_interface function with this version
690
  def create_interface():
691
  """Create a comprehensive Gradio interface with advanced features"""
692
  css = """
@@ -696,22 +695,260 @@ def create_interface():
696
  .success { background-color: #d4edda; color: #155724; padding: 10px; border-radius: 4px; }
697
  """
698
 
699
- # Use Interface instead of Blocks
700
- interface = gr.Interface(
701
- fn=datachat_interface,
702
- inputs=[
703
- gr.Radio(["Trained with Data", "Chat about Data"], label="Mode"),
704
- gr.Radio(["JSON Input", "QR Code"], label="Data Source"),
705
- gr.Textbox(lines=8, label="JSON Data"),
706
- gr.Image(label="QR Code Image", type="filepath"),
707
- gr.Textbox(label="Query")
708
- ],
709
- outputs=gr.Textbox(label="Response"),
710
- title="Advanced Data Processor & QR Code Generator",
711
- description="# 🌐 Advanced Data Processing & QR Code Generator",
712
- css=css
713
- )
714
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
715
  return interface
716
 
717
  def main():
@@ -730,136 +967,3 @@ def main():
730
  if __name__ == "__main__":
731
  main()
732
 
733
-
734
- def create_download_zip(results):
735
- if not results or (isinstance(results, dict) and 'error' in results):
736
- return None
737
-
738
- try:
739
- # Create a temporary zip file
740
- with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as tmp:
741
- with zipfile.ZipFile(tmp.name, 'w') as zipf:
742
- # Add JSON data
743
- zipf.writestr('extracted_data.json', json.dumps(results, indent=2))
744
-
745
- # Add individual files for each URL
746
- for idx, item in enumerate(results):
747
- if 'html' in item:
748
- zipf.writestr(f'content_{idx}_full.html', item['html'])
749
-
750
- if 'content' in item:
751
- zipf.writestr(f'content_{idx}_text.txt', item['content'])
752
-
753
- # Download and include images
754
- if 'images' in item and item['images']:
755
- img_dir = f'content_{idx}_images'
756
- for img_idx, img in enumerate(item['images']):
757
- try:
758
- img_url = img['src']
759
- if validators.url(img_url):
760
- img_response = requests.get(img_url, timeout=10)
761
- if img_response.status_code == 200:
762
- # Extract file extension from URL or content type
763
- content_type = img_response.headers.get('Content-Type', '')
764
- ext = '.jpg' # Default extension
765
- if 'png' in content_type:
766
- ext = '.png'
767
- elif 'gif' in content_type:
768
- ext = '.gif'
769
- elif 'svg' in content_type:
770
- ext = '.svg'
771
-
772
- zipf.writestr(f'{img_dir}/image_{img_idx}{ext}', img_response.content)
773
- except Exception as e:
774
- logger.error(f"Error downloading image {img_url}: {e}")
775
-
776
- # Include scripts
777
- if 'scripts' in item and item['scripts']:
778
- scripts_dir = f'content_{idx}_scripts'
779
- for script_idx, script in enumerate(item['scripts']):
780
- if script:
781
- zipf.writestr(f'{scripts_dir}/script_{script_idx}.js', script)
782
-
783
- # Include styles
784
- if 'styles' in item and item['styles']:
785
- styles_dir = f'content_{idx}_styles'
786
- for style_idx, style in enumerate(item['styles']):
787
- if style:
788
- zipf.writestr(f'{styles_dir}/style_{style_idx}.css', style)
789
-
790
- # Include links as a separate file
791
- if 'links' in item and item['links']:
792
- links_content = "URL,Text\n"
793
- for link in item['links']:
794
- links_content += f"\"{link['url']}\",\"{link['text']}\"\n"
795
- zipf.writestr(f'content_{idx}_links.csv', links_content)
796
-
797
- # Include tables as CSV files
798
- if 'tables' in item and item['tables']:
799
- tables_dir = f'content_{idx}_tables'
800
- for table_idx, table in enumerate(item['tables']):
801
- table_content = ""
802
- for row in table:
803
- table_content += ",".join([f"\"{cell}\"" for cell in row]) + "\n"
804
- zipf.writestr(f'{tables_dir}/table_{table_idx}.csv', table_content)
805
-
806
- # Create an index.html file for easy navigation
807
- index_html = """
808
- <!DOCTYPE html>
809
- <html>
810
- <head>
811
- <title>Extracted Content</title>
812
- <style>
813
- body { font-family: Arial, sans-serif; margin: 20px; }
814
- h1 { color: #333; }
815
- .url-item { margin-bottom: 20px; padding: 10px; border: 1px solid #ddd; }
816
- .url-title { font-weight: bold; }
817
- .resource-list { margin-left: 20px; }
818
- </style>
819
- </head>
820
- <body>
821
- <h1>Extracted Content</h1>
822
- """
823
-
824
- for idx, item in enumerate(results):
825
- index_html += f"""
826
- <div class="url-item">
827
- <div class="url-title">{idx + 1}. {item.get('url', 'Unknown URL')}</div>
828
- <div>Title: {item.get('title', 'No title')}</div>
829
- <div>Timestamp: {item.get('timestamp', '')}</div>
830
- <div class="resource-list">
831
- <p><a href="content_{idx}_full.html">Full HTML</a></p>
832
- <p><a href="content_{idx}_text.txt">Text Content</a></p>
833
- """
834
-
835
- if 'images' in item and item['images']:
836
- index_html += f"""
837
- <p>Images: {len(item['images'])} found</p>
838
- """
839
-
840
- if 'links' in item and item['links']:
841
- index_html += f"""
842
- <p>Links: <a href="content_{idx}_links.csv">{len(item['links'])} found</a></p>
843
- """
844
-
845
- if 'tables' in item and item['tables']:
846
- index_html += f"""
847
- <p>Tables: {len(item['tables'])} found</p>
848
- """
849
-
850
- index_html += """
851
- </div>
852
- </div>
853
- """
854
-
855
- index_html += """
856
- </body>
857
- </html>
858
- """
859
-
860
- zipf.writestr('index.html', index_html)
861
-
862
- return tmp.name
863
- except Exception as e:
864
- logger.error(f"Error creating ZIP file: {e}")
865
- return None
 
555
  qr.add_data(json_str)
556
  qr.make(fit=True)
557
 
558
+ img = qr.make_image(fill_color="black", back_color="white")
559
  output_path = output_dir / f'combined_qr_{int(time.time())}.png'
560
  img.save(str(output_path))
561
  return [str(output_path)]
 
686
  else:
687
  return "Invalid mode selected."
688
 
 
689
  def create_interface():
690
  """Create a comprehensive Gradio interface with advanced features"""
691
  css = """
 
695
  .success { background-color: #d4edda; color: #155724; padding: 10px; border-radius: 4px; }
696
  """
697
 
698
+ with gr.Blocks(css=css, title="Advanced Data Processor & QR Code Generator") as interface:
699
+ gr.Markdown("# 🌐 Advanced Data Processing & QR Code Generator")
700
+
701
+ # URL Extraction Tab
702
+ with gr.Tab("URL Extraction"):
703
+ url_input = gr.Textbox(label="URL to Process", placeholder="https://example.com")
704
+ depth_slider = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Crawl Depth (Higher values may affect performance)")
705
+ respect_robots = gr.Checkbox(label="Respect robots.txt", value=True)
706
+ extract_btn = gr.Button("Extract Content")
707
+ url_output = gr.JSON(label="Extracted Data")
708
+ download_btn = gr.Button("Download Results as ZIP")
709
+ download_output = gr.File(label="Download")
710
+
711
+ # Warning about depth
712
+ gr.Markdown("""
713
+ <div class="warning">
714
+ ⚠️ <strong>Warning:</strong> Higher depth values (>2) may significantly increase processing time and resource usage.
715
+ </div>
716
+ """)
717
+
718
+ # URL processor instance
719
+ url_processor = URLProcessor()
720
+
721
+ def process_url(url, depth, respect_robots):
722
+ url_processor.respect_robots = respect_robots
723
+ results = []
724
+ try:
725
+ # Validate URL
726
+ validation = url_processor.validate_url(url)
727
+ if not validation['is_valid']:
728
+ return {"error": validation['message']}
729
+
730
+ # Process with depth
731
+ processed_urls = set()
732
+ urls_to_process = [(url, 0)] # (url, current_depth)
733
+
734
+ while urls_to_process:
735
+ current_url, current_depth = urls_to_process.pop(0)
736
+
737
+ if current_url in processed_urls:
738
+ continue
739
+
740
+ processed_urls.add(current_url)
741
+ content = url_processor.fetch_content(current_url)
742
+
743
+ if content:
744
+ results.append({
745
+ "url": current_url,
746
+ "content": content.get('content', ''),
747
+ "content_type": content.get('content_type', ''),
748
+ "timestamp": datetime.now().isoformat()
749
+ })
750
+
751
+ # If we haven't reached max depth, extract and queue more URLs
752
+ if current_depth < depth:
753
+ soup = BeautifulSoup(content.get('content', ''), 'html.parser')
754
+ for link in soup.find_all('a', href=True):
755
+ next_url = link['href']
756
+ if next_url.startswith('/'):
757
+ # Convert relative URL to absolute
758
+ from urllib.parse import urlparse, urljoin
759
+ parsed_url = urlparse(current_url)
760
+ base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
761
+ next_url = urljoin(base_url, next_url)
762
+
763
+ if validators.url(next_url) and next_url not in processed_urls:
764
+ urls_to_process.append((next_url, current_depth + 1))
765
+
766
+ return results
767
+ except Exception as e:
768
+ logger.error(f"URL processing error: {e}")
769
+ return {"error": str(e)}
770
+
771
+ def create_download_zip(results):
772
+ if not results or (isinstance(results, dict) and 'error' in results):
773
+ return None
774
+
775
+ try:
776
+ # Create a temporary zip file
777
+ with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as tmp:
778
+ with zipfile.ZipFile(tmp.name, 'w') as zipf:
779
+ # Add JSON data
780
+ zipf.writestr('extracted_data.json', json.dumps(results, indent=2))
781
+
782
+ # Add individual text files for each URL
783
+ for idx, item in enumerate(results):
784
+ if 'content' in item:
785
+ zipf.writestr(f'content_{idx}_{int(time.time())}.txt', item['content'])
786
+
787
+ return tmp.name
788
+ except Exception as e:
789
+ logger.error(f"Error creating ZIP file: {e}")
790
+ return None
791
+
792
+ extract_btn.click(process_url, [url_input, depth_slider, respect_robots], url_output)
793
+ download_btn.click(create_download_zip, [url_output], download_output)
794
+
795
+ # ZIP File Extractor Tab
796
+ with gr.Tab("ZIP File Extractor"):
797
+ zip_file_input = gr.File(label="Upload ZIP File")
798
+ extract_zip_btn = gr.Button("Extract and Process")
799
+ zip_output = gr.JSON(label="Extracted Data")
800
+ zip_qr_btn = gr.Button("Generate QR Code")
801
+ zip_qr_output = gr.Image(label="QR Code")
802
+
803
+ file_processor = FileProcessor()
804
+
805
+ def process_zip_file(file):
806
+ if not file:
807
+ return {"error": "No file uploaded"}
808
+
809
+ try:
810
+ results = file_processor.process_file(file)
811
+ return results
812
+ except Exception as e:
813
+ logger.error(f"ZIP processing error: {e}")
814
+ return {"error": str(e)}
815
+
816
+ def generate_zip_qr(data):
817
+ if not data or (isinstance(data, dict) and 'error' in data):
818
+ return None
819
+
820
+ try:
821
+ return file_processor.generate_qr_code(data, combined=True)[0]
822
+ except Exception as e:
823
+ logger.error(f"QR generation error: {e}")
824
+ return None
825
+
826
+ extract_zip_btn.click(process_zip_file, [zip_file_input], zip_output)
827
+ zip_qr_btn.click(generate_zip_qr, [zip_output], zip_qr_output)
828
+
829
+ # Raw Text to JSON Tab
830
+ with gr.Tab("Text to JSON"):
831
+ text_input = gr.Textbox(lines=10, label="Raw Text Input")
832
+ json_structure = gr.Dropdown(
833
+ choices=["Simple", "Structured", "Key-Value Pairs"],
834
+ label="JSON Structure",
835
+ value="Simple"
836
+ )
837
+ convert_btn = gr.Button("Convert to JSON")
838
+ json_output = gr.JSON(label="JSON Output")
839
+ combine_json_btn = gr.Button("Combine with Previous JSON")
840
+ previous_json = gr.Textbox(lines=5, label="Previous JSON (Optional)")
841
+ combined_output = gr.JSON(label="Combined JSON")
842
+ text_qr_btn = gr.Button("Generate QR Code")
843
+ text_qr_output = gr.Image(label="QR Code")
844
+
845
+ def convert_text_to_json(text, structure):
846
+ if not text.strip():
847
+ return {"error": "No text provided"}
848
+
849
+ try:
850
+ if structure == "Simple":
851
+ return {
852
+ "text": text,
853
+ "timestamp": datetime.now().isoformat()
854
+ }
855
+ elif structure == "Structured":
856
+ lines = text.split('\n')
857
+ paragraphs = []
858
+ current_para = []
859
+
860
+ for line in lines:
861
+ if line.strip():
862
+ current_para.append(line)
863
+ elif current_para:
864
+ paragraphs.append(' '.join(current_para))
865
+ current_para = []
866
+
867
+ if current_para:
868
+ paragraphs.append(' '.join(current_para))
869
+
870
+ return {
871
+ "title": paragraphs[0] if paragraphs else "",
872
+ "paragraphs": paragraphs[1:] if len(paragraphs) > 1 else [],
873
+ "timestamp": datetime.now().isoformat()
874
+ }
875
+ elif structure == "Key-Value Pairs":
876
+ pairs = {}
877
+ lines = text.split('\n')
878
+
879
+ for line in lines:
880
+ if ':' in line:
881
+ key, value = line.split(':', 1)
882
+ pairs[key.strip()] = value.strip()
883
+
884
+ pairs["timestamp"] = datetime.now().isoformat()
885
+ return pairs
886
+
887
+ return {"error": "Invalid structure selected"}
888
+ except Exception as e:
889
+ logger.error(f"Text to JSON conversion error: {e}")
890
+ return {"error": str(e)}
891
+
892
+ def combine_json_data(current, previous):
893
+ if not current or (isinstance(current, dict) and 'error' in current):
894
+ return {"error": "No valid current JSON"}
895
+
896
+ try:
897
+ if not previous.strip():
898
+ return current
899
+
900
+ prev_json = json.loads(previous)
901
+
902
+ # Determine how to combine based on types
903
+ if isinstance(prev_json, list) and isinstance(current, list):
904
+ return prev_json + current
905
+ elif isinstance(prev_json, list):
906
+ return prev_json + [current]
907
+ elif isinstance(current, list):
908
+ return [prev_json] + current
909
+ else:
910
+ # Both are objects, merge them
911
+ combined = {**prev_json, **current}
912
+ # Add a combined timestamp
913
+ combined["combined_timestamp"] = datetime.now().isoformat()
914
+ return combined
915
+ except json.JSONDecodeError:
916
+ return {"error": "Previous JSON is invalid"}
917
+ except Exception as e:
918
+ logger.error(f"JSON combination error: {e}")
919
+ return {"error": str(e)}
920
+
921
+ convert_btn.click(convert_text_to_json, [text_input, json_structure], json_output)
922
+ combine_json_btn.click(combine_json_data, [json_output, previous_json], combined_output)
923
+ text_qr_btn.click(generate_zip_qr, [json_output], text_qr_output)
924
+
925
+ # DataChat Tab (existing)
926
+ with gr.Tab("DataChat"):
927
+ mode = gr.Radio(["Trained with Data", "Chat about Data"], label="Mode")
928
+ data_source = gr.Radio(["JSON Input", "QR Code"], label="Data Source")
929
+ json_input = gr.Textbox(lines=8, label="JSON Data")
930
+ qr_image = gr.Image(label="QR Code Image", type="filepath")
931
+ query = gr.Textbox(label="Query")
932
+
933
+ submit_btn = gr.Button("Submit")
934
+ output = gr.Textbox(label="Response")
935
+
936
+ submit_btn.click(datachat_interface, [mode, data_source, json_input, qr_image, query], output)
937
+
938
+ # QR Generator Tab (existing)
939
+ with gr.Tab("QR Generator"):
940
+ qr_input = gr.Textbox(lines=8, label="Input JSON for QR")
941
+ generate_btn = gr.Button("Generate QR")
942
+ qr_output = gr.Image(label="Generated QR Code")
943
+
944
+ def generate_qr(json_data):
945
+ data = file_processor.clean_json(json_data)
946
+ if data:
947
+ return file_processor.generate_qr_code(data)
948
+ return None
949
+
950
+ generate_btn.click(generate_qr, qr_input, qr_output)
951
+
952
  return interface
953
 
954
  def main():
 
967
  if __name__ == "__main__":
968
  main()
969