acecalisto3 commited on
Commit
f5f3613
·
verified ·
1 Parent(s): 8b33e0b

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +152 -255
app2.py CHANGED
@@ -555,7 +555,7 @@ class FileProcessor:
555
  qr.add_data(json_str)
556
  qr.make(fit=True)
557
 
558
- img = qr.make_image(fill_color="black", back_color="white")
559
  output_path = output_dir / f'combined_qr_{int(time.time())}.png'
560
  img.save(str(output_path))
561
  return [str(output_path)]
@@ -686,6 +686,7 @@ def datachat_interface(mode: str, data_source: str, json_input: str, qr_image: s
686
  else:
687
  return "Invalid mode selected."
688
 
 
689
  def create_interface():
690
  """Create a comprehensive Gradio interface with advanced features"""
691
  css = """
@@ -695,260 +696,22 @@ def create_interface():
695
  .success { background-color: #d4edda; color: #155724; padding: 10px; border-radius: 4px; }
696
  """
697
 
698
- with gr.Blocks(css=css, title="Advanced Data Processor & QR Code Generator") as interface:
699
- gr.Markdown("# 🌐 Advanced Data Processing & QR Code Generator")
700
-
701
- # URL Extraction Tab
702
- with gr.Tab("URL Extraction"):
703
- url_input = gr.Textbox(label="URL to Process", placeholder="https://example.com")
704
- depth_slider = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Crawl Depth (Higher values may affect performance)")
705
- respect_robots = gr.Checkbox(label="Respect robots.txt", value=True)
706
- extract_btn = gr.Button("Extract Content")
707
- url_output = gr.JSON(label="Extracted Data")
708
- download_btn = gr.Button("Download Results as ZIP")
709
- download_output = gr.File(label="Download")
710
-
711
- # Warning about depth
712
- gr.Markdown("""
713
- <div class="warning">
714
- ⚠️ <strong>Warning:</strong> Higher depth values (>2) may significantly increase processing time and resource usage.
715
- </div>
716
- """)
717
-
718
- # URL processor instance
719
- url_processor = URLProcessor()
720
-
721
- def process_url(url, depth, respect_robots):
722
- url_processor.respect_robots = respect_robots
723
- results = []
724
- try:
725
- # Validate URL
726
- validation = url_processor.validate_url(url)
727
- if not validation['is_valid']:
728
- return {"error": validation['message']}
729
-
730
- # Process with depth
731
- processed_urls = set()
732
- urls_to_process = [(url, 0)] # (url, current_depth)
733
-
734
- while urls_to_process:
735
- current_url, current_depth = urls_to_process.pop(0)
736
-
737
- if current_url in processed_urls:
738
- continue
739
-
740
- processed_urls.add(current_url)
741
- content = url_processor.fetch_content(current_url)
742
-
743
- if content:
744
- results.append({
745
- "url": current_url,
746
- "content": content.get('content', ''),
747
- "content_type": content.get('content_type', ''),
748
- "timestamp": datetime.now().isoformat()
749
- })
750
-
751
- # If we haven't reached max depth, extract and queue more URLs
752
- if current_depth < depth:
753
- soup = BeautifulSoup(content.get('content', ''), 'html.parser')
754
- for link in soup.find_all('a', href=True):
755
- next_url = link['href']
756
- if next_url.startswith('/'):
757
- # Convert relative URL to absolute
758
- from urllib.parse import urlparse, urljoin
759
- parsed_url = urlparse(current_url)
760
- base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
761
- next_url = urljoin(base_url, next_url)
762
-
763
- if validators.url(next_url) and next_url not in processed_urls:
764
- urls_to_process.append((next_url, current_depth + 1))
765
-
766
- return results
767
- except Exception as e:
768
- logger.error(f"URL processing error: {e}")
769
- return {"error": str(e)}
770
-
771
- def create_download_zip(results):
772
- if not results or (isinstance(results, dict) and 'error' in results):
773
- return None
774
-
775
- try:
776
- # Create a temporary zip file
777
- with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as tmp:
778
- with zipfile.ZipFile(tmp.name, 'w') as zipf:
779
- # Add JSON data
780
- zipf.writestr('extracted_data.json', json.dumps(results, indent=2))
781
-
782
- # Add individual text files for each URL
783
- for idx, item in enumerate(results):
784
- if 'content' in item:
785
- zipf.writestr(f'content_{idx}_{int(time.time())}.txt', item['content'])
786
-
787
- return tmp.name
788
- except Exception as e:
789
- logger.error(f"Error creating ZIP file: {e}")
790
- return None
791
-
792
- extract_btn.click(process_url, [url_input, depth_slider, respect_robots], url_output)
793
- download_btn.click(create_download_zip, [url_output], download_output)
794
-
795
- # ZIP File Extractor Tab
796
- with gr.Tab("ZIP File Extractor"):
797
- zip_file_input = gr.File(label="Upload ZIP File")
798
- extract_zip_btn = gr.Button("Extract and Process")
799
- zip_output = gr.JSON(label="Extracted Data")
800
- zip_qr_btn = gr.Button("Generate QR Code")
801
- zip_qr_output = gr.Image(label="QR Code")
802
-
803
- file_processor = FileProcessor()
804
-
805
- def process_zip_file(file):
806
- if not file:
807
- return {"error": "No file uploaded"}
808
-
809
- try:
810
- results = file_processor.process_file(file)
811
- return results
812
- except Exception as e:
813
- logger.error(f"ZIP processing error: {e}")
814
- return {"error": str(e)}
815
-
816
- def generate_zip_qr(data):
817
- if not data or (isinstance(data, dict) and 'error' in data):
818
- return None
819
-
820
- try:
821
- return file_processor.generate_qr_code(data, combined=True)[0]
822
- except Exception as e:
823
- logger.error(f"QR generation error: {e}")
824
- return None
825
-
826
- extract_zip_btn.click(process_zip_file, [zip_file_input], zip_output)
827
- zip_qr_btn.click(generate_zip_qr, [zip_output], zip_qr_output)
828
-
829
- # Raw Text to JSON Tab
830
- with gr.Tab("Text to JSON"):
831
- text_input = gr.Textbox(lines=10, label="Raw Text Input")
832
- json_structure = gr.Dropdown(
833
- choices=["Simple", "Structured", "Key-Value Pairs"],
834
- label="JSON Structure",
835
- value="Simple"
836
- )
837
- convert_btn = gr.Button("Convert to JSON")
838
- json_output = gr.JSON(label="JSON Output")
839
- combine_json_btn = gr.Button("Combine with Previous JSON")
840
- previous_json = gr.Textbox(lines=5, label="Previous JSON (Optional)")
841
- combined_output = gr.JSON(label="Combined JSON")
842
- text_qr_btn = gr.Button("Generate QR Code")
843
- text_qr_output = gr.Image(label="QR Code")
844
-
845
- def convert_text_to_json(text, structure):
846
- if not text.strip():
847
- return {"error": "No text provided"}
848
-
849
- try:
850
- if structure == "Simple":
851
- return {
852
- "text": text,
853
- "timestamp": datetime.now().isoformat()
854
- }
855
- elif structure == "Structured":
856
- lines = text.split('\n')
857
- paragraphs = []
858
- current_para = []
859
-
860
- for line in lines:
861
- if line.strip():
862
- current_para.append(line)
863
- elif current_para:
864
- paragraphs.append(' '.join(current_para))
865
- current_para = []
866
-
867
- if current_para:
868
- paragraphs.append(' '.join(current_para))
869
-
870
- return {
871
- "title": paragraphs[0] if paragraphs else "",
872
- "paragraphs": paragraphs[1:] if len(paragraphs) > 1 else [],
873
- "timestamp": datetime.now().isoformat()
874
- }
875
- elif structure == "Key-Value Pairs":
876
- pairs = {}
877
- lines = text.split('\n')
878
-
879
- for line in lines:
880
- if ':' in line:
881
- key, value = line.split(':', 1)
882
- pairs[key.strip()] = value.strip()
883
-
884
- pairs["timestamp"] = datetime.now().isoformat()
885
- return pairs
886
-
887
- return {"error": "Invalid structure selected"}
888
- except Exception as e:
889
- logger.error(f"Text to JSON conversion error: {e}")
890
- return {"error": str(e)}
891
-
892
- def combine_json_data(current, previous):
893
- if not current or (isinstance(current, dict) and 'error' in current):
894
- return {"error": "No valid current JSON"}
895
-
896
- try:
897
- if not previous.strip():
898
- return current
899
-
900
- prev_json = json.loads(previous)
901
-
902
- # Determine how to combine based on types
903
- if isinstance(prev_json, list) and isinstance(current, list):
904
- return prev_json + current
905
- elif isinstance(prev_json, list):
906
- return prev_json + [current]
907
- elif isinstance(current, list):
908
- return [prev_json] + current
909
- else:
910
- # Both are objects, merge them
911
- combined = {**prev_json, **current}
912
- # Add a combined timestamp
913
- combined["combined_timestamp"] = datetime.now().isoformat()
914
- return combined
915
- except json.JSONDecodeError:
916
- return {"error": "Previous JSON is invalid"}
917
- except Exception as e:
918
- logger.error(f"JSON combination error: {e}")
919
- return {"error": str(e)}
920
-
921
- convert_btn.click(convert_text_to_json, [text_input, json_structure], json_output)
922
- combine_json_btn.click(combine_json_data, [json_output, previous_json], combined_output)
923
- text_qr_btn.click(generate_zip_qr, [json_output], text_qr_output)
924
-
925
- # DataChat Tab (existing)
926
- with gr.Tab("DataChat"):
927
- mode = gr.Radio(["Trained with Data", "Chat about Data"], label="Mode")
928
- data_source = gr.Radio(["JSON Input", "QR Code"], label="Data Source")
929
- json_input = gr.Textbox(lines=8, label="JSON Data")
930
- qr_image = gr.Image(label="QR Code Image", type="filepath")
931
- query = gr.Textbox(label="Query")
932
-
933
- submit_btn = gr.Button("Submit")
934
- output = gr.Textbox(label="Response")
935
-
936
- submit_btn.click(datachat_interface, [mode, data_source, json_input, qr_image, query], output)
937
-
938
- # QR Generator Tab (existing)
939
- with gr.Tab("QR Generator"):
940
- qr_input = gr.Textbox(lines=8, label="Input JSON for QR")
941
- generate_btn = gr.Button("Generate QR")
942
- qr_output = gr.Image(label="Generated QR Code")
943
-
944
- def generate_qr(json_data):
945
- data = file_processor.clean_json(json_data)
946
- if data:
947
- return file_processor.generate_qr_code(data)
948
- return None
949
-
950
- generate_btn.click(generate_qr, qr_input, qr_output)
951
-
952
  return interface
953
 
954
  def main():
@@ -966,3 +729,137 @@ def main():
966
 
967
  if __name__ == "__main__":
968
  main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
555
  qr.add_data(json_str)
556
  qr.make(fit=True)
557
 
558
+ img = qrcode.make_image(fill_color="black", back_color="white")
559
  output_path = output_dir / f'combined_qr_{int(time.time())}.png'
560
  img.save(str(output_path))
561
  return [str(output_path)]
 
686
  else:
687
  return "Invalid mode selected."
688
 
689
+ # Replace the create_interface function with this version
690
  def create_interface():
691
  """Create a comprehensive Gradio interface with advanced features"""
692
  css = """
 
696
  .success { background-color: #d4edda; color: #155724; padding: 10px; border-radius: 4px; }
697
  """
698
 
699
+ # Use Interface instead of Blocks
700
+ interface = gr.Interface(
701
+ fn=datachat_interface,
702
+ inputs=[
703
+ gr.Radio(["Trained with Data", "Chat about Data"], label="Mode"),
704
+ gr.Radio(["JSON Input", "QR Code"], label="Data Source"),
705
+ gr.Textbox(lines=8, label="JSON Data"),
706
+ gr.Image(label="QR Code Image", type="filepath"),
707
+ gr.Textbox(label="Query")
708
+ ],
709
+ outputs=gr.Textbox(label="Response"),
710
+ title="Advanced Data Processor & QR Code Generator",
711
+ description="# 🌐 Advanced Data Processing & QR Code Generator",
712
+ css=css
713
+ )
714
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
715
  return interface
716
 
717
  def main():
 
729
 
730
  if __name__ == "__main__":
731
  main()
732
+
733
+
734
+ def create_download_zip(results):
735
+ if not results or (isinstance(results, dict) and 'error' in results):
736
+ return None
737
+
738
+ try:
739
+ # Create a temporary zip file
740
+ with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as tmp:
741
+ with zipfile.ZipFile(tmp.name, 'w') as zipf:
742
+ # Add JSON data
743
+ zipf.writestr('extracted_data.json', json.dumps(results, indent=2))
744
+
745
+ # Add individual files for each URL
746
+ for idx, item in enumerate(results):
747
+ if 'html' in item:
748
+ zipf.writestr(f'content_{idx}_full.html', item['html'])
749
+
750
+ if 'content' in item:
751
+ zipf.writestr(f'content_{idx}_text.txt', item['content'])
752
+
753
+ # Download and include images
754
+ if 'images' in item and item['images']:
755
+ img_dir = f'content_{idx}_images'
756
+ for img_idx, img in enumerate(item['images']):
757
+ try:
758
+ img_url = img['src']
759
+ if validators.url(img_url):
760
+ img_response = requests.get(img_url, timeout=10)
761
+ if img_response.status_code == 200:
762
+ # Extract file extension from URL or content type
763
+ content_type = img_response.headers.get('Content-Type', '')
764
+ ext = '.jpg' # Default extension
765
+ if 'png' in content_type:
766
+ ext = '.png'
767
+ elif 'gif' in content_type:
768
+ ext = '.gif'
769
+ elif 'svg' in content_type:
770
+ ext = '.svg'
771
+
772
+ zipf.writestr(f'{img_dir}/image_{img_idx}{ext}', img_response.content)
773
+ except Exception as e:
774
+ logger.error(f"Error downloading image {img_url}: {e}")
775
+
776
+ # Include scripts
777
+ if 'scripts' in item and item['scripts']:
778
+ scripts_dir = f'content_{idx}_scripts'
779
+ for script_idx, script in enumerate(item['scripts']):
780
+ if script:
781
+ zipf.writestr(f'{scripts_dir}/script_{script_idx}.js', script)
782
+
783
+ # Include styles
784
+ if 'styles' in item and item['styles']:
785
+ styles_dir = f'content_{idx}_styles'
786
+ for style_idx, style in enumerate(item['styles']):
787
+ if style:
788
+ zipf.writestr(f'{styles_dir}/style_{style_idx}.css', style)
789
+
790
+ # Include links as a separate file
791
+ if 'links' in item and item['links']:
792
+ links_content = "URL,Text\n"
793
+ for link in item['links']:
794
+ links_content += f"\"{link['url']}\",\"{link['text']}\"\n"
795
+ zipf.writestr(f'content_{idx}_links.csv', links_content)
796
+
797
+ # Include tables as CSV files
798
+ if 'tables' in item and item['tables']:
799
+ tables_dir = f'content_{idx}_tables'
800
+ for table_idx, table in enumerate(item['tables']):
801
+ table_content = ""
802
+ for row in table:
803
+ table_content += ",".join([f"\"{cell}\"" for cell in row]) + "\n"
804
+ zipf.writestr(f'{tables_dir}/table_{table_idx}.csv', table_content)
805
+
806
+ # Create an index.html file for easy navigation
807
+ index_html = """
808
+ <!DOCTYPE html>
809
+ <html>
810
+ <head>
811
+ <title>Extracted Content</title>
812
+ <style>
813
+ body { font-family: Arial, sans-serif; margin: 20px; }
814
+ h1 { color: #333; }
815
+ .url-item { margin-bottom: 20px; padding: 10px; border: 1px solid #ddd; }
816
+ .url-title { font-weight: bold; }
817
+ .resource-list { margin-left: 20px; }
818
+ </style>
819
+ </head>
820
+ <body>
821
+ <h1>Extracted Content</h1>
822
+ """
823
+
824
+ for idx, item in enumerate(results):
825
+ index_html += f"""
826
+ <div class="url-item">
827
+ <div class="url-title">{idx + 1}. {item.get('url', 'Unknown URL')}</div>
828
+ <div>Title: {item.get('title', 'No title')}</div>
829
+ <div>Timestamp: {item.get('timestamp', '')}</div>
830
+ <div class="resource-list">
831
+ <p><a href="content_{idx}_full.html">Full HTML</a></p>
832
+ <p><a href="content_{idx}_text.txt">Text Content</a></p>
833
+ """
834
+
835
+ if 'images' in item and item['images']:
836
+ index_html += f"""
837
+ <p>Images: {len(item['images'])} found</p>
838
+ """
839
+
840
+ if 'links' in item and item['links']:
841
+ index_html += f"""
842
+ <p>Links: <a href="content_{idx}_links.csv">{len(item['links'])} found</a></p>
843
+ """
844
+
845
+ if 'tables' in item and item['tables']:
846
+ index_html += f"""
847
+ <p>Tables: {len(item['tables'])} found</p>
848
+ """
849
+
850
+ index_html += """
851
+ </div>
852
+ </div>
853
+ """
854
+
855
+ index_html += """
856
+ </body>
857
+ </html>
858
+ """
859
+
860
+ zipf.writestr('index.html', index_html)
861
+
862
+ return tmp.name
863
+ except Exception as e:
864
+ logger.error(f"Error creating ZIP file: {e}")
865
+ return None