Spaces:
Sleeping
Sleeping
| import http.client as http_client | |
| import json | |
| import logging | |
| import os | |
| import pprint | |
| import re | |
| import string | |
| import streamlit as st | |
| import streamlit.components.v1 as components | |
| import requests | |
| pp = pprint.PrettyPrinter(indent=2) | |
| st.set_page_config(page_title="Gaia Search", layout="wide") | |
| os.makedirs(os.path.join(os.getcwd(),".streamlit"), exist_ok = True) | |
| with open(os.path.join(os.getcwd(),".streamlit/config.toml"), "w") as file: | |
| file.write( | |
| '[theme]\nbase="light"' | |
| ) | |
| LANG_MAPPING = {'Arabic':'ar', | |
| 'Catalan':'ca', | |
| 'Code':'code', | |
| 'English':'en', | |
| 'Spanish':'es', | |
| 'French':'fr', | |
| 'Indonesian':'id', | |
| 'Indic':'indic', | |
| 'Niger-Congo':'nigercongo', | |
| 'Portuguese': 'pt', | |
| 'Vietnamese': 'vi', | |
| 'Chinese': 'zh', | |
| 'Detect Language':'detect_language', | |
| 'All':'all'} | |
| st.sidebar.markdown( | |
| """ | |
| <style> | |
| .aligncenter { | |
| text-align: center; | |
| font-weight: bold; | |
| font-size: 50px; | |
| } | |
| </style> | |
| <p class="aligncenter">Gaia Search 🌖🌏</p> | |
| <p style="text-align: center;"> A search engine for the LAION large scale image caption corpora</p> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| st.sidebar.markdown( | |
| """ | |
| <style> | |
| .aligncenter { | |
| text-align: center; | |
| } | |
| </style> | |
| <p style='text-align: center'> | |
| <a href="" >GitHub</a> | <a href="" >Project Report</a> | |
| </p> | |
| <p class="aligncenter"> | |
| <a href="" target="_blank"> | |
| <img src="https://colab.research.google.com/assets/colab-badge.svg"/> | |
| </a> | |
| </p> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| query = st.sidebar.text_input(label='Search query', value='') | |
| language = st.sidebar.selectbox( | |
| 'Language', | |
| ('Arabic', 'Catalan', 'Code', 'English', 'Spanish', 'French', 'Indonesian', 'Indic', 'Niger-Congo', 'Portuguese', 'Vietnamese', 'Chinese', 'Detect Language', 'All'), | |
| index=3) | |
| max_results = st.sidebar.slider( | |
| "Maximum Number of Results", | |
| min_value=1, | |
| max_value=100, | |
| step=1, | |
| value=10, | |
| help="Maximum Number of Documents to return", | |
| ) | |
| footer="""<style> | |
| .footer { | |
| position: fixed; | |
| left: 0; | |
| bottom: 0; | |
| width: 100%; | |
| background-color: white; | |
| color: black; | |
| text-align: center; | |
| } | |
| </style> | |
| <div class="footer"> | |
| <p>Powered by <a href="https://huggingface.co/" >HuggingFace 🤗</a> and <a href="https://github.com/castorini/pyserini" >Pyserini 🦆</a></p> | |
| </div> | |
| """ | |
| st.sidebar.markdown(footer,unsafe_allow_html=True) | |
| def scisearch(query, language, num_results=10): | |
| try: | |
| query = query.strip() | |
| if query == "" or query is None: | |
| return | |
| post_data = {"query": query, "k": num_results} | |
| if language != "detect_language": | |
| post_data["lang"] = language | |
| output = requests.post( | |
| os.environ.get("address"), | |
| headers={"Content-type": "application/json"}, | |
| data=json.dumps(post_data), | |
| timeout=60, | |
| ) | |
| payload = json.loads(output.text) | |
| if "err" in payload: | |
| if payload["err"]["type"] == "unsupported_lang": | |
| detected_lang = payload["err"]["meta"]["detected_lang"] | |
| return f""" | |
| <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'> | |
| Detected language <b>{detected_lang}</b> is not supported.<br> | |
| Please choose a language from the dropdown or type another query. | |
| </p><br><hr><br>""" | |
| results = payload["results"] | |
| highlight_terms = payload["highlight_terms"] | |
| except Exception as e: | |
| results_html = f""" | |
| <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'> | |
| Raised {type(e).__name__}</p> | |
| <p style='font-size:14px; font-family: Arial; '> | |
| Check if a relevant discussion already exists in the Community tab. If not, please open a discussion. | |
| </p> | |
| """ | |
| print(e) | |
| return results, highlight_terms | |
| PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"} | |
| PII_PREFIX = "PI:" | |
| def process_pii(text): | |
| for tag in PII_TAGS: | |
| text = text.replace( | |
| PII_PREFIX + tag, | |
| """<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(tag), | |
| ) | |
| return text | |
| def highlight_string(paragraph: str, highlight_terms: list) -> str: | |
| for term in highlight_terms: | |
| paragraph = re.sub(f"\\b{term}\\b", f"<b>{term}</b>", paragraph, flags=re.I) | |
| paragraph = process_pii(paragraph) | |
| return paragraph | |
| def process_results(hits: list, highlight_terms: list) -> str: | |
| hit_list = [] | |
| for i, hit in enumerate(hits): | |
| res_head = f""" | |
| <div class="searchresult"> | |
| <h2>{i+1}. Document ID: {hit['docid']}</h2> | |
| <p>Language: <string>{hit['lang']}</string>, Score: {round(hit['score'], 2)}</p> | |
| """ | |
| for subhit in hit['meta']['docs']: | |
| res_head += f""" | |
| <button onclick="load_image({subhit['_id']})">Load Image</button><br> | |
| <p><img id='{subhit['_id']}' src='{subhit['URL']}' style="width:400px;height:auto;display:none;"></p> | |
| <a href='{subhit['URL']}'>{subhit['URL']}</a> | |
| <p>{highlight_string(subhit['TEXT'], highlight_terms)}</p> | |
| """ | |
| res_head += f""" | |
| <p>{highlight_string(hit['text'], highlight_terms)}</p> | |
| </div> | |
| <hr> | |
| """ | |
| hit_list.append(res_head) | |
| return " ".join(hit_list) | |
| if st.sidebar.button("Search"): | |
| hits, highlight_terms = scisearch(query, LANG_MAPPING[language], max_results) | |
| html_results = process_results(hits, highlight_terms) | |
| rendered_results = f""" | |
| <div id="searchresultsarea"> | |
| <br> | |
| <p id="searchresultsnumber">About {max_results} results</p> | |
| {html_results} | |
| </div> | |
| """ | |
| st.markdown(""" | |
| <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet" | |
| integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous"> | |
| """, | |
| unsafe_allow_html=True) | |
| st.markdown( | |
| """ | |
| <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css"> | |
| """, | |
| unsafe_allow_html=True) | |
| st.markdown( | |
| f""" | |
| <div class="row no-gutters mt-3 align-items-center"> | |
| Gaia Search 🌖🌏 | |
| <div class="col col-md-4"> | |
| <input class="form-control border-secondary rounded-pill pr-5" type="search" value="{query}" id="example-search-input2"> | |
| </div> | |
| <div class="col-auto"> | |
| <button class="btn btn-outline-light text-dark border-0 rounded-pill ml-n5" type="button"> | |
| <i class="fa fa-search"></i> | |
| </button> | |
| </div> | |
| </div> | |
| """, | |
| unsafe_allow_html=True) | |
| components.html( | |
| """ | |
| <style> | |
| #searchresultsarea { | |
| font-family: 'Arial'; | |
| } | |
| #searchresultsnumber { | |
| font-size: 0.8rem; | |
| color: gray; | |
| } | |
| .searchresult h2 { | |
| font-size: 19px; | |
| line-height: 18px; | |
| font-weight: normal; | |
| color: rgb(7, 111, 222); | |
| margin-bottom: 0px; | |
| margin-top: 25px; | |
| } | |
| .searchresult a { | |
| font-size: 12px; | |
| line-height: 12px; | |
| color: green; | |
| margin-bottom: 0px; | |
| } | |
| .dark-mode { | |
| color: white; | |
| } | |
| </style> | |
| <script> | |
| function load_image(id){ | |
| console.log(id) | |
| var x = document.getElementById(id); | |
| console.log(x) | |
| if (x.style.display === "none") { | |
| x.style.display = "block"; | |
| } else { | |
| x.style.display = "none"; | |
| } | |
| }; | |
| function myFunction() { | |
| var element = document.body; | |
| element.classList.toggle("dark-mode"); | |
| } | |
| </script> | |
| <button onclick="myFunction()">Toggle dark mode</button> | |
| """ + rendered_results, height=800, scrolling=True | |
| ) |