stefanjwojcik commited on
Commit
9ff0a35
·
verified ·
1 Parent(s): 56f40bd

first commit

Browse files
Dockerfile ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM julia:1.10.4
2
+
3
+ # Install python requirements for the project as root
4
+ RUN apt-get update && apt-get install -y python3 python3-pip python3-venv
5
+
6
+ # Create a non-root user
7
+ RUN useradd --create-home --shell /bin/bash user
8
+ RUN mkdir /home/user/app
9
+ WORKDIR /home/user/app
10
+ RUN chown -R user:user /home/
11
+ USER user
12
+
13
+ # Copy only the requirements file to leverage Docker cache
14
+ COPY --chown=user requirements.txt /home/user/app/requirements.txt
15
+
16
+ # Install pinecone and other Python dependencies as non-root user
17
+ RUN python3 -m venv /home/user/venv && \
18
+ /home/user/venv/bin/pip install -r /home/user/app/requirements.txt
19
+
20
+ # Copy the rest of the application code
21
+ COPY --chown=user . /home/user/app
22
+
23
+ # Copy the data to the container
24
+ COPY --chown=user data /home/user/data
25
+
26
+ # Activate the virtual environment
27
+ RUN echo 'export PATH="/home/user/venv/bin:$PATH"' >> /home/user/.bashrc
28
+
29
+ RUN mkdir -p /home/user/.julia/config && \
30
+ echo 'ENV["PYTHON"] = "/home/user/venv/bin/python"' >> /home/user/.julia/config/startup.jl
31
+
32
+ RUN mkdir -p /home/user/.julia/config
33
+ #COPY startup.jl /home/user/.julia/config/startup.jl
34
+
35
+ # Ensure the virtual environment is activated
36
+ RUN /home/user/venv/bin/pip install --upgrade pip
37
+
38
+ # Expose the port
39
+ EXPOSE 8000
40
+ EXPOSE 80
41
+ ENV JULIA_DEPOT_PATH "/home/user/.julia"
42
+
43
+ RUN julia -e 'using Pkg; Pkg.activate("."); Pkg.precompile()'
44
+
45
+ ENTRYPOINT julia --project -e 'using Pkg; Pkg.instantiate(); include("server.jl")'
How to Run the OstreaCultura Server.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # How to Run the OstreaCultura Server Image
2
+
3
+ This is a guide on how to run the OstreaCultura server image. The server image is a Docker image that contains the OstreaCultura server code and all the necessary dependencies to run the server.
4
+
5
+
6
+ ## Prerequisites
7
+ - Docker installed on your machine
8
+ - Access to Internet
9
+ - Port 8080 is not being used by another process
10
+ - Bash shell
11
+
12
+
13
+ ### Step 1: Pull the Docker Image
14
+ ```bash
15
+ docker pull public.ecr.aws/a8o9b6o4/ostreacultura/api:latest
16
+ ```
17
+
18
+ ### Step 2: Run the Docker Image
19
+ ```bash
20
+ docker run -p 8080:8080 -it public.ecr.aws/a8o9b6o4/ostreacultura/api:latest
21
+ ```
22
+
23
+ ### Step 3: Run the Server
24
+ ```bash
25
+ cd OstreaCultura && julia --project=. server.jl
26
+ ```
27
+
28
+ ### Step 4: Access the Server
29
+ Open your browser and navigate to `http://localhost:8080/docs` to access the documentation for the server.
Project.toml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name = "OstreaCultura"
2
+ uuid = "720c7e0a-31c4-4ba0-bde6-d3e3af4c503a"
3
+ authors = ["stefanjwojcik and contributors"]
4
+ version = "1.0.0-DEV"
5
+
6
+ [deps]
7
+ CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
8
+ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
9
+ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
10
+ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
11
+ HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
12
+ JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
13
+ LocalRegistry = "89398ba2-070a-4b16-a995-9893c55d93cf"
14
+ Oxygen = "df9a0d86-3283-4920-82dc-4555fc0d1d8b"
15
+ Pandas = "eadc2687-ae89-51f9-a5d9-86b5a6373a9c"
16
+ Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
17
+ ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
18
+ PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
19
+ RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
20
+ Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
21
+ Sqids = "5846b9ac-096c-425b-b363-8d1a03210e20"
22
+ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
23
+ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
24
+
25
+ [compat]
26
+ CSV = "0.10.15"
27
+ RDatasets = "0.7.7"
README.md CHANGED
@@ -1,9 +1,10 @@
1
  ---
2
- title: Misinfo Detection App
3
- emoji: 🏃
4
- colorFrom: gray
5
- colorTo: green
6
  sdk: docker
 
7
  pinned: false
8
  ---
9
 
 
1
  ---
2
+ title: Misinformation Search
3
+ emoji: 🌖
4
+ colorFrom: pink
5
+ colorTo: purple
6
  sdk: docker
7
+ app_port: 8000
8
  pinned: false
9
  ---
10
 
ReadMe.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## How to run some of the code in this repository
2
+
3
+ ### 1. Make sure Docker is installed on your machine
4
+ ### 2. Clone the repository
5
+ ### 3. CD into the repository
6
+ ### 4. Run the following command to build the docker image
7
+ ```bash
8
+ docker docker compose build -t oc-prototype .
9
+ ```
10
+ ### 5. Run the following command to run the docker image
11
+ ```bash
12
+ docker compose up -d oc-prototype
13
+ docker exec -it oc-prototype /bin/bash
14
+ ```
15
+
16
+
17
+ ## Prototype TODO's
18
+
19
+ ## Data
20
+ - [X] Process all misinfo claims and generate embeddings for a library namespace
21
+ - [X] Upsert claims into pinecone
22
+ - [X] Upsert 300k into namespace
23
+ - [ ] Update claim format to be similar to: https://www.kaggle.com/datasets/shivkumarganesh/politifact-factcheck-data/data
24
+
25
+ ## Functions
26
+ - [X] Upsert vector
27
+ - [X] Batch upsert
28
+ - [X] Query against metadata
29
+
30
+ - [ ] Generate working Dockerfile for project reproducibility
31
+ - [ ] Load data into a database
32
+ - [ ] Test precision/recall of embeddings
33
+ - [ ] Generate working version of climate demo
34
+
35
+
36
+ Embedding pricing:
37
+
38
+ 1 token = approximately 0.75 words or 1k tokens = 750 words, you pay per 1000 tokens $0.0001
39
+ Using that it can be shown that you get about 4 characters per token or 4Kb of embedding text per 1k tokens or $0.0001
40
+ Using that as your basis you can approximate the cost of your embedding by :
41
+ Cost in $ = Size of Data in Kilobytes * 0.000025
42
+
43
+ $0.100 / 1M tokens
44
+
45
+ Credentials for running google cloud queries: see ostreacultura-credentials.json
docker-compose.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ ostrea_cultura:
5
+ container_name: my_julia_container # Custom container name
6
+ build: .
7
+ volumes:
8
+ - .:/home/juliauser/OstreaCultura
9
+ ports:
10
+ - "8080:8080"
11
+ environment:
12
+ - PATH="/venv/bin:$PATH:/usr/local/julia/bin:/usr/local/bin"
13
+ command: /bin/bash -c "cd OstreaCultura && julia --project=. server.jl"
14
+ restart: unless-stopped
ostreacultura-credentials.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "service_account",
3
+ "project_id": "ostreacultura",
4
+ "private_key_id": "8cf6698a8b2cf0fe9191917f1a344933995f86b3",
5
+ "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQC4kYnebW4F0BbF\nmR5YgiqRY2nVC2oMfpRgWsjGuyeC1f1R+f8cKOG5SxWOAEjBwIKQyutO+B27uyiv\nV70vR5Z3EH+kEce+VJStnVi/2cpK1883dr9Mf7lVEoH3P5KwgYfxp0GBEZY9VO++\nBLUCaT834Al0sSGL0F8oztvV7pQ/dMDqNZ6J3OOcpg50spNuvE/Gxr5KNs3RlUiJ\nLsdJmaN7VzYZJ12u0NE2bR39m6q8BsP47O9AQfYk8UWL+oE8OY0k6EONLJEnMc+W\nYii9mmSYDvE2mP8rMpJi3dy1m8nFoz5BGiB7FGCEIaa3kqFCinss+Q1XzOKcB5PW\nIN54VoCBAgMBAAECggEABs5FdJbTNfjcjhnwtrf+B9eczgbnHy5sgqBDKdWgeSbM\nE4GR0m3NFpimyWvhqVTtVTEWVWNSyieP/xjPvtzEUJmii0ZyKua4umCRJp0XF4Sd\nShQUrVmpJNjs3LtSyZozjONrYYebfNSswaeWYRz7M5bZ4mkRrc0kmFjTkTAd9h6X\nVroxVOs2TzsELBO7LOM1TBAG0Bu5co5WlTltw8si7kEYMBJA3JQpi7BlvzMqxj3x\nlHbaMweW5fY2zpAJkWkGFxkoVg5f845nl54CsP8LWdbVH9EjLJWEqDc1GwUDCjwc\nZMjhjyeztPYzR8OhHMqu5yiknql6WtpR3+ul6khlUQKBgQDqkFj+t3quGjq5VQC1\nkQ7Vp2FpDOIisIVyzb/ETwj7G2HwW8VM630M30seEgmjO/WubcuIo3JKwt2px6Nx\nPHPp4QVR6lGUN1e7Rt1hghlCtP/RDRzErY8xHLtgzCIcKfQDQFNJLh4ti0IHqsQJ\n7nYtXpyRsvRPXIvttXLgA7sMxQKBgQDJb4sMhPkZLZ3MlhNc24oeL4btXh0D9PPV\nGbwcjIKZA/DyI+w7TXvW1LvwaWhr0ZopDkCJ5qtnwpIN91pq19CJzpH23iqrtszt\ng+nOekQ51yH27WSbLKcKeotC1PmpR/WhlFBJIihtKmAtpBGue73PNegmCcT/Lkrn\n4x54ioYYjQKBgHqy63rCq6/rFJDiAC76JFJq7i+vh5Mx13zIV+nulfoUxtSejMTB\n4nYbUAX41YXH1apa0L83EJA/AwjO35ZaUyAij0/cTLRFhuT2GtPo5CHa4H6QZ14S\nk/bS2sm1vpoQ8A3LVkXPlAWq9O1hzI6NaNAuz6P/goEJHkFIS0EXcn8pAoGADODi\nxPQjcOanthLKPh+THwbu23Iu7LfWdwNkMXcSSnIkD3cYfVog1sVXFrHGiU4nSwSs\nP6TNxJ599AKidz5BCO56Cox9sejIBU9vvVG31vLYKw9WZU1fiJ35FVbEd3wwShpP\nX72dVA1nhMN+lec3ZW4O34+0VpRBGb49jntcw2kCgYEA5VMAHskd23+mgsemIAkX\nOZS8dAjlJ2X1lFhOn9tCK3QI1NnQpyIxGK+iYjQjre/aFvT+5XpqUmvZ/deDax8b\nd1sCR5hF/NcDuqkcmBNkkZrt+cdwqyd/LHkPEtMVVcLHjoXh7r/VfwY38IbXpcUB\n30YPr+31bqumyYzssUc98Cw=\n-----END PRIVATE KEY-----\n",
6
+ "client_email": "[email protected]",
7
+ "client_id": "117283783846337502690",
8
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
9
+ "token_uri": "https://oauth2.googleapis.com/token",
10
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
11
+ "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/ocservice%40ostreacultura.iam.gserviceaccount.com",
12
+ "universe_domain": "googleapis.com"
13
+ }
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ sentence-transformers
4
+ pinecone[grpc]
5
+ pinecone_text
6
+ sqids
server.jl ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ using Oxygen
2
+ using HTTP
3
+ import OstreaCultura as OC
4
+
5
+ # Load the fasttext embeddings and the fasttext model
6
+ const (fc_embed, fc) = OC.load_fasttext_embeddings("data/filtered_fact_check_latest_embed.csv")
7
+ const (nar_embed, nar) = OC.load_fasttext_embeddings("data/expansive_claims_library_expanded_embed.csv")
8
+
9
+ @get "/greet" function(req::HTTP.Request)
10
+ return "hello world!"
11
+ end
12
+
13
+ ## Send a query to the test-index index in the test-namespace namespace
14
+ #@get "/query" function(req::HTTP.Request, querytext::String, indexname::String, namespace::String, top_k::Int64=5, include_values::Bool=false)
15
+ # OC.query(querytext, indexname, namespace; top_k=top_k, include_values=include_values)
16
+ #end
17
+
18
+ ## Send a query to look for matches within an organic dataset
19
+ #@get "/queryclaims" function(req::HTTP.Request, claim::String, counterclaim::String, indexname::String, namespace::String, top_k::Int64=5000, include_values::Bool=false)
20
+ # OC.query_claims(claim, counterclaim, indexname, namespace; top_k=top_k, include_values=include_values)
21
+ #end
22
+
23
+ ## Classify a claim within the misinformation library
24
+ #@get "/classify" function(req::HTTP.Request, claim::String, counterclaim::String, indexname::String, namespace::String, top_k::Int64=10, include_values::Bool=false)
25
+ # OC.classify_claim(claim, counterclaim, indexname, namespace; top_k=top_k, include_values=include_values)
26
+ #end
27
+
28
+ # Model is really 'namespace' in the OC library
29
+ @get "/search" function(req::HTTP.Request, claim::String, model::String="narratives", top_k::Int64=5)
30
+ # remove % signs from claim, replace with percent
31
+ claim = replace(claim, "%" => "percent")
32
+ OC.search(claim, "oc-hybrid-library-index", model; top_k=top_k, include_values=false)
33
+ end
34
+
35
+ @get "/fastfactsearch" function(req::HTTP.Request, claim::String, model::String, top_k::Int64=5)
36
+ # remove % signs from claim, replace with percent
37
+ claim = replace(claim, "%" => "percent")
38
+ if model == "narratives"
39
+ json(OC.fast_topk(nar_embed, nar, claim, top_k))
40
+ elseif model == "factchecks"
41
+ json(OC.fast_topk(fc_embed, fc, claim, top_k))
42
+ else
43
+ "Model not found"
44
+ end
45
+ end
46
+
47
+ #@get "/searchplot" function(req::HTTP.Request, claim::String, model::String, top_k::Int64=5, include_values::Bool=false)
48
+ # OC.searchplot(claim, "oc-hybrid-library-index", model; top_k=top_k, include_values=include_values)
49
+ #end
50
+
51
+ # start the web server
52
+ serve(host="0.0.0.0", port=8000)
startup.jl ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ENV["PYTHON"]="/venv/bin/python"
2
+ ENV["JULIA_NUM_THREADS"] = 8
3
+ ENV["JULIA_PKG_DEVDIR"] = "/home/workspace"
4
+
5
+ ENV["OPENAI_API_KEY"] = "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"
6
+ ENV["PINECONE_API_KEY"] = "5faec954-a6c5-4af5-a577-89dbd2e4e5b0"
7
+
8
+ using Revise
9
+
10
+ function restart()
11
+ startup = """
12
+ Base.ACTIVE_PROJECT[]=$(repr(Base.ACTIVE_PROJECT[]))
13
+ Base.HOME_PROJECT[]=$(repr(Base.HOME_PROJECT[]))
14
+ cd($(repr(pwd())))
15
+ """
16
+ cmd = `$(Base.julia_cmd()) -ie $startup`
17
+ atexit(()->run(cmd))
18
+ exit(0)
19
+ end