first commit
Browse files- Dockerfile +45 -0
- How to Run the OstreaCultura Server.md +29 -0
- Project.toml +27 -0
- README.md +5 -4
- ReadMe.md +45 -0
- docker-compose.yaml +14 -0
- ostreacultura-credentials.json +13 -0
- requirements.txt +6 -0
- server.jl +52 -0
- startup.jl +19 -0
Dockerfile
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM julia:1.10.4
|
2 |
+
|
3 |
+
# Install python requirements for the project as root
|
4 |
+
RUN apt-get update && apt-get install -y python3 python3-pip python3-venv
|
5 |
+
|
6 |
+
# Create a non-root user
|
7 |
+
RUN useradd --create-home --shell /bin/bash user
|
8 |
+
RUN mkdir /home/user/app
|
9 |
+
WORKDIR /home/user/app
|
10 |
+
RUN chown -R user:user /home/
|
11 |
+
USER user
|
12 |
+
|
13 |
+
# Copy only the requirements file to leverage Docker cache
|
14 |
+
COPY --chown=user requirements.txt /home/user/app/requirements.txt
|
15 |
+
|
16 |
+
# Install pinecone and other Python dependencies as non-root user
|
17 |
+
RUN python3 -m venv /home/user/venv && \
|
18 |
+
/home/user/venv/bin/pip install -r /home/user/app/requirements.txt
|
19 |
+
|
20 |
+
# Copy the rest of the application code
|
21 |
+
COPY --chown=user . /home/user/app
|
22 |
+
|
23 |
+
# Copy the data to the container
|
24 |
+
COPY --chown=user data /home/user/data
|
25 |
+
|
26 |
+
# Activate the virtual environment
|
27 |
+
RUN echo 'export PATH="/home/user/venv/bin:$PATH"' >> /home/user/.bashrc
|
28 |
+
|
29 |
+
RUN mkdir -p /home/user/.julia/config && \
|
30 |
+
echo 'ENV["PYTHON"] = "/home/user/venv/bin/python"' >> /home/user/.julia/config/startup.jl
|
31 |
+
|
32 |
+
RUN mkdir -p /home/user/.julia/config
|
33 |
+
#COPY startup.jl /home/user/.julia/config/startup.jl
|
34 |
+
|
35 |
+
# Ensure the virtual environment is activated
|
36 |
+
RUN /home/user/venv/bin/pip install --upgrade pip
|
37 |
+
|
38 |
+
# Expose the port
|
39 |
+
EXPOSE 8000
|
40 |
+
EXPOSE 80
|
41 |
+
ENV JULIA_DEPOT_PATH "/home/user/.julia"
|
42 |
+
|
43 |
+
RUN julia -e 'using Pkg; Pkg.activate("."); Pkg.precompile()'
|
44 |
+
|
45 |
+
ENTRYPOINT julia --project -e 'using Pkg; Pkg.instantiate(); include("server.jl")'
|
How to Run the OstreaCultura Server.md
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# How to Run the OstreaCultura Server Image
|
2 |
+
|
3 |
+
This is a guide on how to run the OstreaCultura server image. The server image is a Docker image that contains the OstreaCultura server code and all the necessary dependencies to run the server.
|
4 |
+
|
5 |
+
|
6 |
+
## Prerequisites
|
7 |
+
- Docker installed on your machine
|
8 |
+
- Access to Internet
|
9 |
+
- Port 8080 is not being used by another process
|
10 |
+
- Bash shell
|
11 |
+
|
12 |
+
|
13 |
+
### Step 1: Pull the Docker Image
|
14 |
+
```bash
|
15 |
+
docker pull public.ecr.aws/a8o9b6o4/ostreacultura/api:latest
|
16 |
+
```
|
17 |
+
|
18 |
+
### Step 2: Run the Docker Image
|
19 |
+
```bash
|
20 |
+
docker run -p 8080:8080 -it public.ecr.aws/a8o9b6o4/ostreacultura/api:latest
|
21 |
+
```
|
22 |
+
|
23 |
+
### Step 3: Run the Server
|
24 |
+
```bash
|
25 |
+
cd OstreaCultura && julia --project=. server.jl
|
26 |
+
```
|
27 |
+
|
28 |
+
### Step 4: Access the Server
|
29 |
+
Open your browser and navigate to `http://localhost:8080/docs` to access the documentation for the server.
|
Project.toml
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name = "OstreaCultura"
|
2 |
+
uuid = "720c7e0a-31c4-4ba0-bde6-d3e3af4c503a"
|
3 |
+
authors = ["stefanjwojcik and contributors"]
|
4 |
+
version = "1.0.0-DEV"
|
5 |
+
|
6 |
+
[deps]
|
7 |
+
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
|
8 |
+
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
|
9 |
+
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
|
10 |
+
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
|
11 |
+
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
|
12 |
+
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
|
13 |
+
LocalRegistry = "89398ba2-070a-4b16-a995-9893c55d93cf"
|
14 |
+
Oxygen = "df9a0d86-3283-4920-82dc-4555fc0d1d8b"
|
15 |
+
Pandas = "eadc2687-ae89-51f9-a5d9-86b5a6373a9c"
|
16 |
+
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
|
17 |
+
ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
|
18 |
+
PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
|
19 |
+
RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
|
20 |
+
Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
|
21 |
+
Sqids = "5846b9ac-096c-425b-b363-8d1a03210e20"
|
22 |
+
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
|
23 |
+
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
24 |
+
|
25 |
+
[compat]
|
26 |
+
CSV = "0.10.15"
|
27 |
+
RDatasets = "0.7.7"
|
README.md
CHANGED
@@ -1,9 +1,10 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
|
|
7 |
pinned: false
|
8 |
---
|
9 |
|
|
|
1 |
---
|
2 |
+
title: Misinformation Search
|
3 |
+
emoji: 🌖
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: purple
|
6 |
sdk: docker
|
7 |
+
app_port: 8000
|
8 |
pinned: false
|
9 |
---
|
10 |
|
ReadMe.md
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## How to run some of the code in this repository
|
2 |
+
|
3 |
+
### 1. Make sure Docker is installed on your machine
|
4 |
+
### 2. Clone the repository
|
5 |
+
### 3. CD into the repository
|
6 |
+
### 4. Run the following command to build the docker image
|
7 |
+
```bash
|
8 |
+
docker docker compose build -t oc-prototype .
|
9 |
+
```
|
10 |
+
### 5. Run the following command to run the docker image
|
11 |
+
```bash
|
12 |
+
docker compose up -d oc-prototype
|
13 |
+
docker exec -it oc-prototype /bin/bash
|
14 |
+
```
|
15 |
+
|
16 |
+
|
17 |
+
## Prototype TODO's
|
18 |
+
|
19 |
+
## Data
|
20 |
+
- [X] Process all misinfo claims and generate embeddings for a library namespace
|
21 |
+
- [X] Upsert claims into pinecone
|
22 |
+
- [X] Upsert 300k into namespace
|
23 |
+
- [ ] Update claim format to be similar to: https://www.kaggle.com/datasets/shivkumarganesh/politifact-factcheck-data/data
|
24 |
+
|
25 |
+
## Functions
|
26 |
+
- [X] Upsert vector
|
27 |
+
- [X] Batch upsert
|
28 |
+
- [X] Query against metadata
|
29 |
+
|
30 |
+
- [ ] Generate working Dockerfile for project reproducibility
|
31 |
+
- [ ] Load data into a database
|
32 |
+
- [ ] Test precision/recall of embeddings
|
33 |
+
- [ ] Generate working version of climate demo
|
34 |
+
|
35 |
+
|
36 |
+
Embedding pricing:
|
37 |
+
|
38 |
+
1 token = approximately 0.75 words or 1k tokens = 750 words, you pay per 1000 tokens $0.0001
|
39 |
+
Using that it can be shown that you get about 4 characters per token or 4Kb of embedding text per 1k tokens or $0.0001
|
40 |
+
Using that as your basis you can approximate the cost of your embedding by :
|
41 |
+
Cost in $ = Size of Data in Kilobytes * 0.000025
|
42 |
+
|
43 |
+
$0.100 / 1M tokens
|
44 |
+
|
45 |
+
Credentials for running google cloud queries: see ostreacultura-credentials.json
|
docker-compose.yaml
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: '3.8'
|
2 |
+
|
3 |
+
services:
|
4 |
+
ostrea_cultura:
|
5 |
+
container_name: my_julia_container # Custom container name
|
6 |
+
build: .
|
7 |
+
volumes:
|
8 |
+
- .:/home/juliauser/OstreaCultura
|
9 |
+
ports:
|
10 |
+
- "8080:8080"
|
11 |
+
environment:
|
12 |
+
- PATH="/venv/bin:$PATH:/usr/local/julia/bin:/usr/local/bin"
|
13 |
+
command: /bin/bash -c "cd OstreaCultura && julia --project=. server.jl"
|
14 |
+
restart: unless-stopped
|
ostreacultura-credentials.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"type": "service_account",
|
3 |
+
"project_id": "ostreacultura",
|
4 |
+
"private_key_id": "8cf6698a8b2cf0fe9191917f1a344933995f86b3",
|
5 |
+
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQC4kYnebW4F0BbF\nmR5YgiqRY2nVC2oMfpRgWsjGuyeC1f1R+f8cKOG5SxWOAEjBwIKQyutO+B27uyiv\nV70vR5Z3EH+kEce+VJStnVi/2cpK1883dr9Mf7lVEoH3P5KwgYfxp0GBEZY9VO++\nBLUCaT834Al0sSGL0F8oztvV7pQ/dMDqNZ6J3OOcpg50spNuvE/Gxr5KNs3RlUiJ\nLsdJmaN7VzYZJ12u0NE2bR39m6q8BsP47O9AQfYk8UWL+oE8OY0k6EONLJEnMc+W\nYii9mmSYDvE2mP8rMpJi3dy1m8nFoz5BGiB7FGCEIaa3kqFCinss+Q1XzOKcB5PW\nIN54VoCBAgMBAAECggEABs5FdJbTNfjcjhnwtrf+B9eczgbnHy5sgqBDKdWgeSbM\nE4GR0m3NFpimyWvhqVTtVTEWVWNSyieP/xjPvtzEUJmii0ZyKua4umCRJp0XF4Sd\nShQUrVmpJNjs3LtSyZozjONrYYebfNSswaeWYRz7M5bZ4mkRrc0kmFjTkTAd9h6X\nVroxVOs2TzsELBO7LOM1TBAG0Bu5co5WlTltw8si7kEYMBJA3JQpi7BlvzMqxj3x\nlHbaMweW5fY2zpAJkWkGFxkoVg5f845nl54CsP8LWdbVH9EjLJWEqDc1GwUDCjwc\nZMjhjyeztPYzR8OhHMqu5yiknql6WtpR3+ul6khlUQKBgQDqkFj+t3quGjq5VQC1\nkQ7Vp2FpDOIisIVyzb/ETwj7G2HwW8VM630M30seEgmjO/WubcuIo3JKwt2px6Nx\nPHPp4QVR6lGUN1e7Rt1hghlCtP/RDRzErY8xHLtgzCIcKfQDQFNJLh4ti0IHqsQJ\n7nYtXpyRsvRPXIvttXLgA7sMxQKBgQDJb4sMhPkZLZ3MlhNc24oeL4btXh0D9PPV\nGbwcjIKZA/DyI+w7TXvW1LvwaWhr0ZopDkCJ5qtnwpIN91pq19CJzpH23iqrtszt\ng+nOekQ51yH27WSbLKcKeotC1PmpR/WhlFBJIihtKmAtpBGue73PNegmCcT/Lkrn\n4x54ioYYjQKBgHqy63rCq6/rFJDiAC76JFJq7i+vh5Mx13zIV+nulfoUxtSejMTB\n4nYbUAX41YXH1apa0L83EJA/AwjO35ZaUyAij0/cTLRFhuT2GtPo5CHa4H6QZ14S\nk/bS2sm1vpoQ8A3LVkXPlAWq9O1hzI6NaNAuz6P/goEJHkFIS0EXcn8pAoGADODi\nxPQjcOanthLKPh+THwbu23Iu7LfWdwNkMXcSSnIkD3cYfVog1sVXFrHGiU4nSwSs\nP6TNxJ599AKidz5BCO56Cox9sejIBU9vvVG31vLYKw9WZU1fiJ35FVbEd3wwShpP\nX72dVA1nhMN+lec3ZW4O34+0VpRBGb49jntcw2kCgYEA5VMAHskd23+mgsemIAkX\nOZS8dAjlJ2X1lFhOn9tCK3QI1NnQpyIxGK+iYjQjre/aFvT+5XpqUmvZ/deDax8b\nd1sCR5hF/NcDuqkcmBNkkZrt+cdwqyd/LHkPEtMVVcLHjoXh7r/VfwY38IbXpcUB\n30YPr+31bqumyYzssUc98Cw=\n-----END PRIVATE KEY-----\n",
|
6 |
+
"client_email": "[email protected]",
|
7 |
+
"client_id": "117283783846337502690",
|
8 |
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
9 |
+
"token_uri": "https://oauth2.googleapis.com/token",
|
10 |
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
11 |
+
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/ocservice%40ostreacultura.iam.gserviceaccount.com",
|
12 |
+
"universe_domain": "googleapis.com"
|
13 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
numpy
|
3 |
+
sentence-transformers
|
4 |
+
pinecone[grpc]
|
5 |
+
pinecone_text
|
6 |
+
sqids
|
server.jl
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
using Oxygen
|
2 |
+
using HTTP
|
3 |
+
import OstreaCultura as OC
|
4 |
+
|
5 |
+
# Load the fasttext embeddings and the fasttext model
|
6 |
+
const (fc_embed, fc) = OC.load_fasttext_embeddings("data/filtered_fact_check_latest_embed.csv")
|
7 |
+
const (nar_embed, nar) = OC.load_fasttext_embeddings("data/expansive_claims_library_expanded_embed.csv")
|
8 |
+
|
9 |
+
@get "/greet" function(req::HTTP.Request)
|
10 |
+
return "hello world!"
|
11 |
+
end
|
12 |
+
|
13 |
+
## Send a query to the test-index index in the test-namespace namespace
|
14 |
+
#@get "/query" function(req::HTTP.Request, querytext::String, indexname::String, namespace::String, top_k::Int64=5, include_values::Bool=false)
|
15 |
+
# OC.query(querytext, indexname, namespace; top_k=top_k, include_values=include_values)
|
16 |
+
#end
|
17 |
+
|
18 |
+
## Send a query to look for matches within an organic dataset
|
19 |
+
#@get "/queryclaims" function(req::HTTP.Request, claim::String, counterclaim::String, indexname::String, namespace::String, top_k::Int64=5000, include_values::Bool=false)
|
20 |
+
# OC.query_claims(claim, counterclaim, indexname, namespace; top_k=top_k, include_values=include_values)
|
21 |
+
#end
|
22 |
+
|
23 |
+
## Classify a claim within the misinformation library
|
24 |
+
#@get "/classify" function(req::HTTP.Request, claim::String, counterclaim::String, indexname::String, namespace::String, top_k::Int64=10, include_values::Bool=false)
|
25 |
+
# OC.classify_claim(claim, counterclaim, indexname, namespace; top_k=top_k, include_values=include_values)
|
26 |
+
#end
|
27 |
+
|
28 |
+
# Model is really 'namespace' in the OC library
|
29 |
+
@get "/search" function(req::HTTP.Request, claim::String, model::String="narratives", top_k::Int64=5)
|
30 |
+
# remove % signs from claim, replace with percent
|
31 |
+
claim = replace(claim, "%" => "percent")
|
32 |
+
OC.search(claim, "oc-hybrid-library-index", model; top_k=top_k, include_values=false)
|
33 |
+
end
|
34 |
+
|
35 |
+
@get "/fastfactsearch" function(req::HTTP.Request, claim::String, model::String, top_k::Int64=5)
|
36 |
+
# remove % signs from claim, replace with percent
|
37 |
+
claim = replace(claim, "%" => "percent")
|
38 |
+
if model == "narratives"
|
39 |
+
json(OC.fast_topk(nar_embed, nar, claim, top_k))
|
40 |
+
elseif model == "factchecks"
|
41 |
+
json(OC.fast_topk(fc_embed, fc, claim, top_k))
|
42 |
+
else
|
43 |
+
"Model not found"
|
44 |
+
end
|
45 |
+
end
|
46 |
+
|
47 |
+
#@get "/searchplot" function(req::HTTP.Request, claim::String, model::String, top_k::Int64=5, include_values::Bool=false)
|
48 |
+
# OC.searchplot(claim, "oc-hybrid-library-index", model; top_k=top_k, include_values=include_values)
|
49 |
+
#end
|
50 |
+
|
51 |
+
# start the web server
|
52 |
+
serve(host="0.0.0.0", port=8000)
|
startup.jl
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ENV["PYTHON"]="/venv/bin/python"
|
2 |
+
ENV["JULIA_NUM_THREADS"] = 8
|
3 |
+
ENV["JULIA_PKG_DEVDIR"] = "/home/workspace"
|
4 |
+
|
5 |
+
ENV["OPENAI_API_KEY"] = "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"
|
6 |
+
ENV["PINECONE_API_KEY"] = "5faec954-a6c5-4af5-a577-89dbd2e4e5b0"
|
7 |
+
|
8 |
+
using Revise
|
9 |
+
|
10 |
+
function restart()
|
11 |
+
startup = """
|
12 |
+
Base.ACTIVE_PROJECT[]=$(repr(Base.ACTIVE_PROJECT[]))
|
13 |
+
Base.HOME_PROJECT[]=$(repr(Base.HOME_PROJECT[]))
|
14 |
+
cd($(repr(pwd())))
|
15 |
+
"""
|
16 |
+
cmd = `$(Base.julia_cmd()) -ie $startup`
|
17 |
+
atexit(()->run(cmd))
|
18 |
+
exit(0)
|
19 |
+
end
|