=
commited on
Commit
·
fd342b4
1
Parent(s):
0d00a0e
Add compression and decompression functions for fact check data; update dependencies and remove obsolete files
Browse files- .gitattributes +1 -0
- Manifest.toml +1 -1
- Project.toml +1 -0
- data/fc_latest_maxi_compr +3 -0
- scripts/UpdateHuggingFaceAPI.jl +4 -0
- server.jl +6 -1
- src/OstreaCultura.jl +3 -0
- src/compress.jl +39 -0
.gitattributes
CHANGED
@@ -37,3 +37,4 @@ data/filtered_fact_check_latest_embed.csv filter=lfs diff=lfs merge=lfs -text
|
|
37 |
data/random_300k.csv filter=lfs diff=lfs merge=lfs -text
|
38 |
*.csv filter=lfs diff=lfs merge=lfs -text
|
39 |
*.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
37 |
data/random_300k.csv filter=lfs diff=lfs merge=lfs -text
|
38 |
*.csv filter=lfs diff=lfs merge=lfs -text
|
39 |
*.json filter=lfs diff=lfs merge=lfs -text
|
40 |
+
data/fc_latest_maxi_compr filter=lfs diff=lfs merge=lfs -text
|
Manifest.toml
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
|
3 |
julia_version = "1.10.4"
|
4 |
manifest_format = "2.0"
|
5 |
-
project_hash = "
|
6 |
|
7 |
[[deps.AbstractTrees]]
|
8 |
git-tree-sha1 = "2d9c9a55f9c93e8887ad391fbae72f8ef55e1177"
|
|
|
2 |
|
3 |
julia_version = "1.10.4"
|
4 |
manifest_format = "2.0"
|
5 |
+
project_hash = "071291b10413261c56b71962d94f340814c6f62c"
|
6 |
|
7 |
[[deps.AbstractTrees]]
|
8 |
git-tree-sha1 = "2d9c9a55f9c93e8887ad391fbae72f8ef55e1177"
|
Project.toml
CHANGED
@@ -5,6 +5,7 @@ version = "1.0.0-DEV"
|
|
5 |
|
6 |
[deps]
|
7 |
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
|
|
|
8 |
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
|
9 |
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
|
10 |
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
|
|
|
5 |
|
6 |
[deps]
|
7 |
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
|
8 |
+
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
|
9 |
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
|
10 |
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
|
11 |
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
|
data/fc_latest_maxi_compr
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bfec2c2ec6ed5b0d4df6bc91838a72cd4a87db7d66c6f89245d6534557973e27
|
3 |
+
size 341251717
|
scripts/UpdateHuggingFaceAPI.jl
CHANGED
@@ -33,3 +33,7 @@ narrs = narrs[.!ismissing.(narrs.text), :]
|
|
33 |
narratives_embed = OC.maxi_embed.(narrs.text) # seconds to run
|
34 |
narrs.Embeddings = narratives_embed
|
35 |
CSV.write("data/expansive_claims_library_expanded_embed_maxi.csv", narrs)
|
|
|
|
|
|
|
|
|
|
33 |
narratives_embed = OC.maxi_embed.(narrs.text) # seconds to run
|
34 |
narrs.Embeddings = narratives_embed
|
35 |
CSV.write("data/expansive_claims_library_expanded_embed_maxi.csv", narrs)
|
36 |
+
# Compress the fact check data
|
37 |
+
OC.compress_csv("data/fact_check_latest_embed_maxi.csv", "data/fc_latest_maxi_compr")
|
38 |
+
# Delete the original
|
39 |
+
rm("data/fact_check_latest_embed_maxi.csv")
|
server.jl
CHANGED
@@ -3,7 +3,12 @@ using HTTP
|
|
3 |
import OstreaCultura as OC
|
4 |
|
5 |
# Load the fasttext embeddings and the fasttext model
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
7 |
const (nar_embed, nar) = OC.load_fasttext_embeddings("data/expansive_claims_library_expanded_embed_maxi.csv")
|
8 |
|
9 |
@get "/greet" function(req::HTTP.Request)
|
|
|
3 |
import OstreaCultura as OC
|
4 |
|
5 |
# Load the fasttext embeddings and the fasttext model
|
6 |
+
tmp_destination = tempname()
|
7 |
+
# Decompress the fact check data
|
8 |
+
OC.decompress_csv("data/fc_latest_maxi_compr", tmp_destination)
|
9 |
+
|
10 |
+
#####
|
11 |
+
const (fc_embed, fc) = OC.load_fasttext_embeddings(tmp_destination)
|
12 |
const (nar_embed, nar) = OC.load_fasttext_embeddings("data/expansive_claims_library_expanded_embed_maxi.csv")
|
13 |
|
14 |
@get "/greet" function(req::HTTP.Request)
|
src/OstreaCultura.jl
CHANGED
@@ -5,6 +5,8 @@ module OstreaCultura
|
|
5 |
|
6 |
using JSON3, Dates, Sqids, CSV, DataFrames, StatsBase, Distances, PyCall
|
7 |
|
|
|
|
|
8 |
import Pandas.DataFrame as pdataframe
|
9 |
|
10 |
export MiniEncoder
|
@@ -20,6 +22,7 @@ export MiniEncoder
|
|
20 |
include("py_init.jl")
|
21 |
include("Embeddings.jl")
|
22 |
include("PyPineCone.jl")
|
|
|
23 |
#include("Models.jl")
|
24 |
|
25 |
end
|
|
|
5 |
|
6 |
using JSON3, Dates, Sqids, CSV, DataFrames, StatsBase, Distances, PyCall
|
7 |
|
8 |
+
using CodecZlib
|
9 |
+
|
10 |
import Pandas.DataFrame as pdataframe
|
11 |
|
12 |
export MiniEncoder
|
|
|
22 |
include("py_init.jl")
|
23 |
include("Embeddings.jl")
|
24 |
include("PyPineCone.jl")
|
25 |
+
include("compress.jl")
|
26 |
#include("Models.jl")
|
27 |
|
28 |
end
|
src/compress.jl
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#using CodecZlib
|
2 |
+
#using CSV
|
3 |
+
#using DataFrames
|
4 |
+
|
5 |
+
function compress_csv(input_path::String, output_path::String=input_path * ".gz")
|
6 |
+
println("Compressing $input_path to $output_path...")
|
7 |
+
|
8 |
+
open(input_path, "r") do input_io
|
9 |
+
open(output_path, "w") do output_io
|
10 |
+
stream = GzipCompressorStream(output_io)
|
11 |
+
write(stream, read(input_io))
|
12 |
+
close(stream)
|
13 |
+
end
|
14 |
+
end
|
15 |
+
|
16 |
+
# Calculate compression ratio
|
17 |
+
original_size = filesize(input_path)
|
18 |
+
compressed_size = filesize(output_path)
|
19 |
+
ratio = (1 - compressed_size / original_size) * 100
|
20 |
+
|
21 |
+
println("Compression complete: $(round(original_size / 1024^2, digits=2)) MB → $(round(compressed_size / 1024^2, digits=2)) MB ($(round(ratio, digits=1))% reduction)")
|
22 |
+
return output_path
|
23 |
+
end
|
24 |
+
|
25 |
+
function decompress_csv(input_path::String, output_path::String)
|
26 |
+
println("Decompressing $input_path to $output_path...")
|
27 |
+
|
28 |
+
open(input_path, "r") do input_io
|
29 |
+
open(output_path, "w") do output_io
|
30 |
+
stream = GzipDecompressorStream(input_io)
|
31 |
+
write(output_io, read(stream))
|
32 |
+
close(stream)
|
33 |
+
end
|
34 |
+
end
|
35 |
+
|
36 |
+
println("Decompression complete!")
|
37 |
+
return output_path
|
38 |
+
end
|
39 |
+
|