= commited on
Commit
fd342b4
·
1 Parent(s): 0d00a0e

Add compression and decompression functions for fact check data; update dependencies and remove obsolete files

Browse files
.gitattributes CHANGED
@@ -37,3 +37,4 @@ data/filtered_fact_check_latest_embed.csv filter=lfs diff=lfs merge=lfs -text
37
  data/random_300k.csv filter=lfs diff=lfs merge=lfs -text
38
  *.csv filter=lfs diff=lfs merge=lfs -text
39
  *.json filter=lfs diff=lfs merge=lfs -text
 
 
37
  data/random_300k.csv filter=lfs diff=lfs merge=lfs -text
38
  *.csv filter=lfs diff=lfs merge=lfs -text
39
  *.json filter=lfs diff=lfs merge=lfs -text
40
+ data/fc_latest_maxi_compr filter=lfs diff=lfs merge=lfs -text
Manifest.toml CHANGED
@@ -2,7 +2,7 @@
2
 
3
  julia_version = "1.10.4"
4
  manifest_format = "2.0"
5
- project_hash = "1ce95d4f8f4617f58a3df72191590f2e35b92b89"
6
 
7
  [[deps.AbstractTrees]]
8
  git-tree-sha1 = "2d9c9a55f9c93e8887ad391fbae72f8ef55e1177"
 
2
 
3
  julia_version = "1.10.4"
4
  manifest_format = "2.0"
5
+ project_hash = "071291b10413261c56b71962d94f340814c6f62c"
6
 
7
  [[deps.AbstractTrees]]
8
  git-tree-sha1 = "2d9c9a55f9c93e8887ad391fbae72f8ef55e1177"
Project.toml CHANGED
@@ -5,6 +5,7 @@ version = "1.0.0-DEV"
5
 
6
  [deps]
7
  CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 
8
  DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
9
  Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
10
  Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 
5
 
6
  [deps]
7
  CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
8
+ CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
9
  DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
10
  Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
11
  Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
data/fc_latest_maxi_compr ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfec2c2ec6ed5b0d4df6bc91838a72cd4a87db7d66c6f89245d6534557973e27
3
+ size 341251717
scripts/UpdateHuggingFaceAPI.jl CHANGED
@@ -33,3 +33,7 @@ narrs = narrs[.!ismissing.(narrs.text), :]
33
  narratives_embed = OC.maxi_embed.(narrs.text) # seconds to run
34
  narrs.Embeddings = narratives_embed
35
  CSV.write("data/expansive_claims_library_expanded_embed_maxi.csv", narrs)
 
 
 
 
 
33
  narratives_embed = OC.maxi_embed.(narrs.text) # seconds to run
34
  narrs.Embeddings = narratives_embed
35
  CSV.write("data/expansive_claims_library_expanded_embed_maxi.csv", narrs)
36
+ # Compress the fact check data
37
+ OC.compress_csv("data/fact_check_latest_embed_maxi.csv", "data/fc_latest_maxi_compr")
38
+ # Delete the original
39
+ rm("data/fact_check_latest_embed_maxi.csv")
server.jl CHANGED
@@ -3,7 +3,12 @@ using HTTP
3
  import OstreaCultura as OC
4
 
5
  # Load the fasttext embeddings and the fasttext model
6
- const (fc_embed, fc) = OC.load_fasttext_embeddings("data/fact_check_latest_embed_maxi.csv")
 
 
 
 
 
7
  const (nar_embed, nar) = OC.load_fasttext_embeddings("data/expansive_claims_library_expanded_embed_maxi.csv")
8
 
9
  @get "/greet" function(req::HTTP.Request)
 
3
  import OstreaCultura as OC
4
 
5
  # Load the fasttext embeddings and the fasttext model
6
+ tmp_destination = tempname()
7
+ # Decompress the fact check data
8
+ OC.decompress_csv("data/fc_latest_maxi_compr", tmp_destination)
9
+
10
+ #####
11
+ const (fc_embed, fc) = OC.load_fasttext_embeddings(tmp_destination)
12
  const (nar_embed, nar) = OC.load_fasttext_embeddings("data/expansive_claims_library_expanded_embed_maxi.csv")
13
 
14
  @get "/greet" function(req::HTTP.Request)
src/OstreaCultura.jl CHANGED
@@ -5,6 +5,8 @@ module OstreaCultura
5
 
6
  using JSON3, Dates, Sqids, CSV, DataFrames, StatsBase, Distances, PyCall
7
 
 
 
8
  import Pandas.DataFrame as pdataframe
9
 
10
  export MiniEncoder
@@ -20,6 +22,7 @@ export MiniEncoder
20
  include("py_init.jl")
21
  include("Embeddings.jl")
22
  include("PyPineCone.jl")
 
23
  #include("Models.jl")
24
 
25
  end
 
5
 
6
  using JSON3, Dates, Sqids, CSV, DataFrames, StatsBase, Distances, PyCall
7
 
8
+ using CodecZlib
9
+
10
  import Pandas.DataFrame as pdataframe
11
 
12
  export MiniEncoder
 
22
  include("py_init.jl")
23
  include("Embeddings.jl")
24
  include("PyPineCone.jl")
25
+ include("compress.jl")
26
  #include("Models.jl")
27
 
28
  end
src/compress.jl ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #using CodecZlib
2
+ #using CSV
3
+ #using DataFrames
4
+
5
+ function compress_csv(input_path::String, output_path::String=input_path * ".gz")
6
+ println("Compressing $input_path to $output_path...")
7
+
8
+ open(input_path, "r") do input_io
9
+ open(output_path, "w") do output_io
10
+ stream = GzipCompressorStream(output_io)
11
+ write(stream, read(input_io))
12
+ close(stream)
13
+ end
14
+ end
15
+
16
+ # Calculate compression ratio
17
+ original_size = filesize(input_path)
18
+ compressed_size = filesize(output_path)
19
+ ratio = (1 - compressed_size / original_size) * 100
20
+
21
+ println("Compression complete: $(round(original_size / 1024^2, digits=2)) MB → $(round(compressed_size / 1024^2, digits=2)) MB ($(round(ratio, digits=1))% reduction)")
22
+ return output_path
23
+ end
24
+
25
+ function decompress_csv(input_path::String, output_path::String)
26
+ println("Decompressing $input_path to $output_path...")
27
+
28
+ open(input_path, "r") do input_io
29
+ open(output_path, "w") do output_io
30
+ stream = GzipDecompressorStream(input_io)
31
+ write(output_io, read(stream))
32
+ close(stream)
33
+ end
34
+ end
35
+
36
+ println("Decompression complete!")
37
+ return output_path
38
+ end
39
+