File size: 2,680 Bytes
84250ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0194265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84250ad
 
 
0194265
 
 
 
 
 
84250ad
 
 
 
0194265
84250ad
 
 
 
 
0194265
 
 
 
84250ad
 
 
0194265
84250ad
 
 
 
 
 
 
 
0194265
dc80a8e
0194265
dc80a8e
0194265
dc80a8e
0194265
ff339d0
dc80a8e
 
0194265
84250ad
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
<!-- livebook:{"app_settings":{"show_source":true,"slug":"Summarine"}} -->

# Summarine

```elixir
Mix.install(
  [
    {:kino_bumblebee, "~> 0.3.0"},
    {:exla, "~> 0.5.1"},
    {:req, "~> 0.3.11"}
  ],
  config: [nx: [default_backend: EXLA.Backend]]
)
```

## Intro

Audio to text, then summary.

## Setup Ollama module

```elixir
defmodule Ollama do
  @api_endpoint "http://localhost:11434/api/generate"
  @model "llama2-uncensored"

  def generate(prompt) do
    payload = %{
      model: @model,
      prompt: prompt
    }

    {:ok, response} = Req.post(@api_endpoint, json: payload)

    process_response(response)
  end

  defp process_response(response) do
    response.body
    |> String.split("\n")
    |> Enum.map(&process_chunk/1)
    |> Enum.reject(&is_nil/1)
    |> Enum.map(&get_content/1)
    |> Enum.join("")
  end

  defp process_chunk("") do
    nil
  end

  defp process_chunk(json_string) do
    {:ok, data} = Jason.decode(json_string)

    data
  end

  defp get_content(%{"response" => response}) do
    response
  end

  defp get_content(_) do
    ""
  end
end
```

## App

```elixir
model_name = "openai/whisper-base"

{:ok, model_info} = Bumblebee.load_model({:hf, model_name})
{:ok, featurizer} = Bumblebee.load_featurizer({:hf, model_name})
{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, model_name})
{:ok, generation_config} = Bumblebee.load_generation_config({:hf, model_name})
generation_config = Bumblebee.configure(generation_config, max_new_tokens: 100)

serving =
  Bumblebee.Audio.speech_to_text(model_info, featurizer, tokenizer, generation_config,
    compile: [batch_size: 4],
    defn_options: [compiler: EXLA]
  )
```

```elixir
audio_input = Kino.Input.audio("", sampling_rate: featurizer.sampling_rate)
form = Kino.Control.form([audio: audio_input], submit: "Summary the audio")
audio_frame = Kino.Frame.new(placeholder: false)
summary_frame = Kino.Frame.new(placeholder: false)

Kino.listen(form, fn %{data: %{audio: audio}} ->
  if audio do
    Kino.Frame.render(audio_frame, Kino.Text.new("Running..."))

    audio =
      audio.data
      |> Nx.from_binary(:f32)
      |> Nx.reshape({:auto, audio.num_channels})
      |> Nx.mean(axes: [1])

    %{results: [%{text: generated_text}]} = Nx.Serving.run(serving, audio)
    Kino.Frame.render(audio_frame, Kino.Markdown.new("**Audio Content**: #{generated_text}"))

    Kino.Frame.render(summary_frame, Kino.Markdown.new("Running"))

    result = Ollama.generate("Please summary the text: #{generated_text}")

    Kino.Frame.render(summary_frame, Kino.Markdown.new("**Summary**: #{result}"))
  end
end)

Kino.Layout.grid([form, audio_frame, summary_frame], boxed: true, gap: 16)
```