forrestbao commited on
Commit
81cb431
·
1 Parent(s): 120684a

fix user permission problem

Browse files
Files changed (5) hide show
  1. Dockerfile +6 -10
  2. app/app.py +0 -3
  3. app/app_utils.py +5 -8
  4. app/requirements.txt +0 -1
  5. app/results.json +0 -860
Dockerfile CHANGED
@@ -6,21 +6,17 @@ COPY ./app/vectara_theme.py /app/vectara_theme.py
6
  COPY ./app/requirements.txt /app/requirements.txt
7
  COPY ./app/app.py /app/app.py
8
  COPY ./app/app_utils.py /app/app_utils.py
9
- COPY ./app/results.json /app/results.json
10
 
11
  RUN apt-get update && apt-get install -y git-lfs
12
 
13
- RUN mkdir -p /app/results
14
-
15
  RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
16
 
17
- # RUN useradd -m -u 1000 user
18
- # USER user
19
- # ENV HOME=/home/user \
20
- # PATH=/home/user/.local/bin:$PATH
21
 
22
- # WORKDIR $HOME/app
23
-
24
- # COPY --chown=user . $HOME/app
25
 
26
  CMD ["funix", "app.py", "--host", "0.0.0.0", "--port", "7860", "--no-browser"]
 
6
  COPY ./app/requirements.txt /app/requirements.txt
7
  COPY ./app/app.py /app/app.py
8
  COPY ./app/app_utils.py /app/app_utils.py
9
+ # COPY ./app/results.json /app/results.json
10
 
11
  RUN apt-get update && apt-get install -y git-lfs
12
 
 
 
13
  RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
14
 
15
+ RUN useradd -m -u 1000 user
16
+ ENV HOME=/home/user \
17
+ PATH=/home/user/.local/bin:$PATH
 
18
 
19
+ RUN mkdir -p /app/results
20
+ RUN chown -R user /app/results
 
21
 
22
  CMD ["funix", "app.py", "--host", "0.0.0.0", "--port", "7860", "--no-browser"]
app/app.py CHANGED
@@ -5,9 +5,6 @@ import pandas as pd
5
  import matplotlib.figure
6
  from IPython.display import Markdown
7
 
8
- import dotenv
9
- dotenv.load_dotenv() # load HF_TOKEN
10
-
11
  from funix import funix, import_theme
12
  from vectara_theme import vectara_theme
13
  import_theme(vectara_theme)
 
5
  import matplotlib.figure
6
  from IPython.display import Markdown
7
 
 
 
 
8
  from funix import funix, import_theme
9
  from vectara_theme import vectara_theme
10
  import_theme(vectara_theme)
app/app_utils.py CHANGED
@@ -7,14 +7,14 @@ import matplotlib.pyplot as plt
7
  import matplotlib.figure
8
  from sklearn.preprocessing import MinMaxScaler
9
 
10
- import dotenv
11
- dotenv.load_dotenv()
12
 
13
  min_max_scaler = MinMaxScaler()
14
 
15
  # %%
16
  def pull_results(results_dir: str):
17
- repo = Repository(local_dir = results_dir, clone_from="vectara/results", repo_type="dataset", token=os.getenv("HF_TOKEN"))
18
  repo.git_pull()
19
 
20
  def extract_info_from_result_file(result_file):
@@ -108,14 +108,11 @@ def load_results(
108
  results_df = pd.DataFrame(results)
109
  results_df = results_df.sort_values(by="Hallucination %", ascending=True)
110
 
111
- # replace any value TBD with 0
112
- results_df = results_df.replace("TBD", -1)
113
 
114
  for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
115
  results_df[column] = results_df[column].apply(lambda x: round(x, 3))
116
-
117
- # replace any value -1 with string "TBD"
118
- results_df = results_df.replace(-1, "TBD")
119
 
120
  return results_df
121
 
 
7
  import matplotlib.figure
8
  from sklearn.preprocessing import MinMaxScaler
9
 
10
+ # import dotenv
11
+ # dotenv.load_dotenv()
12
 
13
  min_max_scaler = MinMaxScaler()
14
 
15
  # %%
16
  def pull_results(results_dir: str):
17
+ repo = Repository(local_dir = results_dir, clone_from="vectara/results", repo_type="dataset")
18
  repo.git_pull()
19
 
20
  def extract_info_from_result_file(result_file):
 
108
  results_df = pd.DataFrame(results)
109
  results_df = results_df.sort_values(by="Hallucination %", ascending=True)
110
 
111
+ # replace any value TBD with -1
112
+ results_df = results_df.replace("TBD", 100)
113
 
114
  for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
115
  results_df[column] = results_df[column].apply(lambda x: round(x, 3))
 
 
 
116
 
117
  return results_df
118
 
app/requirements.txt CHANGED
@@ -1,6 +1,5 @@
1
  funix==0.6.1
2
  pandas
3
- dotenv
4
  huggingface_hub
5
  matplotlib
6
  scikit-learn
 
1
  funix==0.6.1
2
  pandas
 
3
  huggingface_hub
4
  matplotlib
5
  scikit-learn
app/results.json DELETED
@@ -1,860 +0,0 @@
1
- [
2
- {
3
- "LLM": "gemini-2.0-flash-exp",
4
- "Hallucination %": 1.3,
5
- "Answer %": 99.9,
6
- "Avg Summary Words": 60.0
7
- },
8
- {
9
- "LLM": "deepseek/deepseek-r1",
10
- "Hallucination %": 14.3,
11
- "Answer %": 100.0,
12
- "Avg Summary Words": 77.1
13
- },
14
- {
15
- "LLM": "deepseek/deepseek-v3",
16
- "Hallucination %": 3.9,
17
- "Answer %": 100.0,
18
- "Avg Summary Words": 88.2
19
- },
20
- {
21
- "LLM": "deepseek/deepseek-chat",
22
- "Hallucination %": 2.4,
23
- "Answer %": 100.0,
24
- "Avg Summary Words": 83.2
25
- },
26
- {
27
- "LLM": "deepseek/deepseek-v3-0324",
28
- "Hallucination %": 8.0,
29
- "Answer %": 100.0,
30
- "Avg Summary Words": 78.9
31
- },
32
- {
33
- "LLM": "openai/chatgpt-4o-latest",
34
- "Hallucination %": 3.5,
35
- "Answer %": 100.0,
36
- "Avg Summary Words": 63.5
37
- },
38
- {
39
- "LLM": "openai/GPT-4",
40
- "Hallucination %": 1.8050541516245486,
41
- "Answer %": 100.0,
42
- "Avg Summary Words": 81.1
43
- },
44
- {
45
- "LLM": "openai/o3-mini-high-reasoning",
46
- "Hallucination %": 0.7952286282306176,
47
- "Answer %": 100.0,
48
- "Avg Summary Words": 79.51888667992047
49
- },
50
- {
51
- "LLM": "openai/gpt-4.1-mini",
52
- "Hallucination %": 2.2,
53
- "Answer %": 100.0,
54
- "Avg Summary Words": 79.6
55
- },
56
- {
57
- "LLM": "openai/o1-pro",
58
- "Hallucination %": 2.4,
59
- "Answer %": 100.0,
60
- "Avg Summary Words": 81.0
61
- },
62
- {
63
- "LLM": "openai/gpt-4.1-nano",
64
- "Hallucination %": 2.0,
65
- "Answer %": 100.0,
66
- "Avg Summary Words": 70.2
67
- },
68
- {
69
- "LLM": "openai/o1-mini",
70
- "Hallucination %": 1.4,
71
- "Answer %": 100.0,
72
- "Avg Summary Words": 78.3
73
- },
74
- {
75
- "LLM": "openai/GPT-4-Turbo",
76
- "Hallucination %": 1.6898608349900597,
77
- "Answer %": 100.0,
78
- "Avg Summary Words": 86.2
79
- },
80
- {
81
- "LLM": "openai/o3",
82
- "Hallucination %": 6.8,
83
- "Answer %": 100.0,
84
- "Avg Summary Words": 77.7
85
- },
86
- {
87
- "LLM": "openai/GPT-3.5-Turbo",
88
- "Hallucination %": 1.9,
89
- "Answer %": 99.6,
90
- "Avg Summary Words": 84.1
91
- },
92
- {
93
- "LLM": "openai/o1",
94
- "Hallucination %": 2.4,
95
- "Answer %": 99.9,
96
- "Avg Summary Words": 73.0
97
- },
98
- {
99
- "LLM": "openai/GPT-4o",
100
- "Hallucination %": 1.4910536779324055,
101
- "Answer %": 100.0,
102
- "Avg Summary Words": 77.8
103
- },
104
- {
105
- "LLM": "openai/GPT-4o-mini",
106
- "Hallucination %": 1.7,
107
- "Answer %": 100.0,
108
- "Avg Summary Words": 76.3
109
- },
110
- {
111
- "LLM": "openai/o1-preview",
112
- "Hallucination %": 3.3,
113
- "Answer %": 100.0,
114
- "Avg Summary Words": 119.3
115
- },
116
- {
117
- "LLM": "openai/o4-mini",
118
- "Hallucination %": 4.6,
119
- "Answer %": 100.0,
120
- "Avg Summary Words": 82.0
121
- },
122
- {
123
- "LLM": "openai/gpt-4.5-preview",
124
- "Hallucination %": 1.2,
125
- "Answer %": 100.0,
126
- "Avg Summary Words": 77.0
127
- },
128
- {
129
- "LLM": "openai/gpt-4.1",
130
- "Hallucination %": 2.0,
131
- "Answer %": 100.0,
132
- "Avg Summary Words": 71.9
133
- },
134
- {
135
- "LLM": "Qwen/Qwen2-VL-2B-Instruct",
136
- "Hallucination %": 8.3,
137
- "Answer %": 100.0,
138
- "Avg Summary Words": 81.8
139
- },
140
- {
141
- "LLM": "Qwen/Qwen2.5-14B-Instruct",
142
- "Hallucination %": 4.2,
143
- "Answer %": 100.0,
144
- "Avg Summary Words": 74.8
145
- },
146
- {
147
- "LLM": "Qwen/Qwen3-32B",
148
- "Hallucination %": 2.8,
149
- "Answer %": 100.0,
150
- "Avg Summary Words": 82.4
151
- },
152
- {
153
- "LLM": "Qwen/Qwen2.5-32B-Instruct",
154
- "Hallucination %": 3.0,
155
- "Answer %": 100.0,
156
- "Avg Summary Words": 67.9
157
- },
158
- {
159
- "LLM": "Qwen/QwQ-32B-Preview",
160
- "Hallucination %": 12.9,
161
- "Answer %": 100.0,
162
- "Avg Summary Words": 140.2
163
- },
164
- {
165
- "LLM": "Qwen/Qwen3-0.6B",
166
- "Hallucination %": 3.7,
167
- "Answer %": 100.0,
168
- "Avg Summary Words": 65.3
169
- },
170
- {
171
- "LLM": "Qwen/Qwen3-14B",
172
- "Hallucination %": 2.2,
173
- "Answer %": 100.0,
174
- "Avg Summary Words": 82.4
175
- },
176
- {
177
- "LLM": "Qwen/Qwen2.5-3B-Instruct",
178
- "Hallucination %": 7.0,
179
- "Answer %": 100.0,
180
- "Avg Summary Words": 70.4
181
- },
182
- {
183
- "LLM": "Qwen/Qwen2.5-1.5B-Instruct",
184
- "Hallucination %": 15.8,
185
- "Answer %": 100.0,
186
- "Avg Summary Words": 70.7
187
- },
188
- {
189
- "LLM": "Qwen/Qwen2-VL-7B-Instruct",
190
- "Hallucination %": 4.2,
191
- "Answer %": 100.0,
192
- "Avg Summary Words": 73.9
193
- },
194
- {
195
- "LLM": "Qwen/Qwen2.5-0.5B-Instruct",
196
- "Hallucination %": 25.2,
197
- "Answer %": 100.0,
198
- "Avg Summary Words": 72.6
199
- },
200
- {
201
- "LLM": "Qwen/Qwen3-4B",
202
- "Hallucination %": 2.7,
203
- "Answer %": 100.0,
204
- "Avg Summary Words": 87.7
205
- },
206
- {
207
- "LLM": "Qwen/Qwen2.5-72B-Instruct",
208
- "Hallucination %": 4.3,
209
- "Answer %": 100.0,
210
- "Avg Summary Words": 80.8
211
- },
212
- {
213
- "LLM": "Qwen/Qwen3-8B",
214
- "Hallucination %": 3.0,
215
- "Answer %": 100.0,
216
- "Avg Summary Words": 78.2
217
- },
218
- {
219
- "LLM": "Qwen/Qwen3-1.7B",
220
- "Hallucination %": 4.4,
221
- "Answer %": 100.0,
222
- "Avg Summary Words": 69.0
223
- },
224
- {
225
- "LLM": "Qwen/Qwen2-72B-Instruct",
226
- "Hallucination %": 4.7,
227
- "Answer %": 100.0,
228
- "Avg Summary Words": 100.1
229
- },
230
- {
231
- "LLM": "Qwen/Qwen2.5-7B-Instruct",
232
- "Hallucination %": 2.8,
233
- "Answer %": 100.0,
234
- "Avg Summary Words": 71.0
235
- },
236
- {
237
- "LLM": "allenai/OLMo-2-1124-7B-Instruct",
238
- "Hallucination %": 11.1,
239
- "Answer %": 100.0,
240
- "Avg Summary Words": 112.6
241
- },
242
- {
243
- "LLM": "allenai/OLMo-2-1124-13B-Instruct",
244
- "Hallucination %": 10.8,
245
- "Answer %": 100.0,
246
- "Avg Summary Words": 82.0
247
- },
248
- {
249
- "LLM": "allenai/olmo-2-0325-32b-instruct",
250
- "Hallucination %": 4.9,
251
- "Answer %": 99.9,
252
- "Avg Summary Words": 100.0
253
- },
254
- {
255
- "LLM": "amazon/Titan-Express",
256
- "Hallucination %": 13.5,
257
- "Answer %": 99.5,
258
- "Avg Summary Words": 98.4
259
- },
260
- {
261
- "LLM": "amazon/nova-lite-v1",
262
- "Hallucination %": 1.8,
263
- "Answer %": 99.9,
264
- "Avg Summary Words": 80.7
265
- },
266
- {
267
- "LLM": "amazon/nova-pro-v1",
268
- "Hallucination %": 1.8,
269
- "Answer %": 100.0,
270
- "Avg Summary Words": 85.5
271
- },
272
- {
273
- "LLM": "amazon/nova-micro-v1",
274
- "Hallucination %": 1.6,
275
- "Answer %": 100.0,
276
- "Avg Summary Words": 90.0
277
- },
278
- {
279
- "LLM": "google/gemini-2.5-pro-exp-03-25",
280
- "Hallucination %": 1.1,
281
- "Answer %": 95.1,
282
- "Avg Summary Words": 72.9
283
- },
284
- {
285
- "LLM": "google/PaLM-2",
286
- "Hallucination %": 14.1,
287
- "Answer %": 99.8,
288
- "Avg Summary Words": 86.6
289
- },
290
- {
291
- "LLM": "google/gemma-1.1-2b-it",
292
- "Hallucination %": 27.8,
293
- "Answer %": 100.0,
294
- "Avg Summary Words": 66.8
295
- },
296
- {
297
- "LLM": "google/gemini-2.0-flash-thinking-exp",
298
- "Hallucination %": 1.8,
299
- "Answer %": 99.3,
300
- "Avg Summary Words": 73.2
301
- },
302
- {
303
- "LLM": "google/gemma-3-1b-it",
304
- "Hallucination %": 5.3,
305
- "Answer %": 99.9,
306
- "Avg Summary Words": 57.9
307
- },
308
- {
309
- "LLM": "google/gemma-2-2b-it",
310
- "Hallucination %": 7.0,
311
- "Answer %": 100.0,
312
- "Avg Summary Words": 62.2
313
- },
314
- {
315
- "LLM": "google/flan-t5-large",
316
- "Hallucination %": 18.3,
317
- "Answer %": 99.3,
318
- "Avg Summary Words": 20.9
319
- },
320
- {
321
- "LLM": "google/gemini-2.5-flash-preview-04-17",
322
- "Hallucination %": 1.3,
323
- "Answer %": 91.2,
324
- "Avg Summary Words": 71.1
325
- },
326
- {
327
- "LLM": "google/Gemini-Pro",
328
- "Hallucination %": 7.6767676767676765,
329
- "Answer %": 98.4,
330
- "Avg Summary Words": 89.5
331
- },
332
- {
333
- "LLM": "google/gemini-1.5-pro-001",
334
- "Hallucination %": 9.1,
335
- "Answer %": 99.8,
336
- "Avg Summary Words": 61.6
337
- },
338
- {
339
- "LLM": "google/gemma-2-9b-it",
340
- "Hallucination %": 10.139165009940358,
341
- "Answer %": 100.0,
342
- "Avg Summary Words": 70.2
343
- },
344
- {
345
- "LLM": "google/gemma-1.1-7b-it",
346
- "Hallucination %": 17.0,
347
- "Answer %": 100.0,
348
- "Avg Summary Words": 64.3
349
- },
350
- {
351
- "LLM": "google/gemma-3-4b-it",
352
- "Hallucination %": 3.7,
353
- "Answer %": 100.0,
354
- "Avg Summary Words": 63.7
355
- },
356
- {
357
- "LLM": "google/gemini-2.0-pro-exp-02-05",
358
- "Hallucination %": 0.8,
359
- "Answer %": 99.7,
360
- "Avg Summary Words": 61.5
361
- },
362
- {
363
- "LLM": "google/gemini-1.5-pro-002",
364
- "Hallucination %": 6.6,
365
- "Answer %": 99.9,
366
- "Avg Summary Words": 62.0
367
- },
368
- {
369
- "LLM": "google/gemma-3-12b-it",
370
- "Hallucination %": 2.8,
371
- "Answer %": 100.0,
372
- "Avg Summary Words": 69.6
373
- },
374
- {
375
- "LLM": "google/gemini-2.0-flash-001",
376
- "Hallucination %": 0.7,
377
- "Answer %": 100.0,
378
- "Avg Summary Words": 65.2
379
- },
380
- {
381
- "LLM": "google/gemini-1.5-flash-002",
382
- "Hallucination %": 3.4,
383
- "Answer %": 99.9,
384
- "Avg Summary Words": 59.4
385
- },
386
- {
387
- "LLM": "google/gemma-7b-it",
388
- "Hallucination %": 14.81113320079523,
389
- "Answer %": 100.0,
390
- "Avg Summary Words": 113.0
391
- },
392
- {
393
- "LLM": "google/gemini-2.0-flash-lite-preview-02-05",
394
- "Hallucination %": 1.2,
395
- "Answer %": 99.5,
396
- "Avg Summary Words": 60.9
397
- },
398
- {
399
- "LLM": "google/gemini-1.5-flash-001",
400
- "Hallucination %": 6.6,
401
- "Answer %": 99.9,
402
- "Avg Summary Words": 63.3
403
- },
404
- {
405
- "LLM": "google/gemma-3-27b-it",
406
- "Hallucination %": 5.9,
407
- "Answer %": 98.5,
408
- "Avg Summary Words": 64.3
409
- },
410
- {
411
- "LLM": "snowflake/snowflake-arctic-instruct",
412
- "Hallucination %": 3.0,
413
- "Answer %": 100.0,
414
- "Avg Summary Words": 68.7
415
- },
416
- {
417
- "LLM": "01-ai/Yi-1.5-9B-Chat",
418
- "Hallucination %": 4.9,
419
- "Answer %": 100.0,
420
- "Avg Summary Words": 85.7
421
- },
422
- {
423
- "LLM": "01-ai/Yi-1.5-6B-Chat",
424
- "Hallucination %": 7.9,
425
- "Answer %": 100.0,
426
- "Avg Summary Words": 98.9
427
- },
428
- {
429
- "LLM": "01-ai/Yi-1.5-34B-Chat",
430
- "Hallucination %": 3.7,
431
- "Answer %": 100.0,
432
- "Avg Summary Words": 83.7
433
- },
434
- {
435
- "LLM": "ai21labs/AI21-Jamba-1.5-Mini",
436
- "Hallucination %": 2.9,
437
- "Answer %": 95.6,
438
- "Avg Summary Words": 74.5
439
- },
440
- {
441
- "LLM": "cohere/c4ai-aya-expanse-32b",
442
- "Hallucination %": 8.5,
443
- "Answer %": 99.9,
444
- "Avg Summary Words": 81.9
445
- },
446
- {
447
- "LLM": "cohere/command-r-plus-08-2024",
448
- "Hallucination %": 5.4,
449
- "Answer %": 100.0,
450
- "Avg Summary Words": 68.4
451
- },
452
- {
453
- "LLM": "cohere/c4ai-aya-expanse-8b",
454
- "Hallucination %": 12.2,
455
- "Answer %": 99.9,
456
- "Avg Summary Words": 83.9
457
- },
458
- {
459
- "LLM": "cohere/command-a-03-2025",
460
- "Hallucination %": 4.5,
461
- "Answer %": 100.0,
462
- "Avg Summary Words": 77.3
463
- },
464
- {
465
- "LLM": "cohere/command-r-08-2024",
466
- "Hallucination %": 4.9,
467
- "Answer %": 100.0,
468
- "Avg Summary Words": 68.7
469
- },
470
- {
471
- "LLM": "Intel/neural-chat-7b-v3-3",
472
- "Hallucination %": 2.6,
473
- "Answer %": 100.0,
474
- "Avg Summary Words": 60.7
475
- },
476
- {
477
- "LLM": "mistralai/pixtral-large-latest",
478
- "Hallucination %": 6.6,
479
- "Answer %": 100.0,
480
- "Avg Summary Words": 76.4
481
- },
482
- {
483
- "LLM": "mistralai/Mixtral-8x22B-Instruct-v0.1",
484
- "Hallucination %": 4.7,
485
- "Answer %": 99.9,
486
- "Avg Summary Words": 92.0
487
- },
488
- {
489
- "LLM": "mistralai/mistral-small-latest",
490
- "Hallucination %": 8.6,
491
- "Answer %": 100.0,
492
- "Avg Summary Words": 74.2
493
- },
494
- {
495
- "LLM": "mistralai/mistral-large-latest",
496
- "Hallucination %": 5.864811133200803,
497
- "Answer %": 100.0,
498
- "Avg Summary Words": 79.55367793240556
499
- },
500
- {
501
- "LLM": "mistralai/Mixtral-8x7B-Instruct-v0.1",
502
- "Hallucination %": 20.09950248756219,
503
- "Answer %": 99.9,
504
- "Avg Summary Words": 90.7
505
- },
506
- {
507
- "LLM": "mistralai/Mistral-Nemo-Instruct-2407",
508
- "Hallucination %": 11.2,
509
- "Answer %": 100.0,
510
- "Avg Summary Words": 69.9
511
- },
512
- {
513
- "LLM": "mistralai/Mistral-Large2",
514
- "Hallucination %": 4.1,
515
- "Answer %": 100.0,
516
- "Avg Summary Words": 77.4
517
- },
518
- {
519
- "LLM": "mistralai/Mistral-7B-Instruct-v0.3",
520
- "Hallucination %": 9.5,
521
- "Answer %": 100.0,
522
- "Avg Summary Words": 98.4
523
- },
524
- {
525
- "LLM": "mistralai/ministral-3b-latest",
526
- "Hallucination %": 8.3,
527
- "Answer %": 100.0,
528
- "Avg Summary Words": 73.2
529
- },
530
- {
531
- "LLM": "mistralai/ministral-8b-latest",
532
- "Hallucination %": 7.5,
533
- "Answer %": 100.0,
534
- "Avg Summary Words": 62.7
535
- },
536
- {
537
- "LLM": "mistralai/Mistral-Small-24B-Instruct-2501",
538
- "Hallucination %": 3.1,
539
- "Answer %": 100.0,
540
- "Avg Summary Words": 74.9
541
- },
542
- {
543
- "LLM": "mistralai/mistral-small-3.1-24b-instruct",
544
- "Hallucination %": 5.6,
545
- "Answer %": 100.0,
546
- "Avg Summary Words": 73.1
547
- },
548
- {
549
- "LLM": "anthropic/Claude-3-5-Sonnet",
550
- "Hallucination %": 8.6,
551
- "Answer %": 100.0,
552
- "Avg Summary Words": 103.0
553
- },
554
- {
555
- "LLM": "anthropic/claude-3-7-sonnet-latest",
556
- "Hallucination %": 4.4,
557
- "Answer %": 100.0,
558
- "Avg Summary Words": 97.8
559
- },
560
- {
561
- "LLM": "anthropic/Claude-3-opus",
562
- "Hallucination %": 10.092687950566425,
563
- "Answer %": 95.5,
564
- "Avg Summary Words": 92.1
565
- },
566
- {
567
- "LLM": "anthropic/Claude-2",
568
- "Hallucination %": 17.448856799037305,
569
- "Answer %": 99.3,
570
- "Avg Summary Words": 87.5
571
- },
572
- {
573
- "LLM": "anthropic/claude-3-5-haiku-20241022",
574
- "Hallucination %": 4.9,
575
- "Answer %": 100.0,
576
- "Avg Summary Words": 92.2
577
- },
578
- {
579
- "LLM": "anthropic/Claude-3-sonnet",
580
- "Hallucination %": 16.302186878727635,
581
- "Answer %": 100.0,
582
- "Avg Summary Words": 108.5
583
- },
584
- {
585
- "LLM": "anthropic/claude-3-7-sonnet-latest-think",
586
- "Hallucination %": 4.5,
587
- "Answer %": 99.8,
588
- "Avg Summary Words": 99.9
589
- },
590
- {
591
- "LLM": "ai21/jamba-1.6-mini",
592
- "Hallucination %": 4.6,
593
- "Answer %": 100.0,
594
- "Avg Summary Words": 82.3
595
- },
596
- {
597
- "LLM": "ai21/jamba-1.6-large",
598
- "Hallucination %": 2.3,
599
- "Answer %": 99.9,
600
- "Avg Summary Words": 85.6
601
- },
602
- {
603
- "LLM": "qwen/qwen3-235b-a22b",
604
- "Hallucination %": 13.0,
605
- "Answer %": 99.2,
606
- "Avg Summary Words": 86.6
607
- },
608
- {
609
- "LLM": "qwen/qwen-max",
610
- "Hallucination %": 2.9,
611
- "Answer %": 88.4,
612
- "Avg Summary Words": 90.4
613
- },
614
- {
615
- "LLM": "qwen/qwen3-30b-a3b",
616
- "Hallucination %": 7.6,
617
- "Answer %": 99.9,
618
- "Avg Summary Words": 69.9
619
- },
620
- {
621
- "LLM": "x-ai/grok-2-1212",
622
- "Hallucination %": 1.9,
623
- "Answer %": 100.0,
624
- "Avg Summary Words": 86.5
625
- },
626
- {
627
- "LLM": "x-ai/grok-2-vision-1212",
628
- "Hallucination %": 2.9,
629
- "Answer %": 100.0,
630
- "Avg Summary Words": 79.8
631
- },
632
- {
633
- "LLM": "databricks/dbrx-instruct",
634
- "Hallucination %": 8.3,
635
- "Answer %": 100.0,
636
- "Avg Summary Words": 85.9
637
- },
638
- {
639
- "LLM": "xai/grok-3-mini-latest",
640
- "Hallucination %": 3.3,
641
- "Answer %": 100.0,
642
- "Avg Summary Words": 90.2
643
- },
644
- {
645
- "LLM": "xai/grok-beta",
646
- "Hallucination %": 4.6,
647
- "Answer %": 100.0,
648
- "Avg Summary Words": 91.0
649
- },
650
- {
651
- "LLM": "xai/grok-3-latest",
652
- "Hallucination %": 2.1,
653
- "Answer %": 100.0,
654
- "Avg Summary Words": 97.7
655
- },
656
- {
657
- "LLM": "apple/OpenELM-3B-Instruct",
658
- "Hallucination %": 24.776119402985074,
659
- "Answer %": 99.3,
660
- "Avg Summary Words": 47.2
661
- },
662
- {
663
- "LLM": "meta-llama/Llama-3.2-3B-Instruct-Turbo",
664
- "Hallucination %": 7.9,
665
- "Answer %": 100.0,
666
- "Avg Summary Words": 72.2
667
- },
668
- {
669
- "LLM": "meta-llama/Llama-2-70b-chat-hf",
670
- "Hallucination %": 5.896510228640193,
671
- "Answer %": 99.9,
672
- "Avg Summary Words": 84.9
673
- },
674
- {
675
- "LLM": "meta-llama/Meta-Llama-3.1-405B-Instruct",
676
- "Hallucination %": 3.9,
677
- "Answer %": 99.6,
678
- "Avg Summary Words": 85.7
679
- },
680
- {
681
- "LLM": "meta-llama/Llama-3.3-70B-Instruct",
682
- "Hallucination %": 4.0,
683
- "Answer %": 100.0,
684
- "Avg Summary Words": 85.3
685
- },
686
- {
687
- "LLM": "meta-llama/Meta-Llama-3.1-8B-Instruct",
688
- "Hallucination %": 5.4,
689
- "Answer %": 100.0,
690
- "Avg Summary Words": 71.0
691
- },
692
- {
693
- "LLM": "meta-llama/Meta-Llama-3.1-70B-Instruct",
694
- "Hallucination %": 5.0,
695
- "Answer %": 100.0,
696
- "Avg Summary Words": 79.6
697
- },
698
- {
699
- "LLM": "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
700
- "Hallucination %": 8.9,
701
- "Answer %": 100.0,
702
- "Avg Summary Words": 73.1
703
- },
704
- {
705
- "LLM": "meta-llama/Llama-3.2-1B-Instruct",
706
- "Hallucination %": 20.7,
707
- "Answer %": 100.0,
708
- "Avg Summary Words": 71.5
709
- },
710
- {
711
- "LLM": "meta-llama/Llama-3-70B-chat-hf",
712
- "Hallucination %": 4.1,
713
- "Answer %": 99.2,
714
- "Avg Summary Words": 68.5
715
- },
716
- {
717
- "LLM": "meta-llama/Llama-3-8B-chat-hf",
718
- "Hallucination %": 7.370517928286853,
719
- "Answer %": 99.8,
720
- "Avg Summary Words": 79.7
721
- },
722
- {
723
- "LLM": "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
724
- "Hallucination %": 4.3,
725
- "Answer %": 100.0,
726
- "Avg Summary Words": 79.8
727
- },
728
- {
729
- "LLM": "meta-llama/llama-4-scout",
730
- "Hallucination %": 4.7,
731
- "Answer %": 100.0,
732
- "Avg Summary Words": 80.7
733
- },
734
- {
735
- "LLM": "meta-llama/Llama-2-7b-chat-hf",
736
- "Hallucination %": 11.3,
737
- "Answer %": 99.6,
738
- "Avg Summary Words": 119.9
739
- },
740
- {
741
- "LLM": "meta-llama/Llama-2-13b-chat-hf",
742
- "Hallucination %": 10.5,
743
- "Answer %": 99.8,
744
- "Avg Summary Words": 82.1
745
- },
746
- {
747
- "LLM": "meta-llama/llama-4-maverick",
748
- "Hallucination %": 4.6,
749
- "Answer %": 100.0,
750
- "Avg Summary Words": 84.8
751
- },
752
- {
753
- "LLM": "microsoft/Orca-2-13b",
754
- "Hallucination %": 2.5,
755
- "Answer %": 100.0,
756
- "Avg Summary Words": 66.2
757
- },
758
- {
759
- "LLM": "microsoft/Phi-3.5-MoE-instruct",
760
- "Hallucination %": 2.5,
761
- "Answer %": 96.3,
762
- "Avg Summary Words": 69.7
763
- },
764
- {
765
- "LLM": "microsoft/Phi-3-mini-4k-instruct",
766
- "Hallucination %": 3.9761431411530817,
767
- "Answer %": 100.0,
768
- "Avg Summary Words": 86.8
769
- },
770
- {
771
- "LLM": "microsoft/phi-4",
772
- "Hallucination %": 4.7,
773
- "Answer %": 100.0,
774
- "Avg Summary Words": 100.3
775
- },
776
- {
777
- "LLM": "microsoft/Phi-3.5-mini-instruct",
778
- "Hallucination %": 4.1,
779
- "Answer %": 100.0,
780
- "Avg Summary Words": 75.0
781
- },
782
- {
783
- "LLM": "microsoft/Phi-3-mini-128k-instruct",
784
- "Hallucination %": 3.1,
785
- "Answer %": 100.0,
786
- "Avg Summary Words": 60.1
787
- },
788
- {
789
- "LLM": "microsoft/Phi-4-mini-instruct",
790
- "Hallucination %": 3.4,
791
- "Answer %": 100.0,
792
- "Avg Summary Words": 69.7
793
- },
794
- {
795
- "LLM": "microsoft/WizardLM-2-8x22B",
796
- "Hallucination %": 11.741293532338307,
797
- "Answer %": 99.9,
798
- "Avg Summary Words": 140.8
799
- },
800
- {
801
- "LLM": "microsoft/phi-2",
802
- "Hallucination %": 6.666666666666667,
803
- "Answer %": 91.5,
804
- "Avg Summary Words": 80.8
805
- },
806
- {
807
- "LLM": "THUDM/glm-4-9b-chat",
808
- "Hallucination %": 1.3,
809
- "Answer %": 100.0,
810
- "Avg Summary Words": 58.1
811
- },
812
- {
813
- "LLM": "internlm/internlm3-8b-instruct",
814
- "Hallucination %": 4.0,
815
- "Answer %": 100.0,
816
- "Avg Summary Words": 97.5
817
- },
818
- {
819
- "LLM": "ibm-granite/granite-3.1-8b-instruct",
820
- "Hallucination %": 8.6,
821
- "Answer %": 100.0,
822
- "Avg Summary Words": 107.4
823
- },
824
- {
825
- "LLM": "ibm-granite/granite-3.2-2b-instruct",
826
- "Hallucination %": 16.5,
827
- "Answer %": 100.0,
828
- "Avg Summary Words": 117.3
829
- },
830
- {
831
- "LLM": "ibm-granite/granite-3.1-2b-instruct",
832
- "Hallucination %": 15.7,
833
- "Answer %": 100.0,
834
- "Avg Summary Words": 107.7
835
- },
836
- {
837
- "LLM": "ibm-granite/granite-3.0-2b-instruct",
838
- "Hallucination %": 8.8,
839
- "Answer %": 100.0,
840
- "Avg Summary Words": 81.6
841
- },
842
- {
843
- "LLM": "ibm-granite/granite-3.0-8b-instruct",
844
- "Hallucination %": 6.5,
845
- "Answer %": 100.0,
846
- "Avg Summary Words": 74.2
847
- },
848
- {
849
- "LLM": "ibm-granite/granite-3.2-8b-instruct",
850
- "Hallucination %": 8.7,
851
- "Answer %": 100.0,
852
- "Avg Summary Words": 120.1
853
- },
854
- {
855
- "LLM": "tiiuae/falcon-7b-instruct",
856
- "Hallucination %": 29.92047713717694,
857
- "Answer %": 90.0,
858
- "Avg Summary Words": 75.5
859
- }
860
- ]