fazeel007 commited on
Commit
742d179
·
1 Parent(s): e57738d

Add seed documents to begin with

Browse files
Files changed (1) hide show
  1. server/seed-documents.ts +279 -7
server/seed-documents.ts CHANGED
@@ -5,6 +5,7 @@ import { storage } from './storage';
5
  import { type InsertDocument } from '@shared/schema';
6
 
7
  const defaultPapers: Omit<InsertDocument, 'id' | 'createdAt'>[] = [
 
8
  {
9
  title: "Attention Is All You Need",
10
  content: `The Transformer, a model architecture eschewing recurrence and instead relying entirely on an attention mechanism to draw global dependencies between input and output. The Transformer allows for significantly more parallelization and can reach a new state of the art in translation quality after being trained for as little as twelve hours on eight P100 GPUs.
@@ -20,7 +21,8 @@ The attention mechanism allows the model to make connections between distant ele
20
  year: 2017,
21
  venue: "NIPS",
22
  citations: 85000,
23
- keywords: ["attention mechanism", "transformer", "neural networks", "machine translation", "deep learning"]
 
24
  },
25
  embedding: null
26
  },
@@ -39,7 +41,8 @@ The model demonstrates remarkable capabilities across diverse domains, from crea
39
  year: 2023,
40
  venue: "arXiv",
41
  citations: 15000,
42
- keywords: ["GPT-4", "large language model", "multimodal", "AI safety", "alignment"]
 
43
  },
44
  embedding: null
45
  },
@@ -58,7 +61,8 @@ We find this approach can train a non-evasive and non-manipulative AI assistant
58
  year: 2022,
59
  venue: "arXiv",
60
  citations: 8000,
61
- keywords: ["constitutional AI", "AI safety", "harmlessness", "AI feedback", "alignment"]
 
62
  },
63
  embedding: null
64
  },
@@ -77,7 +81,8 @@ We introduce RAG models where the parametric memory is a pre-trained seq2seq mod
77
  year: 2020,
78
  venue: "NeurIPS",
79
  citations: 12000,
80
- keywords: ["retrieval augmented generation", "RAG", "knowledge-intensive", "question answering", "information retrieval"]
 
81
  },
82
  embedding: null
83
  },
@@ -96,7 +101,8 @@ The framework enables several key capabilities: connecting LLMs to other data so
96
  year: 2022,
97
  venue: "Open Source",
98
  citations: 5000,
99
- keywords: ["LangChain", "LLM framework", "agents", "chains", "composability", "tools"]
 
100
  },
101
  embedding: null
102
  },
@@ -115,7 +121,8 @@ We then collect a dataset of rankings of model outputs, which we use to further
115
  year: 2022,
116
  venue: "NeurIPS",
117
  citations: 18000,
118
- keywords: ["RLHF", "instruction following", "human feedback", "alignment", "InstructGPT"]
 
119
  },
120
  embedding: null
121
  },
@@ -134,7 +141,272 @@ We survey over 100 papers and find that emergent abilities appear in various dom
134
  year: 2022,
135
  venue: "arXiv",
136
  citations: 7500,
137
- keywords: ["emergent abilities", "scaling", "large language models", "few-shot learning", "reasoning"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  },
139
  embedding: null
140
  }
 
5
  import { type InsertDocument } from '@shared/schema';
6
 
7
  const defaultPapers: Omit<InsertDocument, 'id' | 'createdAt'>[] = [
8
+ // 🧠 Foundation & Scaling Laws
9
  {
10
  title: "Attention Is All You Need",
11
  content: `The Transformer, a model architecture eschewing recurrence and instead relying entirely on an attention mechanism to draw global dependencies between input and output. The Transformer allows for significantly more parallelization and can reach a new state of the art in translation quality after being trained for as little as twelve hours on eight P100 GPUs.
 
21
  year: 2017,
22
  venue: "NIPS",
23
  citations: 85000,
24
+ keywords: ["attention mechanism", "transformer", "neural networks", "machine translation", "deep learning"],
25
+ theme: "Foundation & Scaling Laws"
26
  },
27
  embedding: null
28
  },
 
41
  year: 2023,
42
  venue: "arXiv",
43
  citations: 15000,
44
+ keywords: ["GPT-4", "large language model", "multimodal", "AI safety", "alignment"],
45
+ theme: "Foundation & Scaling Laws"
46
  },
47
  embedding: null
48
  },
 
61
  year: 2022,
62
  venue: "arXiv",
63
  citations: 8000,
64
+ keywords: ["constitutional AI", "AI safety", "harmlessness", "AI feedback", "alignment"],
65
+ theme: "Alignment & Safety"
66
  },
67
  embedding: null
68
  },
 
81
  year: 2020,
82
  venue: "NeurIPS",
83
  citations: 12000,
84
+ keywords: ["retrieval augmented generation", "RAG", "knowledge-intensive", "question answering", "information retrieval"],
85
+ theme: "Tool Use & Reasoning & Agents"
86
  },
87
  embedding: null
88
  },
 
101
  year: 2022,
102
  venue: "Open Source",
103
  citations: 5000,
104
+ keywords: ["LangChain", "LLM framework", "agents", "chains", "composability", "tools"],
105
+ theme: "Tool Use & Reasoning & Agents"
106
  },
107
  embedding: null
108
  },
 
121
  year: 2022,
122
  venue: "NeurIPS",
123
  citations: 18000,
124
+ keywords: ["RLHF", "instruction following", "human feedback", "alignment", "InstructGPT"],
125
+ theme: "Alignment & Safety"
126
  },
127
  embedding: null
128
  },
 
141
  year: 2022,
142
  venue: "arXiv",
143
  citations: 7500,
144
+ keywords: ["emergent abilities", "scaling", "large language models", "few-shot learning", "reasoning"],
145
+ theme: "Foundation & Scaling Laws"
146
+ },
147
+ embedding: null
148
+ },
149
+ // 🧠 Foundation & Scaling Laws - Additional Papers
150
+ {
151
+ title: "Training Compute-Optimal Large Language Models",
152
+ content: `We investigate the optimal model size and number of tokens for training a transformer language model under a given compute budget. We find that current large language models are significantly undertrained. For compute-optimal training, the model size and the number of training tokens should be scaled equally: for every doubling of model size the number of training tokens should also be doubled.
153
+
154
+ For training compute-optimal language models, we predict that Chinchilla, a 70-billion parameter model, should outperform Gopher (280B), GPT-3 (175B), Jurassic-1 (178B), and Megatron-Turing NLG (530B), which are all considerably larger. We test this hypothesis by training Chinchilla on the same dataset as Gopher but with four times fewer parameters and four times more data.
155
+
156
+ Chinchilla uniformly and significantly outperforms Gopher, GPT-3, Jurassic-1, and Megatron-Turing NLG on a large range of downstream evaluation tasks. This result has profound implications for choosing LLM training strategies going forward.`,
157
+ source: "DeepMind, Hoffmann et al.",
158
+ sourceType: "research",
159
+ url: "https://arxiv.org/abs/2203.15556",
160
+ metadata: {
161
+ authors: ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch", "Elena Buchatskaya", "Trevor Cai", "Eliza Rutherford", "Diego de Las Casas", "Lisa Anne Hendricks", "Johannes Welbl", "Aidan Clark", "Tom Hennigan", "Eric Noland", "Katie Millican", "George van den Driessche", "Bogdan Damoc", "Aurelia Guy", "Simon Osindero", "Karen Simonyan", "Erich Elsen", "Jack W. Rae", "Oriol Vinyals", "Laurent Sifre"],
162
+ year: 2022,
163
+ venue: "arXiv",
164
+ citations: 12000,
165
+ keywords: ["Chinchilla", "compute-optimal", "scaling laws", "training efficiency", "language models"],
166
+ theme: "Foundation & Scaling Laws"
167
+ },
168
+ embedding: null
169
+ },
170
+ {
171
+ title: "LLaMA: Open and Efficient Foundation Language Models",
172
+ content: `We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train LLaMA on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly available datasets exclusively, without resorting to proprietary and inaccessible datasets.
173
+
174
+ In particular, LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA-65B is competitive with the best models, Chinchilla-70B and PaLM-540B. We release all our models to the research community.
175
+
176
+ Unlike Chinchilla, PaLM, or GPT-3, LLaMA only uses publicly available data, making our work compatible with open-source, while most existing models rely on data that is either not publicly available or undocumented (e.g. "Books - 2TB" or "Social media conversations"). There exist some exceptions, notably OPT and GLM, but none that are competitive with PaLM-62B or Chinchilla.`,
177
+ source: "Meta AI, Touvron et al.",
178
+ sourceType: "research",
179
+ url: "https://arxiv.org/abs/2302.13971",
180
+ metadata: {
181
+ authors: ["Hugo Touvron", "Thibaut Lavril", "Gautier Izacard", "Xavier Martinet", "Marie-Anne Lachaux", "Timothée Lacroix", "Baptiste Rozière", "Naman Goyal", "Eric Hambro", "Faisal Azhar", "Aurelien Rodriguez", "Armand Joulin", "Edouard Grave", "Guillaume Lample"],
182
+ year: 2023,
183
+ venue: "arXiv",
184
+ citations: 9000,
185
+ keywords: ["LLaMA", "open models", "foundation models", "efficiency", "democratization"],
186
+ theme: "Open Models & Democratization"
187
+ },
188
+ embedding: null
189
+ },
190
+ {
191
+ title: "DeepSeek-Coder: When the Large Language Model Meets Programming",
192
+ content: `We present DeepSeek-Coder, a series of code language models trained from scratch on 2 trillion tokens sourced from a high-quality programming corpus. We provide models with 1.3B, 5.7B, 6.7B, and 33B parameters. Our evaluation demonstrates that DeepSeek-Coder achieves state-of-the-art performance among open-source code models.
193
+
194
+ Notably, DeepSeek-Coder-Base-33B achieves 79.3% pass@1 on HumanEval, while DeepSeek-Coder-Instruct-33B reaches 78.6%. When DeepSeek-Coder-Base models are employed as base models for code completion in practical development environments, they demonstrate remarkable effectiveness.
195
+
196
+ DeepSeek-Coder comprises a range of models, each meticulously trained to excel in programming tasks. Beyond conventional code generation and comprehension, these models are optimized for practical software development scenarios, offering advanced code completion and repository-level understanding capabilities.`,
197
+ source: "DeepSeek AI, Guo et al.",
198
+ sourceType: "research",
199
+ url: "https://arxiv.org/abs/2401.14196",
200
+ metadata: {
201
+ authors: ["Daya Guo", "Qihao Zhu", "Dejian Yang", "Zhenda Xie", "Kai Dong", "Wentao Zhang", "Guanting Chen", "Xiao Bi", "Y. Wu", "YK Li", "Fuli Luo", "Yingfei Xiong", "Wenfeng Liang"],
202
+ year: 2024,
203
+ venue: "arXiv",
204
+ citations: 2500,
205
+ keywords: ["DeepSeek-Coder", "code generation", "programming", "software development", "open models"],
206
+ theme: "Open Models & Democratization"
207
+ },
208
+ embedding: null
209
+ },
210
+ {
211
+ title: "Mixtral of Experts",
212
+ content: `We introduce Mixtral 8x7B, a Sparse Mixture of Experts (SMoE) language model. Mixtral has the same architecture as Mistral 7B, but each layer is composed of 8 feedforward blocks (i.e. experts). For every token, at each layer, a router network selects two experts to process the current state and combine their outputs.
213
+
214
+ Despite being a mixture of experts model with 45B total parameters, Mixtral only uses 12B parameters during inference, leading to better throughput at the same batch size and sequence length as Mistral 7B. Mixtral matches or outperforms Llama 2 70B on most benchmarks we tested.
215
+
216
+ On the HellaSwag, Arc, MMLU, TruthfulQA, Winogrande, and GSM8k benchmarks, Mixtral significantly outperforms Llama 2 13B and Llama 1 34B on all benchmarks. It either matches or outperforms Llama 2 70B on MMLU, GSM8k, and most other benchmarks.`,
217
+ source: "Mistral AI, Jiang et al.",
218
+ sourceType: "research",
219
+ url: "https://arxiv.org/abs/2401.04088",
220
+ metadata: {
221
+ authors: ["Albert Q. Jiang", "Alexandre Sablayrolles", "Antoine Roux", "Arthur Mensch", "Blanche Savary", "Chris Bamford", "Devendra Singh Chaplot", "Diego de las Casas", "Emma Bou Hanna", "Florian Bressand", "Gianna Lengyel", "Guillaume Bour", "Guillaume Lample", "Lélio Renard Lavaud", "Lucile Saulnier", "Marie-Anne Lachaux", "Pierre Stock", "Sandeep Subramanian", "Sophia Yang", "Szymon Antoniak", "Teven Le Scao", "Théophile Gervet", "Thibaut Lavril", "Thomas Wang", "Timothée Lacroix", "William El Sayed"],
222
+ year: 2024,
223
+ venue: "arXiv",
224
+ citations: 1800,
225
+ keywords: ["Mixtral", "mixture of experts", "efficiency", "sparse models", "Mistral"],
226
+ theme: "Open Models & Democratization"
227
+ },
228
+ embedding: null
229
+ },
230
+ // 🔐 Alignment & Safety - Additional Papers
231
+ {
232
+ title: "Teaching language models to support answers with verified quotes",
233
+ content: `We present GopherCite, a system for training language models to support their responses with evidence from reliable sources. Our approach involves fine-tuning a pre-trained language model to cite specific quotes from a corpus of reliable sources when answering questions.
234
+
235
+ We demonstrate that our approach significantly improves the factual accuracy of model responses while maintaining readability. When citations are provided, users can verify the information and better assess the trustworthiness of the model's claims. We establish new benchmarks for evaluating attribution quality.
236
+
237
+ Our work addresses critical challenges in deploying language models safely, particularly around misinformation and hallucination. By requiring models to ground their responses in verifiable sources, we reduce the risk of generating false or misleading information, which is essential for real-world applications.`,
238
+ source: "DeepMind, Menick et al.",
239
+ sourceType: "research",
240
+ url: "https://arxiv.org/abs/2203.11147",
241
+ metadata: {
242
+ authors: ["Jacob Menick", "Maja Trebacz", "Vladimir Mikulik", "John Aslanides", "Francis Song", "Martin Chadwick", "Mia Glaese", "Susannah Young", "Lucy Campbell-Gillingham", "Geoffrey Irving", "Nat McAleese"],
243
+ year: 2022,
244
+ venue: "arXiv",
245
+ citations: 3500,
246
+ keywords: ["Sparrow", "attribution", "factual accuracy", "citations", "safety"],
247
+ theme: "Alignment & Safety"
248
+ },
249
+ embedding: null
250
+ },
251
+ {
252
+ title: "Least-to-Most Prompting Enables Complex Reasoning in Large Language Models",
253
+ content: `Chain-of-thought prompting has demonstrated remarkable performance on various reasoning tasks. However, it can still struggle with problems that require solving sub-problems sequentially. We introduce least-to-most prompting, a novel prompting strategy that enables large language models to break down complex problems into simpler subproblems and solve them sequentially.
254
+
255
+ Our approach consists of two stages: first, we prompt the model to decompose a complex problem into a series of simpler subproblems; then we solve these subproblems sequentially, using answers from previously solved subproblems to help solve the next subproblem.
256
+
257
+ We demonstrate that least-to-most prompting can significantly improve performance on tasks that require generalization to harder problems than those seen in the context. In particular, we achieve new state-of-the-art results on the tasks of symbolic manipulation, compositional generalization, and math word problems.`,
258
+ source: "Google Research, Zhou et al.",
259
+ sourceType: "research",
260
+ url: "https://arxiv.org/abs/2205.10625",
261
+ metadata: {
262
+ authors: ["Denny Zhou", "Nathanael Schärli", "Le Hou", "Jason Wei", "Nathan Scales", "Xuezhi Wang", "Dale Schuurmans", "Claire Cui", "Olivier Bousquet", "Quoc Le", "Ed Chi"],
263
+ year: 2022,
264
+ venue: "ICLR 2023",
265
+ citations: 4200,
266
+ keywords: ["task decomposition", "prompting", "complex reasoning", "problem solving", "compositional generalization"],
267
+ theme: "Alignment & Safety"
268
+ },
269
+ embedding: null
270
+ },
271
+ // 🎨 Multimodality & Vision-Language Models
272
+ {
273
+ title: "Learning Transferable Visual Models From Natural Language Supervision",
274
+ content: `State-of-the-art computer vision systems are trained to predict a fixed set of predetermined object categories. This restricted form of supervision limits their generality and usability since additional labeled data is needed to specify any other visual concept. Learning directly from raw text about images is a promising alternative which leverages a much broader source of supervision.
275
+
276
+ We demonstrate that the simple pre-training task of predicting which caption goes with which image is an efficient and scalable way to learn SOTA image representations from scratch on a dataset of 400 million (image, text) pairs collected from the internet. After pre-training, natural language is used to reference learned visual concepts (or describe new ones) enabling zero-shot transfer of the model to downstream tasks.
277
+
278
+ We study the performance of this approach by benchmarking on over 30 different existing computer vision datasets, spanning tasks such as OCR, action recognition in videos, geo-localization, and many types of fine-grained object classification. The model transfers non-trivially to most tasks and is often competitive with a fully supervised baseline without needing any dataset-specific training.`,
279
+ source: "OpenAI, Radford et al.",
280
+ sourceType: "research",
281
+ url: "https://arxiv.org/abs/2103.00020",
282
+ metadata: {
283
+ authors: ["Alec Radford", "Jong Wook Kim", "Chris Hallacy", "Aditya Ramesh", "Gabriel Goh", "Sandhini Agarwal", "Girish Sastry", "Amanda Askell", "Pamela Mishkin", "Jack Clark", "Gretchen Krueger", "Ilya Sutskever"],
284
+ year: 2021,
285
+ venue: "ICML",
286
+ citations: 25000,
287
+ keywords: ["CLIP", "multimodal", "vision-language", "zero-shot transfer", "contrastive learning"],
288
+ theme: "Multimodality & Vision-Language Models"
289
+ },
290
+ embedding: null
291
+ },
292
+ {
293
+ title: "Multimodal Neurons in Artificial Neural Networks",
294
+ content: `We document the presence of multimodal neurons in CLIP that respond to the same concept whether presented literally, symbolically, or conceptually. This offers one step toward understanding the associations and conceptual reasoning displayed by multimodal systems like CLIP.
295
+
296
+ By studying multimodal neurons, we can begin to understand how CLIP performs its remarkable zero-shot capabilities. We show examples of neurons that respond to concepts like "Spiderman" whether the image shows the character, text spelling the name, or even a spider, demonstrating the rich associations learned during training.
297
+
298
+ We also uncover neurons that can be exploited to adversarially attack the model. By understanding these failure modes through the lens of multimodal neurons, we provide insights into potential vulnerabilities and how they might be addressed. This work demonstrates the importance of interpretability research for understanding and improving multimodal AI systems.`,
299
+ source: "OpenAI, Goh et al.",
300
+ sourceType: "research",
301
+ url: "https://distill.pub/2021/multimodal-neurons/",
302
+ metadata: {
303
+ authors: ["Gabriel Goh", "Nick Cammarata", "Chelsea Voss", "Shan Carter", "Michael Petrov", "Ludwig Schubert", "Alec Radford", "Chris Olah"],
304
+ year: 2021,
305
+ venue: "Distill",
306
+ citations: 1200,
307
+ keywords: ["multimodal neurons", "CLIP", "interpretability", "concept learning", "adversarial examples"],
308
+ theme: "Multimodality & Vision-Language Models"
309
+ },
310
+ embedding: null
311
+ },
312
+ {
313
+ title: "DeepSeek-VL: Towards Real-World Vision-Language Understanding",
314
+ content: `We present DeepSeek-VL, an open-source Vision-Language (VL) Model designed for real-world vision and language understanding applications. DeepSeek-VL possesses general multimodal understanding capabilities, experiencing significant improvements when fine-tuned for specific tasks.
315
+
316
+ DeepSeek-VL family consists of 1.3B and 7B models, both trained from scratch with a carefully designed data curriculum that includes both text-only and vision-text data. For the vision encoder, we explore different approaches and introduce an efficient hybrid vision encoder that balances performance and efficiency.
317
+
318
+ In most benchmarks, DeepSeek-VL shows superior or competitive performance compared to existing open-source dense and MoE vision-language models with similar model sizes, and even surpasses some larger models. We also provide comprehensive analysis on training strategies, model architecture choices, and scaling effects to facilitate future research in this direction.`,
319
+ source: "DeepSeek AI, Lu et al.",
320
+ sourceType: "research",
321
+ url: "https://arxiv.org/abs/2403.05525",
322
+ metadata: {
323
+ authors: ["Haoyu Lu", "Wen Liu", "Bo Zhang", "Bingxuan Wang", "Kai Dong", "Bo Liu", "Jingxiang Sun", "Tongzheng Ren", "Zhuoshu Li", "Hao Yang", "Yaofeng Sun", "Chengqi Deng", "Hanwei Xu", "Zhenda Xie", "Chong Ruan"],
324
+ year: 2024,
325
+ venue: "arXiv",
326
+ citations: 800,
327
+ keywords: ["DeepSeek-VL", "vision-language", "multimodal understanding", "open-source", "real-world applications"],
328
+ theme: "Multimodality & Vision-Language Models"
329
+ },
330
+ embedding: null
331
+ },
332
+ {
333
+ title: "Gemini: A Family of Highly Capable Multimodal Models",
334
+ content: `This report introduces a new family of multimodal models, Gemini, that exhibit remarkable capabilities across image, audio, video, and text understanding. Gemini models are trained on a diverse dataset of text, code, and multimodal data, using both supervised and reinforcement learning from human feedback (RLHF).
335
+
336
+ Gemini Ultra's performance exceeds current state-of-the-art results on 30 of 32 widely-used academic benchmarks used in large language model (LLM) research. We believe that the new capabilities of Gemini models in cross-modal reasoning and language understanding will enable a wide variety of use cases and we discuss our approach toward deploying them responsibly to users.
337
+
338
+ The largest model, Gemini Ultra, achieves new state-of-the-art performance on challenging benchmarks like MMLU, where it becomes the first model to exceed human expert level performance. Gemini models also demonstrate strong performance on multimodal reasoning tasks that require understanding and reasoning over images, videos, and audio in combination with text.`,
339
+ source: "Google DeepMind, Team et al.",
340
+ sourceType: "research",
341
+ url: "https://arxiv.org/abs/2312.11805",
342
+ metadata: {
343
+ authors: ["Gemini Team", "Rohan Anil", "Sebastian Borgeaud", "Yonghui Wu", "Jean-Baptiste Alayrac", "Jiahui Yu", "Radu Soricut", "Johan Schalkwyk", "Andrew M. Dai", "Anja Hauth", "Katie Millican", "David Silver", "Slav Petrov", "Melvin Johnson", "Ioannis Antonoglou", "Julian Schrittwieser", "Amelia Glaese", "Jilin Chen", "Emily Pitler", "Timothy Lillicrap", "Angeliki Lazaridou", "Orhan Firat", "James Molloy", "Michael Isard", "Paul R. Barham", "Tom Hennigan", "Benjamin Lee", "Fabio Viola", "Malcolm Reynolds", "Yuanzhong Xu", "Ryan Doherty", "Eli Collins", "Clemens Meyer", "Eliza Rutherford", "Erica Moreira", "Kareem Ayoub", "Megha Goel", "George Tucker", "Enrique Piqueras", "Maxim Krikun", "Iain Barr", "Nikolay Savinov", "Ivo Danihelka", "Becca Roelofs", "Anaïs White", "Anders Andreassen", "Tamara von Glehn", "Lakshman Yagati", "Mehran Kazemi", "Lucas Gonzalez", "Misha Khalman", "Jakub Sygnowski", "Alexandre Frechette", "Charlotte Smith", "Laura Culp", "Lev Proleev", "Yi Luan", "Xi Chen", "James Lottes", "Nathan Schucher", "Federico Lebron", "Alban Rrustemi", "Natasha Clay", "Phil Crone", "Tomas Kocisky", "Jeffrey Zhao", "Bartek Perz", "Dian Yu", "Heidi Howard", "Adam Bloniarz", "Jack W. Rae", "Han Lu", "Laurent Sifre", "Marcello Maggioni", "Fred Alcober", "Dan Garrette", "Megan Barnes", "Shantanu Thakoor", "Jacob Austin", "Gabriel Barth-Maron", "William Wong", "Rishabh Joshi", "Rahma Chaabouni", "Deeni Fatiha", "Arun Ahuja", "Ruibo Liu", "Yunxuan Li", "Sarah Cogan", "Jeremy Chen", "Chao Jia", "Chenjie Gu", "Qiao Zhang", "Jordan Grimstad", "Ale Jakse Hartman", "Martin Chadwick", "Gaurav Singh Tomar", "Xavier Garcia", "Evan Senter", "Emanuel Taropa", "Thanumalayan Sankaranarayana Pillai", "Jacob Devlin", "Michael Laskin", "Diego de Las Casas", "Dasha Valter", "Connie Tao", "Lorenzo Blanco", "Adrià Puigdomènech Badia", "David Reitter", "Mianna Chen", "Jenny Brennan", "Clara Rivera", "Sergey Brin", "Shariq Hashme", "Mario Garrido", "Justin Gilmer", "Carl Saroufim", "James Molloy", "Cosmo Du", "Eli Bixby", "Orhan Firat", "Matthew Kelcey", "Sushant Prakash", "Huaixiu Steven Zheng", "Bradley Green", "Ewa Olecka", "Petko Georgiev", "Nate Attaluri", "Matthew Lamm", "Luyu Wang", "Chenkai Kuang", "Jason Riesa", "Abhanshu Sharma", "Nick Fernando", "Behnam Neyshabur", "Noah Fiedel", "Erica Oliveira", "Lem Asaba", "Alexander Chen", "Aakanksha Chowdhery", "Marie Pellat", "Jacob Eisenstein", "Adam Roberts", "Hyung Won Chung", "Henryk Michalewski", "Charlie Chen", "Ankesh Anand", "Shibo Wang", "Anton Ruiz", "Honglei Liu", "Libin Bai", "Andre Saraiva", "Andrew Dai", "Diogo de Freitas Amorim", "Ben Hutchinson", "Reiner Pope", "James Bradbury", "Jacob Austin", "Michael Isard", "Guy Gur-Ari", "Pengcheng Yin", "Toju Duke", "Anselm Levskaya", "Sanjay Ghemawat", "Sunipa Dev", "Henryk Michalewski", "Xavier Garcia", "Vedant Misra", "Kevin Robinson", "Liam Fedus", "Denny Zhou", "Daphne Ippolito", "David Luan", "Hyeontaek Lim", "Barret Zoph", "Alexander Spiridonov", "Ryan Sepassi", "David Dohan", "Shivani Agrawal", "Mark Omernick", "Andrew M. Dai", "Thanumalayan Sankaranarayana Pillai", "Marie Pellat", "Aida Nematzadeh", "Dmitry Lepikhin", "Henryk Michalewski", "Aakanksha Chowdhery", "Sharan Narang", "Jacob Menick", "Sebastian Borgeaud", "Andy Brock", "Aidan Clark", "Karen Simonyan", "Melvin Johnson", "Ioannis Antonoglou", "Rohan Anil", "Tom Hennigan", "Jacob Menick", "Sharan Narang", "Arthur Mensch", "Saffron Huang", "Liam Fedus", "Adam Roberts", "Jascha Sohl-Dickstein", "Dani Yogatama", "James Bradbury", "Ioannis Antonoglou", "Tom Hennigan", "Omar Shaikh", "Shivani Agrawal", "Ryan Sepassi", "Alexander Ratner", "Tom Hennigan", "Peter J. Liu", "Sharan Narang", "Hyung Won Chung", "Michael Fink", "Noah Constant", "Adam Roberts", "Colin Raffel"],
344
+ year: 2023,
345
+ venue: "arXiv",
346
+ citations: 6000,
347
+ keywords: ["Gemini", "multimodal", "state-of-the-art", "MMLU", "cross-modal reasoning"],
348
+ theme: "Multimodality & Vision-Language Models"
349
+ },
350
+ embedding: null
351
+ },
352
+ // 🤖 Tool Use, Reasoning & Agents - Additional Papers
353
+ {
354
+ title: "Toolformer: Language Models Can Teach Themselves to Use Tools",
355
+ content: `Language models (LMs) exhibit remarkable abilities to solve new tasks from just a few examples or textual instructions, especially at scale. They also, paradoxically, struggle with basic functionality, such as arithmetic or factual lookup, where much simpler and more reliable alternatives exist. In this paper, we show that LMs can teach themselves to use external tools via simple APIs and achieve the best of both worlds.
356
+
357
+ We introduce Toolformer, a model trained to decide which APIs to call, when to call them, what arguments to pass, and how to best incorporate the results into future token prediction. This is done in a self-supervised way, requiring nothing more than a handful of demonstrations for each API. We incorporate a range of tools, including a calculator, a Q&A system, two different search engines, a translation system, and a calendar.
358
+
359
+ Toolformer achieves substantially improved zero-shot performance across a variety of downstream tasks, often competitive with much larger models, without sacrificing its core language modeling abilities. Our approach represents an important step toward LMs that can use external tools in a more sophisticated and autonomous way.`,
360
+ source: "Meta AI, Schick et al.",
361
+ sourceType: "research",
362
+ url: "https://arxiv.org/abs/2302.04761",
363
+ metadata: {
364
+ authors: ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì", "Roberta Raileanu", "Maria Lomeli", "Luke Zettlemoyer", "Nicola Cancedda", "Thomas Scialom"],
365
+ year: 2023,
366
+ venue: "arXiv",
367
+ citations: 3800,
368
+ keywords: ["Toolformer", "tool use", "API integration", "language models", "self-supervised learning"],
369
+ theme: "Tool Use & Reasoning & Agents"
370
+ },
371
+ embedding: null
372
+ },
373
+ {
374
+ title: "ReAct: Synergizing Reasoning and Acting in Language Models",
375
+ content: `While large language models (LLMs) have demonstrated impressive capabilities across tasks in language understanding and interactive decision making, their abilities for reasoning (e.g. chain-of-thought prompting) and acting (e.g. action plan generation) have primarily been studied as separate topics. In this paper, we explore the use of LLMs to generate both reasoning traces and task-specific actions in an interleaved manner.
376
+
377
+ We present ReAct, a general paradigm that combines reasoning and acting with language models. ReAct prompts LLMs to generate verbal reasoning traces and actions for a task, which allows for greater synergy between the two: reasoning traces help the model induce, track, and update action plans as well as handle exceptions, while actions allow it to interface with external sources, such as knowledge bases or environments, to gather additional information.
378
+
379
+ We apply ReAct to a diverse set of language and decision making tasks and demonstrate its effectiveness over state-of-the-art baselines, as well as improved human interpretability and trustworthiness over methods without reasoning or acting components. Concretely, on question answering (HotpotQA) and fact verification (Fever), ReAct overcomes issues of hallucination and error propagation prevalent in chain-of-thought reasoning by interacting with a simple Wikipedia API, and generates human-like task-solving trajectories that are more interpretable than baselines without reasoning traces.`,
380
+ source: "Princeton University, Yao et al.",
381
+ sourceType: "research",
382
+ url: "https://arxiv.org/abs/2210.03629",
383
+ metadata: {
384
+ authors: ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik Narasimhan", "Yuan Cao"],
385
+ year: 2022,
386
+ venue: "ICLR 2023",
387
+ citations: 5200,
388
+ keywords: ["ReAct", "reasoning", "acting", "language models", "decision making"],
389
+ theme: "Tool Use & Reasoning & Agents"
390
+ },
391
+ embedding: null
392
+ },
393
+ {
394
+ title: "Self-Instruct: Aligning Language Models with Self-Generated Instructions",
395
+ content: `Large "instruction-tuned" language models (i.e., finetuned to respond to instructions) have demonstrated a remarkable ability to generalize zero-shot to new tasks. Nevertheless, they depend heavily on human-written instruction data that is often limited in quantity, diversity, and creativity, therefore hindering the generality of the tuned model. We introduce Self-Instruct, a framework for improving the instruction-following capabilities of pretrained language models by bootstrapping off their own generations.
396
+
397
+ Our pipeline generates instructions, input, and output samples from a language model, then filters invalid or similar ones before using them to finetune the original model. Applying our method to the vanilla GPT3, we demonstrate a 33% absolute improvement over the original model on Super-NaturalInstructions, on par with the performance of InstructGPT_{001}, which was trained with private user data and human annotations.
398
+
399
+ For further evaluation, we curate a set of expert-written instructions for novel tasks, and show through human evaluation that tuning GPT3 with Self-Instruct outperforms using existing public instruction datasets by a large margin, leaving only a 5% absolute gap behind InstructGPT_{001}. Self-Instruct provides an almost annotation-free method for aligning language models with instructions, and we release our large synthetic dataset to facilitate future work on instruction tuning.`,
400
+ source: "University of Washington, Wang et al.",
401
+ sourceType: "research",
402
+ url: "https://arxiv.org/abs/2212.10560",
403
+ metadata: {
404
+ authors: ["Yizhong Wang", "Yeganeh Kordi", "Swaroop Mishra", "Alisa Liu", "Noah A. Smith", "Daniel Khashabi", "Hannaneh Hajishirzi"],
405
+ year: 2022,
406
+ venue: "ACL 2023",
407
+ citations: 4500,
408
+ keywords: ["Self-Instruct", "instruction tuning", "bootstrapping", "synthetic data", "alignment"],
409
+ theme: "Tool Use & Reasoning & Agents"
410
  },
411
  embedding: null
412
  }