Spaces:

zjunlp
/

OmniThink

Running

App Files Files Community

ZekunXi commited on Apr 5

Commit

80a598c

1 Parent(s): d1dbd31

push

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +161 -0
model/paraphrase-MiniLM-L6-v2/.gitattributes +17 -0
model/paraphrase-MiniLM-L6-v2/1_Pooling/config.json +7 -0
model/paraphrase-MiniLM-L6-v2/README.md +108 -0
model/paraphrase-MiniLM-L6-v2/config.json +24 -0
model/paraphrase-MiniLM-L6-v2/config_sentence_transformers.json +7 -0
model/paraphrase-MiniLM-L6-v2/model.safetensors +3 -0
model/paraphrase-MiniLM-L6-v2/modules.json +14 -0
model/paraphrase-MiniLM-L6-v2/onnx/model.onnx +3 -0
model/paraphrase-MiniLM-L6-v2/onnx/model_O1.onnx +3 -0
model/paraphrase-MiniLM-L6-v2/onnx/model_O2.onnx +3 -0
model/paraphrase-MiniLM-L6-v2/onnx/model_O3.onnx +3 -0
model/paraphrase-MiniLM-L6-v2/onnx/model_O4.onnx +3 -0
model/paraphrase-MiniLM-L6-v2/onnx/model_qint8_arm64.onnx +3 -0
model/paraphrase-MiniLM-L6-v2/onnx/model_qint8_avx512.onnx +3 -0
model/paraphrase-MiniLM-L6-v2/onnx/model_qint8_avx512_vnni.onnx +3 -0
model/paraphrase-MiniLM-L6-v2/onnx/model_quint8_avx2.onnx +3 -0
model/paraphrase-MiniLM-L6-v2/openvino/openvino_model.bin +3 -0
model/paraphrase-MiniLM-L6-v2/openvino/openvino_model.xml +0 -0
model/paraphrase-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.bin +3 -0
model/paraphrase-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.xml +0 -0
model/paraphrase-MiniLM-L6-v2/pytorch_model.bin +3 -0
model/paraphrase-MiniLM-L6-v2/sentence_bert_config.json +4 -0
model/paraphrase-MiniLM-L6-v2/special_tokens_map.json +1 -0
model/paraphrase-MiniLM-L6-v2/tf_model.h5 +3 -0
model/paraphrase-MiniLM-L6-v2/tokenizer.json +0 -0
model/paraphrase-MiniLM-L6-v2/tokenizer_config.json +1 -0
model/paraphrase-MiniLM-L6-v2/vocab.txt +0 -0
requirements.txt +22 -0
src/DeepThink/__pycache__/__init__.cpython-311.pyc +0 -0
src/DeepThink/__pycache__/engine.cpython-311.pyc +0 -0
src/DeepThink/engine.py +285 -0
src/DeepThink/modules/__pycache__/__init__.cpython-311.pyc +0 -0
src/DeepThink/modules/__pycache__/article_generation.cpython-310.pyc +0 -0
src/DeepThink/modules/__pycache__/article_generation.cpython-311.pyc +0 -0
src/DeepThink/modules/__pycache__/article_polish.cpython-310.pyc +0 -0
src/DeepThink/modules/__pycache__/article_polish.cpython-311.pyc +0 -0
src/DeepThink/modules/__pycache__/interface.cpython-310.pyc +0 -0
src/DeepThink/modules/__pycache__/interface.cpython-311.pyc +0 -0
src/DeepThink/modules/__pycache__/mindmap.cpython-310.pyc +0 -0
src/DeepThink/modules/__pycache__/mindmap.cpython-311.pyc +0 -0
src/DeepThink/modules/__pycache__/outline_generation.cpython-310.pyc +0 -0
src/DeepThink/modules/__pycache__/outline_generation.cpython-311.pyc +0 -0
src/DeepThink/modules/__pycache__/retriever.cpython-311.pyc +0 -0
src/DeepThink/modules/__pycache__/storm_dataclass.cpython-310.pyc +0 -0
src/DeepThink/modules/__pycache__/storm_dataclass.cpython-311.pyc +0 -0
src/DeepThink/modules/__pycache__/utils.cpython-310.pyc +0 -0
src/DeepThink/modules/__pycache__/utils.cpython-311.pyc +0 -0
src/DeepThink/modules/article_generation.py +523 -0
src/DeepThink/modules/article_polish.py +417 -0

app.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import time
+import json
+import pandas as pd
+import streamlit as st
+from dotenv import load_dotenv
+from http import HTTPStatus
+from src.lm import  QwenModel
+from src.rm import GoogleSearchAli_new
+import sys
+sys.path.append('./src/DeepThink/modules')
+from mindmap import MindMap
+from storm_dataclass import Article
+from article_generation import ArticleGenerationModule
+from article_polish import ArticlePolishingModule
+from outline_generation import OutlineGenerationModule
+import os
+import subprocess
+bash_command = "pip install --upgrade pip"
+process = subprocess.Popen(bash_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+# Load environment variables and API keys
+# load_dotenv()
+openai_kwargs = {
+    'api_key': os.getenv("OPENAI_API_KEY"),
+    'api_provider': os.getenv('OPENAI_API_TYPE'),
+    'temperature': 1.0,
+    'top_p': 0.9,
+    'api_base': os.getenv('AZURE_API_BASE'),
+    'api_version': os.getenv('AZURE_API_VERSION'),
+}
+lm = QwenModel(model='qwen-plus', max_tokens=1000, **openai_kwargs)
+lm4outline = QwenModel(model='qwen-plus', max_tokens=1000, **openai_kwargs)
+lm4gensection = QwenModel(model='qwen-plus', max_tokens=2000, **openai_kwargs)
+lm4polish = QwenModel(model='qwen-plus', max_tokens=4000, **openai_kwargs)
+rm = GoogleSearchAli_new(k=5)
+st.set_page_config(page_title='OmniThink', layout="wide")
+st.warning("Announcement: Due to the recent high volume of visitors, search API quota limitations, you may encounter an error: "
+           "'ValueError: Expected 2D array, got 1D array instead: array=[]. "
+           "Reshape your data either using array.reshape(-1, 1) if your data has a single feature "
+           "or array.reshape(1, -1) if it contains a single sample.' "
+           "If this error occurs, please try again in a few hours.")
+st.title('🤔 OmniThink')
+st.markdown('_OmniThink is a tool that helps you think deeply about a topic, generate an outline, and write an article._')
+# Sidebar for configuration and examples
+with st.sidebar:
+    st.header('Configuration')
+    MAX_ROUNDS = st.number_input('Retrieval Depth', min_value=0, max_value=10, value=2, step=1)
+    models = ['Qwen-Plus', 'Coming Soon']
+    selected_example = st.selectbox('LLM:', models)
+    searchers = ['GoogleSearch', 'Coming Soon']
+    selected_example = st.selectbox('Search engine', searchers)
+    n_max_doc = st.number_input('Number of web pages retrievad in single search', min_value=1, max_value=50, value=10, step=5)
+    st.header('Examples')
+    examples = ['AlphaFold', '2024 Hualien City Earthquake', 'Taylor Swift', 'Yoon Seok-youl']
+    selected_example = st.selectbox('case', examples)
+    status_placeholder = st.empty()
+mind_map = MindMap(
+    retriever=rm,
+    gen_concept_lm = lm4outline,
+    gen_concept_lm2 = lm4outline,
+    search_top_k = n_max_doc,
+    depth= MAX_ROUNDS
+)
+def Think(input_topic):
+    generator = mind_map.build_map(input_topic)
+    st.markdown(f'Performing an in-depth search on the content related to {input_topic}...')
+    for idx, layer in enumerate(generator):
+        print(layer)
+        print('layer!!!')
+        st.markdown(f'Deep Thinking Retrieval at Level {idx + 1}...')
+        status_placeholder.text(f"Currently conducting the {idx + 1}th level deep thinking retrieval, estimated to take {(idx+1)*3} minutes.")
+        for node in layer:
+            category = node.category
+            print(f'category: {category}')
+            with st.expander(f'{category}'):
+                st.markdown(f'### The concept of {node.category}')
+                print(node.concept)
+                for concept in node.concept:
+                    st.markdown(f'* {concept}')
+                st.markdown(f'### The web of {node.category}')
+                for idx, info in enumerate(node.info):
+                    st.markdown(f'{idx + 1}. {info["title"]} \n {info["snippets"]}')
+    st.markdown(f'Constructing an index table for the {mind_map.get_web_number()} retrieved web pages...')
+    mind_map.prepare_table_for_retrieval()
+    return '__finish__', '__finish__'
+def GenOutline(input_topic):
+    status_placeholder.text("The outline writing is in progress and is expected to take 1 minute.")
+    ogm = OutlineGenerationModule(lm)
+    outline = ogm.generate_outline(topic= input_topic, mindmap = mind_map)
+    return outline
+def GenArticle(input_topic, outline):
+    status_placeholder.text("The article writing is in progress and is expected to take 3 minutes.")
+    article_with_outline = Article.from_outline_str(topic=input_topic, outline_str=outline)
+    ag = ArticleGenerationModule(retriever = rm, article_gen_lm = lm, retrieve_top_k = 3, max_thread_num = 10)
+    article = ag.generate_article(topic = topic, mindmap = mind_map, article_with_outline = article_with_outline)
+    ap = ArticlePolishingModule(article_gen_lm = lm, article_polish_lm = lm)
+    article = ap.polish_article(topic = topic, draft_article = article)
+    return article.to_string()
+with st.form('my_form'):
+    topic = st.text_input('Please enter the topic you are interested in.', value=selected_example, placeholder='Please enter the topic you are interested in.')
+    submit_button = st.form_submit_button('Generate！')
+    if submit_button:
+        if topic:
+            st.markdown('### Thought process')
+            summary, news_timeline = Think(topic)
+            st.session_state.summary = summary
+            st.session_state.news_timeline = news_timeline
+            st.markdown('### Outline generation')
+            with st.expander("Outline generation", expanded=True):
+                outline = GenOutline(topic)
+                st.text(outline)
+            st.markdown('### article generation')
+            with st.expander("article generation", expanded=True):
+                article = GenArticle(topic, outline)
+                st.markdown(article)
+        else:
+            st.error('Please enter the subject.')

model/paraphrase-MiniLM-L6-v2/.gitattributes ADDED Viewed

	@@ -0,0 +1,17 @@

+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+model.safetensors filter=lfs diff=lfs merge=lfs -text

model/paraphrase-MiniLM-L6-v2/1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "word_embedding_dimension": 384,
+  "pooling_mode_cls_token": false,
+  "pooling_mode_mean_tokens": true,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false
+}

model/paraphrase-MiniLM-L6-v2/README.md ADDED Viewed

	@@ -0,0 +1,108 @@

+---
+license: apache-2.0
+library_name: sentence-transformers
+tags:
+- sentence-transformers
+- feature-extraction
+- sentence-similarity
+- transformers
+pipeline_tag: sentence-similarity
+---
+# sentence-transformers/paraphrase-MiniLM-L6-v2
+This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
+## Usage (Sentence-Transformers)
+Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
+```
+pip install -U sentence-transformers
+```
+Then you can use the model like this:
+```python
+from sentence_transformers import SentenceTransformer
+sentences = ["This is an example sentence", "Each sentence is converted"]
+model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
+embeddings = model.encode(sentences)
+print(embeddings)
+```
+## Usage (HuggingFace Transformers)
+Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
+```python
+from transformers import AutoTokenizer, AutoModel
+import torch
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+# Sentences we want sentence embeddings for
+sentences = ['This is an example sentence', 'Each sentence is converted']
+# Load model from HuggingFace Hub
+tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')
+model = AutoModel.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')
+# Tokenize sentences
+encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+# Compute token embeddings
+with torch.no_grad():
+    model_output = model(**encoded_input)
+# Perform pooling. In this case, max pooling.
+sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+print("Sentence embeddings:")
+print(sentence_embeddings)
+```
+## Evaluation Results
+For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name=sentence-transformers/paraphrase-MiniLM-L6-v2)
+## Full Model Architecture
+```
+SentenceTransformer(
+  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel
+  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
+)
+```
+## Citing & Authors
+This model was trained by [sentence-transformers](https://www.sbert.net/).
+If you find this model helpful, feel free to cite our publication [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084):
+```bibtex
+@inproceedings{reimers-2019-sentence-bert,
+    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
+    month = "11",
+    year = "2019",
+    publisher = "Association for Computational Linguistics",
+    url = "http://arxiv.org/abs/1908.10084",
+}
+```

model/paraphrase-MiniLM-L6-v2/config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "_name_or_path": "old_models/paraphrase-MiniLM-L6-v2/0_Transformer",
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.7.0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

model/paraphrase-MiniLM-L6-v2/config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "__version__": {
+    "sentence_transformers": "2.0.0",
+    "transformers": "4.7.0",
+    "pytorch": "1.9.0+cu102"
+  }
+}

model/paraphrase-MiniLM-L6-v2/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ce4480dc3b2f8edeee50c43765c72768e79fc0113d3f73773dded4887cca298
+size 90868373

model/paraphrase-MiniLM-L6-v2/modules.json ADDED Viewed

	@@ -0,0 +1,14 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  }
+]

model/paraphrase-MiniLM-L6-v2/onnx/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:441a5dc61ff3b889892feeb7aa0400518cc9908603209c45861ba3abef3006bc
+size 90405214

model/paraphrase-MiniLM-L6-v2/onnx/model_O1.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e4b50dcc09ca71accf34b7c7d843ad157499bb2e2c7f7a9b9bc1bbb720147ce6
+size 90360328

model/paraphrase-MiniLM-L6-v2/onnx/model_O2.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d99151ccdfb700fb278610e060f3debf615b59da275ba5784385d49c8b8e8e9c
+size 90326566

model/paraphrase-MiniLM-L6-v2/onnx/model_O3.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:053f244528154eef2db50c676536cdf0ab1e9cba20693ad8c9d83cb592126072
+size 90326497

model/paraphrase-MiniLM-L6-v2/onnx/model_O4.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:891a315753191d6dfb32c193615187456e33ee52d6425e0ad8dac2d086350f81
+size 45212349

model/paraphrase-MiniLM-L6-v2/onnx/model_qint8_arm64.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ccc4bb68331e8410226d021ed709d6f2db3b0b25a43504828fa1d54fc6f7b3b3
+size 23026053

model/paraphrase-MiniLM-L6-v2/onnx/model_qint8_avx512.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ccc4bb68331e8410226d021ed709d6f2db3b0b25a43504828fa1d54fc6f7b3b3
+size 23026053

model/paraphrase-MiniLM-L6-v2/onnx/model_qint8_avx512_vnni.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ccc4bb68331e8410226d021ed709d6f2db3b0b25a43504828fa1d54fc6f7b3b3
+size 23026053

model/paraphrase-MiniLM-L6-v2/onnx/model_quint8_avx2.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d2e9f4180455601b4ebb64602ba667f551c87f16e791550479346851b6e4787
+size 23046789

model/paraphrase-MiniLM-L6-v2/openvino/openvino_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6005063ac5c88df685065089e887719f43956959a2080c7b9467bc17924645d
+size 90265744

model/paraphrase-MiniLM-L6-v2/openvino/openvino_model.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

model/paraphrase-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f036c75118e1df8040b4be3d5b7589ae1f1bb0c1f0f5d666b9bd317a2c8014d5
+size 22933664

model/paraphrase-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

model/paraphrase-MiniLM-L6-v2/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d716de760acbdc09e79a11e718c5606e0812b6aeb76c6664cba876d174e3ecd
+size 90895153

model/paraphrase-MiniLM-L6-v2/sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 128,
+  "do_lower_case": false
+}

model/paraphrase-MiniLM-L6-v2/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}

model/paraphrase-MiniLM-L6-v2/tf_model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee09134d6f68fddf22606d3bc296855df94e527bbfe1555151e4b9613564a218
+size 91005696

model/paraphrase-MiniLM-L6-v2/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/paraphrase-MiniLM-L6-v2/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "name_or_path": "nreimers/MiniLM-L6-H384-uncased", "do_basic_tokenize": true, "never_split": null, "model_max_length": 512}

model/paraphrase-MiniLM-L6-v2/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+dspy_ai==2.4.9
+wikipedia==1.4.0
+sentence-transformers
+toml
+langchain-text-splitters
+trafilatura
+langchain-huggingface
+qdrant-client
+langchain-qdrant
+numpy==1.26.4
+dashscope
+beautifulsoup4
+streamlit==1.37.1
+python-dotenv
+streamlit-vis-timeline==0.3.0
+tilse
+jsonlines
+rank-bm25
+transformers
+litellm
+lxml
+lxml_html_clean

src/DeepThink/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (223 Bytes). View file

src/DeepThink/__pycache__/engine.cpython-311.pyc ADDED Viewed

Binary file (18.2 kB). View file

src/DeepThink/engine.py ADDED Viewed

	@@ -0,0 +1,285 @@

+import json
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Union, Literal, Optional
+import dspy
+from interface import Engine, LMConfigs
+from lm import OpenAIModel
+class LMConfigs():
+    """Configurations for LLM used in different parts of STORM.
+    Given that different parts in STORM framework have different complexity, we use different LLM configurations
+    to achieve a balance between quality and efficiency. If no specific configuration is provided, we use the default
+    setup in the paper.
+    """
+    def __init__(self):
+        self.conv_simulator_lm = None  # LLM used in conversation simulator except for question asking.
+        self.question_asker_lm = None  # LLM used in question asking.
+        self.outline_gen_lm = None  # LLM used in outline generation.
+        self.article_gen_lm = None  # LLM used in article generation.
+        self.article_polish_lm = None  # LLM used in article polishing.
+    def set_lm(self, model: Union[dspy.dsp.LM, dspy.dsp.HFModel]):
+        self.lm = model
+    def set_conv_simulator_lm(self, model: Union[dspy.dsp.LM, dspy.dsp.HFModel]):
+        self.conv_simulator_lm = model
+    def set_question_asker_lm(self, model: Union[dspy.dsp.LM, dspy.dsp.HFModel]):
+        self.question_asker_lm = model
+    def set_outline_gen_lm(self, model: Union[dspy.dsp.LM, dspy.dsp.HFModel]):
+        self.outline_gen_lm = model
+    def set_article_gen_lm(self, model: Union[dspy.dsp.LM, dspy.dsp.HFModel]):
+        self.article_gen_lm = model
+    def set_article_polish_lm(self, model: Union[dspy.dsp.LM, dspy.dsp.HFModel]):
+        self.article_polish_lm = model
+@dataclass
+class RunnerArguments:
+    """Arguments for controlling the STORM Wiki pipeline."""
+    output_dir: str = field(
+        metadata={"help": "Output directory for the results."},
+    )
+    max_conv_turn: int = field(
+        default=3,
+        metadata={"help": "Maximum number of questions in conversational question asking."},
+    )
+    max_perspective: int = field(
+        default=3,
+        metadata={"help": "Maximum number of perspectives to consider in perspective-guided question asking."},
+    )
+    max_search_queries_per_turn: int = field(
+        default=3,
+        metadata={"help": "Maximum number of search queries to consider in each turn."},
+    )
+    disable_perspective: bool = field(
+        default=False,
+        metadata={"help": "If True, disable perspective-guided question asking."},
+    )
+    search_top_k: int = field(
+        default=3,
+        metadata={"help": "Top k search results to consider for each search query."},
+    )
+    retrieve_top_k: int = field(
+        default=3,
+        metadata={"help": "Top k collected references for each section title."},
+    )
+    max_thread_num: int = field(
+        default=10,
+        metadata={"help": "Maximum number of threads to use. "
+                          "Consider reducing it if keep getting 'Exceed rate limit' error when calling LM API."},
+    )
+class Runner():
+    """STORM Wiki pipeline runner."""
+    def __init__(self,
+                 args: RunnerArguments,
+                 lm_configs: LMConfigs,
+                 rm):
+        super().__init__(lm_configs=lm_configs)
+        self.args = args
+        self.lm_configs = lm_configs
+        self.retriever = StormRetriever(rm=rm, k=self.args.retrieve_top_k)
+        storm_persona_generator = StormPersonaGenerator(self.lm_configs.question_asker_lm)
+        self.storm_knowledge_curation_module = StormKnowledgeCurationModule(
+            retriever=self.retriever,
+            persona_generator=storm_persona_generator,
+            conv_simulator_lm=self.lm_configs.conv_simulator_lm,
+            question_asker_lm=self.lm_configs.question_asker_lm,
+            max_search_queries_per_turn=self.args.max_search_queries_per_turn,
+            search_top_k=self.args.search_top_k,
+            max_conv_turn=self.args.max_conv_turn,
+            max_thread_num=self.args.max_thread_num
+        )
+        self.storm_outline_generation_module = StormOutlineGenerationModule(
+            outline_gen_lm=self.lm_configs.outline_gen_lm
+        )
+        self.storm_article_generation = StormArticleGenerationModule(
+            article_gen_lm=self.lm_configs.article_gen_lm,
+            retrieve_top_k=self.args.retrieve_top_k,
+            max_thread_num=self.args.max_thread_num,
+            retriever =self.retriever
+        )
+        self.storm_article_polishing_module = StormArticlePolishingModule(
+            article_gen_lm=self.lm_configs.article_gen_lm,
+            article_polish_lm=self.lm_configs.article_polish_lm
+        )
+        self.lm_configs.init_check()
+        self.apply_decorators()
+    def run_knowledge_curation_module(self,
+                                      ground_truth_url: str = "None",
+                                      ) -> StormInformationTable:
+    #第一次进入的地方，此处还是原topic，information_table既有所���的conversation对话又有所有的url和snippet的对应dict
+        information_table, conversation_log = self.storm_knowledge_curation_module.research(
+            topic=self.topic,
+            ground_truth_url=ground_truth_url,
+            callback_handler=callback_handler,
+            max_perspective=self.args.max_perspective,
+            disable_perspective=False,
+            return_conversation_log=True
+        )
+        FileIOHelper.dump_json(conversation_log, os.path.join(self.article_output_dir, 'conversation_log.json'))
+        information_table.dump_url_to_info(os.path.join(self.article_output_dir, 'raw_search_results.json'))
+        return information_table
+    def run_outline_generation_module(self,
+                                      information_table: StormInformationTable,
+                                      callback_handler: BaseCallbackHandler = None) -> StormArticle:
+        outline, draft_outline = self.storm_outline_generation_module.generate_outline(
+            topic=self.topic,
+            information_table=information_table,
+            return_draft_outline=True,
+            callback_handler=callback_handler
+        )
+        outline.dump_outline_to_file(os.path.join(self.article_output_dir, 'storm_gen_outline.txt'))
+        draft_outline.dump_outline_to_file(os.path.join(self.article_output_dir, "direct_gen_outline.txt"))
+        return outline
+    def run_article_generation_module(self,
+                                      outline: StormArticle,
+                                      information_table=StormInformationTable,
+                                      callback_handler: BaseCallbackHandler = None) -> StormArticle:
+        draft_article = self.storm_article_generation.generate_article(
+            topic=self.topic,
+            information_table=information_table,
+            article_with_outline=outline,
+            callback_handler=callback_handler
+        )
+        draft_article.dump_article_as_plain_text(os.path.join(self.article_output_dir, 'storm_gen_article.txt'))
+        draft_article.dump_reference_to_file(os.path.join(self.article_output_dir, 'url_to_info.json'))
+        return draft_article
+    def run_article_polishing_module(self,
+                                     draft_article: StormArticle,
+                                     remove_duplicate: bool = False) -> StormArticle:
+        polished_article = self.storm_article_polishing_module.polish_article(
+            topic=self.topic,
+            draft_article=draft_article,
+            remove_duplicate=remove_duplicate
+        )
+        FileIOHelper.write_str(polished_article.to_string(),
+                               os.path.join(self.article_output_dir, 'storm_gen_article_polished.txt'))
+        return polished_article
+    def post_run(self):
+        """
+        Post-run operations, including:
+        1. Dumping the run configuration.
+        2. Dumping the LLM call history.
+        """
+        config_log = self.lm_configs.log()
+        FileIOHelper.dump_json(config_log, os.path.join(self.article_output_dir, 'run_config.json'))
+        llm_call_history = self.lm_configs.collect_and_reset_lm_history()
+        with open(os.path.join(self.article_output_dir, 'llm_call_history.jsonl'), 'w') as f:
+            for call in llm_call_history:
+                if 'kwargs' in call:
+                    call.pop('kwargs')  # All kwargs are dumped together to run_config.json.
+                f.write(json.dumps(call) + '\n')
+    def _load_information_table_from_local_fs(self, information_table_local_path):
+        assert os.path.exists(information_table_local_path), makeStringRed(f"{information_table_local_path} not exists. Please set --do-research argument to prepare the conversation_log.json for this topic.")
+        return StormInformationTable.from_conversation_log_file(information_table_local_path)
+    def _load_outline_from_local_fs(self, topic, outline_local_path):
+        assert os.path.exists(outline_local_path), makeStringRed(f"{outline_local_path} not exists. Please set --do-generate-outline argument to prepare the storm_gen_outline.txt for this topic.")
+        return StormArticle.from_outline_file(topic=topic, file_path=outline_local_path)
+    def _load_draft_article_from_local_fs(self, topic, draft_article_path, url_to_info_path):
+        assert os.path.exists(draft_article_path), makeStringRed(f"{draft_article_path} not exists. Please set --do-generate-article argument to prepare the storm_gen_article.txt for this topic.")
+        assert os.path.exists(url_to_info_path), makeStringRed(f"{url_to_info_path} not exists. Please set --do-generate-article argument to prepare the url_to_info.json for this topic.")
+        article_text = FileIOHelper.load_str(draft_article_path)
+        references = FileIOHelper.load_json(url_to_info_path)
+        return StormArticle.from_string(topic_name=topic, article_text=article_text, references=references)
+    def run(self,
+            topic: str,
+            ground_truth_url: str = '',
+            do_research: bool = True,
+            do_generate_outline: bool = True,
+            do_generate_article: bool = True,
+            do_polish_article: bool = True,
+            remove_duplicate: bool = False,
+            callback_handler: BaseCallbackHandler = BaseCallbackHandler()):
+        """
+        Run the STORM pipeline.
+        Args:
+            topic: The topic to research.
+            ground_truth_url: A ground truth URL including a curated article about the topic. The URL will be excluded.
+            do_research: If True, research the topic through information-seeking conversation;
+             if False, expect conversation_log.json and raw_search_results.json to exist in the output directory.
+            do_generate_outline: If True, generate an outline for the topic;
+             if False, expect storm_gen_outline.txt to exist in the output directory.
+            do_generate_article: If True, generate a curated article for the topic;
+             if False, expect storm_gen_article.txt to exist in the output directory.
+            do_polish_article: If True, polish the article by adding a summarization section and (optionally) removing
+             duplicated content.
+            remove_duplicate: If True, remove duplicated content.
+            callback_handler: A callback handler to handle the intermediate results.
+        """
+        assert do_research or do_generate_outline or do_generate_article or do_polish_article, \
+            makeStringRed("No action is specified. Please set at least one of --do-research, --do-generate-outline, --do-generate-article, --do-polish-article")
+        self.topic = topic
+        self.article_dir_name = topic.replace(' ', '_').replace('/', '_')
+        self.article_output_dir = os.path.join(self.args.output_dir, self.article_dir_name)
+        os.makedirs(self.article_output_dir, exist_ok=True)
+        # research module,先自己生成一些链接得到一些url，然后读取url生成一些不同的人格，然后对不同的人格进行对话得到有用信息
+        information_table: StormInformationTable = None
+        if do_research:
+            information_table = self.run_knowledge_curation_module(ground_truth_url=ground_truth_url,
+                                                                   callback_handler=callback_handler)
+        # outline generation module，这地方就是生成一些outline，可以选择根据前面的conversation进行生成outline会更详细一些
+        outline: StormArticle = None
+        if do_generate_outline:
+            # load information table if it's not initialized
+            if information_table is None:
+                 information_table = self._load_information_table_from_local_fs(os.path.join(self.article_output_dir, '.json'))
+            outline = self.run_outline_generation_module(information_table=information_table,
+                                                         callback_handler=callback_handler)
+        # article generation module
+        draft_article: StormArticle = None
+        if do_generate_article:
+            if information_table is None:
+                 information_table = self._load_information_table_from_local_fs(os.path.join(self.article_output_dir, 'conversation_log.json'))
+            if outline is None:
+                outline = self._load_outline_from_local_fs(topic=topic, outline_local_path=os.path.join(self.article_output_dir, 'storm_gen_outline.txt'))
+            draft_article = self.run_article_generation_module(outline=outline,
+                                                               information_table=information_table,
+                                                               callback_handler=callback_handler)
+        # article polishing module
+        if do_polish_article:
+            if draft_article is None:
+                draft_article_path = os.path.join(self.article_output_dir, 'storm_gen_article.txt')
+                url_to_info_path = os.path.join(self.article_output_dir, 'url_to_info.json')
+                draft_article =  self._load_draft_article_from_local_fs(topic=topic, draft_article_path=draft_article_path, url_to_info_path=url_to_info_path)
+            self.run_article_polishing_module(draft_article=draft_article, remove_duplicate=remove_duplicate)
+        post_polish(self.article_output_dir)

src/DeepThink/modules/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (242 Bytes). View file

src/DeepThink/modules/__pycache__/article_generation.cpython-310.pyc ADDED Viewed

Binary file (6.43 kB). View file

src/DeepThink/modules/__pycache__/article_generation.cpython-311.pyc ADDED Viewed

Binary file (10.5 kB). View file

src/DeepThink/modules/__pycache__/article_polish.cpython-310.pyc ADDED Viewed

Binary file (3.4 kB). View file

src/DeepThink/modules/__pycache__/article_polish.cpython-311.pyc ADDED Viewed

Binary file (5.13 kB). View file

src/DeepThink/modules/__pycache__/interface.cpython-310.pyc ADDED Viewed

Binary file (17.2 kB). View file

src/DeepThink/modules/__pycache__/interface.cpython-311.pyc ADDED Viewed

Binary file (24.2 kB). View file

src/DeepThink/modules/__pycache__/mindmap.cpython-310.pyc ADDED Viewed

Binary file (14.1 kB). View file

src/DeepThink/modules/__pycache__/mindmap.cpython-311.pyc ADDED Viewed

Binary file (25.4 kB). View file

src/DeepThink/modules/__pycache__/outline_generation.cpython-310.pyc ADDED Viewed

Binary file (4.84 kB). View file

src/DeepThink/modules/__pycache__/outline_generation.cpython-311.pyc ADDED Viewed

Binary file (7.47 kB). View file

src/DeepThink/modules/__pycache__/retriever.cpython-311.pyc ADDED Viewed

Binary file (3.81 kB). View file

src/DeepThink/modules/__pycache__/storm_dataclass.cpython-310.pyc ADDED Viewed

Binary file (12.6 kB). View file

src/DeepThink/modules/__pycache__/storm_dataclass.cpython-311.pyc ADDED Viewed

Binary file (21.6 kB). View file

src/DeepThink/modules/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (12.5 kB). View file

src/DeepThink/modules/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (20.4 kB). View file

src/DeepThink/modules/article_generation.py ADDED Viewed

	@@ -0,0 +1,523 @@

+import concurrent.futures
+import copy
+import logging
+from concurrent.futures import as_completed
+from typing import List, Union
+import random
+import dspy
+import sys
+import concurrent.futures
+import json
+import os
+import pickle
+import re
+import sys
+from typing import List, Dict
+import httpx
+import toml
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from trafilatura import extract
+class ArticleTextProcessing:
+    @staticmethod
+    def limit_word_count_preserve_newline(input_string, max_word_count):
+        """
+        Limit the word count of an input string to a specified maximum, while preserving the integrity of complete lines.
+        The function truncates the input string at the nearest word that does not exceed the maximum word count,
+        ensuring that no partial lines are included in the output. Words are defined as text separated by spaces,
+        and lines are defined as text separated by newline characters.
+        Args:
+            input_string (str): The string to be truncated. This string may contain multiple lines.
+            max_word_count (int): The maximum number of words allowed in the truncated string.
+        Returns:
+            str: The truncated string with word count limited to `max_word_count`, preserving complete lines.
+        """
+        word_count = 0
+        limited_string = ''
+        for word in input_string.split('\n'):
+            line_words = word.split()
+            for lw in line_words:
+                if word_count < max_word_count:
+                    limited_string += lw + ' '
+                    word_count += 1
+                else:
+                    break
+            if word_count >= max_word_count:
+                break
+            limited_string = limited_string.strip() + '\n'
+        return limited_string.strip()
+    @staticmethod
+    def remove_citations(s):
+        """
+        Removes all citations from a given string. Citations are assumed to be in the format
+        of numbers enclosed in square brackets, such as [1], [2], or [1, 2], etc. This function searches
+        for all occurrences of such patterns and removes them, returning the cleaned string.
+        Args:
+            s (str): The string from which citations are to be removed.
+        Returns:
+            str: The string with all citation patterns removed.
+        """
+        return re.sub(r'\[\d+(?:,\s*\d+)*\]', '', s)
+    @staticmethod
+    def get_first_section_dict_and_list(s):
+        """
+        """
+        text = s
+        sections = text.strip().split('\n# ')
+        titles = []
+        content_dict = {}
+        for section in sections:
+            if section:
+                lines = section.split('\n', 1)
+                title = lines[0].strip()
+                content = lines[1].strip() if len(lines) > 1 else ""
+                titles.append(title)
+                content_dict[title] = content
+        return content_dict, titles
+    @staticmethod
+    def parse_citation_indices(s):
+        """
+        Extracts citation indexes from the provided content string and returns them as a list of integers.
+        Args:
+            content (str): The content string containing citations in the format [number].
+        Returns:
+            List[int]: A list of unique citation indexes extracted from the content, in the order they appear.
+        """
+        matches = re.findall(r'\[\d+\]', s)
+        return [int(index[1:-1]) for index in matches]
+    @staticmethod
+    def remove_uncompleted_sentences_with_citations(text):
+        """
+        Removes uncompleted sentences and standalone citations from the input text. Sentences are identified
+        by their ending punctuation (.!?), optionally followed by a citation in square brackets (e.g., "[1]").
+        Grouped citations (e.g., "[1, 2]") are split into individual ones (e.g., "[1] [2]"). Only text up to
+        and including the last complete sentence and its citation is retained.
+        Args:
+            text (str): The input text from which uncompleted sentences and their citations are to be removed.
+        Returns:
+            str: The processed string with uncompleted sentences and standalone citations removed, leaving only
+            complete sentences and their associated citations if present.
+        """
+        # Convert citations like [1, 2, 3] to [1][2][3].
+        def replace_with_individual_brackets(match):
+            numbers = match.group(1).split(', ')
+            return ' '.join(f'[{n}]' for n in numbers)
+        # Deduplicate and sort individual groups of citations.
+        def deduplicate_group(match):
+            citations = match.group(0)
+            unique_citations = list(set(re.findall(r'\[\d+\]', citations)))
+            sorted_citations = sorted(unique_citations, key=lambda x: int(x.strip('[]')))
+            # Return the sorted unique citations as a string
+            return ''.join(sorted_citations)
+        text = re.sub(r'\[([0-9, ]+)\]', replace_with_individual_brackets, text)
+        text = re.sub(r'(\[\d+\])+', deduplicate_group, text)
+        # Deprecated: Remove sentence without proper ending punctuation and citations.
+        # Split the text into sentences (including citations).
+        # sentences_with_trailing = re.findall(r'([^.!?]*[.!?].*?)(?=[^.!?]*[.!?]|$)', text)
+        # Filter sentences to ensure they end with a punctuation mark and properly formatted citations
+        # complete_sentences = []
+        # for sentence in sentences_with_trailing:
+        #     # Check if the sentence ends with properly formatted citations
+        #     if re.search(r'[.!?]( \[\d+\])*$|^[^.!?]*[.!?]$', sentence.strip()):
+        #         complete_sentences.append(sentence.strip())
+        # combined_sentences = ' '.join(complete_sentences)
+        # Check for and append any complete citations that follow the last sentence
+        # trailing_citations = re.findall(r'(\[\d+\]) ', text[text.rfind(combined_sentences) + len(combined_sentences):])
+        # if trailing_citations:
+        #     combined_sentences += ' '.join(trailing_citations)
+        # Regex pattern to match sentence endings, including optional citation markers.
+        eos_pattern = r'([.!?])\s*(\[\d+\])?\s*'
+        matches = list(re.finditer(eos_pattern, text))
+        if matches:
+            last_match = matches[-1]
+            text = text[:last_match.end()].strip()
+        return text
+    @staticmethod
+    def clean_up_citation(conv):
+        for turn in conv.dlg_history:
+            turn.agent_utterance = turn.agent_utterance[:turn.agent_utterance.find('References:')]
+            turn.agent_utterance = turn.agent_utterance[:turn.agent_utterance.find('Sources:')]
+            turn.agent_utterance = turn.agent_utterance.replace('Answer:', '').strip()
+            try:
+                max_ref_num = max([int(x) for x in re.findall(r'\[(\d+)\]', turn.agent_utterance)])
+            except Exception as e:
+                max_ref_num = 0
+            if max_ref_num > len(turn.search_results):
+                for i in range(len(turn.search_results), max_ref_num + 1):
+                    turn.agent_utterance = turn.agent_utterance.replace(f'[{i}]', '')
+            turn.agent_utterance = ArticleTextProcessing.remove_uncompleted_sentences_with_citations(
+                turn.agent_utterance)
+        return conv
+    @staticmethod
+    def clean_up_outline(outline, topic=""):
+        output_lines = []
+        current_level = 0  # To track the current section level
+        for line in outline.split('\n'):
+            stripped_line = line.strip()
+            if topic != "" and f"# {topic.lower()}" in stripped_line.lower():
+                output_lines = []
+            # Check if the line is a section header
+            if stripped_line.startswith('#') and stripped_line != '#':
+                current_level = stripped_line.count('#')
+                output_lines.append(stripped_line)
+            # Check if the line is a bullet point
+            # elif stripped_line.startswith('-'):
+            #     subsection_header = '#' * (current_level + 1) + ' ' + stripped_line[1:].strip()
+            #     output_lines.append(subsection_header)
+            # Preserve lines with @
+            elif stripped_line.startswith('@'):
+                output_lines.append(stripped_line)
+        outline = '\n'.join(output_lines)
+        # Remove references.
+        outline = re.sub(r"#[#]? See also.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? See Also.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? Notes.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? References.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? External links.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? External Links.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? Bibliography.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? Further reading*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? Further Reading*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? Summary.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? Appendices.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? Appendix.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        return outline
+    @staticmethod
+    def clean_up_section(text):
+        """Clean up a section:
+        1. Remove uncompleted sentences (usually due to output token limitation).
+        2. Deduplicate individual groups of citations.
+        3. Remove unnecessary summary."""
+        paragraphs = text.split('\n')
+        output_paragraphs = []
+        summary_sec_flag = False
+        for p in paragraphs:
+            p = p.strip()
+            if len(p) == 0:
+                continue
+            if not p.startswith('#'):
+                p = ArticleTextProcessing.remove_uncompleted_sentences_with_citations(p)
+            if summary_sec_flag:
+                if p.startswith('#'):
+                    summary_sec_flag = False
+                else:
+                    continue
+            if p.startswith('Overall') or p.startswith('In summary') or p.startswith('In conclusion'):
+                continue
+            if "# Summary" in p or '# Conclusion' in p:
+                summary_sec_flag = True
+                continue
+            output_paragraphs.append(p)
+        return '\n\n'.join(output_paragraphs)  # Join with '\n\n' for markdown format.
+    @staticmethod
+    def update_citation_index(s, citation_map):
+        """Update citation index in the string based on the citation map."""
+        for original_citation in citation_map:
+            s = s.replace(f"[{original_citation}]", f"__PLACEHOLDER_{original_citation}__")
+        for original_citation, unify_citation in citation_map.items():
+            s = s.replace(f"__PLACEHOLDER_{original_citation}__", f"[{unify_citation}]")
+        return s
+    @staticmethod
+    def parse_article_into_dict(input_string):
+        """
+        Parses a structured text into a nested dictionary. The structure of the text
+        is defined by markdown-like headers (using '#' symbols) to denote sections
+        and subsections. Each section can contain content and further nested subsections.
+        The resulting dictionary captures the hierarchical structure of sections, where
+        each section is represented as a key (the section's title) mapping to a value
+        that is another dictionary. This dictionary contains two keys:
+        - 'content': content of the section
+        - 'subsections': a list of dictionaries, each representing a nested subsection
+        following the same structure.
+        Args:
+            input_string (str): A string containing the structured text to parse.
+        Returns:
+            A dictionary representing contains the section title as the key, and another dictionary
+        as the value, which includes the 'content' and 'subsections' keys as described above.
+        """
+        lines = input_string.split('\n')
+        lines = [line for line in lines if line.strip()]
+        root = {'content': '', 'subsections': {}}
+        current_path = [(root, -1)]  # (current_dict, level)
+        for line in lines:
+            if line.startswith('#'):
+                level = line.count('#')
+                title = line.strip('# ').strip()
+                new_section = {'content': '', 'subsections': {}}
+                # Pop from stack until find the parent level
+                while current_path and current_path[-1][1] >= level:
+                    current_path.pop()
+                # Append new section to the nearest upper level's subsections
+                current_path[-1][0]['subsections'][title] = new_section
+                current_path.append((new_section, level))
+            else:
+                current_path[-1][0]['content'] += line + '\n'
+        return root['subsections']
+class FileIOHelper:
+    @staticmethod
+    def dump_json(obj, file_name, encoding="utf-8"):
+        with open(file_name, 'w', encoding=encoding) as fw:
+            json.dump(obj, fw, default=FileIOHelper.handle_non_serializable, ensure_ascii=False)
+    @staticmethod
+    def handle_non_serializable(obj):
+        return "non-serializable contents"  # mark the non-serializable part
+    @staticmethod
+    def load_json(file_name, encoding="utf-8"):
+        with open(file_name, 'r', encoding=encoding) as fr:
+            return json.load(fr)
+    @staticmethod
+    def write_str(s, path):
+        with open(path, 'w') as f:
+            f.write(s)
+    @staticmethod
+    def load_str(path):
+        with open(path, 'r') as f:
+            return '\n'.join(f.readlines())
+    @staticmethod
+    def dump_pickle(obj, path):
+        with open(path, 'wb') as f:
+            pickle.dump(obj, f)
+    @staticmethod
+    def load_pickle(path):
+        with open(path, 'rb') as f:
+            return pickle.load(f)
+class ArticleGenerationModule():
+    """
+    The interface for article generation stage. Given topic, collected information from
+    knowledge curation stage, generated outline from outline generation stage,
+    """
+    def __init__(self,
+                 retriever,
+                 article_gen_lm=Union[dspy.dsp.LM, dspy.dsp.HFModel],
+                 retrieve_top_k: int = 10,
+                 max_thread_num: int = 10,
+                ):
+        super().__init__()
+        self.retrieve_top_k = retrieve_top_k
+        self.article_gen_lm = article_gen_lm
+        self.max_thread_num = max_thread_num
+        self.retriever = retriever
+        self.section_gen = ConvToSection(engine=self.article_gen_lm)
+    def generate_section(self, topic, section_name, mindmap, section_query, section_outline):
+        collected_info = mindmap.retrieve_information(queries=section_query,
+                                                                    search_top_k=self.retrieve_top_k)
+        output = self.section_gen(
+            topic=topic,
+            outline=section_outline,
+            section=section_name,
+            collected_info=collected_info,
+        )
+        return {"section_name": section_name, "section_content": output.section, "collected_info": collected_info}
+    def generate_article(self,
+                         topic: str,
+                         mindmap,
+                         article_with_outline,
+                         ):
+        """
+        Generate article for the topic based on the information table and article outline.
+        """
+        mindmap.prepare_table_for_retrieval()
+        sections_to_write = article_with_outline.get_first_level_section_names()
+        section_output_dict_collection = []
+        with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_thread_num) as executor:
+            future_to_sec_title = {}
+            for section_title in sections_to_write:
+                section_query = article_with_outline.get_outline_as_list(
+                    root_section_name=section_title, add_hashtags=False
+                )
+                queries_with_hashtags = article_with_outline.get_outline_as_list(
+                    root_section_name=section_title, add_hashtags=True
+                )
+                section_outline = "\n".join(queries_with_hashtags)
+                future_to_sec_title[
+                    executor.submit(self.generate_section,
+                                    topic, section_title, mindmap, section_query,section_outline)
+                ] = section_title
+            for future in concurrent.futures.as_completed(future_to_sec_title):
+                section_output_dict_collection.append(future.result())
+        article = copy.deepcopy(article_with_outline)
+        for section_output_dict in section_output_dict_collection:
+            article.update_section(parent_section_name=topic,
+                                   current_section_content=section_output_dict["section_content"],
+                                   current_section_info_list=section_output_dict["collected_info"],
+                                )
+        article.post_processing()
+        return article
+class ConvToSection(dspy.Module):
+    """Use the information collected from the information-seeking conversation to write a section."""
+    #给你传入的都是所有的section的对应的url，但是这个地方我们的目标是指根据一个来生成，这个地方需要完善，因为他的outline没有用到
+    def __init__(self, engine: Union[dspy.dsp.LM, dspy.dsp.HFModel]):
+        super().__init__()
+        self.write_section = dspy.Predict(WriteSection)
+        self.engine = engine
+    def forward(self, topic: str, outline:str, section: str, collected_info: List):
+        all_info = ''
+        for idx, info in enumerate(collected_info):
+            all_info += f'[{idx + 1}]\n' + '\n'.join(info['snippets'])
+            all_info += '\n\n'
+        all_info = ArticleTextProcessing.limit_word_count_preserve_newline(all_info, 1500)
+        with dspy.settings.context(lm=self.engine):
+            section = ArticleTextProcessing.clean_up_section(
+                self.write_section(topic=topic, info=info, section=section).output)
+        section = section.replace('\[','[').replace('\]',']')
+        return dspy.Prediction(section=section)
+class WriteSection(dspy.Signature):
+    """Write a Wikipedia section based on the collected information.
+    Here is the format of your writing:
+        1. Use "#" Title" to indicate section title, "##" Title" to indicate subsection title, "###" Title" to indicate subsubsection title, and so on.
+        2. Use [1], [2], ..., [n] in line (for example, "The capital of the United States is Washington, D.C.[1][3]."). You DO NOT need to include a References or Sources section to list the sources at the end.
+        3. The language style should resemble that of Wikipedia: concise yet informative, formal yet accessible.
+    """
+    # """
+    # Write a detailed, Wikipedia-style report section based on the collected information.
+    # Here is the format of your writing:
+    # 1. Use "#" Title" to indicate section title, "##" Title" to indicate subsection title, "###" Title" to indicate subsubsection title, and so on.
+    # 2. Use [1], [2], ..., [n] in line (for example, "The capital of the United States is Washington, D.C.[1][3]."). You DO NOT need to include a References or Sources section to list the sources at the end.
+    # 3. The language style should resemble that of Wikipedia: concise yet informative, formal yet accessible.
+    # """
+    info = dspy.InputField(prefix="The Collected information:\n", format=str)
+    topic = dspy.InputField(prefix="The topic of the page: ", format=str)
+    section = dspy.InputField(prefix="The section you need to write: ", format=str)
+    output = dspy.OutputField(
+        prefix="Write the section with proper inline citations (Start your writing with # section title. Don't include the page title or try to write other sections):\n",
+        format=str)
+if __name__ == "__main__":
+    import sys
+    from mindmap import MindMap
+    from outline_generation import OutlineGenerationModule
+    sys.path.append('/mnt/nas-alinlp/xizekun/project/DeepThink/src')
+    from storm_dataclass import Article
+    from lm import OpenAIModel, OpenAIModel_New
+    from rm import BingSearch, BingSearchAli
+    from utils import load_api_key
+    import os
+    load_api_key(toml_file_path='/mnt/nas-alinlp/xizekun/project/DeepThink/secrets.toml')
+    openai_kwargs = {
+        'api_key': os.getenv("OPENAI_API_KEY"),
+        'api_provider': os.getenv('OPENAI_API_TYPE'),
+        'temperature': 1.0,
+        'top_p': 0.9,
+        'api_base': os.getenv('AZURE_API_BASE'),
+        'api_version': os.getenv('AZURE_API_VERSION'),
+    }
+    lm = OpenAIModel(model='gpt-4-1106-preview', max_tokens=5000, **openai_kwargs)
+    rm = BingSearchAli(ydc_api_key=os.getenv('BING_SEARCH_ALI_API_KEY'), k=3)
+    retriever = rm
+    gen_concept_lm = lm
+    mind_map = MindMap(
+        retriever=retriever,
+        gen_concept_lm=lm,
+        search_top_k=3,
+        deepth = 3
+    )
+    a = mind_map.load_map('/mnt/nas-alinlp/xizekun/project/DeepThink/src/DeepThink/modules/Taylor.json')
+    ag = ArticleGenerationModule(
+                 retriever = retriever,
+                 article_gen_lm = lm,
+                 retrieve_top_k = 5,
+                 max_thread_num = 10)
+    module = OutlineGenerationModule(lm)
+    outline = module.generate_outline(topic= 'Taylor Hawkins',mindmap = mind_map)
+    print(outline)
+    print('~~~~~~')
+    article_with_outline = Article.from_outline_str(topic='Taylor Hawkins', outline_str=outline)
+    a = ag.generate_article(topic = 'Taylor Hawkins', mindmap = mind_map, article_with_outline = article_with_outline)
+    print(a.to_string())

src/DeepThink/modules/article_polish.py ADDED Viewed

	@@ -0,0 +1,417 @@

+import copy
+from typing import Union
+import dspy
+# from storm_wiki.modules.storm_dataclass import StormArticle
+import concurrent.futures
+import json
+import os
+import pickle
+import re
+import sys
+from typing import List, Dict
+import httpx
+import toml
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from trafilatura import extract
+class ArticleTextProcessing:
+    @staticmethod
+    def limit_word_count_preserve_newline(input_string, max_word_count):
+        """
+        Limit the word count of an input string to a specified maximum, while preserving the integrity of complete lines.
+        The function truncates the input string at the nearest word that does not exceed the maximum word count,
+        ensuring that no partial lines are included in the output. Words are defined as text separated by spaces,
+        and lines are defined as text separated by newline characters.
+        Args:
+            input_string (str): The string to be truncated. This string may contain multiple lines.
+            max_word_count (int): The maximum number of words allowed in the truncated string.
+        Returns:
+            str: The truncated string with word count limited to `max_word_count`, preserving complete lines.
+        """
+        word_count = 0
+        limited_string = ''
+        for word in input_string.split('\n'):
+            line_words = word.split()
+            for lw in line_words:
+                if word_count < max_word_count:
+                    limited_string += lw + ' '
+                    word_count += 1
+                else:
+                    break
+            if word_count >= max_word_count:
+                break
+            limited_string = limited_string.strip() + '\n'
+        return limited_string.strip()
+    @staticmethod
+    def remove_citations(s):
+        """
+        Removes all citations from a given string. Citations are assumed to be in the format
+        of numbers enclosed in square brackets, such as [1], [2], or [1, 2], etc. This function searches
+        for all occurrences of such patterns and removes them, returning the cleaned string.
+        Args:
+            s (str): The string from which citations are to be removed.
+        Returns:
+            str: The string with all citation patterns removed.
+        """
+        return re.sub(r'\[\d+(?:,\s*\d+)*\]', '', s)
+    @staticmethod
+    def get_first_section_dict_and_list(s):
+        """
+        """
+        text = s
+        sections = text.strip().split('\n# ')
+        titles = []
+        content_dict = {}
+        for section in sections:
+            if section:
+                lines = section.split('\n', 1)
+                title = lines[0].strip()
+                content = lines[1].strip() if len(lines) > 1 else ""
+                titles.append(title)
+                content_dict[title] = content
+        return content_dict, titles
+    @staticmethod
+    def parse_citation_indices(s):
+        """
+        Extracts citation indexes from the provided content string and returns them as a list of integers.
+        Args:
+            content (str): The content string containing citations in the format [number].
+        Returns:
+            List[int]: A list of unique citation indexes extracted from the content, in the order they appear.
+        """
+        matches = re.findall(r'\[\d+\]', s)
+        return [int(index[1:-1]) for index in matches]
+    @staticmethod
+    def remove_uncompleted_sentences_with_citations(text):
+        """
+        Removes uncompleted sentences and standalone citations from the input text. Sentences are identified
+        by their ending punctuation (.!?), optionally followed by a citation in square brackets (e.g., "[1]").
+        Grouped citations (e.g., "[1, 2]") are split into individual ones (e.g., "[1] [2]"). Only text up to
+        and including the last complete sentence and its citation is retained.
+        Args:
+            text (str): The input text from which uncompleted sentences and their citations are to be removed.
+        Returns:
+            str: The processed string with uncompleted sentences and standalone citations removed, leaving only
+            complete sentences and their associated citations if present.
+        """
+        # Convert citations like [1, 2, 3] to [1][2][3].
+        def replace_with_individual_brackets(match):
+            numbers = match.group(1).split(', ')
+            return ' '.join(f'[{n}]' for n in numbers)
+        # Deduplicate and sort individual groups of citations.
+        def deduplicate_group(match):
+            citations = match.group(0)
+            unique_citations = list(set(re.findall(r'\[\d+\]', citations)))
+            sorted_citations = sorted(unique_citations, key=lambda x: int(x.strip('[]')))
+            # Return the sorted unique citations as a string
+            return ''.join(sorted_citations)
+        text = re.sub(r'\[([0-9, ]+)\]', replace_with_individual_brackets, text)
+        text = re.sub(r'(\[\d+\])+', deduplicate_group, text)
+        # Deprecated: Remove sentence without proper ending punctuation and citations.
+        # Split the text into sentences (including citations).
+        # sentences_with_trailing = re.findall(r'([^.!?]*[.!?].*?)(?=[^.!?]*[.!?]|$)', text)
+        # Filter sentences to ensure they end with a punctuation mark and properly formatted citations
+        # complete_sentences = []
+        # for sentence in sentences_with_trailing:
+        #     # Check if the sentence ends with properly formatted citations
+        #     if re.search(r'[.!?]( \[\d+\])*$|^[^.!?]*[.!?]$', sentence.strip()):
+        #         complete_sentences.append(sentence.strip())
+        # combined_sentences = ' '.join(complete_sentences)
+        # Check for and append any complete citations that follow the last sentence
+        # trailing_citations = re.findall(r'(\[\d+\]) ', text[text.rfind(combined_sentences) + len(combined_sentences):])
+        # if trailing_citations:
+        #     combined_sentences += ' '.join(trailing_citations)
+        # Regex pattern to match sentence endings, including optional citation markers.
+        eos_pattern = r'([.!?])\s*(\[\d+\])?\s*'
+        matches = list(re.finditer(eos_pattern, text))
+        if matches:
+            last_match = matches[-1]
+            text = text[:last_match.end()].strip()
+        return text
+    @staticmethod
+    def clean_up_citation(conv):
+        for turn in conv.dlg_history:
+            turn.agent_utterance = turn.agent_utterance[:turn.agent_utterance.find('References:')]
+            turn.agent_utterance = turn.agent_utterance[:turn.agent_utterance.find('Sources:')]
+            turn.agent_utterance = turn.agent_utterance.replace('Answer:', '').strip()
+            try:
+                max_ref_num = max([int(x) for x in re.findall(r'\[(\d+)\]', turn.agent_utterance)])
+            except Exception as e:
+                max_ref_num = 0
+            if max_ref_num > len(turn.search_results):
+                for i in range(len(turn.search_results), max_ref_num + 1):
+                    turn.agent_utterance = turn.agent_utterance.replace(f'[{i}]', '')
+            turn.agent_utterance = ArticleTextProcessing.remove_uncompleted_sentences_with_citations(
+                turn.agent_utterance)
+        return conv
+    @staticmethod
+    def clean_up_outline(outline, topic=""):
+        output_lines = []
+        current_level = 0  # To track the current section level
+        for line in outline.split('\n'):
+            stripped_line = line.strip()
+            if topic != "" and f"# {topic.lower()}" in stripped_line.lower():
+                output_lines = []
+            # Check if the line is a section header
+            if stripped_line.startswith('#') and stripped_line != '#':
+                current_level = stripped_line.count('#')
+                output_lines.append(stripped_line)
+            # Check if the line is a bullet point
+            # elif stripped_line.startswith('-'):
+            #     subsection_header = '#' * (current_level + 1) + ' ' + stripped_line[1:].strip()
+            #     output_lines.append(subsection_header)
+            # Preserve lines with @
+            elif stripped_line.startswith('@'):
+                output_lines.append(stripped_line)
+        outline = '\n'.join(output_lines)
+        # Remove references.
+        outline = re.sub(r"#[#]? See also.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? See Also.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? Notes.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? References.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? External links.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? External Links.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? Bibliography.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? Further reading*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? Further Reading*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? Summary.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? Appendices.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        outline = re.sub(r"#[#]? Appendix.*?(?=##|$)", '', outline, flags=re.DOTALL)
+        return outline
+    @staticmethod
+    def clean_up_section(text):
+        """Clean up a section:
+        1. Remove uncompleted sentences (usually due to output token limitation).
+        2. Deduplicate individual groups of citations.
+        3. Remove unnecessary summary."""
+        paragraphs = text.split('\n')
+        output_paragraphs = []
+        summary_sec_flag = False
+        for p in paragraphs:
+            p = p.strip()
+            if len(p) == 0:
+                continue
+            if not p.startswith('#'):
+                p = ArticleTextProcessing.remove_uncompleted_sentences_with_citations(p)
+            if summary_sec_flag:
+                if p.startswith('#'):
+                    summary_sec_flag = False
+                else:
+                    continue
+            if p.startswith('Overall') or p.startswith('In summary') or p.startswith('In conclusion'):
+                continue
+            if "# Summary" in p or '# Conclusion' in p:
+                summary_sec_flag = True
+                continue
+            output_paragraphs.append(p)
+        return '\n\n'.join(output_paragraphs)  # Join with '\n\n' for markdown format.
+    @staticmethod
+    def update_citation_index(s, citation_map):
+        """Update citation index in the string based on the citation map."""
+        for original_citation in citation_map:
+            s = s.replace(f"[{original_citation}]", f"__PLACEHOLDER_{original_citation}__")
+        for original_citation, unify_citation in citation_map.items():
+            s = s.replace(f"__PLACEHOLDER_{original_citation}__", f"[{unify_citation}]")
+        return s
+    @staticmethod
+    def parse_article_into_dict(input_string):
+        """
+        Parses a structured text into a nested dictionary. The structure of the text
+        is defined by markdown-like headers (using '#' symbols) to denote sections
+        and subsections. Each section can contain content and further nested subsections.
+        The resulting dictionary captures the hierarchical structure of sections, where
+        each section is represented as a key (the section's title) mapping to a value
+        that is another dictionary. This dictionary contains two keys:
+        - 'content': content of the section
+        - 'subsections': a list of dictionaries, each representing a nested subsection
+        following the same structure.
+        Args:
+            input_string (str): A string containing the structured text to parse.
+        Returns:
+            A dictionary representing contains the section title as the key, and another dictionary
+        as the value, which includes the 'content' and 'subsections' keys as described above.
+        """
+        lines = input_string.split('\n')
+        lines = [line for line in lines if line.strip()]
+        root = {'content': '', 'subsections': {}}
+        current_path = [(root, -1)]  # (current_dict, level)
+        for line in lines:
+            if line.startswith('#'):
+                level = line.count('#')
+                title = line.strip('# ').strip()
+                new_section = {'content': '', 'subsections': {}}
+                # Pop from stack until find the parent level
+                while current_path and current_path[-1][1] >= level:
+                    current_path.pop()
+                # Append new section to the nearest upper level's subsections
+                current_path[-1][0]['subsections'][title] = new_section
+                current_path.append((new_section, level))
+            else:
+                current_path[-1][0]['content'] += line + '\n'
+        return root['subsections']
+class FileIOHelper:
+    @staticmethod
+    def dump_json(obj, file_name, encoding="utf-8"):
+        with open(file_name, 'w', encoding=encoding) as fw:
+            json.dump(obj, fw, default=FileIOHelper.handle_non_serializable, ensure_ascii=False)
+    @staticmethod
+    def handle_non_serializable(obj):
+        return "non-serializable contents"  # mark the non-serializable part
+    @staticmethod
+    def load_json(file_name, encoding="utf-8"):
+        with open(file_name, 'r', encoding=encoding) as fr:
+            return json.load(fr)
+    @staticmethod
+    def write_str(s, path):
+        with open(path, 'w') as f:
+            f.write(s)
+    @staticmethod
+    def load_str(path):
+        with open(path, 'r') as f:
+            return '\n'.join(f.readlines())
+    @staticmethod
+    def dump_pickle(obj, path):
+        with open(path, 'wb') as f:
+            pickle.dump(obj, f)
+    @staticmethod
+    def load_pickle(path):
+        with open(path, 'rb') as f:
+            return pickle.load(f)
+class ArticlePolishingModule():
+    """
+    The interface for article generation stage. Given topic, collected information from
+    knowledge curation stage, generated outline from outline generation stage.
+    """
+    def __init__(self,
+                 article_gen_lm: Union[dspy.dsp.LM, dspy.dsp.HFModel],
+                 article_polish_lm: Union[dspy.dsp.LM, dspy.dsp.HFModel]):
+        self.article_gen_lm = article_gen_lm
+        self.article_polish_lm = article_polish_lm
+        self.polish_page = PolishPageModule(
+            write_lead_engine=self.article_gen_lm,
+            polish_engine=self.article_polish_lm
+        )
+    def polish_article(self,
+                       topic: str,
+                       draft_article,
+                       remove_duplicate: bool = False):
+        """
+        Polish article.
+        Args:
+            topic (str): The topic of the article.
+            draft_article (StormArticle): The draft article.
+            remove_duplicate (bool): Whether to use one additional LM call to remove duplicates from the article.
+        """
+        article_text = draft_article.to_string()
+        remove_duplicate = True
+        polish_result = self.polish_page(topic=topic, draft_page=article_text, polish_whole_page=remove_duplicate)
+        polished_article = polish_result.page
+        polished_article_dict = ArticleTextProcessing.parse_article_into_dict(polished_article)
+        polished_article = copy.deepcopy(draft_article)
+        polished_article.insert_or_create_section(article_dict=polished_article_dict)
+        polished_article.post_processing()
+        return polished_article
+class PolishPage(dspy.Signature):
+    """
+    You are a faithful text editor that is good at finding repeated information in the article and deleting them to make sure there is no repetition in the article.
+    You won't delete any non-repeated part in the article.
+    You will keep the inline citations and article structure (indicated by "#", "##", etc.) appropriately.
+    In the article, do not include references.
+    Do your job for the following article.
+    """
+    article = dspy.InputField(prefix="The article you need to polish:\n", format=str)
+    page = dspy.OutputField(
+        prefix="Your revised article:\n",
+        format=str)
+class PolishPageModule(dspy.Module):
+    def __init__(self, write_lead_engine: Union[dspy.dsp.LM, dspy.dsp.HFModel],
+                 polish_engine: Union[dspy.dsp.LM, dspy.dsp.HFModel]):
+        super().__init__()
+        self.write_lead_engine = write_lead_engine
+        self.polish_engine = polish_engine
+        self.polish_page = dspy.Predict(PolishPage)
+    def forward(self, topic: str, draft_page: str, polish_whole_page: bool = True):
+        with dspy.settings.context(lm=self.polish_engine):
+            page = self.polish_page(article=draft_page).page
+        return dspy.Prediction(page=page)