Spaces:
Runtime error
Runtime error
remove logging
Browse files- src/extract_questions.py +1 -4
- src/podcast_data.py +4 -7
- src/summarize.py +4 -10
src/extract_questions.py
CHANGED
|
@@ -86,10 +86,7 @@ if __name__ == "__main__":
|
|
| 86 |
questions.append(episode_questions)
|
| 87 |
|
| 88 |
print("*" * 25)
|
| 89 |
-
print(
|
| 90 |
-
print(f"Total completion tokens: {cb.completion_tokens}")
|
| 91 |
-
print(f"Total tokens: {cb.total_tokens}")
|
| 92 |
-
print(f"Total cost (USD): ${cb.total_cost}")
|
| 93 |
print("*" * 25)
|
| 94 |
|
| 95 |
wandb.log(
|
|
|
|
| 86 |
questions.append(episode_questions)
|
| 87 |
|
| 88 |
print("*" * 25)
|
| 89 |
+
print(cb)
|
|
|
|
|
|
|
|
|
|
| 90 |
print("*" * 25)
|
| 91 |
|
| 92 |
wandb.log(
|
src/podcast_data.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
import logging
|
| 2 |
import time
|
| 3 |
from dataclasses import asdict
|
| 4 |
|
|
@@ -10,8 +9,6 @@ from tqdm import tqdm
|
|
| 10 |
import wandb
|
| 11 |
from config import config
|
| 12 |
|
| 13 |
-
logger = logging.getLogger(__name__)
|
| 14 |
-
|
| 15 |
|
| 16 |
def retry_access_yt_object(url, max_retries=5, interval_secs=5):
|
| 17 |
"""
|
|
@@ -28,7 +25,7 @@ def retry_access_yt_object(url, max_retries=5, interval_secs=5):
|
|
| 28 |
return yt # Return the YouTube object if successful.
|
| 29 |
except Exception as err:
|
| 30 |
last_exception = err # Keep track of the last exception raised.
|
| 31 |
-
|
| 32 |
f"Failed to create YouTube object or access title. Retrying... ({i+1}/{max_retries})"
|
| 33 |
)
|
| 34 |
time.sleep(interval_secs) # Wait for the specified interval before retrying.
|
|
@@ -43,7 +40,7 @@ if __name__ == "__main__":
|
|
| 43 |
playlist = Playlist(config.playlist_url)
|
| 44 |
playlist_video_urls = playlist.video_urls
|
| 45 |
|
| 46 |
-
|
| 47 |
|
| 48 |
video_data = []
|
| 49 |
for video in tqdm(playlist_video_urls, total=len(playlist_video_urls)):
|
|
@@ -61,9 +58,9 @@ if __name__ == "__main__":
|
|
| 61 |
curr_video_data["total_words"] = len(transcript.split())
|
| 62 |
video_data.append(curr_video_data)
|
| 63 |
except:
|
| 64 |
-
|
| 65 |
|
| 66 |
-
|
| 67 |
|
| 68 |
df = pd.DataFrame(video_data)
|
| 69 |
df.to_csv(config.yt_scraped_data_path, index=False)
|
|
|
|
|
|
|
| 1 |
import time
|
| 2 |
from dataclasses import asdict
|
| 3 |
|
|
|
|
| 9 |
import wandb
|
| 10 |
from config import config
|
| 11 |
|
|
|
|
|
|
|
| 12 |
|
| 13 |
def retry_access_yt_object(url, max_retries=5, interval_secs=5):
|
| 14 |
"""
|
|
|
|
| 25 |
return yt # Return the YouTube object if successful.
|
| 26 |
except Exception as err:
|
| 27 |
last_exception = err # Keep track of the last exception raised.
|
| 28 |
+
print(
|
| 29 |
f"Failed to create YouTube object or access title. Retrying... ({i+1}/{max_retries})"
|
| 30 |
)
|
| 31 |
time.sleep(interval_secs) # Wait for the specified interval before retrying.
|
|
|
|
| 40 |
playlist = Playlist(config.playlist_url)
|
| 41 |
playlist_video_urls = playlist.video_urls
|
| 42 |
|
| 43 |
+
print(f"There are total {len(playlist_video_urls)} videos in the playlist.")
|
| 44 |
|
| 45 |
video_data = []
|
| 46 |
for video in tqdm(playlist_video_urls, total=len(playlist_video_urls)):
|
|
|
|
| 58 |
curr_video_data["total_words"] = len(transcript.split())
|
| 59 |
video_data.append(curr_video_data)
|
| 60 |
except:
|
| 61 |
+
print(f"Failed to scrape {video}")
|
| 62 |
|
| 63 |
+
print(f"Total podcast episodes scraped: {len(video_data)}")
|
| 64 |
|
| 65 |
df = pd.DataFrame(video_data)
|
| 66 |
df.to_csv(config.yt_scraped_data_path, index=False)
|
src/summarize.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
import logging
|
| 2 |
import os
|
| 3 |
from dataclasses import asdict
|
| 4 |
|
|
@@ -15,8 +14,6 @@ from wandb.integration.langchain import WandbTracer
|
|
| 15 |
|
| 16 |
from config import config
|
| 17 |
|
| 18 |
-
logger = logging.getLogger(__name__)
|
| 19 |
-
|
| 20 |
|
| 21 |
def get_data(
|
| 22 |
artifact_name: str = "gladiator/gradient_dissent_bot/yt_podcast_data:latest",
|
|
@@ -38,7 +35,7 @@ def summarize_episode(episode_df: pd.DataFrame):
|
|
| 38 |
# split the documents
|
| 39 |
text_splitter = TokenTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=0)
|
| 40 |
docs = text_splitter.split_documents(data)
|
| 41 |
-
|
| 42 |
|
| 43 |
# initialize LLM
|
| 44 |
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
|
|
@@ -98,12 +95,9 @@ if __name__ == "__main__":
|
|
| 98 |
summary = summarize_episode(episode_data)
|
| 99 |
summaries.append(summary["output_text"])
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
logger.info(f"Total tokens: {cb.total_tokens}")
|
| 105 |
-
logger.info(f"Total cost (USD): ${cb.total_cost}")
|
| 106 |
-
logger.info("*" * 25)
|
| 107 |
|
| 108 |
wandb.log(
|
| 109 |
{
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
from dataclasses import asdict
|
| 3 |
|
|
|
|
| 14 |
|
| 15 |
from config import config
|
| 16 |
|
|
|
|
|
|
|
| 17 |
|
| 18 |
def get_data(
|
| 19 |
artifact_name: str = "gladiator/gradient_dissent_bot/yt_podcast_data:latest",
|
|
|
|
| 35 |
# split the documents
|
| 36 |
text_splitter = TokenTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=0)
|
| 37 |
docs = text_splitter.split_documents(data)
|
| 38 |
+
print(f"Number of documents for podcast {data[0].metadata['title']}: {len(docs)}")
|
| 39 |
|
| 40 |
# initialize LLM
|
| 41 |
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
|
|
|
|
| 95 |
summary = summarize_episode(episode_data)
|
| 96 |
summaries.append(summary["output_text"])
|
| 97 |
|
| 98 |
+
print("*" * 25)
|
| 99 |
+
print(cb)
|
| 100 |
+
print("*" * 25)
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
wandb.log(
|
| 103 |
{
|