In [None]:
!pip install praw
!pip install pinecone
!pip install semantic-router
!pip install datasets

In [None]:
!pip install numpy==1.26.0

In [None]:
!pip install semantic-router[local]

Collecting Data

In [None]:
import praw
from google.colab import userdata

reddit = praw.Reddit(
    client_id=userdata.get('REDDIT_CLIENT_ID'),
    client_secret=userdata.get('REDDIT_CLIENT_SECRET'),
    user_agent=userdata.get('REDDIT_USER_AGENT'),
)

In [None]:
def submissionToDict(submission):
  submissionAsDict = {}
  submissionAsDict['id'] = submission.id
  # Metadata is directly stored as a dictionary with 'title' and 'body'
  submissionAsDict['metadata'] = {
      'title': submission.title,
      'content': '\n'.join([comment.body for comment in submission.comments.list() if isinstance(comment, praw.models.Comment)])  # Join comments into a single string, but only if it's a Comment object
  }
  return submissionAsDict

In [None]:
from IPython.display import clear_output

data = []
subreddit = reddit.subreddit("AskNYC")
for submission in subreddit.hot(limit=10):
    data.append(submissionToDict(submission)) # Await the result of submissionToDict

clear_output()

In [None]:
from datasets import Dataset

# Convert your existing 'data' list into a Dataset object
data = Dataset.from_list(data)

# Apply the mapping function to structure the data
data = data.map(lambda x: {
    "id": x["id"],
    "metadata": {
        "title": x["metadata"]["title"],  # Access title from metadata
        "content": x["metadata"]["content"],  # Access content from metadata
    }
})

# Since you don't have the extra columns in your original data
# you can skip the remove_columns step

# Now 'data' is a Dataset object
print(data)


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'metadata'],
    num_rows: 10
})


Connect to Pinecone

In [None]:
from semantic_router.encoders import HuggingFaceEncoder

encoder = HuggingFaceEncoder(name="dwzhu/e5-base-4k")
embeds = encoder(["this is a test"])
dims = len(embeds[0])

In [None]:
import os
import getpass
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = userdata.get('PINECONE_API_KEY')

# configure client
pc = Pinecone(api_key=api_key)


In [None]:
# configure client
pc = Pinecone(api_key=api_key)

from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)


In [None]:
import time

index_name = "groq-llama-3-rag"
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=dims,
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

In [None]:
from tqdm.auto import tqdm

batch_size = 128  # how many embeddings we create and insert at once

for i in tqdm(range(0, len(data), batch_size)):
    # find end of batch
    i_end = min(len(data), i+batch_size)
    # create batch
    batch = data[i:i_end]
    # create embeddings
    chunks = [f'{x["title"]}: {x["content"]}' for x in batch["metadata"]]
    embeds = encoder(chunks)
    assert len(embeds) == (i_end-i)
    to_upsert = list(zip(batch["id"], embeds, batch["metadata"]))
    # upsert to Pinecone
    index.upsert(vectors=to_upsert)

Now apply on different subreddits

In [None]:

data = []
subreddit = reddit.subreddit("AskNYC")
for submission in subreddit.hot(limit=1000):
    data.append(submissionToDict(submission)) # Await the result of submissionToDict

# Convert your existing 'data' list into a Dataset object
data = Dataset.from_list(data)

# Apply the mapping function to structure the data
data = data.map(lambda x: {
    "id": x["id"],
    "metadata": {
        "title": x["metadata"]["title"],  # Access title from metadata
        "content": x["metadata"]["content"],  # Access content from metadata
    }
})


In [None]:
from tqdm.auto import tqdm

batch_size = 128  # how many embeddings we create and insert at once

for i in tqdm(range(0, len(data), batch_size)):
    # find end of batch
    i_end = min(len(data), i + batch_size)
    # create batch
    batch = data[i:i_end]
    # create embeddings
    chunks = [f'{x["title"]}: {x["content"][:1000]}' for x in batch["metadata"]] # Truncate content to 1000 characters
    embeds = encoder(chunks)
    assert len(embeds) == (i_end - i)
    # Reduce metadata size before upserting
    metadata_to_upsert = [{'title': x['title'], 'content_snippet': x['content'][:2000]} for x in batch['metadata']] # Truncate content snippet to 2000 characters
    to_upsert = list(zip(batch["id"], embeds, metadata_to_upsert)) # Use the reduced metadata
    # upsert to Pinecone
    index.upsert(vectors=to_upsert)

  0%|          | 0/7 [00:00<?, ?it/s]

Others

In [None]:
for item in ['Manhattan','Bronx', 'Brooklyn', 'Queens', 'StatenIsland']:
  data = []
  subreddit = reddit.subreddit(item)
  for submission in subreddit.hot(limit=256):
      data.append(submissionToDict(submission)) # Await the result of submissionToDict

  # Convert your existing 'data' list into a Dataset object
  data = Dataset.from_list(data)

  # Apply the mapping function to structure the data
  data = data.map(lambda x: {
      "id": x["id"],
      "metadata": {
          "title": x["metadata"]["title"],  # Access title from metadata
          "content": x["metadata"]["content"],  # Access content from metadata
      }
  })

  batch_size = 128  # how many embeddings we create and insert at once

  for i in tqdm(range(0, len(data), batch_size)):
      # find end of batch
      i_end = min(len(data), i + batch_size)
      # create batch
      batch = data[i:i_end]
      # create embeddings
      chunks = [f'{x["title"]}: {x["content"][:1000]}' for x in batch["metadata"]] # Truncate content to 1000 characters
      embeds = encoder(chunks)
      assert len(embeds) == (i_end - i)
      # Reduce metadata size before upserting
      metadata_to_upsert = [{'title': x['title'], 'content_snippet': x['content'][:2000]} for x in batch['metadata']] # Truncate content snippet to 2000 characters
      to_upsert = list(zip(batch["id"], embeds, metadata_to_upsert)) # Use the reduced metadata
      # upsert to Pinecone
      index.upsert(vectors=to_upsert)
