Upload 8 files
Browse files- .env +6 -0
- .pre-commit-config.yaml +44 -0
- Transcript.pdf +0 -0
- constants.py +15 -0
- ingest.py +31 -0
- new.py +156 -0
- requirements.txt +12 -0
- sample.py +9 -0
.env
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PERSIST_DIRECTORY=db
|
2 |
+
|
3 |
+
EMBEDDINGS_MODEL_NAME=all-MiniLM-L6-v2
|
4 |
+
MODEL_N_CTX=2048
|
5 |
+
MODEL_N_BATCH=8
|
6 |
+
TARGET_SOURCE_CHUNKS=4
|
.pre-commit-config.yaml
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
files: ^(.*\.(py|json|md|sh|yaml|cfg|txt))$
|
3 |
+
exclude: ^(\.[^/]*cache/.*|.*/_user.py|source_documents/)$
|
4 |
+
repos:
|
5 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
6 |
+
rev: v4.4.0
|
7 |
+
hooks:
|
8 |
+
#- id: no-commit-to-branch
|
9 |
+
# args: [--branch, main]
|
10 |
+
- id: check-yaml
|
11 |
+
args: [--unsafe]
|
12 |
+
# - id: debug-statements
|
13 |
+
- id: end-of-file-fixer
|
14 |
+
- id: trailing-whitespace
|
15 |
+
exclude-files: \.md$
|
16 |
+
- id: check-json
|
17 |
+
- id: mixed-line-ending
|
18 |
+
# - id: check-builtin-literals
|
19 |
+
# - id: check-ast
|
20 |
+
- id: check-merge-conflict
|
21 |
+
- id: check-executables-have-shebangs
|
22 |
+
- id: check-shebang-scripts-are-executable
|
23 |
+
- id: check-docstring-first
|
24 |
+
- id: fix-byte-order-marker
|
25 |
+
- id: check-case-conflict
|
26 |
+
# - id: check-toml
|
27 |
+
- repo: https://github.com/adrienverge/yamllint.git
|
28 |
+
rev: v1.29.0
|
29 |
+
hooks:
|
30 |
+
- id: yamllint
|
31 |
+
args:
|
32 |
+
- --no-warnings
|
33 |
+
- -d
|
34 |
+
- '{extends: relaxed, rules: {line-length: {max: 90}}}'
|
35 |
+
- repo: https://github.com/codespell-project/codespell
|
36 |
+
rev: v2.2.2
|
37 |
+
hooks:
|
38 |
+
- id: codespell
|
39 |
+
args:
|
40 |
+
# - --builtin=clear,rare,informal,usage,code,names,en-GB_to_en-US
|
41 |
+
- --builtin=clear,rare,informal,usage,code,names
|
42 |
+
- --ignore-words-list=hass,master
|
43 |
+
- --skip="./.*"
|
44 |
+
- --quiet-level=2
|
Transcript.pdf
ADDED
Binary file (57.2 kB). View file
|
|
constants.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
from chromadb.config import Settings
|
4 |
+
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
# Define the folder for storing database
|
8 |
+
PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY')
|
9 |
+
|
10 |
+
# Define the Chroma settings
|
11 |
+
CHROMA_SETTINGS = Settings(
|
12 |
+
chroma_db_impl='duckdb+parquet',
|
13 |
+
persist_directory=PERSIST_DIRECTORY,
|
14 |
+
anonymized_telemetry=False
|
15 |
+
)
|
ingest.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ingest.py
|
2 |
+
from sentence_transformers import SentenceTransformer
|
3 |
+
import whisper
|
4 |
+
import os
|
5 |
+
import chromadb
|
6 |
+
from chromadb.config import Settings
|
7 |
+
from chromadb.utils import embedding_functions
|
8 |
+
|
9 |
+
|
10 |
+
# Initialize models
|
11 |
+
whisper_model = whisper.load_model("base") # or "medium", "large" depending on system resources
|
12 |
+
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
13 |
+
|
14 |
+
# Set up Chroma DB
|
15 |
+
client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory="chroma_db"))
|
16 |
+
collection = client.get_or_create_collection(name="meeting_notes",
|
17 |
+
embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(embedding_model))
|
18 |
+
|
19 |
+
def transcribe_and_ingest(video_path, meeting_id):
|
20 |
+
print(f"Transcribing {video_path}...")
|
21 |
+
result = whisper_model.transcribe(video_path)
|
22 |
+
transcription = result["text"]
|
23 |
+
|
24 |
+
print("Splitting transcription...")
|
25 |
+
chunks = [transcription[i:i+500] for i in range(0, len(transcription), 500)]
|
26 |
+
ids = [f"{meeting_id}_{i}" for i in range(len(chunks))]
|
27 |
+
|
28 |
+
print("Storing embeddings in Chroma DB...")
|
29 |
+
collection.add(documents=chunks, ids=ids, metadatas=[{"meeting_id": meeting_id}] * len(chunks))
|
30 |
+
|
31 |
+
return transcription # return full transcription for PDF or future use
|
new.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
from dataclasses import dataclass
|
5 |
+
from typing import Literal
|
6 |
+
from langchain_community.document_loaders import PyPDFLoader
|
7 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
8 |
+
from langchain_community.embeddings import SentenceTransformerEmbeddings
|
9 |
+
from langchain_community.vectorstores import Chroma
|
10 |
+
from langchain_core.prompts import ChatPromptTemplate
|
11 |
+
from langchain_community.llms import Ollama
|
12 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
13 |
+
from langchain.chains import create_retrieval_chain
|
14 |
+
from langchain_core.output_parsers import StrOutputParser
|
15 |
+
import streamlit.components.v1 as components
|
16 |
+
import langchain_ollama
|
17 |
+
import langchain_huggingface
|
18 |
+
# Initialize the LLM (make sure the model is correct)
|
19 |
+
llm = Ollama(model="llama3")
|
20 |
+
|
21 |
+
# Load the Coimbatore-related PDF for personalized recommendations
|
22 |
+
loader = PyPDFLoader("Transcript.pdf")
|
23 |
+
docs = loader.load()
|
24 |
+
|
25 |
+
# Split the document into chunks
|
26 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
|
27 |
+
documents = text_splitter.split_documents(docs)
|
28 |
+
|
29 |
+
# Create embeddings and vector database
|
30 |
+
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
31 |
+
db = Chroma.from_documents(documents, embeddings)
|
32 |
+
|
33 |
+
# Determine whether a query is related to Coimbatore
|
34 |
+
def is_coimbatore_query(query):
|
35 |
+
keywords = ["meeting", "minutes", "transcript", "discussion", "virtual"]
|
36 |
+
return any(k in query.lower() for k in keywords)
|
37 |
+
|
38 |
+
# Chat history file
|
39 |
+
HISTORY_FILE = "chat_history.json"
|
40 |
+
|
41 |
+
@dataclass
|
42 |
+
class Message:
|
43 |
+
origin: Literal["human", "ai"]
|
44 |
+
message: str
|
45 |
+
|
46 |
+
def load_chat_history():
|
47 |
+
if os.path.exists(HISTORY_FILE):
|
48 |
+
with open(HISTORY_FILE, "r") as f:
|
49 |
+
return json.load(f)
|
50 |
+
return []
|
51 |
+
|
52 |
+
def save_chat_history(history):
|
53 |
+
history_to_save = []
|
54 |
+
for msg in history:
|
55 |
+
if isinstance(msg['message'], dict):
|
56 |
+
msg['message'] = str(msg['message'])
|
57 |
+
history_to_save.append(msg)
|
58 |
+
with open(HISTORY_FILE, "w") as f:
|
59 |
+
json.dump(history_to_save, f)
|
60 |
+
|
61 |
+
def clear_chat_history():
|
62 |
+
if os.path.exists(HISTORY_FILE):
|
63 |
+
os.remove(HISTORY_FILE)
|
64 |
+
|
65 |
+
def initialize_session_state():
|
66 |
+
if "history" not in st.session_state:
|
67 |
+
st.session_state.history = load_chat_history()
|
68 |
+
|
69 |
+
if "conversation_chain" not in st.session_state:
|
70 |
+
coimbatore_prompt = """You are an intelligent meeting assistant called RUBY for helping employees and clarify their queries regarding the virtual meet.
|
71 |
+
Use the following pieces of retrieved context to answer the question in detail: {context}.
|
72 |
+
Greet if the user greets you.
|
73 |
+
If you don't know the answer, just say that you don't know.
|
74 |
+
Only answer relevant content and not anything extra.
|
75 |
+
Don't return the prompt in the answer.
|
76 |
+
Don't respond irrelevant or anything outside the context.
|
77 |
+
___________
|
78 |
+
{context}"""
|
79 |
+
prompt = ChatPromptTemplate.from_messages([
|
80 |
+
("system", coimbatore_prompt),
|
81 |
+
("human", "{input}"),
|
82 |
+
])
|
83 |
+
question_answer_chain = create_stuff_documents_chain(llm, prompt)
|
84 |
+
retriever = db.as_retriever()
|
85 |
+
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
|
86 |
+
st.session_state.retrieval_chain = rag_chain
|
87 |
+
|
88 |
+
general_prompt = ChatPromptTemplate.from_messages([
|
89 |
+
("system", "You are a highly knowledgeable virtual meeting assistant named RUBY. You help users summarize the meeting, provide semantic analysis, clear doubts, answer who said what, classify opening and closing statements, and identify takeaway points. Always ask follow-up questions."),
|
90 |
+
("user", "Question: {input}")
|
91 |
+
])
|
92 |
+
st.session_state.general_chain = general_prompt | llm | StrOutputParser()
|
93 |
+
|
94 |
+
# Handle user input and update chat
|
95 |
+
def on_click_callback():
|
96 |
+
user_input = st.session_state.user_input
|
97 |
+
context = "\n".join([f"{msg['origin']}: {msg['message']}" for msg in st.session_state.history])
|
98 |
+
|
99 |
+
if is_coimbatore_query(user_input):
|
100 |
+
response = st.session_state.retrieval_chain.invoke({"input": context + "\n" + user_input})
|
101 |
+
answer = response['answer']
|
102 |
+
else:
|
103 |
+
response = st.session_state.general_chain.invoke({"input": context + "\n" + user_input})
|
104 |
+
answer = response
|
105 |
+
|
106 |
+
st.session_state.history.append({"origin": "human", "message": user_input})
|
107 |
+
st.session_state.history.append({"origin": "ai", "message": answer})
|
108 |
+
save_chat_history(st.session_state.history)
|
109 |
+
|
110 |
+
# New chat button
|
111 |
+
if st.sidebar.button("New Chat"):
|
112 |
+
st.session_state.history = []
|
113 |
+
clear_chat_history()
|
114 |
+
|
115 |
+
# Initialize session
|
116 |
+
initialize_session_state()
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
+
|
121 |
+
st.title("RUBY, INTELLIGENT MEETING BOT 🤖")
|
122 |
+
|
123 |
+
# Display chat history
|
124 |
+
chat_placeholder = st.container()
|
125 |
+
prompt_placeholder = st.form("chat-form")
|
126 |
+
|
127 |
+
with chat_placeholder:
|
128 |
+
for chat in st.session_state.history:
|
129 |
+
div = f"""
|
130 |
+
<div class="chat-row {'row-reverse' if chat['origin'] == 'human' else ''}">
|
131 |
+
<div class="chat-bubble {'human-bubble' if chat['origin'] == 'human' else 'ai-bubble'}">
|
132 |
+
{chat['message']}
|
133 |
+
</div>
|
134 |
+
</div>
|
135 |
+
"""
|
136 |
+
st.markdown(div, unsafe_allow_html=True)
|
137 |
+
|
138 |
+
with prompt_placeholder:
|
139 |
+
st.markdown("*Ask RUBY about your meeting !*")
|
140 |
+
cols = st.columns((6, 1))
|
141 |
+
cols[0].text_input("Chat", key="user_input", label_visibility="collapsed")
|
142 |
+
cols[1].form_submit_button("Submit", on_click=on_click_callback)
|
143 |
+
|
144 |
+
# Press "Enter" to submit input
|
145 |
+
components.html("""
|
146 |
+
<script>
|
147 |
+
const streamlitDoc = window.parent.document;
|
148 |
+
const buttons = Array.from(streamlitDoc.querySelectorAll('.stButton > button'));
|
149 |
+
const submitButton = buttons.find(el => el.innerText === 'Submit');
|
150 |
+
streamlitDoc.addEventListener('keydown', function(e) {
|
151 |
+
if (e.key === 'Enter') {
|
152 |
+
submitButton.click();
|
153 |
+
}
|
154 |
+
});
|
155 |
+
</script>
|
156 |
+
""", height=0, width=0)
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain==0.0.197
|
2 |
+
gpt4all==0.3.4
|
3 |
+
chromadb==0.3.23
|
4 |
+
urllib3==2.0.2
|
5 |
+
PyMuPDF==1.22.3
|
6 |
+
python-dotenv==1.0.0
|
7 |
+
unstructured==0.6.6
|
8 |
+
extract-msg==0.41.1
|
9 |
+
tabulate==0.9.0
|
10 |
+
pandoc==2.3
|
11 |
+
pypandoc==1.11
|
12 |
+
tqdm==4.65.0
|
sample.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
|
3 |
+
# Check if GPU is available
|
4 |
+
if tf.test.is_gpu_available():
|
5 |
+
print("GPU is available")
|
6 |
+
else:
|
7 |
+
print("GPU is not available")
|
8 |
+
|
9 |
+
# Rest of your code...
|