itzbhav commited on
Commit
748b6ee
·
verified ·
1 Parent(s): 86b384c

Upload 8 files

Browse files
Files changed (8) hide show
  1. .env +6 -0
  2. .pre-commit-config.yaml +44 -0
  3. Transcript.pdf +0 -0
  4. constants.py +15 -0
  5. ingest.py +31 -0
  6. new.py +156 -0
  7. requirements.txt +12 -0
  8. sample.py +9 -0
.env ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ PERSIST_DIRECTORY=db
2
+
3
+ EMBEDDINGS_MODEL_NAME=all-MiniLM-L6-v2
4
+ MODEL_N_CTX=2048
5
+ MODEL_N_BATCH=8
6
+ TARGET_SOURCE_CHUNKS=4
.pre-commit-config.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ files: ^(.*\.(py|json|md|sh|yaml|cfg|txt))$
3
+ exclude: ^(\.[^/]*cache/.*|.*/_user.py|source_documents/)$
4
+ repos:
5
+ - repo: https://github.com/pre-commit/pre-commit-hooks
6
+ rev: v4.4.0
7
+ hooks:
8
+ #- id: no-commit-to-branch
9
+ # args: [--branch, main]
10
+ - id: check-yaml
11
+ args: [--unsafe]
12
+ # - id: debug-statements
13
+ - id: end-of-file-fixer
14
+ - id: trailing-whitespace
15
+ exclude-files: \.md$
16
+ - id: check-json
17
+ - id: mixed-line-ending
18
+ # - id: check-builtin-literals
19
+ # - id: check-ast
20
+ - id: check-merge-conflict
21
+ - id: check-executables-have-shebangs
22
+ - id: check-shebang-scripts-are-executable
23
+ - id: check-docstring-first
24
+ - id: fix-byte-order-marker
25
+ - id: check-case-conflict
26
+ # - id: check-toml
27
+ - repo: https://github.com/adrienverge/yamllint.git
28
+ rev: v1.29.0
29
+ hooks:
30
+ - id: yamllint
31
+ args:
32
+ - --no-warnings
33
+ - -d
34
+ - '{extends: relaxed, rules: {line-length: {max: 90}}}'
35
+ - repo: https://github.com/codespell-project/codespell
36
+ rev: v2.2.2
37
+ hooks:
38
+ - id: codespell
39
+ args:
40
+ # - --builtin=clear,rare,informal,usage,code,names,en-GB_to_en-US
41
+ - --builtin=clear,rare,informal,usage,code,names
42
+ - --ignore-words-list=hass,master
43
+ - --skip="./.*"
44
+ - --quiet-level=2
Transcript.pdf ADDED
Binary file (57.2 kB). View file
 
constants.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from chromadb.config import Settings
4
+
5
+ load_dotenv()
6
+
7
+ # Define the folder for storing database
8
+ PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY')
9
+
10
+ # Define the Chroma settings
11
+ CHROMA_SETTINGS = Settings(
12
+ chroma_db_impl='duckdb+parquet',
13
+ persist_directory=PERSIST_DIRECTORY,
14
+ anonymized_telemetry=False
15
+ )
ingest.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ingest.py
2
+ from sentence_transformers import SentenceTransformer
3
+ import whisper
4
+ import os
5
+ import chromadb
6
+ from chromadb.config import Settings
7
+ from chromadb.utils import embedding_functions
8
+
9
+
10
+ # Initialize models
11
+ whisper_model = whisper.load_model("base") # or "medium", "large" depending on system resources
12
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
13
+
14
+ # Set up Chroma DB
15
+ client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory="chroma_db"))
16
+ collection = client.get_or_create_collection(name="meeting_notes",
17
+ embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(embedding_model))
18
+
19
+ def transcribe_and_ingest(video_path, meeting_id):
20
+ print(f"Transcribing {video_path}...")
21
+ result = whisper_model.transcribe(video_path)
22
+ transcription = result["text"]
23
+
24
+ print("Splitting transcription...")
25
+ chunks = [transcription[i:i+500] for i in range(0, len(transcription), 500)]
26
+ ids = [f"{meeting_id}_{i}" for i in range(len(chunks))]
27
+
28
+ print("Storing embeddings in Chroma DB...")
29
+ collection.add(documents=chunks, ids=ids, metadatas=[{"meeting_id": meeting_id}] * len(chunks))
30
+
31
+ return transcription # return full transcription for PDF or future use
new.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import json
3
+ import os
4
+ from dataclasses import dataclass
5
+ from typing import Literal
6
+ from langchain_community.document_loaders import PyPDFLoader
7
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+ from langchain_community.embeddings import SentenceTransformerEmbeddings
9
+ from langchain_community.vectorstores import Chroma
10
+ from langchain_core.prompts import ChatPromptTemplate
11
+ from langchain_community.llms import Ollama
12
+ from langchain.chains.combine_documents import create_stuff_documents_chain
13
+ from langchain.chains import create_retrieval_chain
14
+ from langchain_core.output_parsers import StrOutputParser
15
+ import streamlit.components.v1 as components
16
+ import langchain_ollama
17
+ import langchain_huggingface
18
+ # Initialize the LLM (make sure the model is correct)
19
+ llm = Ollama(model="llama3")
20
+
21
+ # Load the Coimbatore-related PDF for personalized recommendations
22
+ loader = PyPDFLoader("Transcript.pdf")
23
+ docs = loader.load()
24
+
25
+ # Split the document into chunks
26
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
27
+ documents = text_splitter.split_documents(docs)
28
+
29
+ # Create embeddings and vector database
30
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
31
+ db = Chroma.from_documents(documents, embeddings)
32
+
33
+ # Determine whether a query is related to Coimbatore
34
+ def is_coimbatore_query(query):
35
+ keywords = ["meeting", "minutes", "transcript", "discussion", "virtual"]
36
+ return any(k in query.lower() for k in keywords)
37
+
38
+ # Chat history file
39
+ HISTORY_FILE = "chat_history.json"
40
+
41
+ @dataclass
42
+ class Message:
43
+ origin: Literal["human", "ai"]
44
+ message: str
45
+
46
+ def load_chat_history():
47
+ if os.path.exists(HISTORY_FILE):
48
+ with open(HISTORY_FILE, "r") as f:
49
+ return json.load(f)
50
+ return []
51
+
52
+ def save_chat_history(history):
53
+ history_to_save = []
54
+ for msg in history:
55
+ if isinstance(msg['message'], dict):
56
+ msg['message'] = str(msg['message'])
57
+ history_to_save.append(msg)
58
+ with open(HISTORY_FILE, "w") as f:
59
+ json.dump(history_to_save, f)
60
+
61
+ def clear_chat_history():
62
+ if os.path.exists(HISTORY_FILE):
63
+ os.remove(HISTORY_FILE)
64
+
65
+ def initialize_session_state():
66
+ if "history" not in st.session_state:
67
+ st.session_state.history = load_chat_history()
68
+
69
+ if "conversation_chain" not in st.session_state:
70
+ coimbatore_prompt = """You are an intelligent meeting assistant called RUBY for helping employees and clarify their queries regarding the virtual meet.
71
+ Use the following pieces of retrieved context to answer the question in detail: {context}.
72
+ Greet if the user greets you.
73
+ If you don't know the answer, just say that you don't know.
74
+ Only answer relevant content and not anything extra.
75
+ Don't return the prompt in the answer.
76
+ Don't respond irrelevant or anything outside the context.
77
+ ___________
78
+ {context}"""
79
+ prompt = ChatPromptTemplate.from_messages([
80
+ ("system", coimbatore_prompt),
81
+ ("human", "{input}"),
82
+ ])
83
+ question_answer_chain = create_stuff_documents_chain(llm, prompt)
84
+ retriever = db.as_retriever()
85
+ rag_chain = create_retrieval_chain(retriever, question_answer_chain)
86
+ st.session_state.retrieval_chain = rag_chain
87
+
88
+ general_prompt = ChatPromptTemplate.from_messages([
89
+ ("system", "You are a highly knowledgeable virtual meeting assistant named RUBY. You help users summarize the meeting, provide semantic analysis, clear doubts, answer who said what, classify opening and closing statements, and identify takeaway points. Always ask follow-up questions."),
90
+ ("user", "Question: {input}")
91
+ ])
92
+ st.session_state.general_chain = general_prompt | llm | StrOutputParser()
93
+
94
+ # Handle user input and update chat
95
+ def on_click_callback():
96
+ user_input = st.session_state.user_input
97
+ context = "\n".join([f"{msg['origin']}: {msg['message']}" for msg in st.session_state.history])
98
+
99
+ if is_coimbatore_query(user_input):
100
+ response = st.session_state.retrieval_chain.invoke({"input": context + "\n" + user_input})
101
+ answer = response['answer']
102
+ else:
103
+ response = st.session_state.general_chain.invoke({"input": context + "\n" + user_input})
104
+ answer = response
105
+
106
+ st.session_state.history.append({"origin": "human", "message": user_input})
107
+ st.session_state.history.append({"origin": "ai", "message": answer})
108
+ save_chat_history(st.session_state.history)
109
+
110
+ # New chat button
111
+ if st.sidebar.button("New Chat"):
112
+ st.session_state.history = []
113
+ clear_chat_history()
114
+
115
+ # Initialize session
116
+ initialize_session_state()
117
+
118
+
119
+
120
+
121
+ st.title("RUBY, INTELLIGENT MEETING BOT 🤖")
122
+
123
+ # Display chat history
124
+ chat_placeholder = st.container()
125
+ prompt_placeholder = st.form("chat-form")
126
+
127
+ with chat_placeholder:
128
+ for chat in st.session_state.history:
129
+ div = f"""
130
+ <div class="chat-row {'row-reverse' if chat['origin'] == 'human' else ''}">
131
+ <div class="chat-bubble {'human-bubble' if chat['origin'] == 'human' else 'ai-bubble'}">
132
+ {chat['message']}
133
+ </div>
134
+ </div>
135
+ """
136
+ st.markdown(div, unsafe_allow_html=True)
137
+
138
+ with prompt_placeholder:
139
+ st.markdown("*Ask RUBY about your meeting !*")
140
+ cols = st.columns((6, 1))
141
+ cols[0].text_input("Chat", key="user_input", label_visibility="collapsed")
142
+ cols[1].form_submit_button("Submit", on_click=on_click_callback)
143
+
144
+ # Press "Enter" to submit input
145
+ components.html("""
146
+ <script>
147
+ const streamlitDoc = window.parent.document;
148
+ const buttons = Array.from(streamlitDoc.querySelectorAll('.stButton > button'));
149
+ const submitButton = buttons.find(el => el.innerText === 'Submit');
150
+ streamlitDoc.addEventListener('keydown', function(e) {
151
+ if (e.key === 'Enter') {
152
+ submitButton.click();
153
+ }
154
+ });
155
+ </script>
156
+ """, height=0, width=0)
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.0.197
2
+ gpt4all==0.3.4
3
+ chromadb==0.3.23
4
+ urllib3==2.0.2
5
+ PyMuPDF==1.22.3
6
+ python-dotenv==1.0.0
7
+ unstructured==0.6.6
8
+ extract-msg==0.41.1
9
+ tabulate==0.9.0
10
+ pandoc==2.3
11
+ pypandoc==1.11
12
+ tqdm==4.65.0
sample.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+
3
+ # Check if GPU is available
4
+ if tf.test.is_gpu_available():
5
+ print("GPU is available")
6
+ else:
7
+ print("GPU is not available")
8
+
9
+ # Rest of your code...