Spaces:

harithapliyal
/

ask-osho

Sleeping

App Files Files Community

harithapliyal commited on Dec 7, 2024

Commit

5f8d57d

0 Parent(s):

first commit

Browse files

Files changed (6) hide show

.gitignore +171 -0
create_vector_db.py +86 -0
query_vector_db.py +41 -0
requirements.txt +6 -0
src/speech-to-text/speech-to-text.ipynb +271 -0
src/speech-to-text/vigyan_bhairav_tantra.mp3 +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,171 @@

+# I am keeping unzipped data from downloaded file, so no sending to github again.
+tempdata/
+vector_db/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Big files
+*.zip
+*.pt
+*.pptx
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

create_vector_db.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+from typing import List, Dict
+import PyPDF2
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import chromadb
+from chromadb.utils import embedding_functions
+from tqdm import tqdm
+class PDFVectorizer:
+    def __init__(self, pdf_dir: str, db_dir: str):
+        self.pdf_dir = pdf_dir
+        self.db_dir = db_dir
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200,
+            length_function=len
+        )
+        # Initialize ChromaDB with sentence-transformers embeddings
+        self.client = chromadb.PersistentClient(path=db_dir)
+        self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
+            model_name="all-MiniLM-L6-v2"
+        )
+        self.collection = self.client.create_collection(
+            name="osho_books",
+            embedding_function=self.embedding_function
+        )
+    def extract_text_from_pdf(self, pdf_path: str) -> str:
+        """Extract text from a PDF file."""
+        try:
+            with open(pdf_path, 'rb') as file:
+                reader = PyPDF2.PdfReader(file)
+                text = ""
+                for page in reader.pages:
+                    text += page.extract_text() + "\n"
+                return text
+        except Exception as e:
+            print(f"Error processing {pdf_path}: {str(e)}")
+            return ""
+    def process_pdf(self, pdf_path: str) -> List[Dict]:
+        """Process a single PDF file and return chunks with metadata."""
+        text = self.extract_text_from_pdf(pdf_path)
+        if not text:
+            return []
+        chunks = self.text_splitter.split_text(text)
+        book_name = os.path.basename(pdf_path)
+        return [{
+            "text": chunk,
+            "metadata": {
+                "book": book_name,
+                "chunk_index": i
+            }
+        } for i, chunk in enumerate(chunks)]
+    def create_vector_database(self):
+        """Process all PDFs and create the vector database."""
+        pdf_files = [f for f in os.listdir(self.pdf_dir) if f.endswith('.pdf')]
+        for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
+            pdf_path = os.path.join(self.pdf_dir, pdf_file)
+            chunks = self.process_pdf(pdf_path)
+            if chunks:
+                # Add chunks to ChromaDB
+                self.collection.add(
+                    documents=[chunk["text"] for chunk in chunks],
+                    metadatas=[chunk["metadata"] for chunk in chunks],
+                    ids=[f"{pdf_file}_{chunk['metadata']['chunk_index']}" for chunk in chunks]
+                )
+                print(f"Added {len(chunks)} chunks from {pdf_file}")
+if __name__ == "__main__":
+    # Define directories
+    pdf_dir = os.path.join(os.getcwd(), "OshoBooks")
+    db_dir = os.path.join(os.getcwd(), "vector_db")
+    # Create vector database directory if it doesn't exist
+    os.makedirs(db_dir, exist_ok=True)
+    # Initialize and run the vectorizer
+    vectorizer = PDFVectorizer(pdf_dir, db_dir)
+    vectorizer.create_vector_database()
+    print("Vector database creation completed!")

query_vector_db.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import os
+import chromadb
+from chromadb.utils import embedding_functions
+def query_vector_db(query: str, n_results: int = 5):
+    """Query the vector database and return relevant passages."""
+    # Initialize ChromaDB client
+    db_dir = os.path.join(os.getcwd(), "vector_db")
+    client = chromadb.PersistentClient(path=db_dir)
+    # Initialize embedding function (same as used in creation)
+    embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
+        model_name="all-MiniLM-L6-v2"
+    )
+    # Get the collection
+    collection = client.get_collection(
+        name="osho_books",
+        embedding_function=embedding_function
+    )
+    # Query the collection
+    results = collection.query(
+        query_texts=[query],
+        n_results=n_results
+    )
+    # Print results
+    print(f"\nQuery: {query}\n")
+    for i, (doc, metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0])):
+        print(f"\nResult {i+1}:")
+        print(f"Book: {metadata['book']}")
+        print(f"Passage: {doc[:200]}...")  # Show first 200 characters
+        print("-" * 80)
+if __name__ == "__main__":
+    while True:
+        query = input("\nEnter your query (or 'quit' to exit): ")
+        if query.lower() == 'quit':
+            break
+        query_vector_db(query)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+PyPDF2==3.0.1
+langchain==0.0.350
+chromadb==0.4.20
+sentence-transformers==2.2.2
+tqdm==4.66.1
+huggingface-hub==0.19.4

src/speech-to-text/speech-to-text.ipynb ADDED Viewed

	@@ -0,0 +1,271 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install google-cloud-texttospeech"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Google Cloud SDK 486.0.0\n",
+      "bq 2.1.7\n",
+      "core 2024.07.26\n",
+      "gcloud-crc32c 1.0.0\n",
+      "gsutil 5.30\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Updates are available for some Google Cloud CLI components.  To install them,\n",
+      "please run:\n",
+      "  $ gcloud components update\n"
+     ]
+    }
+   ],
+   "source": [
+    "!gcloud --version"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#gcloud auth application-default login"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Your browser has been opened to visit:\n",
+    "\n",
+    "    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=rU3Hm3TBkKnist6ySUS79s8XqobXDV&access_type=offline&code_challenge=4aypyhtRgzs0m7Kdcx5q65JEeGeWz19mktZYChTEb4E&code_challenge_method=S256\n",
+    "\n",
+    "\n",
+    "Credentials saved to file: [C:\\Users\\hari_\\AppData\\Roaming\\gcloud\\application_default_credentials.json]\n",
+    "\n",
+    "These credentials will be used by any library that requests Application Default Credentials (ADC).\n",
+    "\n",
+    "Quota project \"demoproject-111-429713\" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text1 = \"\"\"\n",
+    "ये वीडियो में मैंने विज्ञान भैरव पुस्तक की 112 धारणाओं को हिंदी में सिर्फ बोल के बताया है। हर एक विधि की समझ विस्तार से देखने के लिए मैंने अलग से वीडियो बनाए हैं। उसे आप देख सकते हैं मैंने विज्ञान भैरव तंत्र की।\n",
+    "\n",
+    "112 धारणाओं को 30 भाग में विभाजित किया है, जिसमें कर्म मार्ग के लिए आठ भाग में 31 धारणा भाव मार्ग के लिए 10 भाग में 36 धारणा और ज्ञान मार्ग के लिए 12 भाग में 45 धारणा विभाजित है। आइए।\n",
+    "\n",
+    "अब हर एक भाग की धारणा को संस्कृत और हिंदी में देखते हैं। भाग एक कर्ममार्ग सांस की धारणा ये भाग में कुल पांच धारणाएं हैं धारणा एक भैरव कहते हैं ऊर्जा के बनने और बिखरने के स्वभाव से ही।\n",
+    "\n",
+    "बाहर आता हुआ प्राण श्वास और अंदर जाता हुआ जीव श्वास निरंतर चलता रहता है। दोनों श्वास के उत्पत्ति बिंदु पर भैरव की शक्ति की भावना करने पर उसका भैरव स्वरूप प्रकट होता है। धारणा दो।\n",
+    "\n",
+    "अंदर आती हुई श्वास अंदर आने के बाद और बाहर जाती हुई श्वास बाहर जाने के बाद एक क्षण के लिए विलीन हो जाती है। उस मध्य स्थिति का विकास करने पर चेतना का भैरव स्वरूप प्रकाशित हो जाता है। धारणा तीन श्वास रूपी प्राण शक्ति\n",
+    "\n",
+    "और जीव शक्ति ना बाहर जाए ना अंदर आए उस मध्य स्थिति को विकसित करने पर साधक अपने भैरव स्वरूप को पहचान लेता है। धारणाचार, बाह्य कुंभक और अंतरकुंभक की मध्यस्थिति के विकास करने पर प्राण और अपान।\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_file_name = \"vigyan_bhairav_tantra.mp3\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Audio content written to file vigyan_bhairav_tantra.mp3\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "from google.cloud import texttospeech\n",
+    "\n",
+    "# Clear any existing credentials environment variable\n",
+    "if \"GOOGLE_APPLICATION_CREDENTIALS\" in os.environ:\n",
+    "    del os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"]\n",
+    "\n",
+    "# Instantiates a client\n",
+    "client = texttospeech.TextToSpeechClient()\n",
+    "\n",
+    "# Create the text input to be synthesized\n",
+    "synthesis_input = texttospeech.SynthesisInput(text=text1)\n",
+    "\n",
+    "# Build the voice request\n",
+    "voice = texttospeech.VoiceSelectionParams(\n",
+    "    language_code=\"hi-IN\",\n",
+    "    name=\"hi-IN-Wavenet-B\",  # Male voice (Wavenet offers better quality)\n",
+    "    ssml_gender=texttospeech.SsmlVoiceGender.MALE\n",
+    ")\n",
+    "\n",
+    "# Select the type of audio file\n",
+    "audio_config = texttospeech.AudioConfig(\n",
+    "    audio_encoding=texttospeech.AudioEncoding.MP3,\n",
+    "    speaking_rate=1.0,  # 0.25 to 4.0\n",
+    "    pitch=0.0,  # -20.0 to 20.0\n",
+    "    volume_gain_db=0.0  # -96.0 to 16.0\n",
+    ")\n",
+    "\n",
+    "# Perform the text-to-speech request\n",
+    "response = client.synthesize_speech(\n",
+    "    input=synthesis_input,\n",
+    "    voice=voice,\n",
+    "    audio_config=audio_config\n",
+    ")\n",
+    "\n",
+    "# The response's audio_content is binary\n",
+    "with open(output_file_name, \"wb\") as out:\n",
+    "    out.write(response.audio_content)\n",
+    "print(f\"Audio content written to file {output_file_name}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Available Hindi voices:\n",
+      "==================================================\n",
+      "Name: hi-IN-Neural2-A\n",
+      "Gender: 2\n",
+      "Natural Sample Rate Hertz: 24000\n",
+      "--------------------------------------------------\n",
+      "Name: hi-IN-Neural2-B\n",
+      "Gender: 1\n",
+      "Natural Sample Rate Hertz: 24000\n",
+      "--------------------------------------------------\n",
+      "Name: hi-IN-Neural2-C\n",
+      "Gender: 1\n",
+      "Natural Sample Rate Hertz: 24000\n",
+      "--------------------------------------------------\n",
+      "Name: hi-IN-Neural2-D\n",
+      "Gender: 2\n",
+      "Natural Sample Rate Hertz: 24000\n",
+      "--------------------------------------------------\n",
+      "Name: hi-IN-Standard-A\n",
+      "Gender: 2\n",
+      "Natural Sample Rate Hertz: 24000\n",
+      "--------------------------------------------------\n",
+      "Name: hi-IN-Standard-B\n",
+      "Gender: 1\n",
+      "Natural Sample Rate Hertz: 24000\n",
+      "--------------------------------------------------\n",
+      "Name: hi-IN-Standard-C\n",
+      "Gender: 1\n",
+      "Natural Sample Rate Hertz: 24000\n",
+      "--------------------------------------------------\n",
+      "Name: hi-IN-Standard-D\n",
+      "Gender: 2\n",
+      "Natural Sample Rate Hertz: 24000\n",
+      "--------------------------------------------------\n",
+      "Name: hi-IN-Standard-E\n",
+      "Gender: 2\n",
+      "Natural Sample Rate Hertz: 24000\n",
+      "--------------------------------------------------\n",
+      "Name: hi-IN-Standard-F\n",
+      "Gender: 1\n",
+      "Natural Sample Rate Hertz: 24000\n",
+      "--------------------------------------------------\n",
+      "Name: hi-IN-Wavenet-A\n",
+      "Gender: 2\n",
+      "Natural Sample Rate Hertz: 24000\n",
+      "--------------------------------------------------\n",
+      "Name: hi-IN-Wavenet-B\n",
+      "Gender: 1\n",
+      "Natural Sample Rate Hertz: 24000\n",
+      "--------------------------------------------------\n",
+      "Name: hi-IN-Wavenet-C\n",
+      "Gender: 1\n",
+      "Natural Sample Rate Hertz: 24000\n",
+      "--------------------------------------------------\n",
+      "Name: hi-IN-Wavenet-D\n",
+      "Gender: 2\n",
+      "Natural Sample Rate Hertz: 24000\n",
+      "--------------------------------------------------\n",
+      "Name: hi-IN-Wavenet-E\n",
+      "Gender: 2\n",
+      "Natural Sample Rate Hertz: 24000\n",
+      "--------------------------------------------------\n",
+      "Name: hi-IN-Wavenet-F\n",
+      "Gender: 1\n",
+      "Natural Sample Rate Hertz: 24000\n",
+      "--------------------------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "from google.cloud import texttospeech\n",
+    "\n",
+    "client = texttospeech.TextToSpeechClient()\n",
+    "\n",
+    "# List all available voices\n",
+    "voices = client.list_voices()\n",
+    "\n",
+    "# Filter and print Hindi voices with their details\n",
+    "print(\"Available Hindi voices:\")\n",
+    "print(\"=\" * 50)\n",
+    "for voice in voices.voices:\n",
+    "    if \"hi-IN\" in voice.language_codes:\n",
+    "        print(f\"Name: {voice.name}\")\n",
+    "        print(f\"Gender: {voice.ssml_gender}\")\n",
+    "        print(f\"Natural Sample Rate Hertz: {voice.natural_sample_rate_hertz}\")\n",
+    "        print(\"-\" * 50)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

src/speech-to-text/vigyan_bhairav_tantra.mp3 ADDED Viewed

Binary file (601 kB). View file