-
Notifications
You must be signed in to change notification settings - Fork 16k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Jupyter Notebook for RAG chatbot with FAISS and sample PDFs
- Loading branch information
1 parent
1cd4d8d
commit 2503d3f
Showing
3 changed files
with
318 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,318 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "e7135278-a502-4ae9-9e61-b00a80a7c7ec", | ||
"metadata": {}, | ||
"source": [ | ||
"# Build RAG vector database using FAISS and query the document\n", | ||
"\n", | ||
"This notebook takes you through step by step process of creating a RAG based chatbot" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "c96cd71a-510d-46a2-a06b-8839818e2196", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Requirement already satisfied: langchain in /home/dheerajreddy/lctest/lib/python3.10/site-packages (0.3.14)\n", | ||
"Requirement already satisfied: langchain-community in /home/dheerajreddy/lctest/lib/python3.10/site-packages (0.3.14)\n", | ||
"Requirement already satisfied: langchain_openai in /home/dheerajreddy/lctest/lib/python3.10/site-packages (0.3.0)\n", | ||
"Requirement already satisfied: faiss-cpu in /home/dheerajreddy/lctest/lib/python3.10/site-packages (1.9.0.post1)\n", | ||
"Requirement already satisfied: pypdf in /home/dheerajreddy/lctest/lib/python3.10/site-packages (5.1.0)\n", | ||
"Requirement already satisfied: SQLAlchemy<3,>=1.4 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (2.0.37)\n", | ||
"Requirement already satisfied: langsmith<0.3,>=0.1.17 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (0.2.11)\n", | ||
"Requirement already satisfied: requests<3,>=2 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (2.32.3)\n", | ||
"Requirement already satisfied: numpy<2,>=1.22.4 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (1.26.4)\n", | ||
"Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (3.11.11)\n", | ||
"Requirement already satisfied: pydantic<3.0.0,>=2.7.4 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (2.10.5)\n", | ||
"Requirement already satisfied: langchain-core<0.4.0,>=0.3.29 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (0.3.30)\n", | ||
"Requirement already satisfied: PyYAML>=5.3 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (6.0.2)\n", | ||
"Requirement already satisfied: langchain-text-splitters<0.4.0,>=0.3.3 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (0.3.5)\n", | ||
"Requirement already satisfied: tenacity!=8.4.0,<10,>=8.1.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (9.0.0)\n", | ||
"Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (4.0.3)\n", | ||
"Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain-community) (0.6.7)\n", | ||
"Requirement already satisfied: pydantic-settings<3.0.0,>=2.4.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain-community) (2.7.1)\n", | ||
"Requirement already satisfied: httpx-sse<0.5.0,>=0.4.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain-community) (0.4.0)\n", | ||
"Requirement already satisfied: tiktoken<1,>=0.7 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain_openai) (0.8.0)\n", | ||
"Requirement already satisfied: openai<2.0.0,>=1.58.1 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain_openai) (1.59.8)\n", | ||
"Requirement already satisfied: packaging in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from faiss-cpu) (24.2)\n", | ||
"Requirement already satisfied: typing_extensions>=4.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from pypdf) (4.12.2)\n", | ||
"Requirement already satisfied: attrs>=17.3.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (24.3.0)\n", | ||
"Requirement already satisfied: frozenlist>=1.1.1 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.5.0)\n", | ||
"Requirement already satisfied: propcache>=0.2.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (0.2.1)\n", | ||
"Requirement already satisfied: multidict<7.0,>=4.5 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.1.0)\n", | ||
"Requirement already satisfied: yarl<2.0,>=1.17.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.18.3)\n", | ||
"Requirement already satisfied: aiosignal>=1.1.2 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.2)\n", | ||
"Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (2.4.4)\n", | ||
"Requirement already satisfied: typing-inspect<1,>=0.4.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain-community) (0.9.0)\n", | ||
"Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain-community) (3.25.1)\n", | ||
"Requirement already satisfied: jsonpatch<2.0,>=1.33 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain-core<0.4.0,>=0.3.29->langchain) (1.33)\n", | ||
"Requirement already satisfied: requests-toolbelt<2.0.0,>=1.0.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langsmith<0.3,>=0.1.17->langchain) (1.0.0)\n", | ||
"Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langsmith<0.3,>=0.1.17->langchain) (3.10.14)\n", | ||
"Requirement already satisfied: httpx<1,>=0.23.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langsmith<0.3,>=0.1.17->langchain) (0.28.1)\n", | ||
"Requirement already satisfied: anyio<5,>=3.5.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (4.8.0)\n", | ||
"Requirement already satisfied: sniffio in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (1.3.1)\n", | ||
"Requirement already satisfied: tqdm>4 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (4.67.1)\n", | ||
"Requirement already satisfied: jiter<1,>=0.4.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (0.8.2)\n", | ||
"Requirement already satisfied: distro<2,>=1.7.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (1.9.0)\n", | ||
"Requirement already satisfied: pydantic-core==2.27.2 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from pydantic<3.0.0,>=2.7.4->langchain) (2.27.2)\n", | ||
"Requirement already satisfied: annotated-types>=0.6.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from pydantic<3.0.0,>=2.7.4->langchain) (0.7.0)\n", | ||
"Requirement already satisfied: python-dotenv>=0.21.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from pydantic-settings<3.0.0,>=2.4.0->langchain-community) (1.0.1)\n", | ||
"Requirement already satisfied: urllib3<3,>=1.21.1 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from requests<3,>=2->langchain) (2.3.0)\n", | ||
"Requirement already satisfied: certifi>=2017.4.17 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from requests<3,>=2->langchain) (2024.12.14)\n", | ||
"Requirement already satisfied: idna<4,>=2.5 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from requests<3,>=2->langchain) (3.10)\n", | ||
"Requirement already satisfied: charset-normalizer<4,>=2 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from requests<3,>=2->langchain) (3.4.1)\n", | ||
"Requirement already satisfied: greenlet!=0.4.17 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from SQLAlchemy<3,>=1.4->langchain) (3.1.1)\n", | ||
"Requirement already satisfied: regex>=2022.1.18 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from tiktoken<1,>=0.7->langchain_openai) (2024.11.6)\n", | ||
"Requirement already satisfied: exceptiongroup>=1.0.2 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from anyio<5,>=3.5.0->openai<2.0.0,>=1.58.1->langchain_openai) (1.2.2)\n", | ||
"Requirement already satisfied: httpcore==1.* in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from httpx<1,>=0.23.0->langsmith<0.3,>=0.1.17->langchain) (1.0.7)\n", | ||
"Requirement already satisfied: h11<0.15,>=0.13 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->langsmith<0.3,>=0.1.17->langchain) (0.14.0)\n", | ||
"Requirement already satisfied: jsonpointer>=1.9 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from jsonpatch<2.0,>=1.33->langchain-core<0.4.0,>=0.3.29->langchain) (3.0.0)\n", | ||
"Requirement already satisfied: mypy-extensions>=0.3.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community) (1.0.0)\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"! pip install -U langchain langchain-community langchain_openai faiss-cpu pypdf # (newest versions required for multi-modal)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "d033c505-c805-49cd-905d-97edf606113d", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#Import all necessary libraries\n", | ||
"# from langchain_community.chat_models import ChatOpenAI\n", | ||
"from langchain_openai import ChatOpenAI\n", | ||
"from langchain_community.vectorstores import FAISS\n", | ||
"# from langchain_community.embeddings import OpenAIEmbeddings\n", | ||
"from langchain_openai import OpenAIEmbeddings\n", | ||
"from langchain_core.prompts import PromptTemplate\n", | ||
"from langchain_community.document_loaders import PyPDFLoader,TextLoader\n", | ||
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n", | ||
"from langchain import hub\n", | ||
"from langchain_core.output_parsers import StrOutputParser\n", | ||
"from langchain_core.runnables import RunnablePassthrough" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "4196d408-efe1-442c-8d8a-856a002ab8ac", | ||
"metadata": { | ||
"scrolled": true | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdin", | ||
"output_type": "stream", | ||
"text": [ | ||
"Enter API key for OpenAI: ········\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import getpass\n", | ||
"import os\n", | ||
"\n", | ||
"if not os.environ.get(\"OPENAI_API_KEY\"):\n", | ||
" os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter API key for OpenAI: \")\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "c4eb93a9-6e6a-4ab9-822b-1e5106e30258", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Initialize vector store and embeddings\n", | ||
"vectorstore = None\n", | ||
"embeddings = OpenAIEmbeddings()\n", | ||
"\n", | ||
"# Define a prompt template\n", | ||
"prompt_template = PromptTemplate(\n", | ||
" input_variables=[\"context\", \"question\"],\n", | ||
" template=\"\"\"\n", | ||
" You are an intelligent assistant. Use the following context to answer the user's question accurately:\n", | ||
"\n", | ||
" Context: {context}\n", | ||
"\n", | ||
" Question: {question}\n", | ||
"\n", | ||
" Answer: \"\"\"\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "9d8e77bc-0e5e-4d8f-8475-d9248b048817", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#Write the helper functions\n", | ||
"def process_documents(file_paths):\n", | ||
" print(file_paths)\n", | ||
" \"\"\"Processes and updates the vector store with new documents.\"\"\"\n", | ||
" global vectorstore\n", | ||
" documents = []\n", | ||
" text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", | ||
"\n", | ||
" for file_path in file_paths:\n", | ||
" print(f\"Processing file: {file_path}\")\n", | ||
" \n", | ||
" try:\n", | ||
" loader = PyPDFLoader(file_path) if file_path.endswith(\".pdf\") else TextLoader(file_path)\n", | ||
" documents.extend(loader.load())\n", | ||
" except Exception as e:\n", | ||
" print(f\"Error processing file {file_path}: {e}\")\n", | ||
" continue\n", | ||
"\n", | ||
" docs = text_splitter.split_documents(documents)\n", | ||
"\n", | ||
" # Create or update vector store\n", | ||
" if vectorstore is None:\n", | ||
" vectorstore = FAISS.from_documents(docs, embeddings)\n", | ||
" else:\n", | ||
" vectorstore.add_documents(docs)\n", | ||
" print(vectorstore.index.ntotal)\n", | ||
"\n", | ||
"\n", | ||
"def format_docs(docs):\n", | ||
" return \"\\n\\n\".join(doc.page_content for doc in docs)\n", | ||
"\n", | ||
"\n", | ||
"def get_qa_chain():\n", | ||
" \"\"\"Creates and returns a RetrievalQA chain.\"\"\"\n", | ||
" if vectorstore is None:\n", | ||
" raise ValueError(\"Vector store is not initialized. Upload documents first.\")\n", | ||
" prompt = hub.pull(\"rlm/rag-prompt\")\n", | ||
" retriever = vectorstore.as_retriever()\n", | ||
" qa_chain = (\n", | ||
" {\n", | ||
" \"context\": retriever | format_docs,\n", | ||
" \"question\": RunnablePassthrough(),\n", | ||
" }\n", | ||
" | prompt\n", | ||
" | ChatOpenAI()\n", | ||
" | StrOutputParser()\n", | ||
" )\n", | ||
"\n", | ||
" return qa_chain" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"id": "c25b33fe-8039-4bbd-a3af-c829179325b5", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#add all the file paths, you want to create a retrieval chatbot for\n", | ||
"file_paths = ['../docs/docs/example_data/nike-q3-2024-earnings.pdf',\n", | ||
" '../docs/docs/example_data/puma-q3-2024-earnings.pdf']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"id": "3b0ced3b-890d-4fd4-9b3e-849e73451210", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"['../docs/docs/example_data/nike-q3-2024-earnings.pdf', '../docs/docs/example_data/puma-q3-2024-earnings.pdf']\n", | ||
"Processing file: ../docs/docs/example_data/nike-q3-2024-earnings.pdf\n", | ||
"Processing file: ../docs/docs/example_data/puma-q3-2024-earnings.pdf\n", | ||
"49\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"#Create Vector embeddings\n", | ||
"process_documents(file_paths)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"id": "e261b2c3-d62e-428f-a615-2643e2c3f79d", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"/home/dheerajreddy/lctest/lib/python3.10/site-packages/langsmith/client.py:256: LangSmithMissingAPIKeyWarning: API key must be provided when using hosted LangSmith API\n", | ||
" warnings.warn(\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"qa_chain = get_qa_chain()\n", | ||
"# " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"id": "723f652b-ca89-4f9e-9b39-60c451336d2e", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"It is difficult to determine which company's financials are better based on the given context as both Nike and Puma present different aspects of their financial performance. Nike reported revenues of $12.4 billion in the third quarter of fiscal year 2024, while Puma emphasized its focus on managing short-term challenges without compromising long-term momentum and positive feedback on upcoming product releases. Additional analysis and comparison of financial statements would be needed to determine which company's financial condition is better.\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"#query the documents\n", | ||
"question = 'Among Nike and Puma whose financials are better'\n", | ||
"response = qa_chain.invoke(question)\n", | ||
"print(response)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "57fb42dd-684a-4c80-917d-37feaa41a6ea", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.12" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Binary file not shown.
Binary file not shown.