Skip to content

Commit

Permalink
Add Jupyter Notebook for RAG chatbot with FAISS and sample PDFs
Browse files Browse the repository at this point in the history
  • Loading branch information
dheerajreddy2020 committed Jan 18, 2025
1 parent 1cd4d8d commit 2503d3f
Show file tree
Hide file tree
Showing 3 changed files with 318 additions and 0 deletions.
318 changes: 318 additions & 0 deletions cookbook/rag_with_faiss.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "e7135278-a502-4ae9-9e61-b00a80a7c7ec",
"metadata": {},
"source": [
"# Build RAG vector database using FAISS and query the document\n",
"\n",
"This notebook takes you through step by step process of creating a RAG based chatbot"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "c96cd71a-510d-46a2-a06b-8839818e2196",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: langchain in /home/dheerajreddy/lctest/lib/python3.10/site-packages (0.3.14)\n",
"Requirement already satisfied: langchain-community in /home/dheerajreddy/lctest/lib/python3.10/site-packages (0.3.14)\n",
"Requirement already satisfied: langchain_openai in /home/dheerajreddy/lctest/lib/python3.10/site-packages (0.3.0)\n",
"Requirement already satisfied: faiss-cpu in /home/dheerajreddy/lctest/lib/python3.10/site-packages (1.9.0.post1)\n",
"Requirement already satisfied: pypdf in /home/dheerajreddy/lctest/lib/python3.10/site-packages (5.1.0)\n",
"Requirement already satisfied: SQLAlchemy<3,>=1.4 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (2.0.37)\n",
"Requirement already satisfied: langsmith<0.3,>=0.1.17 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (0.2.11)\n",
"Requirement already satisfied: requests<3,>=2 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (2.32.3)\n",
"Requirement already satisfied: numpy<2,>=1.22.4 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (1.26.4)\n",
"Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (3.11.11)\n",
"Requirement already satisfied: pydantic<3.0.0,>=2.7.4 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (2.10.5)\n",
"Requirement already satisfied: langchain-core<0.4.0,>=0.3.29 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (0.3.30)\n",
"Requirement already satisfied: PyYAML>=5.3 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (6.0.2)\n",
"Requirement already satisfied: langchain-text-splitters<0.4.0,>=0.3.3 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (0.3.5)\n",
"Requirement already satisfied: tenacity!=8.4.0,<10,>=8.1.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (9.0.0)\n",
"Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (4.0.3)\n",
"Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain-community) (0.6.7)\n",
"Requirement already satisfied: pydantic-settings<3.0.0,>=2.4.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain-community) (2.7.1)\n",
"Requirement already satisfied: httpx-sse<0.5.0,>=0.4.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain-community) (0.4.0)\n",
"Requirement already satisfied: tiktoken<1,>=0.7 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain_openai) (0.8.0)\n",
"Requirement already satisfied: openai<2.0.0,>=1.58.1 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain_openai) (1.59.8)\n",
"Requirement already satisfied: packaging in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from faiss-cpu) (24.2)\n",
"Requirement already satisfied: typing_extensions>=4.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from pypdf) (4.12.2)\n",
"Requirement already satisfied: attrs>=17.3.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (24.3.0)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.5.0)\n",
"Requirement already satisfied: propcache>=0.2.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (0.2.1)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.1.0)\n",
"Requirement already satisfied: yarl<2.0,>=1.17.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.18.3)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.2)\n",
"Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (2.4.4)\n",
"Requirement already satisfied: typing-inspect<1,>=0.4.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain-community) (0.9.0)\n",
"Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain-community) (3.25.1)\n",
"Requirement already satisfied: jsonpatch<2.0,>=1.33 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain-core<0.4.0,>=0.3.29->langchain) (1.33)\n",
"Requirement already satisfied: requests-toolbelt<2.0.0,>=1.0.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langsmith<0.3,>=0.1.17->langchain) (1.0.0)\n",
"Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langsmith<0.3,>=0.1.17->langchain) (3.10.14)\n",
"Requirement already satisfied: httpx<1,>=0.23.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langsmith<0.3,>=0.1.17->langchain) (0.28.1)\n",
"Requirement already satisfied: anyio<5,>=3.5.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (4.8.0)\n",
"Requirement already satisfied: sniffio in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (1.3.1)\n",
"Requirement already satisfied: tqdm>4 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (4.67.1)\n",
"Requirement already satisfied: jiter<1,>=0.4.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (0.8.2)\n",
"Requirement already satisfied: distro<2,>=1.7.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (1.9.0)\n",
"Requirement already satisfied: pydantic-core==2.27.2 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from pydantic<3.0.0,>=2.7.4->langchain) (2.27.2)\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from pydantic<3.0.0,>=2.7.4->langchain) (0.7.0)\n",
"Requirement already satisfied: python-dotenv>=0.21.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from pydantic-settings<3.0.0,>=2.4.0->langchain-community) (1.0.1)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from requests<3,>=2->langchain) (2.3.0)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from requests<3,>=2->langchain) (2024.12.14)\n",
"Requirement already satisfied: idna<4,>=2.5 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from requests<3,>=2->langchain) (3.10)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from requests<3,>=2->langchain) (3.4.1)\n",
"Requirement already satisfied: greenlet!=0.4.17 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from SQLAlchemy<3,>=1.4->langchain) (3.1.1)\n",
"Requirement already satisfied: regex>=2022.1.18 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from tiktoken<1,>=0.7->langchain_openai) (2024.11.6)\n",
"Requirement already satisfied: exceptiongroup>=1.0.2 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from anyio<5,>=3.5.0->openai<2.0.0,>=1.58.1->langchain_openai) (1.2.2)\n",
"Requirement already satisfied: httpcore==1.* in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from httpx<1,>=0.23.0->langsmith<0.3,>=0.1.17->langchain) (1.0.7)\n",
"Requirement already satisfied: h11<0.15,>=0.13 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->langsmith<0.3,>=0.1.17->langchain) (0.14.0)\n",
"Requirement already satisfied: jsonpointer>=1.9 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from jsonpatch<2.0,>=1.33->langchain-core<0.4.0,>=0.3.29->langchain) (3.0.0)\n",
"Requirement already satisfied: mypy-extensions>=0.3.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community) (1.0.0)\n"
]
}
],
"source": [
"! pip install -U langchain langchain-community langchain_openai faiss-cpu pypdf # (newest versions required for multi-modal)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "d033c505-c805-49cd-905d-97edf606113d",
"metadata": {},
"outputs": [],
"source": [
"#Import all necessary libraries\n",
"# from langchain_community.chat_models import ChatOpenAI\n",
"from langchain_openai import ChatOpenAI\n",
"from langchain_community.vectorstores import FAISS\n",
"# from langchain_community.embeddings import OpenAIEmbeddings\n",
"from langchain_openai import OpenAIEmbeddings\n",
"from langchain_core.prompts import PromptTemplate\n",
"from langchain_community.document_loaders import PyPDFLoader,TextLoader\n",
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
"from langchain import hub\n",
"from langchain_core.output_parsers import StrOutputParser\n",
"from langchain_core.runnables import RunnablePassthrough"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "4196d408-efe1-442c-8d8a-856a002ab8ac",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdin",
"output_type": "stream",
"text": [
"Enter API key for OpenAI: ········\n"
]
}
],
"source": [
"import getpass\n",
"import os\n",
"\n",
"if not os.environ.get(\"OPENAI_API_KEY\"):\n",
" os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter API key for OpenAI: \")\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c4eb93a9-6e6a-4ab9-822b-1e5106e30258",
"metadata": {},
"outputs": [],
"source": [
"# Initialize vector store and embeddings\n",
"vectorstore = None\n",
"embeddings = OpenAIEmbeddings()\n",
"\n",
"# Define a prompt template\n",
"prompt_template = PromptTemplate(\n",
" input_variables=[\"context\", \"question\"],\n",
" template=\"\"\"\n",
" You are an intelligent assistant. Use the following context to answer the user's question accurately:\n",
"\n",
" Context: {context}\n",
"\n",
" Question: {question}\n",
"\n",
" Answer: \"\"\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "9d8e77bc-0e5e-4d8f-8475-d9248b048817",
"metadata": {},
"outputs": [],
"source": [
"#Write the helper functions\n",
"def process_documents(file_paths):\n",
" print(file_paths)\n",
" \"\"\"Processes and updates the vector store with new documents.\"\"\"\n",
" global vectorstore\n",
" documents = []\n",
" text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
"\n",
" for file_path in file_paths:\n",
" print(f\"Processing file: {file_path}\")\n",
" \n",
" try:\n",
" loader = PyPDFLoader(file_path) if file_path.endswith(\".pdf\") else TextLoader(file_path)\n",
" documents.extend(loader.load())\n",
" except Exception as e:\n",
" print(f\"Error processing file {file_path}: {e}\")\n",
" continue\n",
"\n",
" docs = text_splitter.split_documents(documents)\n",
"\n",
" # Create or update vector store\n",
" if vectorstore is None:\n",
" vectorstore = FAISS.from_documents(docs, embeddings)\n",
" else:\n",
" vectorstore.add_documents(docs)\n",
" print(vectorstore.index.ntotal)\n",
"\n",
"\n",
"def format_docs(docs):\n",
" return \"\\n\\n\".join(doc.page_content for doc in docs)\n",
"\n",
"\n",
"def get_qa_chain():\n",
" \"\"\"Creates and returns a RetrievalQA chain.\"\"\"\n",
" if vectorstore is None:\n",
" raise ValueError(\"Vector store is not initialized. Upload documents first.\")\n",
" prompt = hub.pull(\"rlm/rag-prompt\")\n",
" retriever = vectorstore.as_retriever()\n",
" qa_chain = (\n",
" {\n",
" \"context\": retriever | format_docs,\n",
" \"question\": RunnablePassthrough(),\n",
" }\n",
" | prompt\n",
" | ChatOpenAI()\n",
" | StrOutputParser()\n",
" )\n",
"\n",
" return qa_chain"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "c25b33fe-8039-4bbd-a3af-c829179325b5",
"metadata": {},
"outputs": [],
"source": [
"#add all the file paths, you want to create a retrieval chatbot for\n",
"file_paths = ['../docs/docs/example_data/nike-q3-2024-earnings.pdf',\n",
" '../docs/docs/example_data/puma-q3-2024-earnings.pdf']"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "3b0ced3b-890d-4fd4-9b3e-849e73451210",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['../docs/docs/example_data/nike-q3-2024-earnings.pdf', '../docs/docs/example_data/puma-q3-2024-earnings.pdf']\n",
"Processing file: ../docs/docs/example_data/nike-q3-2024-earnings.pdf\n",
"Processing file: ../docs/docs/example_data/puma-q3-2024-earnings.pdf\n",
"49\n"
]
}
],
"source": [
"#Create Vector embeddings\n",
"process_documents(file_paths)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "e261b2c3-d62e-428f-a615-2643e2c3f79d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/dheerajreddy/lctest/lib/python3.10/site-packages/langsmith/client.py:256: LangSmithMissingAPIKeyWarning: API key must be provided when using hosted LangSmith API\n",
" warnings.warn(\n"
]
}
],
"source": [
"qa_chain = get_qa_chain()\n",
"# "
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "723f652b-ca89-4f9e-9b39-60c451336d2e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"It is difficult to determine which company's financials are better based on the given context as both Nike and Puma present different aspects of their financial performance. Nike reported revenues of $12.4 billion in the third quarter of fiscal year 2024, while Puma emphasized its focus on managing short-term challenges without compromising long-term momentum and positive feedback on upcoming product releases. Additional analysis and comparison of financial statements would be needed to determine which company's financial condition is better.\n"
]
}
],
"source": [
"#query the documents\n",
"question = 'Among Nike and Puma whose financials are better'\n",
"response = qa_chain.invoke(question)\n",
"print(response)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "57fb42dd-684a-4c80-917d-37feaa41a6ea",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Binary file added docs/docs/example_data/nike-q3-2024-earnings.pdf
Binary file not shown.
Binary file added docs/docs/example_data/puma-q3-2024-earnings.pdf
Binary file not shown.

0 comments on commit 2503d3f

Please sign in to comment.