diff --git a/cookbook/rag_with_faiss.ipynb b/cookbook/rag_with_faiss.ipynb new file mode 100644 index 0000000000000..e14365a3af63a --- /dev/null +++ b/cookbook/rag_with_faiss.ipynb @@ -0,0 +1,318 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e7135278-a502-4ae9-9e61-b00a80a7c7ec", + "metadata": {}, + "source": [ + "# Build RAG vector database using FAISS and query the document\n", + "\n", + "This notebook takes you through step by step process of creating a RAG based chatbot" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "c96cd71a-510d-46a2-a06b-8839818e2196", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: langchain in /home/dheerajreddy/lctest/lib/python3.10/site-packages (0.3.14)\n", + "Requirement already satisfied: langchain-community in /home/dheerajreddy/lctest/lib/python3.10/site-packages (0.3.14)\n", + "Requirement already satisfied: langchain_openai in /home/dheerajreddy/lctest/lib/python3.10/site-packages (0.3.0)\n", + "Requirement already satisfied: faiss-cpu in /home/dheerajreddy/lctest/lib/python3.10/site-packages (1.9.0.post1)\n", + "Requirement already satisfied: pypdf in /home/dheerajreddy/lctest/lib/python3.10/site-packages (5.1.0)\n", + "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (2.0.37)\n", + "Requirement already satisfied: langsmith<0.3,>=0.1.17 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (0.2.11)\n", + "Requirement already satisfied: requests<3,>=2 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (2.32.3)\n", + "Requirement already satisfied: numpy<2,>=1.22.4 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (1.26.4)\n", + "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (3.11.11)\n", + "Requirement already satisfied: pydantic<3.0.0,>=2.7.4 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (2.10.5)\n", + "Requirement already satisfied: langchain-core<0.4.0,>=0.3.29 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (0.3.30)\n", + "Requirement already satisfied: PyYAML>=5.3 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (6.0.2)\n", + "Requirement already satisfied: langchain-text-splitters<0.4.0,>=0.3.3 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (0.3.5)\n", + "Requirement already satisfied: tenacity!=8.4.0,<10,>=8.1.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (9.0.0)\n", + "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain) (4.0.3)\n", + "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain-community) (0.6.7)\n", + "Requirement already satisfied: pydantic-settings<3.0.0,>=2.4.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain-community) (2.7.1)\n", + "Requirement already satisfied: httpx-sse<0.5.0,>=0.4.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain-community) (0.4.0)\n", + "Requirement already satisfied: tiktoken<1,>=0.7 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain_openai) (0.8.0)\n", + "Requirement already satisfied: openai<2.0.0,>=1.58.1 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain_openai) (1.59.8)\n", + "Requirement already satisfied: packaging in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from faiss-cpu) (24.2)\n", + "Requirement already satisfied: typing_extensions>=4.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from pypdf) (4.12.2)\n", + "Requirement already satisfied: attrs>=17.3.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (24.3.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.5.0)\n", + "Requirement already satisfied: propcache>=0.2.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (0.2.1)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.1.0)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.18.3)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.2)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (2.4.4)\n", + "Requirement already satisfied: typing-inspect<1,>=0.4.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain-community) (0.9.0)\n", + "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain-community) (3.25.1)\n", + "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langchain-core<0.4.0,>=0.3.29->langchain) (1.33)\n", + "Requirement already satisfied: requests-toolbelt<2.0.0,>=1.0.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langsmith<0.3,>=0.1.17->langchain) (1.0.0)\n", + "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langsmith<0.3,>=0.1.17->langchain) (3.10.14)\n", + "Requirement already satisfied: httpx<1,>=0.23.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from langsmith<0.3,>=0.1.17->langchain) (0.28.1)\n", + "Requirement already satisfied: anyio<5,>=3.5.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (4.8.0)\n", + "Requirement already satisfied: sniffio in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (1.3.1)\n", + "Requirement already satisfied: tqdm>4 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (4.67.1)\n", + "Requirement already satisfied: jiter<1,>=0.4.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (0.8.2)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (1.9.0)\n", + "Requirement already satisfied: pydantic-core==2.27.2 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from pydantic<3.0.0,>=2.7.4->langchain) (2.27.2)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from pydantic<3.0.0,>=2.7.4->langchain) (0.7.0)\n", + "Requirement already satisfied: python-dotenv>=0.21.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from pydantic-settings<3.0.0,>=2.4.0->langchain-community) (1.0.1)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from requests<3,>=2->langchain) (2.3.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from requests<3,>=2->langchain) (2024.12.14)\n", + "Requirement already satisfied: idna<4,>=2.5 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from requests<3,>=2->langchain) (3.10)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from requests<3,>=2->langchain) (3.4.1)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from SQLAlchemy<3,>=1.4->langchain) (3.1.1)\n", + "Requirement already satisfied: regex>=2022.1.18 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from tiktoken<1,>=0.7->langchain_openai) (2024.11.6)\n", + "Requirement already satisfied: exceptiongroup>=1.0.2 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from anyio<5,>=3.5.0->openai<2.0.0,>=1.58.1->langchain_openai) (1.2.2)\n", + "Requirement already satisfied: httpcore==1.* in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from httpx<1,>=0.23.0->langsmith<0.3,>=0.1.17->langchain) (1.0.7)\n", + "Requirement already satisfied: h11<0.15,>=0.13 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->langsmith<0.3,>=0.1.17->langchain) (0.14.0)\n", + "Requirement already satisfied: jsonpointer>=1.9 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from jsonpatch<2.0,>=1.33->langchain-core<0.4.0,>=0.3.29->langchain) (3.0.0)\n", + "Requirement already satisfied: mypy-extensions>=0.3.0 in /home/dheerajreddy/lctest/lib/python3.10/site-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community) (1.0.0)\n" + ] + } + ], + "source": [ + "! pip install -U langchain langchain-community langchain_openai faiss-cpu pypdf # (newest versions required for multi-modal)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d033c505-c805-49cd-905d-97edf606113d", + "metadata": {}, + "outputs": [], + "source": [ + "#Import all necessary libraries\n", + "# from langchain_community.chat_models import ChatOpenAI\n", + "from langchain_openai import ChatOpenAI\n", + "from langchain_community.vectorstores import FAISS\n", + "# from langchain_community.embeddings import OpenAIEmbeddings\n", + "from langchain_openai import OpenAIEmbeddings\n", + "from langchain_core.prompts import PromptTemplate\n", + "from langchain_community.document_loaders import PyPDFLoader,TextLoader\n", + "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", + "from langchain import hub\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.runnables import RunnablePassthrough" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4196d408-efe1-442c-8d8a-856a002ab8ac", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Enter API key for OpenAI: ········\n" + ] + } + ], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "if not os.environ.get(\"OPENAI_API_KEY\"):\n", + " os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter API key for OpenAI: \")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c4eb93a9-6e6a-4ab9-822b-1e5106e30258", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize vector store and embeddings\n", + "vectorstore = None\n", + "embeddings = OpenAIEmbeddings()\n", + "\n", + "# Define a prompt template\n", + "prompt_template = PromptTemplate(\n", + " input_variables=[\"context\", \"question\"],\n", + " template=\"\"\"\n", + " You are an intelligent assistant. Use the following context to answer the user's question accurately:\n", + "\n", + " Context: {context}\n", + "\n", + " Question: {question}\n", + "\n", + " Answer: \"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9d8e77bc-0e5e-4d8f-8475-d9248b048817", + "metadata": {}, + "outputs": [], + "source": [ + "#Write the helper functions\n", + "def process_documents(file_paths):\n", + " print(file_paths)\n", + " \"\"\"Processes and updates the vector store with new documents.\"\"\"\n", + " global vectorstore\n", + " documents = []\n", + " text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", + "\n", + " for file_path in file_paths:\n", + " print(f\"Processing file: {file_path}\")\n", + " \n", + " try:\n", + " loader = PyPDFLoader(file_path) if file_path.endswith(\".pdf\") else TextLoader(file_path)\n", + " documents.extend(loader.load())\n", + " except Exception as e:\n", + " print(f\"Error processing file {file_path}: {e}\")\n", + " continue\n", + "\n", + " docs = text_splitter.split_documents(documents)\n", + "\n", + " # Create or update vector store\n", + " if vectorstore is None:\n", + " vectorstore = FAISS.from_documents(docs, embeddings)\n", + " else:\n", + " vectorstore.add_documents(docs)\n", + " print(vectorstore.index.ntotal)\n", + "\n", + "\n", + "def format_docs(docs):\n", + " return \"\\n\\n\".join(doc.page_content for doc in docs)\n", + "\n", + "\n", + "def get_qa_chain():\n", + " \"\"\"Creates and returns a RetrievalQA chain.\"\"\"\n", + " if vectorstore is None:\n", + " raise ValueError(\"Vector store is not initialized. Upload documents first.\")\n", + " prompt = hub.pull(\"rlm/rag-prompt\")\n", + " retriever = vectorstore.as_retriever()\n", + " qa_chain = (\n", + " {\n", + " \"context\": retriever | format_docs,\n", + " \"question\": RunnablePassthrough(),\n", + " }\n", + " | prompt\n", + " | ChatOpenAI()\n", + " | StrOutputParser()\n", + " )\n", + "\n", + " return qa_chain" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c25b33fe-8039-4bbd-a3af-c829179325b5", + "metadata": {}, + "outputs": [], + "source": [ + "#add all the file paths, you want to create a retrieval chatbot for\n", + "file_paths = ['../docs/docs/example_data/nike-q3-2024-earnings.pdf',\n", + " '../docs/docs/example_data/puma-q3-2024-earnings.pdf']" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3b0ced3b-890d-4fd4-9b3e-849e73451210", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['../docs/docs/example_data/nike-q3-2024-earnings.pdf', '../docs/docs/example_data/puma-q3-2024-earnings.pdf']\n", + "Processing file: ../docs/docs/example_data/nike-q3-2024-earnings.pdf\n", + "Processing file: ../docs/docs/example_data/puma-q3-2024-earnings.pdf\n", + "49\n" + ] + } + ], + "source": [ + "#Create Vector embeddings\n", + "process_documents(file_paths)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e261b2c3-d62e-428f-a615-2643e2c3f79d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/dheerajreddy/lctest/lib/python3.10/site-packages/langsmith/client.py:256: LangSmithMissingAPIKeyWarning: API key must be provided when using hosted LangSmith API\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "qa_chain = get_qa_chain()\n", + "# " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "723f652b-ca89-4f9e-9b39-60c451336d2e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "It is difficult to determine which company's financials are better based on the given context as both Nike and Puma present different aspects of their financial performance. Nike reported revenues of $12.4 billion in the third quarter of fiscal year 2024, while Puma emphasized its focus on managing short-term challenges without compromising long-term momentum and positive feedback on upcoming product releases. Additional analysis and comparison of financial statements would be needed to determine which company's financial condition is better.\n" + ] + } + ], + "source": [ + "#query the documents\n", + "question = 'Among Nike and Puma whose financials are better'\n", + "response = qa_chain.invoke(question)\n", + "print(response)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57fb42dd-684a-4c80-917d-37feaa41a6ea", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/docs/example_data/nike-q3-2024-earnings.pdf b/docs/docs/example_data/nike-q3-2024-earnings.pdf new file mode 100644 index 0000000000000..78f41618697be Binary files /dev/null and b/docs/docs/example_data/nike-q3-2024-earnings.pdf differ diff --git a/docs/docs/example_data/puma-q3-2024-earnings.pdf b/docs/docs/example_data/puma-q3-2024-earnings.pdf new file mode 100644 index 0000000000000..350cbf5722f1f Binary files /dev/null and b/docs/docs/example_data/puma-q3-2024-earnings.pdf differ