diff --git a/docs/docs/integrations/document_loaders/pymupdf.ipynb b/docs/docs/integrations/document_loaders/pymupdf.ipynb index 81893dc7d684f6..e304a94395b881 100644 --- a/docs/docs/integrations/document_loaders/pymupdf.ipynb +++ b/docs/docs/integrations/document_loaders/pymupdf.ipynb @@ -35,24 +35,24 @@ ] }, { - "metadata": {}, "cell_type": "markdown", + "metadata": {}, "source": "If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:" }, { + "cell_type": "code", + "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:06:39.287984Z", "start_time": "2025-01-17T11:06:39.285720Z" } }, - "cell_type": "code", + "outputs": [], "source": [ "# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n", "# os.environ[\"LANGSMITH_TRACING\"] = \"true\"" - ], - "outputs": [], - "execution_count": 1 + ] }, { "cell_type": "markdown", @@ -65,13 +65,13 @@ }, { "cell_type": "code", + "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:06:42.183569Z", "start_time": "2025-01-17T11:06:40.528770Z" } }, - "source": "%pip install -qU langchain_community pymupdf", "outputs": [ { "name": "stdout", @@ -81,7 +81,9 @@ ] } ], - "execution_count": 2 + "source": [ + "%pip install -qU langchain_community pymupdf" + ] }, { "cell_type": "markdown", @@ -94,20 +96,20 @@ }, { "cell_type": "code", + "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:06:44.403523Z", "start_time": "2025-01-17T11:06:43.736030Z" } }, + "outputs": [], "source": [ "from langchain_community.document_loaders import PyMuPDFLoader\n", "\n", "file_path = \"./example_data/layout-parser-paper.pdf\"\n", "loader = PyMuPDFLoader(file_path)" - ], - "outputs": [], - "execution_count": 3 + ] }, { "cell_type": "markdown", @@ -118,16 +120,13 @@ }, { "cell_type": "code", + "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:06:46.138267Z", "start_time": "2025-01-17T11:06:46.001187Z" } }, - "source": [ - "docs = loader.load()\n", - "docs[0]" - ], "outputs": [ { "data": { @@ -140,21 +139,20 @@ "output_type": "execute_result" } ], - "execution_count": 4 + "source": [ + "docs = loader.load()\n", + "docs[0]" + ] }, { "cell_type": "code", + "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:06:46.646335Z", "start_time": "2025-01-17T11:06:46.642667Z" } }, - "source": [ - "import pprint\n", - "\n", - "pprint.pp(docs[0].metadata)" - ], "outputs": [ { "name": "stdout", @@ -177,7 +175,11 @@ ] } ], - "execution_count": 5 + "source": [ + "import pprint\n", + "\n", + "pprint.pp(docs[0].metadata)" + ] }, { "cell_type": "markdown", @@ -188,23 +190,13 @@ }, { "cell_type": "code", + "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:06:48.147692Z", "start_time": "2025-01-17T11:06:48.094257Z" } }, - "source": [ - "pages = []\n", - "for doc in loader.lazy_load():\n", - " pages.append(doc)\n", - " if len(pages) >= 10:\n", - " # do some paged operation, e.g.\n", - " # index.upsert(page)\n", - "\n", - " pages = []\n", - "len(pages)" - ], "outputs": [ { "data": { @@ -217,20 +209,27 @@ "output_type": "execute_result" } ], - "execution_count": 6 + "source": [ + "pages = []\n", + "for doc in loader.lazy_load():\n", + " pages.append(doc)\n", + " if len(pages) >= 10:\n", + " # do some paged operation, e.g.\n", + " # index.upsert(page)\n", + "\n", + " pages = []\n", + "len(pages)" + ] }, { "cell_type": "code", + "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:06:50.003790Z", "start_time": "2025-01-17T11:06:50.000060Z" } }, - "source": [ - "print(pages[0].page_content[:100])\n", - "pprint.pp(pages[0].metadata)" - ], "outputs": [ { "name": "stdout", @@ -256,7 +255,10 @@ ] } ], - "execution_count": 7 + "source": [ + "print(pages[0].page_content[:100])\n", + "pprint.pp(pages[0].metadata)" + ] }, { "cell_type": "markdown", @@ -301,21 +303,13 @@ }, { "cell_type": "code", + "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:06:53.613494Z", "start_time": "2025-01-17T11:06:53.563930Z" } }, - "source": [ - "loader = PyMuPDFLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " mode=\"page\",\n", - ")\n", - "docs = loader.load()\n", - "print(len(docs))\n", - "pprint.pp(docs[0].metadata)" - ], "outputs": [ { "name": "stdout", @@ -339,7 +333,15 @@ ] } ], - "execution_count": 8 + "source": [ + "loader = PyMuPDFLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"page\",\n", + ")\n", + "docs = loader.load()\n", + "print(len(docs))\n", + "pprint.pp(docs[0].metadata)" + ] }, { "cell_type": "markdown", @@ -357,21 +359,13 @@ }, { "cell_type": "code", + "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:06:55.955935Z", "start_time": "2025-01-17T11:06:55.903604Z" } }, - "source": [ - "loader = PyMuPDFLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " mode=\"single\",\n", - ")\n", - "docs = loader.load()\n", - "print(len(docs))\n", - "pprint.pp(docs[0].metadata)" - ], "outputs": [ { "name": "stdout", @@ -394,7 +388,15 @@ ] } ], - "execution_count": 9 + "source": [ + "loader = PyMuPDFLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"single\",\n", + ")\n", + "docs = loader.load()\n", + "print(len(docs))\n", + "pprint.pp(docs[0].metadata)" + ] }, { "cell_type": "markdown", @@ -409,22 +411,14 @@ "source": "### Add a custom *pages_delimiter* to identify where are ends of pages in *single* mode:" }, { + "cell_type": "code", + "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:07:31.932597Z", "start_time": "2025-01-17T11:07:31.885499Z" } }, - "cell_type": "code", - "source": [ - "loader = PyMuPDFLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " mode=\"single\",\n", - " pages_delimiter=\"\\n-------THIS IS A CUSTOM END OF PAGE-------\\n\",\n", - ")\n", - "docs = loader.load()\n", - "print(docs[0].page_content[:5780])" - ], "outputs": [ { "name": "stdout", @@ -528,7 +522,15 @@ ] } ], - "execution_count": 11 + "source": [ + "loader = PyMuPDFLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"single\",\n", + " pages_delimiter=\"\\n-------THIS IS A CUSTOM END OF PAGE-------\\n\",\n", + ")\n", + "docs = loader.load()\n", + "print(docs[0].page_content[:5780])" + ] }, { "cell_type": "markdown", @@ -567,15 +569,13 @@ }, { "cell_type": "code", + "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:07:39.281686Z", "start_time": "2025-01-17T11:07:37.500638Z" } }, - "source": [ - "%pip install -qU rapidocr-onnxruntime" - ], "outputs": [ { "name": "stdout", @@ -585,29 +585,19 @@ ] } ], - "execution_count": 12 + "source": [ + "%pip install -qU rapidocr-onnxruntime" + ] }, { + "cell_type": "code", + "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:08:46.036783Z", "start_time": "2025-01-17T11:08:22.713011Z" } }, - "cell_type": "code", - "source": [ - "from langchain_community.document_loaders.parsers import RapidOCRBlobParser\n", - "\n", - "loader = PyMuPDFLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " mode=\"page\",\n", - " images_inner_format=\"markdown-img\",\n", - " images_parser=RapidOCRBlobParser(),\n", - ")\n", - "docs = loader.load()\n", - "\n", - "print(docs[5].page_content)" - ], "outputs": [ { "name": "stdout", @@ -684,7 +674,19 @@ ] } ], - "execution_count": 14 + "source": [ + "from langchain_community.document_loaders.parsers import RapidOCRBlobParser\n", + "\n", + "loader = PyMuPDFLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"page\",\n", + " images_inner_format=\"markdown-img\",\n", + " images_parser=RapidOCRBlobParser(),\n", + ")\n", + "docs = loader.load()\n", + "\n", + "print(docs[5].page_content)" + ] }, { "cell_type": "markdown", @@ -702,15 +704,13 @@ }, { "cell_type": "code", + "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:08:53.698734Z", "start_time": "2025-01-17T11:08:52.248547Z" } }, - "source": [ - "%pip install -qU pytesseract" - ], "outputs": [ { "name": "stdout", @@ -720,28 +720,19 @@ ] } ], - "execution_count": 15 + "source": [ + "%pip install -qU pytesseract" + ] }, { + "cell_type": "code", + "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:09:03.699153Z", "start_time": "2025-01-17T11:08:55.660127Z" } }, - "cell_type": "code", - "source": [ - "from langchain_community.document_loaders.parsers import TesseractBlobParser\n", - "\n", - "loader = PyMuPDFLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " mode=\"page\",\n", - " images_inner_format=\"html-img\",\n", - " images_parser=TesseractBlobParser(),\n", - ")\n", - "docs = loader.load()\n", - "print(docs[5].page_content)" - ], "outputs": [ { "name": "stdout", @@ -818,7 +809,18 @@ ] } ], - "execution_count": 16 + "source": [ + "from langchain_community.document_loaders.parsers import TesseractBlobParser\n", + "\n", + "loader = PyMuPDFLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"page\",\n", + " images_inner_format=\"html-img\",\n", + " images_parser=TesseractBlobParser(),\n", + ")\n", + "docs = loader.load()\n", + "print(docs[5].page_content)" + ] }, { "cell_type": "markdown", @@ -829,15 +831,13 @@ }, { "cell_type": "code", + "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:09:08.637429Z", "start_time": "2025-01-17T11:09:07.177157Z" } }, - "source": [ - "%pip install -qU langchain_openai" - ], "outputs": [ { "name": "stdout", @@ -847,23 +847,19 @@ ] } ], - "execution_count": 17 + "source": [ + "%pip install -qU langchain_openai" + ] }, { "cell_type": "code", + "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:09:09.670266Z", "start_time": "2025-01-17T11:09:09.634422Z" } }, - "source": [ - "import os\n", - "\n", - "from dotenv import load_dotenv\n", - "\n", - "load_dotenv()" - ], "outputs": [ { "data": { @@ -876,47 +872,40 @@ "output_type": "execute_result" } ], - "execution_count": 18 + "source": [ + "import os\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()" + ] }, { "cell_type": "code", + "execution_count": 19, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:09:11.652399Z", "start_time": "2025-01-17T11:09:11.649497Z" } }, + "outputs": [], "source": [ "from getpass import getpass\n", "\n", "if not os.environ.get(\"OPENAI_API_KEY\"):\n", " os.environ[\"OPENAI_API_KEY\"] = getpass(\"OpenAI API key =\")" - ], - "outputs": [], - "execution_count": 19 + ] }, { + "cell_type": "code", + "execution_count": 43, "metadata": { "ExecuteTime": { - "end_time": "2025-01-17T11:10:15.732342Z", + "end_time": "2025-01-17T12:46:33.398682Z", "start_time": "2025-01-17T11:09:14.102369Z" } }, - "cell_type": "code", - "source": [ - "from langchain_community.document_loaders.parsers import LLMImageBlobParser\n", - "from langchain_openai import ChatOpenAI\n", - "\n", - "loader = PyMuPDFLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " mode=\"page\",\n", - " images_inner_format=\"markdown-img\",\n", - " images_parser=LLMImageBlobParser(\n", - " model=ChatOpenAI(model=\"gpt-4o\", max_tokens=1024)),\n", - ")\n", - "docs = loader.load()\n", - "print(docs[5].page_content)" - ], "outputs": [ { "name": "stdout", @@ -1016,11 +1005,102 @@ "\n", "..., textblock2, layout1 \\\\]\n", "\n", + "A list of the layout elements](#)\n", + "6\n", + "Z. Shen et al.\n", + "Fig. 2: The relationship between the three types of layout data structures.\n", + "Coordinate supports three kinds of variation; TextBlock consists of the co-\n", + "ordinate information and extra features like block text, types, and reading orders;\n", + "a Layout object is a list of all possible layout elements, including other Layout\n", + "objects. They all support the same set of transformation and operation APIs for\n", + "maximum flexibility.\n", + "Shown in Table 1, LayoutParser currently hosts 9 pre-trained models trained\n", + "on 5 different datasets. Description of the training dataset is provided alongside\n", + "with the trained models such that users can quickly identify the most suitable\n", + "models for their tasks. Additionally, when such a model is not readily available,\n", + "LayoutParser also supports training customized layout models and community\n", + "sharing of the models (detailed in Section 3.5).\n", + "3.2\n", + "Layout Data Structures\n", + "A critical feature of LayoutParser is the implementation of a series of data\n", + "structures and operations that can be used to efficiently process and manipulate\n", + "the layout elements. In document image analysis pipelines, various post-processing\n", + "on the layout analysis model outputs is usually required to obtain the final\n", + "outputs. Traditionally, this requires exporting DL model outputs and then loading\n", + "the results into other pipelines. All model outputs from LayoutParser will be\n", + "stored in carefully engineered data types optimized for further processing, which\n", + "makes it possible to build an end-to-end document digitization pipeline within\n", + "LayoutParser. There are three key components in the data structure, namely\n", + "the Coordinate system, the TextBlock, and the Layout. They provide different\n", + "levels of abstraction for the layout data, and a set of APIs are supported for\n", + "transformations or operations on these classes.\n", + "\n", + "\n", + "\n", + "\n", + "![**Image Summary for Retrieval:**\n", + "\n", + "Diagram showing a layout model with components such as coordinate (x-interval, y-interval, rectangle, quadrilateral), textblock (coordinate, extra features like block text, type, reading order), and layout list containing layout elements. Includes transformation and operation API references. \n", + "\n", + "**Extracted Text:**\n", + "\n", + "Coordinate\n", + "\n", + "Coordinate\n", + "\n", + "start start\n", + "x-interval\n", + "end\n", + "end\n", + "\n", + "y-interval\n", + "\n", + "Rectangle\n", + "(x1, y2) \n", + "\n", + "(x1, y1) (x2, y2) \n", + "(x2, y2)\n", + "\n", + "Quadrilateral\n", + "(x1, y1) (x2, y2)\n", + "\n", + "(x4, y4)\n", + "\n", + "(x3, y3)\n", + "\n", + "The same transformation and operation APIs\n", + "\n", + "textblock\n", + "\n", + "Coordinate\n", + "\n", + "Extra features\n", + "\n", + "Block Text Block Type Reading Order …\n", + "\n", + "layout\n", + "\n", + "[ coordinate1, textblock1, ...\n", + "\n", + "…, textblock2, layout1 \\\\]\n", + "\n", "A list of the layout elements](#)\n" ] } ], - "execution_count": 20 + "source": [ + "from langchain_community.document_loaders.parsers import LLMImageBlobParser\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "loader = PyMuPDFLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"page\",\n", + " images_inner_format=\"markdown-img\",\n", + " images_parser=LLMImageBlobParser(model=ChatOpenAI(model=\"gpt-4o\", max_tokens=1024)),\n", + ")\n", + "docs = loader.load()\n", + "print(docs[5].page_content)" + ] }, { "cell_type": "markdown", @@ -1037,40 +1117,41 @@ ] }, { + "cell_type": "code", + "execution_count": 44, "metadata": { "ExecuteTime": { - "end_time": "2025-01-17T11:12:07.687810Z", - "start_time": "2025-01-17T11:12:06.352661Z" + "end_time": "2025-01-17T12:46:34.812794Z", + "start_time": "2025-01-17T12:46:33.475764Z" } }, - "cell_type": "code", - "source": [ - "from IPython.display import display, Markdown\n", - "loader = PyMuPDFLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " mode=\"page\",\n", - " extract_tables=\"markdown\",\n", - ")\n", - "docs = loader.load()\n", - "display(Markdown(docs[4].page_content))" - ], "outputs": [ { "data": { + "text/markdown": "LayoutParser: A Unified Toolkit for DL-Based DIA\n5\nTable 1: Current layout detection models in the LayoutParser model zoo\nDataset\nBase Model1 Large Model\nNotes\nPubLayNet [38]\nF / M\nM\nLayouts of modern scientific documents\nPRImA [3]\nM\n-\nLayouts of scanned modern magazines and scientific reports\nNewspaper [17]\nF\n-\nLayouts of scanned US newspapers from the 20th century\nTableBank [18]\nF\nF\nTable region on modern scientific and business document\nHJDataset [31]\nF / M\n-\nLayouts of history Japanese documents\n1 For each dataset, we train several models of different sizes for different needs (the trade-offbetween accuracy\nvs. computational cost). For “base model” and “large model”, we refer to using the ResNet 50 or ResNet 101\nbackbones [13], respectively. One can train models of different architectures, like Faster R-CNN [28] (F) and Mask\nR-CNN [12] (M). For example, an F in the Large Model column indicates it has a Faster R-CNN model trained\nusing the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model\nzoo in coming months.\nlayout data structures, which are optimized for efficiency and versatility. 3) When\nnecessary, users can employ existing or customized OCR models via the unified\nAPI provided in the OCR module. 4) LayoutParser comes with a set of utility\nfunctions for the visualization and storage of the layout data. 5) LayoutParser\nis also highly customizable, via its integration with functions for layout data\nannotation and model training. We now provide detailed descriptions for each\ncomponent.\n3.1\nLayout Detection Models\nIn LayoutParser, a layout model takes a document image as an input and\ngenerates a list of rectangular boxes for the target content regions. Different\nfrom traditional methods, it relies on deep convolutional neural networks rather\nthan manually curated rules to identify content regions. It is formulated as an\nobject detection problem and state-of-the-art models like Faster R-CNN [28] and\nMask R-CNN [12] are used. This yields prediction results of high accuracy and\nmakes it possible to build a concise, generalized interface for layout detection.\nLayoutParser, built upon Detectron2 [35], provides a minimal API that can\nperform layout detection with only four lines of code in Python:\n1 import\nlayoutparser as lp\n2 image = cv2.imread(\"image_file\") # load\nimages\n3 model = lp. Detectron2LayoutModel (\n4\n\"lp:// PubLayNet/ faster_rcnn_R_50_FPN_3x /config\")\n5 layout = model.detect(image)\nLayoutParser provides a wealth of pre-trained model weights using various\ndatasets covering different languages, time periods, and document types. Due to\ndomain shift [7], the prediction performance can notably drop when models are ap-\nplied to target samples that are significantly different from the training dataset. As\ndocument structures and layouts vary greatly in different domains, it is important\nto select models trained on a dataset similar to the test samples. A semantic syntax\nis used for initializing the model weights in LayoutParser, using both the dataset\nname and model name lp:///.\n\n\n|Dataset|Base Model1|Large Model|Notes|\n|---|---|---|---|\n|PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31]|F / M M F F F / M|M - - F -|Layouts of modern scientific documents Layouts of scanned modern magazines and scientific reports Layouts of scanned US newspapers from the 20th century Table region on modern scientific and business document Layouts of history Japanese documents|", "text/plain": [ "" - ], - "text/markdown": "LayoutParser: A Unified Toolkit for DL-Based DIA\n5\nTable 1: Current layout detection models in the LayoutParser model zoo\nDataset\nBase Model1 Large Model\nNotes\nPubLayNet [38]\nF / M\nM\nLayouts of modern scientific documents\nPRImA [3]\nM\n-\nLayouts of scanned modern magazines and scientific reports\nNewspaper [17]\nF\n-\nLayouts of scanned US newspapers from the 20th century\nTableBank [18]\nF\nF\nTable region on modern scientific and business document\nHJDataset [31]\nF / M\n-\nLayouts of history Japanese documents\n1 For each dataset, we train several models of different sizes for different needs (the trade-offbetween accuracy\nvs. computational cost). For “base model” and “large model”, we refer to using the ResNet 50 or ResNet 101\nbackbones [13], respectively. One can train models of different architectures, like Faster R-CNN [28] (F) and Mask\nR-CNN [12] (M). For example, an F in the Large Model column indicates it has a Faster R-CNN model trained\nusing the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model\nzoo in coming months.\nlayout data structures, which are optimized for efficiency and versatility. 3) When\nnecessary, users can employ existing or customized OCR models via the unified\nAPI provided in the OCR module. 4) LayoutParser comes with a set of utility\nfunctions for the visualization and storage of the layout data. 5) LayoutParser\nis also highly customizable, via its integration with functions for layout data\nannotation and model training. We now provide detailed descriptions for each\ncomponent.\n3.1\nLayout Detection Models\nIn LayoutParser, a layout model takes a document image as an input and\ngenerates a list of rectangular boxes for the target content regions. Different\nfrom traditional methods, it relies on deep convolutional neural networks rather\nthan manually curated rules to identify content regions. It is formulated as an\nobject detection problem and state-of-the-art models like Faster R-CNN [28] and\nMask R-CNN [12] are used. This yields prediction results of high accuracy and\nmakes it possible to build a concise, generalized interface for layout detection.\nLayoutParser, built upon Detectron2 [35], provides a minimal API that can\nperform layout detection with only four lines of code in Python:\n1 import\nlayoutparser as lp\n2 image = cv2.imread(\"image_file\") # load\nimages\n3 model = lp. Detectron2LayoutModel (\n4\n\"lp:// PubLayNet/ faster_rcnn_R_50_FPN_3x /config\")\n5 layout = model.detect(image)\nLayoutParser provides a wealth of pre-trained model weights using various\ndatasets covering different languages, time periods, and document types. Due to\ndomain shift [7], the prediction performance can notably drop when models are ap-\nplied to target samples that are significantly different from the training dataset. As\ndocument structures and layouts vary greatly in different domains, it is important\nto select models trained on a dataset similar to the test samples. A semantic syntax\nis used for initializing the model weights in LayoutParser, using both the dataset\nname and model name lp:///.\n\n\n|Dataset|Base Model1|Large Model|Notes|\n|---|---|---|---|\n|PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31]|F / M M F F F / M|M - - F -|Layouts of modern scientific documents Layouts of scanned modern magazines and scientific reports Layouts of scanned US newspapers from the 20th century Table region on modern scientific and business document Layouts of history Japanese documents|" + ] }, "metadata": {}, "output_type": "display_data" } ], - "execution_count": 24 + "source": [ + "from IPython.display import Markdown, display\n", + "\n", + "loader = PyMuPDFLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"page\",\n", + " extract_tables=\"markdown\",\n", + ")\n", + "docs = loader.load()\n", + "display(Markdown(docs[4].page_content))" + ] }, { - "metadata": {}, "cell_type": "markdown", + "metadata": {}, "source": [ "## Working with Files\n", "\n", @@ -1081,29 +1162,14 @@ ] }, { + "cell_type": "code", + "execution_count": 45, "metadata": { "ExecuteTime": { - "end_time": "2025-01-17T11:12:26.844599Z", - "start_time": "2025-01-17T11:12:26.789346Z" + "end_time": "2025-01-17T12:46:34.866868Z", + "start_time": "2025-01-17T12:46:34.819048Z" } }, - "cell_type": "code", - "source": [ - "from langchain_community.document_loaders import FileSystemBlobLoader\n", - "from langchain_community.document_loaders.generic import GenericLoader\n", - "from langchain_community.document_loaders.parsers import PyMuPDFParser\n", - "\n", - "loader = GenericLoader(\n", - " blob_loader=FileSystemBlobLoader(\n", - " path=\"./example_data/\",\n", - " glob=\"*.pdf\",\n", - " ),\n", - " blob_parser=PyMuPDFParser(),\n", - ")\n", - "docs = loader.load()\n", - "print(docs[0].page_content)\n", - "pprint.pp(docs[0].metadata)" - ], "outputs": [ { "name": "stdout", @@ -1168,7 +1234,22 @@ ] } ], - "execution_count": 25 + "source": [ + "from langchain_community.document_loaders import FileSystemBlobLoader\n", + "from langchain_community.document_loaders.generic import GenericLoader\n", + "from langchain_community.document_loaders.parsers import PyMuPDFParser\n", + "\n", + "loader = GenericLoader(\n", + " blob_loader=FileSystemBlobLoader(\n", + " path=\"./example_data/\",\n", + " glob=\"*.pdf\",\n", + " ),\n", + " blob_parser=PyMuPDFParser(),\n", + ")\n", + "docs = loader.load()\n", + "print(docs[0].page_content)\n", + "pprint.pp(docs[0].metadata)" + ] }, { "cell_type": "markdown", @@ -1177,7 +1258,9 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "from langchain_community.document_loaders import CloudBlobLoader\n", "from langchain_community.document_loaders.generic import GenericLoader\n", @@ -1192,9 +1275,7 @@ "docs = loader.load()\n", "print(docs[0].page_content)\n", "pprint.pp(docs[0].metadata)" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 4eb493cf41e935..254849df802738 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -466,7 +466,7 @@ class PyMuPDFParser(BaseBlobParser): parser = PyMuPDFParser( # password = None, mode = "single", - pages_delimitor = "\n\f", + pages_delimiter = "\n\f", # extract_images = True, # images_parser = TesseractBlobParser(), # extract_tables="markdown", diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index bd98d72db922d3..3c5f2ca9b6357b 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -495,7 +495,7 @@ def __init__( pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, extract_images: bool = False, images_parser: Optional[BaseImageBlobParser] = None, - images_inner_format:str="text", + images_inner_format: str = "text", extract_tables: Union[Literal["csv", "markdown", "html"], None] = None, headers: Optional[dict] = None, extract_tables_settings: Optional[dict[str, Any]] = None, diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py index ee0fe365885bb5..44cc8294643f1b 100644 --- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py +++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py @@ -143,13 +143,31 @@ def _analyze_image(self, img: "Image") -> str: @pytest.mark.parametrize( - "mode", - ["single", "page"], + "mode,image_parser", + [("single", EmptyImageBlobParser()), ("page", None)], ) @pytest.mark.parametrize( - "image_parser", - [EmptyImageBlobParser(), None], + "parser_factory,params", + [ + ("PyMuPDFParser", {}), + ], ) +@pytest.mark.requires("pillow") +def test_mode_and_extract_images_variations( + parser_factory: str, + params: dict, + mode: str, + image_parser: BaseImageBlobParser, +) -> None: + _test_matrix( + parser_factory, + params, + mode, + image_parser, + images_inner_format="text", + ) + + @pytest.mark.parametrize( "images_inner_format", ["text", "markdown-img", "html-img"], @@ -161,7 +179,24 @@ def _analyze_image(self, img: "Image") -> str: ], ) @pytest.mark.requires("pillow") -def test_mode_and_extract_images_variations( +def test_mode_and_image_formats_variations( + parser_factory: str, + params: dict, + images_inner_format: str, +) -> None: + mode = "single" + image_parser = EmptyImageBlobParser() + + _test_matrix( + parser_factory, + params, + mode, + image_parser, + images_inner_format, + ) + + +def _test_matrix( parser_factory: str, params: dict, mode: str, diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py index a681dce8c59c01..7eae7ef710d429 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py @@ -226,7 +226,16 @@ def test_standard_parameters( assert len(docs) == 1 file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" - loader = loader_class(file_path, mode="page") + loader = loader_class( + file_path, + mode="page", + page_delimiter="---", + images_parser=None, + images_inner_format="text", + password=None, + extract_tables=None, + extract_tables_settings=None, + ) docs = loader.load() assert len(docs) == 16 assert loader.web_path is None