Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring PDF loaders: 02 PyMuPDF #29063

Open
wants to merge 46 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 41 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
21759e2
Prepare the integration of new versions of PDFLoader.
pprados Jan 2, 2025
4607354
Fix Line too long
pprados Jan 7, 2025
668dc9c
Fix Line too long
pprados Jan 7, 2025
7a5b5c5
Fix Line too long
pprados Jan 7, 2025
6340ded
Fix Line too long
pprados Jan 7, 2025
4845781
Update PyMuPDF
pprados Jan 2, 2025
3beda82
Fix tu
pprados Jan 7, 2025
743a83e
Fix review - step 1
pprados Jan 9, 2025
b623750
Fix all remarques
pprados Jan 10, 2025
20f5a41
Merge remote-tracking branch 'upstream/master' into pprados/02-pymupdf
pprados Jan 10, 2025
91234f0
Fix remarques
pprados Jan 10, 2025
80ee3f7
Fix Images
pprados Jan 13, 2025
66f97cf
Merge remote-tracking branch 'upstream/master' into pprados/02-pymupdf
pprados Jan 13, 2025
0e6c904
Fix Images
pprados Jan 13, 2025
9b45bd8
Merge branch 'master' into pprados/02-pymupdf
pprados Jan 13, 2025
acf4358
Fix deprecated load() with kwargs
pprados Jan 14, 2025
d7d3021
Merge branch 'master' into pprados/02-pymupdf
pprados Jan 14, 2025
4762fab
Change the format for images parser
pprados Jan 14, 2025
6121005
Merge branch 'master' into pprados/02-pymupdf
pprados Jan 14, 2025
5910f99
Merge branch 'master' into pprados/02-pymupdf
pprados Jan 14, 2025
7fc01f3
Merge branch 'master' into pprados/02-pymupdf
pprados Jan 15, 2025
0f654a1
Add format "html" and "markdown" for LLMImageBlobParser
pprados Jan 15, 2025
e4f36ed
Merge remote-tracking branch 'origin/pprados/02-pymupdf' into pprados…
pprados Jan 15, 2025
4a62529
Fix
pprados Jan 15, 2025
1c78325
Bugfix
pprados Jan 15, 2025
1227dbb
Bugfix
pprados Jan 15, 2025
90085e4
Bug fix
pprados Jan 15, 2025
14264e9
Merge branch 'master' into pprados/02-pymupdf
pprados Jan 15, 2025
feacf69
Replace markdown-link to markdown-img
pprados Jan 16, 2025
c074729
Merge branch 'master' into pprados/02-pymupdf
pprados Jan 16, 2025
ee4784d
Update PyMuPDF
pprados Jan 16, 2025
5d4a256
Add default value for properties
pprados Jan 16, 2025
3d15d39
Fix one bug, update some typos, and style doc strings while reading
eyurtsev Jan 16, 2025
0be6c88
Change the strategy for images_inner_format.
pprados Jan 17, 2025
d104ee7
Merge branch 'master' into pprados/02-pymupdf
pprados Jan 17, 2025
023ba11
Fix PIL dependencies
pprados Jan 17, 2025
23a73a9
Fix notebook
pprados Jan 17, 2025
a4587f0
Optimise tests
pprados Jan 17, 2025
d332958
Merge branch 'master' into pprados/02-pymupdf
pprados Jan 17, 2025
4b37b34
Merge branch 'master' into pprados/02-pymupdf
pprados Jan 17, 2025
2281d05
Merge branch 'master' into pprados/02-pymupdf
pprados Jan 17, 2025
0da73f1
Remove Image.__init__
pprados Jan 18, 2025
d012d60
Merge remote-tracking branch 'origin/pprados/02-pymupdf' into pprados…
pprados Jan 18, 2025
882c90d
Merge branch 'master' into pprados/02-pymupdf
pprados Jan 18, 2025
74d3617
Remove Image.__init__
pprados Jan 18, 2025
318f304
Merge branch 'master' into pprados/02-pymupdf
pprados Jan 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,188 changes: 1,154 additions & 34 deletions docs/docs/integrations/document_loaders/pymupdf.ipynb

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions libs/community/extended_testing_deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,14 @@ oracle-ads>=2.9.1,<3
oracledb>=2.2.0,<3
pandas>=2.0.1,<3
pdfminer-six>=20221105,<20240706
pdfplumber>=0.11
pgvector>=0.1.6,<0.2
playwright>=1.48.0,<2
praw>=7.7.1,<8
premai>=0.3.25,<0.4
psychicapi>=0.8.0,<0.9
pydantic>=2.7.4,<3
pytesseract>=0.3.13
py-trello>=0.19.0,<0.20
pyjwt>=2.8.0,<3
pymupdf>=1.22.3,<2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@
from langchain_community.document_loaders.parsers.html import (
BS4HTMLParser,
)
from langchain_community.document_loaders.parsers.images import (
BaseImageBlobParser,
LLMImageBlobParser,
RapidOCRBlobParser,
TesseractBlobParser,
)
from langchain_community.document_loaders.parsers.language import (
LanguageParser,
)
Expand All @@ -35,15 +41,19 @@
_module_lookup = {
"AzureAIDocumentIntelligenceParser": "langchain_community.document_loaders.parsers.doc_intelligence", # noqa: E501
"BS4HTMLParser": "langchain_community.document_loaders.parsers.html",
"BaseImageBlobParser": "langchain_community.document_loaders.parsers.images",
"DocAIParser": "langchain_community.document_loaders.parsers.docai",
"GrobidParser": "langchain_community.document_loaders.parsers.grobid",
"LanguageParser": "langchain_community.document_loaders.parsers.language",
"LLMImageBlobParser": "langchain_community.document_loaders.parsers.images",
"OpenAIWhisperParser": "langchain_community.document_loaders.parsers.audio",
"PDFMinerParser": "langchain_community.document_loaders.parsers.pdf",
"PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
"PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
"PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
"PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
"RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
"TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
"VsdxParser": "langchain_community.document_loaders.parsers.vsdx",
}

Expand All @@ -57,15 +67,19 @@ def __getattr__(name: str) -> Any:

__all__ = [
"AzureAIDocumentIntelligenceParser",
"BaseImageBlobParser",
"BS4HTMLParser",
"DocAIParser",
"GrobidParser",
"LanguageParser",
"LLMImageBlobParser",
"OpenAIWhisperParser",
"PDFMinerParser",
"PDFPlumberParser",
"PyMuPDFParser",
"PyPDFParser",
"PyPDFium2Parser",
"RapidOCRBlobParser",
"TesseractBlobParser",
"VsdxParser",
]
239 changes: 239 additions & 0 deletions libs/community/langchain_community/document_loaders/parsers/images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
import base64
pprados marked this conversation as resolved.
Show resolved Hide resolved
import io
import logging
from abc import abstractmethod
from typing import TYPE_CHECKING, Iterable, Iterator, Literal, Union

import numpy
import numpy as np
from langchain_core.documents import Document
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import HumanMessage

from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob

if TYPE_CHECKING:
from PIL.Image import Image

logger = logging.getLogger(__name__)


class BaseImageBlobParser(BaseBlobParser):
"""Abstract base class for parsing image blobs into text."""

def __init__(
pprados marked this conversation as resolved.
Show resolved Hide resolved
self,
*,
format: Union[Literal["text", "markdown-img", "html-img"], str] = "text",
) -> None:
"""Initializes the BaseImageBlobParser.

Args:
format (Literal["text", "markdown-img", "html-img"]|str):
The format for the parsed output.
- "text" = return the content as is
- "markdown-img" = wrap the content into an image markdown link, w/ link
pointing to (`![body)(#)`]
- "html-img" = wrap the content as the `alt` text of an tag and link to
(`<img alt="{body}" src="#"/>`)
- or other formats if the parser supports it
"""
self.format = format

@abstractmethod
def _analyze_image(self, img: "Image") -> str:
"""Abstract method to analyze an image and extract textual content.

Args:
img: The image to be analyzed.

Returns:
The extracted text content.
"""

def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse a blob and yields Documents containing the parsed content.

Args:
blob (Blob): The blob to be parsed.

Yields:
Document:
A document containing the parsed content and metadata.
"""
try:
from PIL import Image as Img

with blob.as_bytes_io() as buf:
if blob.mimetype == "application/x-npy":
img = Img.fromarray(numpy.load(buf))
else:
img = Img.open(buf)
content = self._analyze_image(img)
logger.debug("Image text: %s", content.replace("\n", "\\n"))
yield Document(
page_content=content,
metadata={**blob.metadata, **{"source": blob.source}},
)
except ImportError:
raise ImportError(
"`Pillow` package not found, please install it with "
"`pip install Pillow`"
)


class RapidOCRBlobParser(BaseImageBlobParser):
"""Parser for extracting text from images using the RapidOCR library.

Attributes:
ocr:
The RapidOCR instance for performing OCR.
"""

def __init__(
self,
) -> None:
"""
Initializes the RapidOCRBlobParser.
"""
super().__init__()
self.ocr = None

def _analyze_image(self, img: "Image") -> str:
"""
Analyzes an image and extracts text using RapidOCR.

Args:
img (Image):
The image to be analyzed.

Returns:
str:
The extracted text content.
"""
if not self.ocr:
try:
from rapidocr_onnxruntime import RapidOCR

self.ocr = RapidOCR()
except ImportError:
raise ImportError(
"`rapidocr-onnxruntime` package not found, please install it with "
"`pip install rapidocr-onnxruntime`"
)
ocr_result, _ = self.ocr(np.array(img)) # type: ignore
content = ""
if ocr_result:
content = ("\n".join([text[1] for text in ocr_result])).strip()
return content


class TesseractBlobParser(BaseImageBlobParser):
"""Parse for extracting text from images using the Tesseract OCR library."""

def __init__(
self,
*,
langs: Iterable[str] = ("eng",),
):
"""Initialize the TesseractBlobParser.

Args:
langs (list[str]):
The languages to use for OCR.
"""
super().__init__()
self.langs = list(langs)

def _analyze_image(self, img: "Image") -> str:
"""Analyze an image and extracts text using Tesseract OCR.

Args:
img: The image to be analyzed.

Returns:
str: The extracted text content.
"""
try:
import pytesseract
except ImportError:
raise ImportError(
"`pytesseract` package not found, please install it with "
"`pip install pytesseract`"
)
return pytesseract.image_to_string(img, lang="+".join(self.langs)).strip()


_PROMPT_IMAGES_TO_DESCRIPTION: str = (
"You are an assistant tasked with summarizing images for retrieval. "
"1. These summaries will be embedded and used to retrieve the raw image. "
"Give a concise summary of the image that is well optimized for retrieval\n"
"2. extract all the text from the image. "
"Do not exclude any content from the page.\n"
"Format answer in markdown without explanatory text "
"and without markdown delimiter ``` at the beginning. "
)


class LLMImageBlobParser(BaseImageBlobParser):
"""Parser for analyzing images using a language model (LLM).

Attributes:
model (BaseChatModel):
The language model to use for analysis.
prompt (str):
The prompt to provide to the language model.
"""

def __init__(
self,
*,
model: BaseChatModel,
prompt: str = _PROMPT_IMAGES_TO_DESCRIPTION,
):
"""Initializes the LLMImageBlobParser.

Args:
model (BaseChatModel):
The language model to use for analysis.
prompt (str):
The prompt to provide to the language model.
"""
super().__init__()
self.model = model
self.prompt = prompt

def _analyze_image(self, img: "Image") -> str:
"""Analyze an image using the provided language model.

Args:
img: The image to be analyzed.

Returns:
The extracted textual content.
"""
image_bytes = io.BytesIO()
img.save(image_bytes, format="PNG")
img_base64 = base64.b64encode(image_bytes.getvalue()).decode("utf-8")
msg = self.model.invoke(
[
HumanMessage(
content=[
{
"type": "text",
"text": self.prompt.format(format=format),
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{img_base64}"
},
},
]
)
]
)
result = msg.content
assert isinstance(result, str)
return result
Loading
Loading