Skip to content

Commit

Permalink
Fix one bug, update some typos, and style doc strings while reading
Browse files Browse the repository at this point in the history
  • Loading branch information
eyurtsev committed Jan 16, 2025
1 parent 5d4a256 commit 3d15d39
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 76 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,14 @@


class BaseImageBlobParser(BaseBlobParser):
"""
Abstract base class for parsing image blobs into text.
Attributes:
format (Literal["text", "markdown-img", "html-img"]):
Output format of the parsed text.
"""
"""Abstract base class for parsing image blobs into text."""

def __init__(
self,
*,
format: Union[Literal["text", "markdown-img", "html-img"], str] = "text",
):
"""
Initializes the BaseImageBlobParser.
) -> None:
"""Initializes the BaseImageBlobParser.
Args:
format (Literal["text", "markdown-img", "html-img"]|str):
Expand All @@ -52,28 +45,21 @@ def __init__(

@abstractmethod
def _analyze_image(self, img: "Image", format: str) -> str:
"""
Abstract method to analyze an image and extract textual content.
"""Abstract method to analyze an image and extract textual content.
Args:
img (Image):
The image to be analyzed.
format (str):
The format to use if it's possible
img: The image to be analyzed.
format: The format to use if it's possible
Returns:
str:
The extracted text content.
The extracted text content.
"""
pass

def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""
Lazily parses a blob and yields Document objects containing the parsed content.
"""Lazily parse a blob and yields Documents containing the parsed content.
Args:
blob (Blob):
The blob to be parsed.
blob (Blob): The blob to be parsed.
Yields:
Document:
Expand Down Expand Up @@ -116,8 +102,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:


class RapidOCRBlobParser(BaseImageBlobParser):
"""
Parser for extracting text from images using the RapidOCR library.
"""Parser for extracting text from images using the RapidOCR library.
Attributes:
ocr:
Expand Down Expand Up @@ -183,8 +168,7 @@ def _analyze_image(self, img: "Image", format: str) -> str:


class TesseractBlobParser(BaseImageBlobParser):
"""
Parser for extracting text from images using the Tesseract OCR library.
"""Parse for extracting text from images using the Tesseract OCR library.
Attributes:
format (Literal["text", "markdown-img", "html-img"]):
Expand All @@ -204,8 +188,7 @@ def __init__(
format: Literal["text", "markdown-img", "html-img"] = "text",
langs: Iterable[str] = ("eng",),
):
"""
Initializes the TesseractBlobParser.
"""Initialize the TesseractBlobParser.
Args:
format (Literal["text", "markdown-img", "html-img"]):
Expand All @@ -222,14 +205,11 @@ def __init__(
self.langs = list(langs)

def _analyze_image(self, img: "Image", format: str) -> str:
"""
Analyzes an image and extracts text using Tesseract OCR.
"""Analyze an image and extracts text using Tesseract OCR.
Args:
img (Image):
The image to be analyzed.
format (str):
The format to use if it's possible
img: The image to be analyzed.
format: The format to use if it's possible
Returns:
str: The extracted text content.
Expand Down Expand Up @@ -257,8 +237,7 @@ def _analyze_image(self, img: "Image", format: str) -> str:


class LLMImageBlobParser(BaseImageBlobParser):
"""
Parser for analyzing images using a language model (LLM).
"""Parser for analyzing images using a language model (LLM).
Attributes:
format (Literal["text", "markdown-img", "html-img"]):
Expand All @@ -285,8 +264,7 @@ def __init__(
model: BaseChatModel,
prompt: BasePromptTemplate = _PROMPT_IMAGES_TO_DESCRIPTION,
):
"""
Initializes the LLMImageBlobParser.
"""Initializes the LLMImageBlobParser.
Args:
format (Literal["text", "markdown", "html"]):
Expand All @@ -301,16 +279,13 @@ def __init__(
self.prompt = prompt

def _analyze_image(self, img: "Image", format: str) -> str:
"""
Analyzes an image using the provided language model.
"""Analyze an image using the provided language model.
Args:
img (Image):
The image to be analyzed.
img: The image to be analyzed.
Returns:
str: *
The extracted textual content.
The extracted textual content.
"""
image_bytes = io.BytesIO()
img.save(image_bytes, format="PNG")
Expand Down
36 changes: 20 additions & 16 deletions libs/community/langchain_community/document_loaders/parsers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,19 +93,22 @@ def extract_from_images_with_rapidocr(
_FORMAT_IMAGE_STR = "\n\n{image_text}\n\n"
_JOIN_IMAGES = "\n"
_JOIN_TABLES = "\n"
_DEFAULT_PAGE_DELIMITOR = "\n\f"
_DEFAULT_PAGES_DELIMITER = "\n\f"

_STD_METADATA_KEYS = {"source", "total_pages", "creationdate", "creator", "producer"}


def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
"""Validates the presence of at least the following keys:
"""Validate that the metadata has all the standard keys and the page is an integer.
The standard keys are:
- source
- page (if mode='page')
- total_page
- creationdate
- creator
- producer
Validate that page is an integer if it is present.
"""
if not _STD_METADATA_KEYS.issubset(metadata.keys()):
raise ValueError("The PDF parser must valorize the standard metadata.")
Expand Down Expand Up @@ -142,7 +145,7 @@ def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
except ValueError:
new_metadata[k] = v
elif k in map_key:
# Normaliaze key with others PDF parser
# Normalize key with others PDF parser
new_metadata[map_key[k]] = v
new_metadata[k] = v
elif isinstance(v, str):
Expand All @@ -152,7 +155,7 @@ def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
return new_metadata


_PARAGRAPH_DELIMITOR = [
_PARAGRAPH_DELIMITER = [
"\n\n\n",
"\n\n",
] # To insert images or table in the middle of the page.
Expand All @@ -174,7 +177,7 @@ def _recurs_merge_text_and_extras(
extras: list[str], text_from_page: str, recurs: bool
) -> Optional[str]:
if extras:
for delim in _PARAGRAPH_DELIMITOR:
for delim in _PARAGRAPH_DELIMITER:
pos = text_from_page.rfind(delim)
if pos != -1:
# search penultimate, to bypass an error in footer
Expand Down Expand Up @@ -205,7 +208,7 @@ def _recurs_merge_text_and_extras(
all_extras = ""
str_extras = "\n\n".join(filter(lambda x: x, extras))
if str_extras:
all_extras = _PARAGRAPH_DELIMITOR[-1] + str_extras
all_extras = _PARAGRAPH_DELIMITER[-1] + str_extras
all_text = text_from_page + all_extras

return all_text
Expand Down Expand Up @@ -470,7 +473,7 @@ def __init__(
*,
password: Optional[str] = None,
mode: Literal["single", "page"] = "page",
pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR,
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
images_parser: Optional[BaseImageBlobParser] = None,
extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
extract_tables_settings: Optional[dict[str, Any]] = None,
Expand All @@ -481,16 +484,14 @@ def __init__(
password: Optional password for opening encrypted PDFs.
mode: The extraction mode, either "single" for the entire document or "page"
for page-wise extraction.
pages_delimitor: A string delimiter to separate pages in single-mode
pages_delimiter: A string delimiter to separate pages in single-mode
extraction.
extract_images: Whether to extract images from the PDF.
images_parser: Optional image blob parser.
extract_tables: Whether to extract tables in a specific format, such as
"csv", "markdown", or "html".
extract_tables_settings: Optional dictionary of settings for customizing
table extraction.
**kwargs: Additional keyword arguments for customizing text extraction
behavior.
Returns:
This method does not directly return data. Use the `parse` or `lazy_parse`
Expand All @@ -508,7 +509,7 @@ def __init__(
raise ValueError("mode must be markdown")

self.mode = mode
self.pages_delimitor = pages_delimitor
self.pages_delimiter = pages_delimiter
self.password = password
self.text_kwargs = text_kwargs or {}
if extract_images and not images_parser:
Expand All @@ -526,14 +527,18 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
def _lazy_parse(
self,
blob: Blob,
text_kwargs: Optional[dict[str, Any]] = None, # deprectaed
# text-kwargs is present for backwards compatibility.
# Users should not use it directly.
text_kwargs: Optional[dict[str, Any]] = None,
) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob.
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.
Args:
blob: The blob to parse.
text_kwargs: Optional keyword arguments to pass to the `get_text` method.
If provided at run time, it will override the default text_kwargs.
Raises:
ImportError: If the `pypdf` package is not found.
Expand All @@ -544,8 +549,7 @@ def _lazy_parse(
try:
import pymupdf

if not text_kwargs:
text_kwargs = {}
text_kwargs = text_kwargs or self.text_kwargs
if not self.extract_tables_settings:
from pymupdf.table import (
DEFAULT_JOIN_TOLERANCE,
Expand Down Expand Up @@ -609,7 +613,7 @@ def _lazy_parse(

if self.mode == "single":
yield Document(
page_content=self.pages_delimitor.join(full_content),
page_content=self.pages_delimiter.join(full_content),
metadata=_validate_metadata(doc_metadata),
)

Expand Down
25 changes: 10 additions & 15 deletions libs/community/langchain_community/document_loaders/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from langchain_community.document_loaders.dedoc import DedocBaseLoader
from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
from langchain_community.document_loaders.parsers.pdf import (
_DEFAULT_PAGE_DELIMITOR,
_DEFAULT_PAGES_DELIMITER,
AmazonTextractPDFParser,
DocumentIntelligenceParser,
PDFMinerParser,
Expand Down Expand Up @@ -458,7 +458,7 @@ class PyMuPDFLoader(BasePDFLoader):
# headers = None
# password = None,
mode = "single",
pages_delimitor = "\n\f",
pages_delimiter = "\n\f",
# extract_images = True,
# images_parser = TesseractBlobParser(),
# extract_tables = "markdown",
Expand Down Expand Up @@ -492,7 +492,7 @@ def __init__(
*,
password: Optional[str] = None,
mode: Literal["single", "page"] = "page",
pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR,
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
extract_images: bool = False,
images_parser: Optional[BaseImageBlobParser] = None,
extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
Expand All @@ -509,7 +509,7 @@ def __init__(
password: Optional password for opening encrypted PDFs.
mode: The extraction mode, either "single" for the entire document or "page"
for page-wise extraction.
pages_delimitor: A string delimiter to separate pages in single-mode
pages_delimiter: A string delimiter to separate pages in single-mode
extraction.
extract_images: Whether to extract images from the PDF.
images_parser: Optional image blob parser.
Expand All @@ -533,7 +533,7 @@ def __init__(
self.parser = PyMuPDFParser(
password=password,
mode=mode,
pages_delimitor=pages_delimitor,
pages_delimiter=pages_delimiter,
text_kwargs=kwargs,
extract_images=extract_images,
images_parser=images_parser,
Expand Down Expand Up @@ -862,8 +862,8 @@ def lazy_load(
) -> Iterator[Document]:
"""Lazy load documents"""
# the self.file_path is local, but the blob has to include
# the S3 location if the file originated from S3 for multi-page documents
# raises ValueError when multi-page and not on S3"""
# the S3 location if the file originated from S3 for multipage documents
# raises ValueError when multipage and not on S3"""

if self.web_path and self._is_s3_url(self.web_path):
blob = Blob(path=self.web_path) # type: ignore[call-arg] # type: ignore[misc]
Expand Down Expand Up @@ -1059,7 +1059,7 @@ class ZeroxPDFLoader(BasePDFLoader):
"""Document loader utilizing Zerox library:
https://github.com/getomni-ai/zerox
Zerox converts PDF document to serties of images (page-wise) and
Zerox converts PDF document to series of images (page-wise) and
uses vision-capable LLM model to generate Markdown representation.
Zerox utilizes anyc operations. Therefore when using this loader
Expand All @@ -1079,7 +1079,7 @@ def __init__(
) -> None:
super().__init__(file_path=file_path)
"""Initialize the parser with arguments to be passed to the zerox function.
Make sure to set necessary environmnet variables such as API key, endpoint, etc.
Make sure to set necessary environment variables such as API key, endpoint, etc.
Check zerox documentation for list of necessary environment variables for
any given model.
Expand All @@ -1100,12 +1100,7 @@ def __init__(
self.model = model

def lazy_load(self) -> Iterator[Document]:
"""Loads documnts from pdf utilizing zerox library:
https://github.com/getomni-ai/zerox
Returns:
Iterator[Document]: An iterator over parsed Document instances.
"""
"""Lazily load pages."""
import asyncio

from pyzerox import zerox
Expand Down

0 comments on commit 3d15d39

Please sign in to comment.