microsoft · tungsten106 · Dec 19, 2024 · Dec 23, 2024 · Dec 23, 2024 · Dec 24, 2024
diff --git a/.gitignore b/.gitignore
@@ -52,6 +52,7 @@ coverage.xml
 .hypothesis/
 .pytest_cache/
 cover/
+tests/out
 
 # Translations
 *.mo

diff --git a/pyproject.toml b/pyproject.toml
@@ -33,6 +33,7 @@ dependencies = [
   "pandas",
   "openpyxl",
   "pdfminer.six",
+  "pymupdf4llm",
   "puremagic",
   "pydub",
   "youtube-transcript-api",

diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py
@@ -57,15 +57,26 @@ def main():
         "--output",
         help="Output file name. If not provided, output is written to stdout.",
     )
+    # adding CLI option for extra parameters for PdfConverter
+    parser.add_argument(
+        "-e",
+        "--engine",
+        help="Engine name for converters. If not provided will use default.",
+    )
+
     args = parser.parse_args()
+
+    kwargs = {}
+    if args.engine:
+        kwargs.update({"engine": args.engine})
 
     if args.filename is None:
         markitdown = MarkItDown()
         result = markitdown.convert_stream(sys.stdin.buffer)
         _handle_output(args, result)
     else:
         markitdown = MarkItDown()
-        result = markitdown.convert(args.filename)
+        result = markitdown.convert(args.filename, **kwargs)
         _handle_output(args, result)
 
 

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
@@ -14,7 +14,7 @@
 import traceback
 import zipfile
 from xml.dom import minidom
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, Literal, Mapping
 from pathlib import Path
 from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
 from warnings import warn, resetwarnings, catch_warnings
@@ -24,6 +24,7 @@
 import pandas as pd
 import pdfminer
 import pdfminer.high_level
+import pymupdf4llm
 import pptx
 
 # File-format detection
@@ -676,19 +677,38 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
 
 class PdfConverter(DocumentConverter):
     """
-    Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
+    Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.    
     """
+    _engines: Mapping[str, Any] = {
+        "pdfminer": pdfminer.high_level.extract_text,
+        "pymupdf4llm": pymupdf4llm.to_markdown,
+    }
 
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def convert(
+        self,
+        local_path,
+        engine: Literal["pdfminer", "pymupdf4llm"] = "pdfminer",
+        engine_kwargs={},
+        **kwargs,
+    ) -> Union[None, DocumentConverterResult]:
+        """
+        Example:
+        >>> source = "https://arxiv.org/pdf/2308.08155v2.pdf"
+        >>> markitdown.convert(source, engine="pymupdf4llm")
+        """
         # Bail if not a PDF
         extension = kwargs.get("file_extension", "")
         if extension.lower() != ".pdf":
             return None
-
-        return DocumentConverterResult(
-            title=None,
-            text_content=pdfminer.high_level.extract_text(local_path),
-        )
+        if engine is not None and engine not in self._engines:
+            raise FileConversionException(
+                "'engine' not valid for {} files. Please choose between {}.".format(
+                    extension, list(self._engines.keys())
+                )
+            )
+        else:
+            text_content = self._engines[engine](local_path, **engine_kwargs)
+        return DocumentConverterResult(title=None, text_content=text_content)
 
 
 class DocxConverter(HtmlConverter):

diff --git a/tests/test_files/2308.08155v2.pdf b/tests/test_files/2308.08155v2.pdf
diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
@@ -7,7 +7,6 @@
 import requests
 
 from warnings import catch_warnings, resetwarnings
-
 from markitdown import MarkItDown
 
 skip_remote = (
@@ -299,6 +298,42 @@ def test_markitdown_llm() -> None:
     for test_string in ["red", "circle", "blue", "square"]:
         assert test_string in result.text_content.lower()
 
+def test_markitdown_pdf() -> None:
+    markitdown = MarkItDown()
+
+    # I test by local pdf, using PDF_TEST_URL may also be fine.
+
+    # By pymupdf4llm
+    result = markitdown.convert(
+        os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"),
+        engine="pymupdf4llm",
+
+        engine_kwargs={"show_progress": False, "pages": range(10),},  # additional kwargs
+    )
+    for test_string in PDF_TEST_STRINGS:
+        assert test_string in result.text_content
+
+    # By pymupdf4llm and extract images
+    result = markitdown.convert(
+        os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"),
+        engine="pymupdf4llm",
+        engine_kwargs={
+            "show_progress": False,
+            "write_images": True,
+            "image_path": "tests/out",
+            "pages": range(10),
+        },  # `write_images` must be True, setting `image_path` for images saving dir.
+    )
+    for test_string in PDF_TEST_STRINGS:
+        assert test_string in result.text_content
+
+    # By pdfminer
+    result = markitdown.convert(
+        os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), engine="pdfminer",
+        enging_kwargs={"page_numbers": range(10),}
+    )
+    for test_string in PDF_TEST_STRINGS:
+        assert test_string in result.text_content
 
 if __name__ == "__main__":
     """Runs this file's tests from the command line."""
@@ -307,3 +342,4 @@ def test_markitdown_llm() -> None:
     test_markitdown_exiftool()
     test_markitdown_deprecation()
     test_markitdown_llm()
+    test_markitdown_pdf()