diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 789c1e5..b397685 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1344,7 +1344,7 @@ def convert_stream( result = None try: # Write to the temporary file - content = stream.read() + content = self._strip_leading_blanks(stream.read()) if isinstance(content, str): fh.write(content.encode("utf-8")) else: @@ -1367,6 +1367,10 @@ def convert_stream( return result + def _strip_leading_blanks(self, content: bytes) -> bytes: + """Helper function to strip leading blank characters or line breaks from content.""" + return content.lstrip() + def convert_url( self, url: str, **kwargs: Any ) -> DocumentConverterResult: # TODO: fix kwargs type diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 4a981bd..27a1160 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -300,6 +300,15 @@ def test_markitdown_llm() -> None: assert test_string in result.text_content.lower() +def test_markitdown_strip_leading_blanks() -> None: + markitdown = MarkItDown() + + # Test input with leading blank characters + input_data = b" \n\n\n

Test

" + result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html") + assert "

Test

" in result.text_content + + if __name__ == "__main__": """Runs this file's tests from the command line.""" test_markitdown_remote() @@ -307,3 +316,4 @@ def test_markitdown_llm() -> None: test_markitdown_exiftool() test_markitdown_deprecation() test_markitdown_llm() + test_markitdown_strip_leading_blanks()