diff --git a/docs/docs/integrations/document_loaders/unstructured_file.ipynb b/docs/docs/integrations/document_loaders/unstructured_file.ipynb index 0dcf691c8b647..fa766385eca41 100644 --- a/docs/docs/integrations/document_loaders/unstructured_file.ipynb +++ b/docs/docs/integrations/document_loaders/unstructured_file.ipynb @@ -105,7 +105,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "id": "79d3e549", "metadata": {}, "outputs": [], @@ -131,7 +131,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "id": "8da59ef8", "metadata": {}, "outputs": [ @@ -139,17 +139,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO: NumExpr defaulting to 12 threads.\n", "INFO: pikepdf C++ to Python logger bridge initialized\n" ] }, { "data": { "text/plain": [ - "Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}, page_content='1 2 0 2')" + "Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-02-27T15:49:27', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}, page_content='1 2 0 2')" ] }, - "execution_count": 2, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -162,7 +161,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "id": "97f7aa1f", "metadata": {}, "outputs": [ @@ -170,7 +169,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}\n" + "{'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-02-27T15:49:27', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}\n" ] } ], @@ -188,17 +187,17 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "id": "b05604d2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}, page_content='1 2 0 2')" + "Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-02-27T15:49:27', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}, page_content='1 2 0 2')" ] }, - "execution_count": 4, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -279,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "386eb63c", "metadata": {}, "outputs": [ @@ -299,7 +298,7 @@ "Document(metadata={'source': 'example_data/fake.docx', 'category_depth': 0, 'filename': 'fake.docx', 'languages': ['por', 'cat'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'Title', 'element_id': '56d531394823d81787d77a04462ed096'}, page_content='Lorem ipsum dolor sit amet.')" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -327,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "a3d7c846", "metadata": {}, "outputs": [ @@ -375,16 +374,22 @@ "### Unstructured SDK Client\n", "\n", "Partitioning with the Unstructured API relies on the [Unstructured SDK\n", - "Client](https://docs.unstructured.io/api-reference/api-services/sdk).\n", - "\n", - "Below is an example showing how you can customize some features of the client and use your own `requests.Session()`, pass in an alternative `server_url`, or customize the `RetryConfig` object for more control over how failed requests are handled.\n", + "Client](https://docs.unstructured.io/api-reference/api-services/accessing-unstructured-api).\n", "\n", - "Note that the example below may not use the latest version of the UnstructuredClient and there could be breaking changes in future releases. For the latest examples, refer to the [Unstructured Python SDK](https://docs.unstructured.io/api-reference/api-services/sdk-python) docs." + "If you want to customize the client, you will have to pass an `UnstructuredClient` instance to the `UnstructuredLoader`. Below is an example showing how you can customize features of the client such as using your own `requests.Session()`, passing an alternative `server_url`, and customizing the `RetryConfig` object. For more information about customizing the client or what additional parameters the sdk client accepts, refer to the [Unstructured Python SDK](https://docs.unstructured.io/api-reference/api-services/sdk-python) docs and the client section of the [API Parameters](https://docs.unstructured.io/api-reference/api-services/api-parameters) docs. Note that all API Parameters should be passed to the `UnstructuredLoader`." + ] + }, + { + "cell_type": "markdown", + "id": "ebb69c85", + "metadata": {}, + "source": [ + "
Warning: The example below may not use the latest version of the UnstructuredClient and there could be breaking changes in future releases. For the latest examples, refer to the Unstructured Python SDK docs.
" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "58e55264", "metadata": {}, "outputs": [ @@ -394,13 +399,15 @@ "text": [ "INFO: Preparing to split document for partition.\n", "INFO: Concurrency level set to 5\n", - "INFO: Splitting pages 1 to 16 (16 total)\n", - "INFO: Determined optimal split size of 4 pages.\n", - "INFO: Partitioning 4 files with 4 page(s) each.\n", - "INFO: Partitioning set #1 (pages 1-4).\n", - "INFO: Partitioning set #2 (pages 5-8).\n", - "INFO: Partitioning set #3 (pages 9-12).\n", - "INFO: Partitioning set #4 (pages 13-16).\n", + "INFO: Splitting pages 1 to 10 (10 total)\n", + "INFO: Determined optimal split size of 2 pages.\n", + "INFO: Partitioning 5 files with 2 page(s) each.\n", + "INFO: Partitioning set #1 (pages 1-2).\n", + "INFO: Partitioning set #2 (pages 3-4).\n", + "INFO: Partitioning set #3 (pages 5-6).\n", + "INFO: Partitioning set #4 (pages 7-8).\n", + "INFO: Partitioning set #5 (pages 9-10).\n", + "INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general \"HTTP/1.1 200 OK\"\n", "INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general \"HTTP/1.1 200 OK\"\n", "INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general \"HTTP/1.1 200 OK\"\n", "INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general \"HTTP/1.1 200 OK\"\n", @@ -408,6 +415,7 @@ "INFO: Successfully partitioned set #2, elements added to the final result.\n", "INFO: Successfully partitioned set #3, elements added to the final result.\n", "INFO: Successfully partitioned set #4, elements added to the final result.\n", + "INFO: Successfully partitioned set #5, elements added to the final result.\n", "INFO: Successfully partitioned the document.\n" ] }, @@ -429,8 +437,8 @@ " api_key_auth=os.getenv(\n", " \"UNSTRUCTURED_API_KEY\"\n", " ), # Note: the client API param is \"api_key_auth\" instead of \"api_key\"\n", - " client=requests.Session(),\n", - " server_url=\"https://api.unstructuredapp.io/general/v0/general\",\n", + " client=requests.Session(), # Define your own requests session\n", + " server_url=\"https://api.unstructuredapp.io/general/v0/general\", # Define your own api url\n", " retry_config=RetryConfig(\n", " strategy=\"backoff\",\n", " retry_connection_errors=True,\n", @@ -440,13 +448,15 @@ " exponent=1.5,\n", " max_elapsed_time=900000,\n", " ),\n", - " ),\n", + " ), # Define your own retry config\n", ")\n", "\n", "loader = UnstructuredLoader(\n", " \"./example_data/layout-parser-paper.pdf\",\n", " partition_via_api=True,\n", " client=client,\n", + " split_pdf_page=True,\n", + " split_pdf_page_range=[1, 10],\n", ")\n", "\n", "docs = loader.load()\n", @@ -479,17 +489,10 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "e9f1c20d", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING: Partitioning locally even though api_key is defined since partition_via_api=False.\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -542,7 +545,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/libs/partners/unstructured/Makefile b/libs/partners/unstructured/Makefile index 5c4e2e7692f1e..1b03b8e76f303 100644 --- a/libs/partners/unstructured/Makefile +++ b/libs/partners/unstructured/Makefile @@ -34,14 +34,14 @@ lint_tests: PYTHON_FILES=tests lint_tests: MYPY_CACHE=.mypy_cache_test lint lint_diff lint_package lint_tests: - poetry run ruff . + poetry run ruff check . poetry run ruff format $(PYTHON_FILES) --diff - poetry run ruff --select I $(PYTHON_FILES) + poetry run ruff check --select I $(PYTHON_FILES) mkdir -p $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE) format format_diff: poetry run ruff format $(PYTHON_FILES) - poetry run ruff --select I --fix $(PYTHON_FILES) + poetry run ruff check --select I --fix $(PYTHON_FILES) spell_check: poetry run codespell --toml pyproject.toml diff --git a/libs/partners/unstructured/poetry.lock b/libs/partners/unstructured/poetry.lock index 7655416222147..09e12dd9a135c 100644 --- a/libs/partners/unstructured/poetry.lock +++ b/libs/partners/unstructured/poetry.lock @@ -3465,28 +3465,29 @@ pyasn1 = ">=0.1.3" [[package]] name = "ruff" -version = "0.1.15" +version = "0.5.7" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.1.15-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:5fe8d54df166ecc24106db7dd6a68d44852d14eb0729ea4672bb4d96c320b7df"}, - {file = "ruff-0.1.15-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6f0bfbb53c4b4de117ac4d6ddfd33aa5fc31beeaa21d23c45c6dd249faf9126f"}, - {file = "ruff-0.1.15-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e0d432aec35bfc0d800d4f70eba26e23a352386be3a6cf157083d18f6f5881c8"}, - {file = "ruff-0.1.15-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9405fa9ac0e97f35aaddf185a1be194a589424b8713e3b97b762336ec79ff807"}, - {file = "ruff-0.1.15-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c66ec24fe36841636e814b8f90f572a8c0cb0e54d8b5c2d0e300d28a0d7bffec"}, - {file = "ruff-0.1.15-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:6f8ad828f01e8dd32cc58bc28375150171d198491fc901f6f98d2a39ba8e3ff5"}, - {file = "ruff-0.1.15-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86811954eec63e9ea162af0ffa9f8d09088bab51b7438e8b6488b9401863c25e"}, - {file = "ruff-0.1.15-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fd4025ac5e87d9b80e1f300207eb2fd099ff8200fa2320d7dc066a3f4622dc6b"}, - {file = "ruff-0.1.15-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b17b93c02cdb6aeb696effecea1095ac93f3884a49a554a9afa76bb125c114c1"}, - {file = "ruff-0.1.15-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:ddb87643be40f034e97e97f5bc2ef7ce39de20e34608f3f829db727a93fb82c5"}, - {file = "ruff-0.1.15-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:abf4822129ed3a5ce54383d5f0e964e7fef74a41e48eb1dfad404151efc130a2"}, - {file = "ruff-0.1.15-py3-none-musllinux_1_2_i686.whl", hash = "sha256:6c629cf64bacfd136c07c78ac10a54578ec9d1bd2a9d395efbee0935868bf852"}, - {file = "ruff-0.1.15-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1bab866aafb53da39c2cadfb8e1c4550ac5340bb40300083eb8967ba25481447"}, - {file = "ruff-0.1.15-py3-none-win32.whl", hash = "sha256:2417e1cb6e2068389b07e6fa74c306b2810fe3ee3476d5b8a96616633f40d14f"}, - {file = "ruff-0.1.15-py3-none-win_amd64.whl", hash = "sha256:3837ac73d869efc4182d9036b1405ef4c73d9b1f88da2413875e34e0d6919587"}, - {file = "ruff-0.1.15-py3-none-win_arm64.whl", hash = "sha256:9a933dfb1c14ec7a33cceb1e49ec4a16b51ce3c20fd42663198746efc0427360"}, - {file = "ruff-0.1.15.tar.gz", hash = "sha256:f6dfa8c1b21c913c326919056c390966648b680966febcb796cc9d1aaab8564e"}, + {file = "ruff-0.5.7-py3-none-linux_armv6l.whl", hash = "sha256:548992d342fc404ee2e15a242cdbea4f8e39a52f2e7752d0e4cbe88d2d2f416a"}, + {file = "ruff-0.5.7-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:00cc8872331055ee017c4f1071a8a31ca0809ccc0657da1d154a1d2abac5c0be"}, + {file = "ruff-0.5.7-py3-none-macosx_11_0_arm64.whl", hash = "sha256:eaf3d86a1fdac1aec8a3417a63587d93f906c678bb9ed0b796da7b59c1114a1e"}, + {file = "ruff-0.5.7-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a01c34400097b06cf8a6e61b35d6d456d5bd1ae6961542de18ec81eaf33b4cb8"}, + {file = "ruff-0.5.7-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fcc8054f1a717e2213500edaddcf1dbb0abad40d98e1bd9d0ad364f75c763eea"}, + {file = "ruff-0.5.7-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7f70284e73f36558ef51602254451e50dd6cc479f8b6f8413a95fcb5db4a55fc"}, + {file = "ruff-0.5.7-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:a78ad870ae3c460394fc95437d43deb5c04b5c29297815a2a1de028903f19692"}, + {file = "ruff-0.5.7-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9ccd078c66a8e419475174bfe60a69adb36ce04f8d4e91b006f1329d5cd44bcf"}, + {file = "ruff-0.5.7-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7e31c9bad4ebf8fdb77b59cae75814440731060a09a0e0077d559a556453acbb"}, + {file = "ruff-0.5.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d796327eed8e168164346b769dd9a27a70e0298d667b4ecee6877ce8095ec8e"}, + {file = "ruff-0.5.7-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:4a09ea2c3f7778cc635e7f6edf57d566a8ee8f485f3c4454db7771efb692c499"}, + {file = "ruff-0.5.7-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:a36d8dcf55b3a3bc353270d544fb170d75d2dff41eba5df57b4e0b67a95bb64e"}, + {file = "ruff-0.5.7-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9369c218f789eefbd1b8d82a8cf25017b523ac47d96b2f531eba73770971c9e5"}, + {file = "ruff-0.5.7-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b88ca3db7eb377eb24fb7c82840546fb7acef75af4a74bd36e9ceb37a890257e"}, + {file = "ruff-0.5.7-py3-none-win32.whl", hash = "sha256:33d61fc0e902198a3e55719f4be6b375b28f860b09c281e4bdbf783c0566576a"}, + {file = "ruff-0.5.7-py3-none-win_amd64.whl", hash = "sha256:083bbcbe6fadb93cd86709037acc510f86eed5a314203079df174c40bbbca6b3"}, + {file = "ruff-0.5.7-py3-none-win_arm64.whl", hash = "sha256:2dca26154ff9571995107221d0aeaad0e75a77b5a682d6236cf89a58c70b76f4"}, + {file = "ruff-0.5.7.tar.gz", hash = "sha256:8dfc0a458797f5d9fb622dd0efc52d796f23f0a1493a9527f4e49a550ae9a7e5"}, ] [[package]] @@ -4256,13 +4257,13 @@ xlsx = ["networkx", "openpyxl", "pandas", "xlrd"] [[package]] name = "unstructured-client" -version = "0.24.1" +version = "0.25.5" description = "Python Client SDK for Unstructured API" optional = false python-versions = ">=3.8" files = [ - {file = "unstructured-client-0.24.1.tar.gz", hash = "sha256:1bd82a532497783dd77b30ed4e56837d6abfae8cc6d61442acac0bcacbd568c8"}, - {file = "unstructured_client-0.24.1-py3-none-any.whl", hash = "sha256:044dab0c3079f908f6adf7088ad44f0e17476b47e2b04e0de608134a482bd0e3"}, + {file = "unstructured-client-0.25.5.tar.gz", hash = "sha256:adb97ea56ce65f8b277d5b05f093e9d13a3320ac8dea7265ffa71f5e13ed5f84"}, + {file = "unstructured_client-0.25.5-py3-none-any.whl", hash = "sha256:23537fee984e43d06a75f986a73e420a9659cc92010afb8324fbf67c85962eaf"}, ] [package.dependencies] @@ -4472,4 +4473,4 @@ local = ["unstructured"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "d95a01d052e3f6175a45c5a589692274300a88782938ed71f835c5f68842d821" +content-hash = "d4f2a9a001f41c52964628e5d82d74e0938c75f75dbc5a08ff9d829c53fb74b9" diff --git a/libs/partners/unstructured/pyproject.toml b/libs/partners/unstructured/pyproject.toml index c8c95b62a092c..6c6cbf671a090 100644 --- a/libs/partners/unstructured/pyproject.toml +++ b/libs/partners/unstructured/pyproject.toml @@ -14,7 +14,7 @@ license = "MIT" [tool.poetry.dependencies] python = ">=3.9,<4.0" langchain-core = "^0.2.23" -unstructured-client = { version = "^0.24.1" } +unstructured-client = { version = "^0.25.0" } unstructured = { version = "^0.15.7", optional = true, python = "<3.13", extras = [ "all-docs", ] } @@ -46,7 +46,7 @@ optional = true optional = true [tool.poetry.group.lint.dependencies] -ruff = "^0.1.8" +ruff = "^0.5" [tool.poetry.group.typing.dependencies] mypy = "^1.7.1"