diff --git a/libs/langchain/langchain/document_transformers/beautiful_soup_transformer.py b/libs/langchain/langchain/document_transformers/beautiful_soup_transformer.py index d174e950bd3d5..0ef52ddde32e3 100644 --- a/libs/langchain/langchain/document_transformers/beautiful_soup_transformer.py +++ b/libs/langchain/langchain/document_transformers/beautiful_soup_transformer.py @@ -1,4 +1,4 @@ -from typing import Any, List, Sequence +from typing import Any, Iterator, List, Sequence, cast from langchain.schema import BaseDocumentTransformer, Document @@ -98,18 +98,15 @@ def extract_tags(html_content: str, tags: List[str]) -> str: from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, "html.parser") - text_parts = [] - for tag in tags: - elements = soup.find_all(tag) - for element in elements: - if tag == "a": - href = element.get("href") - if href: - text_parts.append(f"{element.get_text()} ({href})") - else: - text_parts.append(element.get_text()) - else: - text_parts.append(element.get_text()) + text_parts: List[str] = [] + for element in soup.find_all(): + if element.name in tags: + # Extract all navigable strings recursively from this element. + text_parts += get_navigable_strings(element) + + # To avoid duplicate text, remove all descendants from the soup. + element.decompose() + return " ".join(text_parts) @staticmethod @@ -126,13 +123,7 @@ def remove_unnecessary_lines(content: str) -> str: lines = content.split("\n") stripped_lines = [line.strip() for line in lines] non_empty_lines = [line for line in stripped_lines if line] - seen = set() - deduped_lines = [] - for line in non_empty_lines: - if line not in seen: - seen.add(line) - deduped_lines.append(line) - cleaned_content = " ".join(deduped_lines) + cleaned_content = " ".join(non_empty_lines) return cleaned_content async def atransform_documents( @@ -141,3 +132,16 @@ async def atransform_documents( **kwargs: Any, ) -> Sequence[Document]: raise NotImplementedError + + +def get_navigable_strings(element: Any) -> Iterator[str]: + from bs4 import NavigableString, Tag + + for child in cast(Tag, element).children: + if isinstance(child, Tag): + yield from get_navigable_strings(child) + elif isinstance(child, NavigableString): + if (element.name == "a") and (href := element.get("href")): + yield f"{child.strip()} ({href})" + else: + yield child.strip() diff --git a/libs/langchain/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py b/libs/langchain/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py index d3ebe1d8c73c7..8644ed2e42bac 100644 --- a/libs/langchain/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py +++ b/libs/langchain/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py @@ -15,14 +15,54 @@ def test_transform_empty_html() -> None: @pytest.mark.requires("bs4") -def test_extract_paragraph() -> None: +def test_extract_paragraphs() -> None: bs_transformer = BeautifulSoupTransformer() - paragraphs_html = "

First paragraph.

Second paragraph.

" + paragraphs_html = ( + "

Header

First paragraph.

" + "

Second paragraph.

Ignore at end

" + ) + documents = [Document(page_content=paragraphs_html)] + docs_transformed = bs_transformer.transform_documents(documents) + assert docs_transformed[0].page_content == "First paragraph. Second paragraph." + + +@pytest.mark.requires("bs4") +def test_strip_whitespace() -> None: + bs_transformer = BeautifulSoupTransformer() + paragraphs_html = ( + "

Header

First paragraph.

" + "

Second paragraph.

" + ) documents = [Document(page_content=paragraphs_html)] docs_transformed = bs_transformer.transform_documents(documents) assert docs_transformed[0].page_content == "First paragraph. Second paragraph." +@pytest.mark.requires("bs4") +def test_extract_html() -> None: + bs_transformer = BeautifulSoupTransformer() + paragraphs_html = ( + "Begin of html tag" + "

Header

" + "

First paragraph.

" + "Middle of html tag" + "

Second paragraph.

" + "End of html tag" + "" + ) + documents = [Document(page_content=paragraphs_html)] + docs_transformed = bs_transformer.transform_documents( + documents, tags_to_extract=["html", "p"] + ) + assert docs_transformed[0].page_content == ( + "Begin of html tag " + "Header First paragraph. " + "Middle of html tag " + "Second paragraph. " + "End of html tag" + ) + + @pytest.mark.requires("bs4") def test_remove_style() -> None: bs_transformer = BeautifulSoupTransformer() @@ -30,21 +70,97 @@ def test_remove_style() -> None: "

First paragraph.

" ) documents = [Document(page_content=with_style_html)] - docs_transformed = bs_transformer.transform_documents(documents) + docs_transformed = bs_transformer.transform_documents( + documents, tags_to_extract=["html"] + ) assert docs_transformed[0].page_content == "First paragraph." +@pytest.mark.requires("bs4") +def test_remove_nested_tags() -> None: + """ + If a tag_to_extract is _inside_ an unwanted_tag, it should be removed + (e.g. a

inside a if
is unwanted).) + If an unwanted tag is _inside_ a tag_to_extract, it should be removed, + but the rest of the tag_to_extract should stay. + This means that "unwanted_tags" have a higher "priority" than "tags_to_extract". + """ + bs_transformer = BeautifulSoupTransformer() + with_style_html = ( + "" + "

First paragraph, inside a table.

" + "

Second paragraph
with a cell
.

" + "" + ) + documents = [Document(page_content=with_style_html)] + docs_transformed = bs_transformer.transform_documents( + documents, unwanted_tags=["script", "style", "table"] + ) + assert docs_transformed[0].page_content == "Second paragraph." + + @pytest.mark.requires("bs4") def test_remove_unwanted_lines() -> None: bs_transformer = BeautifulSoupTransformer() with_lines_html = "\n\n

First \n\n paragraph.

\n\n\n" documents = [Document(page_content=with_lines_html)] - docs_transformed = bs_transformer.transform_documents(documents) + docs_transformed = bs_transformer.transform_documents(documents, remove_lines=True) assert docs_transformed[0].page_content == "First paragraph." -# FIXME: This test proves that the order of the tags is NOT preserved. -# Documenting the current behavior here, but this should be fixed. +@pytest.mark.requires("bs4") +def test_do_not_remove_repeated_content() -> None: + bs_transformer = BeautifulSoupTransformer() + with_lines_html = "

1\n1\n1\n1

" + documents = [Document(page_content=with_lines_html)] + docs_transformed = bs_transformer.transform_documents(documents) + assert docs_transformed[0].page_content == "1 1 1 1" + + +@pytest.mark.requires("bs4") +def test_extract_nested_tags() -> None: + bs_transformer = BeautifulSoupTransformer() + nested_html = ( + "
" + "

First paragraph.

" + "

Second

paragraph.

" + "

Third paragraph.

" + "
" + ) + documents = [Document(page_content=nested_html)] + docs_transformed = bs_transformer.transform_documents(documents) + assert ( + docs_transformed[0].page_content + == "First paragraph. Second paragraph. Third paragraph." + ) + + +@pytest.mark.requires("bs4") +def test_extract_more_nested_tags() -> None: + bs_transformer = BeautifulSoupTransformer() + nested_html = ( + "
" + "

First paragraph.

" + "

Second paragraph.

" + "

Third paragraph with a list:" + "

" + "

" + "

Fourth paragraph.

" + "
" + ) + documents = [Document(page_content=nested_html)] + docs_transformed = bs_transformer.transform_documents(documents) + assert docs_transformed[0].page_content == ( + "First paragraph. Second paragraph. " + "Third paragraph with a list: " + "First list item. Second list item. " + "Fourth paragraph." + ) + + @pytest.mark.requires("bs4") def test_transform_keeps_order() -> None: bs_transformer = BeautifulSoupTransformer() @@ -56,33 +172,61 @@ def test_transform_keeps_order() -> None: ) documents = [Document(page_content=multiple_tags_html)] - # order of "p" and "h1" in the "tags_to_extract" parameter is important here: - # it will first extract all "p" tags, then all "h1" tags, breaking the order - # of the HTML. + # Order of "p" and "h1" in the "tags_to_extract" parameter is NOT important here: + # it will keep the order of the original HTML. docs_transformed_p_then_h1 = bs_transformer.transform_documents( documents, tags_to_extract=["p", "h1"] ) assert ( docs_transformed_p_then_h1[0].page_content - == "First paragraph. Second paragraph. First heading. Second heading." + == "First heading. First paragraph. Second heading. Second paragraph." ) # Recreating `documents` because transform_documents() modifies it. documents = [Document(page_content=multiple_tags_html)] - # changing the order of "h1" and "p" in "tags_to_extract" flips the order of - # the extracted tags: + # changing the order of "h1" and "p" in "tags_to_extract" does NOT flip the order + # of the extracted tags: docs_transformed_h1_then_p = bs_transformer.transform_documents( documents, tags_to_extract=["h1", "p"] ) assert ( docs_transformed_h1_then_p[0].page_content - == "First heading. Second heading. First paragraph. Second paragraph." + == "First heading. First paragraph. Second heading. Second paragraph." ) - # The correct result should be: - # - # "First heading. First paragraph. Second heading. Second paragraph." - # - # That is the order in the original HTML, that should be preserved to preserve - # the semantic "meaning" of the text. + +@pytest.mark.requires("bs4") +def test_extracts_href() -> None: + bs_transformer = BeautifulSoupTransformer() + multiple_tags_html = ( + "

First heading.

" + "

First paragraph with an example

" + "

Second paragraph with an a tag without href

" + ) + documents = [Document(page_content=multiple_tags_html)] + + docs_transformed = bs_transformer.transform_documents( + documents, tags_to_extract=["p"] + ) + assert docs_transformed[0].page_content == ( + "First paragraph with an example (http://example.com) " + "Second paragraph with an a tag without href" + ) + + +@pytest.mark.requires("bs4") +def test_invalid_html() -> None: + bs_transformer = BeautifulSoupTransformer() + invalid_html_1 = "

First heading." + invalid_html_2 = "