From 0e2a74a8838faffe570097be338df951cd7ba4cb Mon Sep 17 00:00:00 2001
From: PiotrBLL <piotr.sobieszczyk@bluelabellabs.com>
Date: Tue, 26 Nov 2024 23:19:18 +0100
Subject: [PATCH 01/15] Add HPU tests for embeddings

---
 .../embeddings/self_hosted_hugging_face.py    |  9 +++
 .../embeddings/test_huggingface_hpu.py        | 57 +++++++++++++++++++
 .../embeddings/test_huggingface_hub_hpu.py    | 39 +++++++++++++
 .../embeddings/test_self_hosted_hpu.py        | 57 +++++++++++++++++++
 .../embeddings/huggingface.py                 | 14 +++++
 5 files changed, 176 insertions(+)
 create mode 100644 libs/community/tests/integration_tests/embeddings/test_huggingface_hpu.py
 create mode 100644 libs/community/tests/integration_tests/embeddings/test_huggingface_hub_hpu.py
 create mode 100644 libs/community/tests/integration_tests/embeddings/test_self_hosted_hpu.py

diff --git a/libs/community/langchain_community/embeddings/self_hosted_hugging_face.py b/libs/community/langchain_community/embeddings/self_hosted_hugging_face.py
index d45a802492045..27d3508b6e32c 100644
--- a/libs/community/langchain_community/embeddings/self_hosted_hugging_face.py
+++ b/libs/community/langchain_community/embeddings/self_hosted_hugging_face.py
@@ -34,6 +34,15 @@ def load_embedding_model(model_id: str, instruct: bool = False, device: int = 0)
 
         client = INSTRUCTOR(model_id)
 
+    if importlib.util.find_spec("habana_frameworks") is not None:
+        import habana_frameworks.torch.hpu as hthpu
+
+        if hthpu.is_available():
+            import torch
+            from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+            client = wrap_in_hpu_graph(client)
+            return client.eval().to(torch.device(device))
+
     if importlib.util.find_spec("torch") is not None:
         import torch
 
diff --git a/libs/community/tests/integration_tests/embeddings/test_huggingface_hpu.py b/libs/community/tests/integration_tests/embeddings/test_huggingface_hpu.py
new file mode 100644
index 0000000000000..7656054fda5ea
--- /dev/null
+++ b/libs/community/tests/integration_tests/embeddings/test_huggingface_hpu.py
@@ -0,0 +1,57 @@
+"""Test huggingface embeddings."""
+
+from langchain_community.embeddings.huggingface import (
+    HuggingFaceEmbeddings,
+    HuggingFaceInstructEmbeddings,
+)
+
+
+def test_huggingface_embedding_documents_on_hpu() -> None:
+    """Test huggingface embeddings."""
+    documents = ["foo bar"]
+    embedding = HuggingFaceEmbeddings(model_kwargs={"device": "hpu"})
+    output = embedding.embed_documents(documents)
+    assert len(output) == 1
+    assert len(output[0]) == 768
+
+
+def test_huggingface_embedding_query_on_hpu() -> None:
+    """Test huggingface embeddings."""
+    document = "foo bar"
+    embedding = HuggingFaceEmbeddings(encode_kwargs={"batch_size": 16}, model_kwargs={"device": "hpu"})
+    output = embedding.embed_query(document)
+    assert len(output) == 768
+
+
+def test_huggingface_instructor_embedding_documents_on_hpu() -> None:
+    """Test huggingface embeddings."""
+    documents = ["foo bar"]
+    model_name = "hkunlp/instructor-base"
+    embedding = HuggingFaceInstructEmbeddings(model_name=model_name, model_kwargs={"device": "hpu"})
+    output = embedding.embed_documents(documents)
+    assert len(output) == 1
+    assert len(output[0]) == 768
+
+
+def test_huggingface_instructor_embedding_query_on_hpu() -> None:
+    """Test huggingface embeddings."""
+    query = "foo bar"
+    model_name = "hkunlp/instructor-base"
+    embedding = HuggingFaceInstructEmbeddings(model_name=model_name, model_kwargs={"device": "hpu"})
+    output = embedding.embed_query(query)
+    assert len(output) == 768
+
+
+def test_huggingface_instructor_embedding_normalize_on_hpu() -> None:
+    """Test huggingface embeddings."""
+    query = "foo bar"
+    model_name = "hkunlp/instructor-base"
+    encode_kwargs = {"normalize_embeddings": True}
+    embedding = HuggingFaceInstructEmbeddings(
+        model_name=model_name, encode_kwargs=encode_kwargs, model_kwargs={"device": "hpu"}
+    )
+    output = embedding.embed_query(query)
+    assert len(output) == 768
+    eps = 1e-5
+    norm = sum([o**2 for o in output])
+    assert abs(1 - norm) <= eps
diff --git a/libs/community/tests/integration_tests/embeddings/test_huggingface_hub_hpu.py b/libs/community/tests/integration_tests/embeddings/test_huggingface_hub_hpu.py
new file mode 100644
index 0000000000000..3c29889ec9773
--- /dev/null
+++ b/libs/community/tests/integration_tests/embeddings/test_huggingface_hub_hpu.py
@@ -0,0 +1,39 @@
+"""Test HuggingFaceHub embeddings."""
+
+import pytest
+
+from langchain_community.embeddings import HuggingFaceHubEmbeddings
+
+
+def test_huggingfacehub_embedding_documents_on_hpu() -> None:
+    """Test huggingfacehub embeddings."""
+    documents = ["foo bar"]
+    embedding = HuggingFaceHubEmbeddings(model_kwargs={"device": "hpu"})  # type: ignore[call-arg]
+    output = embedding.embed_documents(documents)
+    assert len(output) == 1
+    assert len(output[0]) == 768
+
+
+async def test_huggingfacehub_embedding_async_documents_on_hpu() -> None:
+    """Test huggingfacehub embeddings."""
+    documents = ["foo bar"]
+    embedding = HuggingFaceHubEmbeddings(model_kwargs={"device": "hpu"})  # type: ignore[call-arg]
+    output = await embedding.aembed_documents(documents)
+    assert len(output) == 1
+    assert len(output[0]) == 768
+
+
+def test_huggingfacehub_embedding_query_on_hpu() -> None:
+    """Test huggingfacehub embeddings."""
+    document = "foo bar"
+    embedding = HuggingFaceHubEmbeddings(model_kwargs={"device": "hpu"})  # type: ignore[call-arg]
+    output = embedding.embed_query(document)
+    assert len(output) == 768
+
+
+async def test_huggingfacehub_embedding_async_query_on_hpu() -> None:
+    """Test huggingfacehub embeddings."""
+    document = "foo bar"
+    embedding = HuggingFaceHubEmbeddings(model_kwargs={"device": "hpu"})  # type: ignore[call-arg]
+    output = await embedding.aembed_query(document)
+    assert len(output) == 768
diff --git a/libs/community/tests/integration_tests/embeddings/test_self_hosted_hpu.py b/libs/community/tests/integration_tests/embeddings/test_self_hosted_hpu.py
new file mode 100644
index 0000000000000..0dfcb4360b12d
--- /dev/null
+++ b/libs/community/tests/integration_tests/embeddings/test_self_hosted_hpu.py
@@ -0,0 +1,57 @@
+"""Test self-hosted embeddings."""
+
+from typing import Any
+
+from langchain_community.embeddings import (
+    SelfHostedEmbeddings,
+    SelfHostedHuggingFaceEmbeddings,
+    SelfHostedHuggingFaceInstructEmbeddings,
+)
+
+
+def get_remote_instance() -> Any:
+    """Get remote instance for testing using HPU."""
+    import runhouse as rh
+
+    # Intel Gaudi instance
+    hpu = rh.cluster(name="gaudi-instance", instance_type="dl1.24xlarge")
+    hpu.install_packages(["pip:./"])
+    return hpu
+
+
+def test_self_hosted_huggingface_embedding_documents_hpu() -> None:
+    """Test self-hosted huggingface embeddings using HPU."""
+    documents = ["foo bar"]
+    hpu = get_remote_instance()
+    embedding = SelfHostedHuggingFaceEmbeddings(hardware=hpu)
+    output = embedding.embed_documents(documents)
+    assert len(output) == 1
+    assert len(output[0]) == 768
+
+
+def test_self_hosted_huggingface_embedding_query_hpu() -> None:
+    """Test self-hosted huggingface embeddings using HPU."""
+    document = "foo bar"
+    hpu = get_remote_instance()
+    embedding = SelfHostedHuggingFaceEmbeddings(hardware=hpu)
+    output = embedding.embed_query(document)
+    assert len(output) == 768
+
+
+def test_self_hosted_huggingface_instructor_embedding_documents_hpu() -> None:
+    """Test self-hosted huggingface instruct embeddings using HPU."""
+    documents = ["foo bar"]
+    hpu = get_remote_instance()
+    embedding = SelfHostedHuggingFaceInstructEmbeddings(hardware=hpu)
+    output = embedding.embed_documents(documents)
+    assert len(output) == 1
+    assert len(output[0]) == 768
+
+
+def test_self_hosted_huggingface_instructor_embedding_query_hpu() -> None:
+    """Test self-hosted huggingface instruct embeddings using HPU."""
+    query = "foo bar"
+    hpu = get_remote_instance()
+    embedding = SelfHostedHuggingFaceInstructEmbeddings(hardware=hpu)
+    output = embedding.embed_query(query)
+    assert len(output) == 768
diff --git a/libs/partners/huggingface/langchain_huggingface/embeddings/huggingface.py b/libs/partners/huggingface/langchain_huggingface/embeddings/huggingface.py
index 2bbc551f4e0b1..29919d47f31b3 100644
--- a/libs/partners/huggingface/langchain_huggingface/embeddings/huggingface.py
+++ b/libs/partners/huggingface/langchain_huggingface/embeddings/huggingface.py
@@ -24,6 +24,20 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
                 model_kwargs=model_kwargs,
                 encode_kwargs=encode_kwargs
             )
+
+    Example using HPU:
+        .. code-block:: python
+
+            from langchain_huggingface import HuggingFaceEmbeddings
+
+            model_name = "sentence-transformers/all-mpnet-base-v2"
+            model_kwargs = {'device': 'hpu'}
+            encode_kwargs = {'normalize_embeddings': False}
+            hf = HuggingFaceEmbeddings(
+                model_name=model_name,
+                model_kwargs=model_kwargs,
+                encode_kwargs=encode_kwargs
+            )
     """
 
     model_name: str = DEFAULT_MODEL_NAME

From 1bc96e808c8101a408e115d05cc3ac89f67ba0fd Mon Sep 17 00:00:00 2001
From: PiotrBLL <piotr.sobieszczyk@bluelabellabs.com>
Date: Thu, 28 Nov 2024 09:42:48 +0100
Subject: [PATCH 02/15] Fix imports in tests

---
 .../integration_tests/embeddings/test_huggingface_hub_hpu.py    | 2 --
 .../tests/integration_tests/embeddings/test_self_hosted_hpu.py  | 1 -
 2 files changed, 3 deletions(-)

diff --git a/libs/community/tests/integration_tests/embeddings/test_huggingface_hub_hpu.py b/libs/community/tests/integration_tests/embeddings/test_huggingface_hub_hpu.py
index 3c29889ec9773..97a9f2f4f8338 100644
--- a/libs/community/tests/integration_tests/embeddings/test_huggingface_hub_hpu.py
+++ b/libs/community/tests/integration_tests/embeddings/test_huggingface_hub_hpu.py
@@ -1,7 +1,5 @@
 """Test HuggingFaceHub embeddings."""
 
-import pytest
-
 from langchain_community.embeddings import HuggingFaceHubEmbeddings
 
 
diff --git a/libs/community/tests/integration_tests/embeddings/test_self_hosted_hpu.py b/libs/community/tests/integration_tests/embeddings/test_self_hosted_hpu.py
index 0dfcb4360b12d..7b22e3f8ae432 100644
--- a/libs/community/tests/integration_tests/embeddings/test_self_hosted_hpu.py
+++ b/libs/community/tests/integration_tests/embeddings/test_self_hosted_hpu.py
@@ -3,7 +3,6 @@
 from typing import Any
 
 from langchain_community.embeddings import (
-    SelfHostedEmbeddings,
     SelfHostedHuggingFaceEmbeddings,
     SelfHostedHuggingFaceInstructEmbeddings,
 )

From b837a35c079e77cabf38041e17065fe2a51b7a90 Mon Sep 17 00:00:00 2001
From: PiotrBLL <piotr.sobieszczyk@bluelabellabs.com>
Date: Fri, 29 Nov 2024 02:39:00 +0100
Subject: [PATCH 03/15] Add Pipeline implementation for Gaudi HPU

---
 .../langchain_huggingface/hpu_utils.py        | 55 +++++++++++++++++++
 .../llms/huggingface_pipeline.py              | 16 ++++++
 2 files changed, 71 insertions(+)
 create mode 100644 libs/partners/huggingface/langchain_huggingface/hpu_utils.py

diff --git a/libs/partners/huggingface/langchain_huggingface/hpu_utils.py b/libs/partners/huggingface/langchain_huggingface/hpu_utils.py
new file mode 100644
index 0000000000000..4a22fa33e319a
--- /dev/null
+++ b/libs/partners/huggingface/langchain_huggingface/hpu_utils.py
@@ -0,0 +1,55 @@
+try:
+    from transformers import (  # type: ignore[import]
+        AutoModelForCausalLM,
+        AutoModelForSeq2SeqLM,
+        AutoTokenizer,
+    )
+    from transformers import pipeline as hf_pipeline  # type: ignore[import]
+
+except ImportError:
+    raise ValueError(
+        "Could not import transformers python package. "
+        "Please install it with `pip install transformers`."
+    )
+
+
+def use_hpu_model_device(model_kwargs: dict) -> None:
+    """check if the model is using the hpu device."""
+    return model_kwargs.get("device") == "hpu"
+
+
+# HuggingFacePipeline usage
+def get_gaudi_auto_model_for_causal_lm(model_id: str) -> AutoModelForCausalLM:
+    """get the model for causal lm."""
+    from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+    from optimum.habana.utils import set_seed
+    from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+    import torch
+
+    adapt_transformers_to_gaudi()
+    set_seed(27)
+    model_dtype = torch.bfloat16
+
+    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=model_dtype)
+    model = model.eval().to("hpu")
+    model = wrap_in_hpu_graph(model)
+
+    return model
+
+
+def get_gaudi_auto_model_for_seq2seq_lm(model_id: str) -> AutoModelForSeq2SeqLM:
+    """get the model for seq2seq lm."""
+    from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+    from optimum.habana.utils import set_seed
+    from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+    import torch
+
+    adapt_transformers_to_gaudi()
+    set_seed(27)
+    model_dtype = torch.bfloat16
+
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_id, torch_dtype=model_dtype)
+    model = model.eval().to("hpu")
+    model = wrap_in_hpu_graph(model)
+
+    return model
diff --git a/libs/partners/huggingface/langchain_huggingface/llms/huggingface_pipeline.py b/libs/partners/huggingface/langchain_huggingface/llms/huggingface_pipeline.py
index 3e743a64289fc..69f153133d8d0 100644
--- a/libs/partners/huggingface/langchain_huggingface/llms/huggingface_pipeline.py
+++ b/libs/partners/huggingface/langchain_huggingface/llms/huggingface_pipeline.py
@@ -9,6 +9,8 @@
 from langchain_core.outputs import Generation, GenerationChunk, LLMResult
 from pydantic import ConfigDict, model_validator
 
+from ..hpu_utils import use_hpu_model_device, get_gaudi_auto_model_for_causal_lm, get_gaudi_auto_model_for_seq2seq_lm
+
 DEFAULT_MODEL_ID = "gpt2"
 DEFAULT_TASK = "text-generation"
 VALID_TASKS = (
@@ -126,6 +128,16 @@ def from_model_id(
             _model_kwargs["device_map"] = device_map
         tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
 
+        if use_hpu_model_device(_model_kwargs):
+            if backend == "openvino":
+                raise ValueError(
+                    "Cannot specify `model_kwargs{'device': 'hpu'}` and `backend=openvino` at the same time. "
+                    "Please remove `hpu` from `model_kwargs['device']` or set `backend=default`."
+                )
+            # setting the `backend` to `HPU` to avoid the error caused by attempting to move the model
+            # that was already loaded on the HPU using the Accelerate module to the same or another device.
+            backend = "HPU"
+
         try:
             if task == "text-generation":
                 if backend == "openvino":
@@ -151,6 +163,8 @@ def from_model_id(
                         model = OVModelForCausalLM.from_pretrained(
                             model_id, export=True, **_model_kwargs
                         )
+                elif use_hpu_model_device(_model_kwargs):
+                    model = get_gaudi_auto_model_for_causal_lm(model_id)
                 else:
                     model = AutoModelForCausalLM.from_pretrained(
                         model_id, **_model_kwargs
@@ -177,6 +191,8 @@ def from_model_id(
                         model = OVModelForSeq2SeqLM.from_pretrained(
                             model_id, export=True, **_model_kwargs
                         )
+                elif use_hpu_model_device(_model_kwargs):
+                    model = get_gaudi_auto_model_for_seq2seq_lm(model_id)
                 else:
                     model = AutoModelForSeq2SeqLM.from_pretrained(
                         model_id, **_model_kwargs

From 78102d17984b13cfd919f62f157548fe7021cf7a Mon Sep 17 00:00:00 2001
From: PiotrBLL <piotr.sobieszczyk@bluelabellabs.com>
Date: Fri, 29 Nov 2024 02:39:54 +0100
Subject: [PATCH 04/15] Add HPU tests: text generation, text2text geneartion
 and invalid hpu  & openvino backend

---
 .../llms/test_huggingface_pipeline_hpu.py     | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 libs/community/tests/integration_tests/llms/test_huggingface_pipeline_hpu.py

diff --git a/libs/community/tests/integration_tests/llms/test_huggingface_pipeline_hpu.py b/libs/community/tests/integration_tests/llms/test_huggingface_pipeline_hpu.py
new file mode 100644
index 0000000000000..ed0819c596708
--- /dev/null
+++ b/libs/community/tests/integration_tests/llms/test_huggingface_pipeline_hpu.py
@@ -0,0 +1,38 @@
+"""Test HuggingFace Pipeline wrapper."""
+
+from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
+
+
+def test_huggingface_pipeline_text_generation_on_hpu() -> None:
+    """Test valid call to HuggingFace text generation model."""
+    llm = HuggingFacePipeline.from_model_id(
+        model_id="gpt2",
+        task="text-generation",
+        pipeline_kwargs={"max_new_tokens": 10},
+        model_kwargs={"device": "hpu"},
+    )
+    output = llm.invoke("Say foo:")
+    assert isinstance(output, str)
+
+
+def test_huggingface_pipeline_text2text_generation_on_hpu() -> None:
+    """Test valid call to HuggingFace text2text generation model."""
+    llm = HuggingFacePipeline.from_model_id(
+        model_id="google/flan-t5-small",
+        task="text2text-generation",
+        model_kwargs={"device": "hpu"},
+    )
+    output = llm.invoke("Say foo:")
+    assert isinstance(output, str)
+
+
+def test_huggingface_pipeline_invalid_hpu_and_openvino_backend() -> None:
+    """Test invalid backend."""
+    try:
+        HuggingFacePipeline.from_model_id(
+            model_id="google/flan-t5-small",
+            task="text2text-generation",
+            model_kwargs={"device": "hpu", "backend": "openvino"},
+        )
+    except ValueError as e:
+        assert "Cannot specify `model_kwargs{'device': 'hpu'}` and `backend=openvino` at the same time." in str(e)

From dda18303ed91d575a97449d49d8de5296067774f Mon Sep 17 00:00:00 2001
From: PiotrBLL <piotr.sobieszczyk@bluelabellabs.com>
Date: Fri, 29 Nov 2024 02:40:59 +0100
Subject: [PATCH 05/15] Add HPU tests: summarization

---
 .../llms/test_huggingface_pipeline_hpu.py             | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/libs/community/tests/integration_tests/llms/test_huggingface_pipeline_hpu.py b/libs/community/tests/integration_tests/llms/test_huggingface_pipeline_hpu.py
index ed0819c596708..6f78e64937047 100644
--- a/libs/community/tests/integration_tests/llms/test_huggingface_pipeline_hpu.py
+++ b/libs/community/tests/integration_tests/llms/test_huggingface_pipeline_hpu.py
@@ -36,3 +36,14 @@ def test_huggingface_pipeline_invalid_hpu_and_openvino_backend() -> None:
         )
     except ValueError as e:
         assert "Cannot specify `model_kwargs{'device': 'hpu'}` and `backend=openvino` at the same time." in str(e)
+
+
+def test_huggingface_pipeline_summarization_on_hpu() -> None:
+    """Test valid call to HuggingFace summarization model."""
+    llm = HuggingFacePipeline.from_model_id(
+        model_id="facebook/bart-large-cnn",
+        task="summarization",
+        model_kwargs={"device": "hpu"},
+    )
+    output = llm.invoke("Say foo:")
+    assert isinstance(output, str)

From 56d167035402aaccfc0db53ac937b634ae2c17ea Mon Sep 17 00:00:00 2001
From: PiotrBLL <piotr.sobieszczyk@bluelabellabs.com>
Date: Tue, 3 Dec 2024 21:10:27 +0100
Subject: [PATCH 06/15] Add `test_huggingface_pipeline_streaming_on_hpu` test

---
 .../tests/integration_tests/test_llms.py       | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/libs/partners/huggingface/tests/integration_tests/test_llms.py b/libs/partners/huggingface/tests/integration_tests/test_llms.py
index e251c5bdb67f7..905826d2ef32d 100644
--- a/libs/partners/huggingface/tests/integration_tests/test_llms.py
+++ b/libs/partners/huggingface/tests/integration_tests/test_llms.py
@@ -16,3 +16,21 @@ def test_huggingface_pipeline_streaming() -> None:
         assert isinstance(chunk, str)
         stream_results_string = chunk
     assert len(stream_results_string.strip()) > 1
+
+
+def test_huggingface_pipeline_streaming_on_hpu() -> None:
+    """Test streaming tokens from huggingface_pipeline on HPU."""
+    llm = HuggingFacePipeline.from_model_id(
+        model_id="gpt2",
+        task="text-generation",
+        pipeline_kwargs={"max_new_tokens": 10},
+        model_kwargs={"device": "hpu"},
+    )
+    generator = llm.stream("Q: How do you say 'hello' in German? A:'", stop=["."])
+    stream_results_string = ""
+    assert isinstance(generator, Generator)
+
+    for chunk in generator:
+        assert isinstance(chunk, str)
+        stream_results_string = chunk
+    assert len(stream_results_string.strip()) > 1

From d3ffdb42c22c86a802e8244c513e53cbe5854b3b Mon Sep 17 00:00:00 2001
From: PiotrBLL <piotr.sobieszczyk@bluelabellabs.com>
Date: Tue, 3 Dec 2024 21:46:32 +0100
Subject: [PATCH 07/15] Split hpu llms test files

---
 .../tests/integration_tests/test_llms.py      | 18 ----------------
 .../tests/integration_tests/test_llms_hpu.py  | 21 +++++++++++++++++++
 2 files changed, 21 insertions(+), 18 deletions(-)
 create mode 100644 libs/partners/huggingface/tests/integration_tests/test_llms_hpu.py

diff --git a/libs/partners/huggingface/tests/integration_tests/test_llms.py b/libs/partners/huggingface/tests/integration_tests/test_llms.py
index 905826d2ef32d..e251c5bdb67f7 100644
--- a/libs/partners/huggingface/tests/integration_tests/test_llms.py
+++ b/libs/partners/huggingface/tests/integration_tests/test_llms.py
@@ -16,21 +16,3 @@ def test_huggingface_pipeline_streaming() -> None:
         assert isinstance(chunk, str)
         stream_results_string = chunk
     assert len(stream_results_string.strip()) > 1
-
-
-def test_huggingface_pipeline_streaming_on_hpu() -> None:
-    """Test streaming tokens from huggingface_pipeline on HPU."""
-    llm = HuggingFacePipeline.from_model_id(
-        model_id="gpt2",
-        task="text-generation",
-        pipeline_kwargs={"max_new_tokens": 10},
-        model_kwargs={"device": "hpu"},
-    )
-    generator = llm.stream("Q: How do you say 'hello' in German? A:'", stop=["."])
-    stream_results_string = ""
-    assert isinstance(generator, Generator)
-
-    for chunk in generator:
-        assert isinstance(chunk, str)
-        stream_results_string = chunk
-    assert len(stream_results_string.strip()) > 1
diff --git a/libs/partners/huggingface/tests/integration_tests/test_llms_hpu.py b/libs/partners/huggingface/tests/integration_tests/test_llms_hpu.py
new file mode 100644
index 0000000000000..c5740cf685c12
--- /dev/null
+++ b/libs/partners/huggingface/tests/integration_tests/test_llms_hpu.py
@@ -0,0 +1,21 @@
+from typing import Generator
+
+from langchain_huggingface.llms import HuggingFacePipeline
+
+
+def test_huggingface_pipeline_streaming_on_hpu() -> None:
+    """Test streaming tokens from huggingface_pipeline on HPU."""
+    llm = HuggingFacePipeline.from_model_id(
+        model_id="gpt2",
+        task="text-generation",
+        pipeline_kwargs={"max_new_tokens": 10},
+        model_kwargs={"device": "hpu"},
+    )
+    generator = llm.stream("Q: How do you say 'hello' in German? A:'", stop=["."])
+    stream_results_string = ""
+    assert isinstance(generator, Generator)
+
+    for chunk in generator:
+        assert isinstance(chunk, str)
+        stream_results_string = chunk
+    assert len(stream_results_string.strip()) > 1

From 3e44cdf95baf662473086741ab2d19a7a24bb882 Mon Sep 17 00:00:00 2001
From: PiotrBLL <piotr.sobieszczyk@bluelabellabs.com>
Date: Tue, 3 Dec 2024 21:46:56 +0100
Subject: [PATCH 08/15] Add Dockerfile and readme to run pipelines on hpu

---
 README_hpu.md           | 55 +++++++++++++++++++++++++++++++++++++++++
 docker/gaudi/Dockerfile | 27 ++++++++++++++++++++
 2 files changed, 82 insertions(+)
 create mode 100644 README_hpu.md
 create mode 100644 docker/gaudi/Dockerfile

diff --git a/README_hpu.md b/README_hpu.md
new file mode 100644
index 0000000000000..75218ec03010f
--- /dev/null
+++ b/README_hpu.md
@@ -0,0 +1,55 @@
+# Running HuggingFace on Intel Gaudi (HPU)
+
+## Prerequisites
+
+Before you begin, ensure you have Docker installed and can run Docker containers on your machine. You'll also need access to Intel Gaudi hardware (HPUs).
+
+## Build the Docker Image
+
+1. Build the Docker image using the provided Dockerfile.
+   
+   ```bash
+   cd docker/gaudi
+   ```
+ 
+   ```bash
+   docker build -t langchain-hpu .
+   ```
+
+   This will create a Docker image called `langchain-hpu`, which includes all necessary dependencies for running HuggingFace on Intel Gaudi (HPU).
+
+## Run the Docker Container
+
+1. Start the Docker container with an interactive terminal.
+
+   ```bash
+   docker run -it langchain-hpu
+   ```
+
+2. Once inside the container, navigate to the HuggingFace integration folder.
+
+   ```bash
+   cd /workspace/langchain/libs/partners/huggingface
+   ```
+
+3. Now, you are ready to run any scripts or tests for HuggingFace models on HPU. For example, you can start a training script or load models for inference on the Intel Gaudi (HPU) device.
+
+   ### Example:
+
+   To run a sample script, use:
+
+   ```bash
+   poetry run pytest tests/integration_tests/test_llms_hpu.py
+   ```
+
+   Replace `test_llms_hpu.py` with the actual script you'd like to execute, and make sure to configure your environment to use HPU during model execution.
+
+## Dependencies
+
+The Dockerfile installs both general and HPU-specific dependencies. If you need to update or add any additional dependencies for your HuggingFace integration, you can modify the `requirements_hpu.txt` file located in the `/libs/partners/huggingface/` directory and rebuild the image.
+
+## Notes
+
+- Ensure that the container has access to Intel Gaudi hardware (HPU) to properly execute the scripts.
+- You may want to use `poetry` or `pip` for managing Python dependencies in the container, depending on your project's setup.
+- If you're using `poetry`, you can install the dependencies by running `poetry install` inside the container.
diff --git a/docker/gaudi/Dockerfile b/docker/gaudi/Dockerfile
new file mode 100644
index 0000000000000..41bf14c5d18a3
--- /dev/null
+++ b/docker/gaudi/Dockerfile
@@ -0,0 +1,27 @@
+# Use the official Gaudi Docker image with PyTorch
+FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+
+# Set the working directory to where the project will be copied
+WORKDIR /workspace
+
+# Copy the entire project into the container (this assumes the whole project is in the context directory)
+COPY . /workspace/langchain
+
+# Install general dependencies
+RUN apt-get update && apt-get install -y \
+    python3-pip \
+    python3-dev \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Poetry for managing dependencies (optional if using Poetry)
+RUN curl -sSL https://install.python-poetry.org | python3 -
+
+# Copy and install the base dependencies (optional if using Poetry)
+RUN poetry install
+
+# Install HPU-specific dependencies from a separate file
+COPY libs/partners/huggingface/requirements_hpu.txt /workspace/langchain/libs/partners/huggingface/requirements_hpu.txt
+
+# Install HPU-specific dependencies using Poetry or pip
+RUN poetry add $(cat /workspace/langchain/libs/partners/huggingface/requirements_hpu.txt)

From 38b831d9b64f08bf5011759fcbe048b33a101075 Mon Sep 17 00:00:00 2001
From: PiotrBLL <piotr.sobieszczyk@bluelabellabs.com>
Date: Tue, 26 Nov 2024 23:19:18 +0100
Subject: [PATCH 09/15] Add Dockerfile and readme to run pipelines on hpu

Split hpu llms test files

Add `test_huggingface_pipeline_streaming_on_hpu` test

Add HPU tests: summarization

Add HPU tests: text generation, text2text geneartion and invalid hpu  & openvino backend

Add Pipeline implementation for Gaudi HPU

Fix imports in tests

Add HPU tests for embeddings
---
 README_hpu.md                                 | 55 ++++++++++++++++++
 docker/gaudi/Dockerfile                       | 27 +++++++++
 .../embeddings/self_hosted_hugging_face.py    |  9 +++
 .../embeddings/test_huggingface_hpu.py        | 57 +++++++++++++++++++
 .../embeddings/test_huggingface_hub_hpu.py    | 37 ++++++++++++
 .../embeddings/test_self_hosted_hpu.py        | 56 ++++++++++++++++++
 .../llms/test_huggingface_pipeline_hpu.py     | 49 ++++++++++++++++
 .../embeddings/huggingface.py                 | 14 +++++
 .../langchain_huggingface/hpu_utils.py        | 55 ++++++++++++++++++
 .../llms/huggingface_pipeline.py              | 16 ++++++
 .../tests/integration_tests/test_llms_hpu.py  | 21 +++++++
 11 files changed, 396 insertions(+)
 create mode 100644 README_hpu.md
 create mode 100644 docker/gaudi/Dockerfile
 create mode 100644 libs/community/tests/integration_tests/embeddings/test_huggingface_hpu.py
 create mode 100644 libs/community/tests/integration_tests/embeddings/test_huggingface_hub_hpu.py
 create mode 100644 libs/community/tests/integration_tests/embeddings/test_self_hosted_hpu.py
 create mode 100644 libs/community/tests/integration_tests/llms/test_huggingface_pipeline_hpu.py
 create mode 100644 libs/partners/huggingface/langchain_huggingface/hpu_utils.py
 create mode 100644 libs/partners/huggingface/tests/integration_tests/test_llms_hpu.py

diff --git a/README_hpu.md b/README_hpu.md
new file mode 100644
index 0000000000000..75218ec03010f
--- /dev/null
+++ b/README_hpu.md
@@ -0,0 +1,55 @@
+# Running HuggingFace on Intel Gaudi (HPU)
+
+## Prerequisites
+
+Before you begin, ensure you have Docker installed and can run Docker containers on your machine. You'll also need access to Intel Gaudi hardware (HPUs).
+
+## Build the Docker Image
+
+1. Build the Docker image using the provided Dockerfile.
+   
+   ```bash
+   cd docker/gaudi
+   ```
+ 
+   ```bash
+   docker build -t langchain-hpu .
+   ```
+
+   This will create a Docker image called `langchain-hpu`, which includes all necessary dependencies for running HuggingFace on Intel Gaudi (HPU).
+
+## Run the Docker Container
+
+1. Start the Docker container with an interactive terminal.
+
+   ```bash
+   docker run -it langchain-hpu
+   ```
+
+2. Once inside the container, navigate to the HuggingFace integration folder.
+
+   ```bash
+   cd /workspace/langchain/libs/partners/huggingface
+   ```
+
+3. Now, you are ready to run any scripts or tests for HuggingFace models on HPU. For example, you can start a training script or load models for inference on the Intel Gaudi (HPU) device.
+
+   ### Example:
+
+   To run a sample script, use:
+
+   ```bash
+   poetry run pytest tests/integration_tests/test_llms_hpu.py
+   ```
+
+   Replace `test_llms_hpu.py` with the actual script you'd like to execute, and make sure to configure your environment to use HPU during model execution.
+
+## Dependencies
+
+The Dockerfile installs both general and HPU-specific dependencies. If you need to update or add any additional dependencies for your HuggingFace integration, you can modify the `requirements_hpu.txt` file located in the `/libs/partners/huggingface/` directory and rebuild the image.
+
+## Notes
+
+- Ensure that the container has access to Intel Gaudi hardware (HPU) to properly execute the scripts.
+- You may want to use `poetry` or `pip` for managing Python dependencies in the container, depending on your project's setup.
+- If you're using `poetry`, you can install the dependencies by running `poetry install` inside the container.
diff --git a/docker/gaudi/Dockerfile b/docker/gaudi/Dockerfile
new file mode 100644
index 0000000000000..41bf14c5d18a3
--- /dev/null
+++ b/docker/gaudi/Dockerfile
@@ -0,0 +1,27 @@
+# Use the official Gaudi Docker image with PyTorch
+FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+
+# Set the working directory to where the project will be copied
+WORKDIR /workspace
+
+# Copy the entire project into the container (this assumes the whole project is in the context directory)
+COPY . /workspace/langchain
+
+# Install general dependencies
+RUN apt-get update && apt-get install -y \
+    python3-pip \
+    python3-dev \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Poetry for managing dependencies (optional if using Poetry)
+RUN curl -sSL https://install.python-poetry.org | python3 -
+
+# Copy and install the base dependencies (optional if using Poetry)
+RUN poetry install
+
+# Install HPU-specific dependencies from a separate file
+COPY libs/partners/huggingface/requirements_hpu.txt /workspace/langchain/libs/partners/huggingface/requirements_hpu.txt
+
+# Install HPU-specific dependencies using Poetry or pip
+RUN poetry add $(cat /workspace/langchain/libs/partners/huggingface/requirements_hpu.txt)
diff --git a/libs/community/langchain_community/embeddings/self_hosted_hugging_face.py b/libs/community/langchain_community/embeddings/self_hosted_hugging_face.py
index d45a802492045..27d3508b6e32c 100644
--- a/libs/community/langchain_community/embeddings/self_hosted_hugging_face.py
+++ b/libs/community/langchain_community/embeddings/self_hosted_hugging_face.py
@@ -34,6 +34,15 @@ def load_embedding_model(model_id: str, instruct: bool = False, device: int = 0)
 
         client = INSTRUCTOR(model_id)
 
+    if importlib.util.find_spec("habana_frameworks") is not None:
+        import habana_frameworks.torch.hpu as hthpu
+
+        if hthpu.is_available():
+            import torch
+            from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+            client = wrap_in_hpu_graph(client)
+            return client.eval().to(torch.device(device))
+
     if importlib.util.find_spec("torch") is not None:
         import torch
 
diff --git a/libs/community/tests/integration_tests/embeddings/test_huggingface_hpu.py b/libs/community/tests/integration_tests/embeddings/test_huggingface_hpu.py
new file mode 100644
index 0000000000000..7656054fda5ea
--- /dev/null
+++ b/libs/community/tests/integration_tests/embeddings/test_huggingface_hpu.py
@@ -0,0 +1,57 @@
+"""Test huggingface embeddings."""
+
+from langchain_community.embeddings.huggingface import (
+    HuggingFaceEmbeddings,
+    HuggingFaceInstructEmbeddings,
+)
+
+
+def test_huggingface_embedding_documents_on_hpu() -> None:
+    """Test huggingface embeddings."""
+    documents = ["foo bar"]
+    embedding = HuggingFaceEmbeddings(model_kwargs={"device": "hpu"})
+    output = embedding.embed_documents(documents)
+    assert len(output) == 1
+    assert len(output[0]) == 768
+
+
+def test_huggingface_embedding_query_on_hpu() -> None:
+    """Test huggingface embeddings."""
+    document = "foo bar"
+    embedding = HuggingFaceEmbeddings(encode_kwargs={"batch_size": 16}, model_kwargs={"device": "hpu"})
+    output = embedding.embed_query(document)
+    assert len(output) == 768
+
+
+def test_huggingface_instructor_embedding_documents_on_hpu() -> None:
+    """Test huggingface embeddings."""
+    documents = ["foo bar"]
+    model_name = "hkunlp/instructor-base"
+    embedding = HuggingFaceInstructEmbeddings(model_name=model_name, model_kwargs={"device": "hpu"})
+    output = embedding.embed_documents(documents)
+    assert len(output) == 1
+    assert len(output[0]) == 768
+
+
+def test_huggingface_instructor_embedding_query_on_hpu() -> None:
+    """Test huggingface embeddings."""
+    query = "foo bar"
+    model_name = "hkunlp/instructor-base"
+    embedding = HuggingFaceInstructEmbeddings(model_name=model_name, model_kwargs={"device": "hpu"})
+    output = embedding.embed_query(query)
+    assert len(output) == 768
+
+
+def test_huggingface_instructor_embedding_normalize_on_hpu() -> None:
+    """Test huggingface embeddings."""
+    query = "foo bar"
+    model_name = "hkunlp/instructor-base"
+    encode_kwargs = {"normalize_embeddings": True}
+    embedding = HuggingFaceInstructEmbeddings(
+        model_name=model_name, encode_kwargs=encode_kwargs, model_kwargs={"device": "hpu"}
+    )
+    output = embedding.embed_query(query)
+    assert len(output) == 768
+    eps = 1e-5
+    norm = sum([o**2 for o in output])
+    assert abs(1 - norm) <= eps
diff --git a/libs/community/tests/integration_tests/embeddings/test_huggingface_hub_hpu.py b/libs/community/tests/integration_tests/embeddings/test_huggingface_hub_hpu.py
new file mode 100644
index 0000000000000..97a9f2f4f8338
--- /dev/null
+++ b/libs/community/tests/integration_tests/embeddings/test_huggingface_hub_hpu.py
@@ -0,0 +1,37 @@
+"""Test HuggingFaceHub embeddings."""
+
+from langchain_community.embeddings import HuggingFaceHubEmbeddings
+
+
+def test_huggingfacehub_embedding_documents_on_hpu() -> None:
+    """Test huggingfacehub embeddings."""
+    documents = ["foo bar"]
+    embedding = HuggingFaceHubEmbeddings(model_kwargs={"device": "hpu"})  # type: ignore[call-arg]
+    output = embedding.embed_documents(documents)
+    assert len(output) == 1
+    assert len(output[0]) == 768
+
+
+async def test_huggingfacehub_embedding_async_documents_on_hpu() -> None:
+    """Test huggingfacehub embeddings."""
+    documents = ["foo bar"]
+    embedding = HuggingFaceHubEmbeddings(model_kwargs={"device": "hpu"})  # type: ignore[call-arg]
+    output = await embedding.aembed_documents(documents)
+    assert len(output) == 1
+    assert len(output[0]) == 768
+
+
+def test_huggingfacehub_embedding_query_on_hpu() -> None:
+    """Test huggingfacehub embeddings."""
+    document = "foo bar"
+    embedding = HuggingFaceHubEmbeddings(model_kwargs={"device": "hpu"})  # type: ignore[call-arg]
+    output = embedding.embed_query(document)
+    assert len(output) == 768
+
+
+async def test_huggingfacehub_embedding_async_query_on_hpu() -> None:
+    """Test huggingfacehub embeddings."""
+    document = "foo bar"
+    embedding = HuggingFaceHubEmbeddings(model_kwargs={"device": "hpu"})  # type: ignore[call-arg]
+    output = await embedding.aembed_query(document)
+    assert len(output) == 768
diff --git a/libs/community/tests/integration_tests/embeddings/test_self_hosted_hpu.py b/libs/community/tests/integration_tests/embeddings/test_self_hosted_hpu.py
new file mode 100644
index 0000000000000..7b22e3f8ae432
--- /dev/null
+++ b/libs/community/tests/integration_tests/embeddings/test_self_hosted_hpu.py
@@ -0,0 +1,56 @@
+"""Test self-hosted embeddings."""
+
+from typing import Any
+
+from langchain_community.embeddings import (
+    SelfHostedHuggingFaceEmbeddings,
+    SelfHostedHuggingFaceInstructEmbeddings,
+)
+
+
+def get_remote_instance() -> Any:
+    """Get remote instance for testing using HPU."""
+    import runhouse as rh
+
+    # Intel Gaudi instance
+    hpu = rh.cluster(name="gaudi-instance", instance_type="dl1.24xlarge")
+    hpu.install_packages(["pip:./"])
+    return hpu
+
+
+def test_self_hosted_huggingface_embedding_documents_hpu() -> None:
+    """Test self-hosted huggingface embeddings using HPU."""
+    documents = ["foo bar"]
+    hpu = get_remote_instance()
+    embedding = SelfHostedHuggingFaceEmbeddings(hardware=hpu)
+    output = embedding.embed_documents(documents)
+    assert len(output) == 1
+    assert len(output[0]) == 768
+
+
+def test_self_hosted_huggingface_embedding_query_hpu() -> None:
+    """Test self-hosted huggingface embeddings using HPU."""
+    document = "foo bar"
+    hpu = get_remote_instance()
+    embedding = SelfHostedHuggingFaceEmbeddings(hardware=hpu)
+    output = embedding.embed_query(document)
+    assert len(output) == 768
+
+
+def test_self_hosted_huggingface_instructor_embedding_documents_hpu() -> None:
+    """Test self-hosted huggingface instruct embeddings using HPU."""
+    documents = ["foo bar"]
+    hpu = get_remote_instance()
+    embedding = SelfHostedHuggingFaceInstructEmbeddings(hardware=hpu)
+    output = embedding.embed_documents(documents)
+    assert len(output) == 1
+    assert len(output[0]) == 768
+
+
+def test_self_hosted_huggingface_instructor_embedding_query_hpu() -> None:
+    """Test self-hosted huggingface instruct embeddings using HPU."""
+    query = "foo bar"
+    hpu = get_remote_instance()
+    embedding = SelfHostedHuggingFaceInstructEmbeddings(hardware=hpu)
+    output = embedding.embed_query(query)
+    assert len(output) == 768
diff --git a/libs/community/tests/integration_tests/llms/test_huggingface_pipeline_hpu.py b/libs/community/tests/integration_tests/llms/test_huggingface_pipeline_hpu.py
new file mode 100644
index 0000000000000..6f78e64937047
--- /dev/null
+++ b/libs/community/tests/integration_tests/llms/test_huggingface_pipeline_hpu.py
@@ -0,0 +1,49 @@
+"""Test HuggingFace Pipeline wrapper."""
+
+from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
+
+
+def test_huggingface_pipeline_text_generation_on_hpu() -> None:
+    """Test valid call to HuggingFace text generation model."""
+    llm = HuggingFacePipeline.from_model_id(
+        model_id="gpt2",
+        task="text-generation",
+        pipeline_kwargs={"max_new_tokens": 10},
+        model_kwargs={"device": "hpu"},
+    )
+    output = llm.invoke("Say foo:")
+    assert isinstance(output, str)
+
+
+def test_huggingface_pipeline_text2text_generation_on_hpu() -> None:
+    """Test valid call to HuggingFace text2text generation model."""
+    llm = HuggingFacePipeline.from_model_id(
+        model_id="google/flan-t5-small",
+        task="text2text-generation",
+        model_kwargs={"device": "hpu"},
+    )
+    output = llm.invoke("Say foo:")
+    assert isinstance(output, str)
+
+
+def test_huggingface_pipeline_invalid_hpu_and_openvino_backend() -> None:
+    """Test invalid backend."""
+    try:
+        HuggingFacePipeline.from_model_id(
+            model_id="google/flan-t5-small",
+            task="text2text-generation",
+            model_kwargs={"device": "hpu", "backend": "openvino"},
+        )
+    except ValueError as e:
+        assert "Cannot specify `model_kwargs{'device': 'hpu'}` and `backend=openvino` at the same time." in str(e)
+
+
+def test_huggingface_pipeline_summarization_on_hpu() -> None:
+    """Test valid call to HuggingFace summarization model."""
+    llm = HuggingFacePipeline.from_model_id(
+        model_id="facebook/bart-large-cnn",
+        task="summarization",
+        model_kwargs={"device": "hpu"},
+    )
+    output = llm.invoke("Say foo:")
+    assert isinstance(output, str)
diff --git a/libs/partners/huggingface/langchain_huggingface/embeddings/huggingface.py b/libs/partners/huggingface/langchain_huggingface/embeddings/huggingface.py
index 2bbc551f4e0b1..29919d47f31b3 100644
--- a/libs/partners/huggingface/langchain_huggingface/embeddings/huggingface.py
+++ b/libs/partners/huggingface/langchain_huggingface/embeddings/huggingface.py
@@ -24,6 +24,20 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
                 model_kwargs=model_kwargs,
                 encode_kwargs=encode_kwargs
             )
+
+    Example using HPU:
+        .. code-block:: python
+
+            from langchain_huggingface import HuggingFaceEmbeddings
+
+            model_name = "sentence-transformers/all-mpnet-base-v2"
+            model_kwargs = {'device': 'hpu'}
+            encode_kwargs = {'normalize_embeddings': False}
+            hf = HuggingFaceEmbeddings(
+                model_name=model_name,
+                model_kwargs=model_kwargs,
+                encode_kwargs=encode_kwargs
+            )
     """
 
     model_name: str = DEFAULT_MODEL_NAME
diff --git a/libs/partners/huggingface/langchain_huggingface/hpu_utils.py b/libs/partners/huggingface/langchain_huggingface/hpu_utils.py
new file mode 100644
index 0000000000000..4a22fa33e319a
--- /dev/null
+++ b/libs/partners/huggingface/langchain_huggingface/hpu_utils.py
@@ -0,0 +1,55 @@
+try:
+    from transformers import (  # type: ignore[import]
+        AutoModelForCausalLM,
+        AutoModelForSeq2SeqLM,
+        AutoTokenizer,
+    )
+    from transformers import pipeline as hf_pipeline  # type: ignore[import]
+
+except ImportError:
+    raise ValueError(
+        "Could not import transformers python package. "
+        "Please install it with `pip install transformers`."
+    )
+
+
+def use_hpu_model_device(model_kwargs: dict) -> None:
+    """check if the model is using the hpu device."""
+    return model_kwargs.get("device") == "hpu"
+
+
+# HuggingFacePipeline usage
+def get_gaudi_auto_model_for_causal_lm(model_id: str) -> AutoModelForCausalLM:
+    """get the model for causal lm."""
+    from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+    from optimum.habana.utils import set_seed
+    from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+    import torch
+
+    adapt_transformers_to_gaudi()
+    set_seed(27)
+    model_dtype = torch.bfloat16
+
+    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=model_dtype)
+    model = model.eval().to("hpu")
+    model = wrap_in_hpu_graph(model)
+
+    return model
+
+
+def get_gaudi_auto_model_for_seq2seq_lm(model_id: str) -> AutoModelForSeq2SeqLM:
+    """get the model for seq2seq lm."""
+    from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+    from optimum.habana.utils import set_seed
+    from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+    import torch
+
+    adapt_transformers_to_gaudi()
+    set_seed(27)
+    model_dtype = torch.bfloat16
+
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_id, torch_dtype=model_dtype)
+    model = model.eval().to("hpu")
+    model = wrap_in_hpu_graph(model)
+
+    return model
diff --git a/libs/partners/huggingface/langchain_huggingface/llms/huggingface_pipeline.py b/libs/partners/huggingface/langchain_huggingface/llms/huggingface_pipeline.py
index 3e743a64289fc..69f153133d8d0 100644
--- a/libs/partners/huggingface/langchain_huggingface/llms/huggingface_pipeline.py
+++ b/libs/partners/huggingface/langchain_huggingface/llms/huggingface_pipeline.py
@@ -9,6 +9,8 @@
 from langchain_core.outputs import Generation, GenerationChunk, LLMResult
 from pydantic import ConfigDict, model_validator
 
+from ..hpu_utils import use_hpu_model_device, get_gaudi_auto_model_for_causal_lm, get_gaudi_auto_model_for_seq2seq_lm
+
 DEFAULT_MODEL_ID = "gpt2"
 DEFAULT_TASK = "text-generation"
 VALID_TASKS = (
@@ -126,6 +128,16 @@ def from_model_id(
             _model_kwargs["device_map"] = device_map
         tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
 
+        if use_hpu_model_device(_model_kwargs):
+            if backend == "openvino":
+                raise ValueError(
+                    "Cannot specify `model_kwargs{'device': 'hpu'}` and `backend=openvino` at the same time. "
+                    "Please remove `hpu` from `model_kwargs['device']` or set `backend=default`."
+                )
+            # setting the `backend` to `HPU` to avoid the error caused by attempting to move the model
+            # that was already loaded on the HPU using the Accelerate module to the same or another device.
+            backend = "HPU"
+
         try:
             if task == "text-generation":
                 if backend == "openvino":
@@ -151,6 +163,8 @@ def from_model_id(
                         model = OVModelForCausalLM.from_pretrained(
                             model_id, export=True, **_model_kwargs
                         )
+                elif use_hpu_model_device(_model_kwargs):
+                    model = get_gaudi_auto_model_for_causal_lm(model_id)
                 else:
                     model = AutoModelForCausalLM.from_pretrained(
                         model_id, **_model_kwargs
@@ -177,6 +191,8 @@ def from_model_id(
                         model = OVModelForSeq2SeqLM.from_pretrained(
                             model_id, export=True, **_model_kwargs
                         )
+                elif use_hpu_model_device(_model_kwargs):
+                    model = get_gaudi_auto_model_for_seq2seq_lm(model_id)
                 else:
                     model = AutoModelForSeq2SeqLM.from_pretrained(
                         model_id, **_model_kwargs
diff --git a/libs/partners/huggingface/tests/integration_tests/test_llms_hpu.py b/libs/partners/huggingface/tests/integration_tests/test_llms_hpu.py
new file mode 100644
index 0000000000000..c5740cf685c12
--- /dev/null
+++ b/libs/partners/huggingface/tests/integration_tests/test_llms_hpu.py
@@ -0,0 +1,21 @@
+from typing import Generator
+
+from langchain_huggingface.llms import HuggingFacePipeline
+
+
+def test_huggingface_pipeline_streaming_on_hpu() -> None:
+    """Test streaming tokens from huggingface_pipeline on HPU."""
+    llm = HuggingFacePipeline.from_model_id(
+        model_id="gpt2",
+        task="text-generation",
+        pipeline_kwargs={"max_new_tokens": 10},
+        model_kwargs={"device": "hpu"},
+    )
+    generator = llm.stream("Q: How do you say 'hello' in German? A:'", stop=["."])
+    stream_results_string = ""
+    assert isinstance(generator, Generator)
+
+    for chunk in generator:
+        assert isinstance(chunk, str)
+        stream_results_string = chunk
+    assert len(stream_results_string.strip()) > 1

From d4075b41ae6dd59906ddc048780d4f861b65f1d8 Mon Sep 17 00:00:00 2001
From: PiotrBLL <piotr.sobieszczyk@bluelabellabs.com>
Date: Thu, 9 Jan 2025 13:55:13 +0100
Subject: [PATCH 10/15] Add optional run for HPU in tests

---
 .../embeddings/test_huggingface_hpu.py                | 10 ++++++++--
 .../embeddings/test_huggingface_hub_hpu.py            |  7 ++++++-
 .../embeddings/test_self_hosted_hpu.py                | 11 ++++++++---
 .../llms/test_huggingface_pipeline_hpu.py             |  7 ++++++-
 4 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/libs/community/tests/integration_tests/embeddings/test_huggingface_hpu.py b/libs/community/tests/integration_tests/embeddings/test_huggingface_hpu.py
index 7656054fda5ea..aa88d5ffb9636 100644
--- a/libs/community/tests/integration_tests/embeddings/test_huggingface_hpu.py
+++ b/libs/community/tests/integration_tests/embeddings/test_huggingface_hpu.py
@@ -1,11 +1,13 @@
 """Test huggingface embeddings."""
-
+import os
+import pytest
 from langchain_community.embeddings.huggingface import (
     HuggingFaceEmbeddings,
     HuggingFaceInstructEmbeddings,
 )
 
 
+@pytest.mark.skipif(not os.getenv('RUN_HPU_TEST'), reason="RUN_HPU_TEST is not set")
 def test_huggingface_embedding_documents_on_hpu() -> None:
     """Test huggingface embeddings."""
     documents = ["foo bar"]
@@ -15,6 +17,7 @@ def test_huggingface_embedding_documents_on_hpu() -> None:
     assert len(output[0]) == 768
 
 
+@pytest.mark.skipif(not os.getenv('RUN_HPU_TEST'), reason="RUN_HPU_TEST is not set")
 def test_huggingface_embedding_query_on_hpu() -> None:
     """Test huggingface embeddings."""
     document = "foo bar"
@@ -23,6 +26,7 @@ def test_huggingface_embedding_query_on_hpu() -> None:
     assert len(output) == 768
 
 
+@pytest.mark.skipif(not os.getenv('RUN_HPU_TEST'), reason="RUN_HPU_TEST is not set")
 def test_huggingface_instructor_embedding_documents_on_hpu() -> None:
     """Test huggingface embeddings."""
     documents = ["foo bar"]
@@ -33,6 +37,7 @@ def test_huggingface_instructor_embedding_documents_on_hpu() -> None:
     assert len(output[0]) == 768
 
 
+@pytest.mark.skipif(not os.getenv('RUN_HPU_TEST'), reason="RUN_HPU_TEST is not set")
 def test_huggingface_instructor_embedding_query_on_hpu() -> None:
     """Test huggingface embeddings."""
     query = "foo bar"
@@ -42,6 +47,7 @@ def test_huggingface_instructor_embedding_query_on_hpu() -> None:
     assert len(output) == 768
 
 
+@pytest.mark.skipif(not os.getenv('RUN_HPU_TEST'), reason="RUN_HPU_TEST is not set")
 def test_huggingface_instructor_embedding_normalize_on_hpu() -> None:
     """Test huggingface embeddings."""
     query = "foo bar"
@@ -53,5 +59,5 @@ def test_huggingface_instructor_embedding_normalize_on_hpu() -> None:
     output = embedding.embed_query(query)
     assert len(output) == 768
     eps = 1e-5
-    norm = sum([o**2 for o in output])
+    norm = sum([o ** 2 for o in output])
     assert abs(1 - norm) <= eps
diff --git a/libs/community/tests/integration_tests/embeddings/test_huggingface_hub_hpu.py b/libs/community/tests/integration_tests/embeddings/test_huggingface_hub_hpu.py
index 97a9f2f4f8338..fab3e58023081 100644
--- a/libs/community/tests/integration_tests/embeddings/test_huggingface_hub_hpu.py
+++ b/libs/community/tests/integration_tests/embeddings/test_huggingface_hub_hpu.py
@@ -1,8 +1,10 @@
 """Test HuggingFaceHub embeddings."""
-
+import os
+import pytest
 from langchain_community.embeddings import HuggingFaceHubEmbeddings
 
 
+@pytest.mark.skipif(not os.getenv('RUN_HPU_TEST'), reason="RUN_HPU_TEST is not set")
 def test_huggingfacehub_embedding_documents_on_hpu() -> None:
     """Test huggingfacehub embeddings."""
     documents = ["foo bar"]
@@ -12,6 +14,7 @@ def test_huggingfacehub_embedding_documents_on_hpu() -> None:
     assert len(output[0]) == 768
 
 
+@pytest.mark.skipif(not os.getenv('RUN_HPU_TEST'), reason="RUN_HPU_TEST is not set")
 async def test_huggingfacehub_embedding_async_documents_on_hpu() -> None:
     """Test huggingfacehub embeddings."""
     documents = ["foo bar"]
@@ -21,6 +24,7 @@ async def test_huggingfacehub_embedding_async_documents_on_hpu() -> None:
     assert len(output[0]) == 768
 
 
+@pytest.mark.skipif(not os.getenv('RUN_HPU_TEST'), reason="RUN_HPU_TEST is not set")
 def test_huggingfacehub_embedding_query_on_hpu() -> None:
     """Test huggingfacehub embeddings."""
     document = "foo bar"
@@ -29,6 +33,7 @@ def test_huggingfacehub_embedding_query_on_hpu() -> None:
     assert len(output) == 768
 
 
+@pytest.mark.skipif(not os.getenv('RUN_HPU_TEST'), reason="RUN_HPU_TEST is not set")
 async def test_huggingfacehub_embedding_async_query_on_hpu() -> None:
     """Test huggingfacehub embeddings."""
     document = "foo bar"
diff --git a/libs/community/tests/integration_tests/embeddings/test_self_hosted_hpu.py b/libs/community/tests/integration_tests/embeddings/test_self_hosted_hpu.py
index 7b22e3f8ae432..a99db8a39a1cc 100644
--- a/libs/community/tests/integration_tests/embeddings/test_self_hosted_hpu.py
+++ b/libs/community/tests/integration_tests/embeddings/test_self_hosted_hpu.py
@@ -1,13 +1,14 @@
 """Test self-hosted embeddings."""
-
-from typing import Any
-
+import os
+import pytest
 from langchain_community.embeddings import (
     SelfHostedHuggingFaceEmbeddings,
     SelfHostedHuggingFaceInstructEmbeddings,
 )
+from typing import Any
 
 
+@pytest.mark.skipif(not os.getenv('RUN_HPU_TEST'), reason="RUN_HPU_TEST is not set")
 def get_remote_instance() -> Any:
     """Get remote instance for testing using HPU."""
     import runhouse as rh
@@ -18,6 +19,7 @@ def get_remote_instance() -> Any:
     return hpu
 
 
+@pytest.mark.skipif(not os.getenv('RUN_HPU_TEST'), reason="RUN_HPU_TEST is not set")
 def test_self_hosted_huggingface_embedding_documents_hpu() -> None:
     """Test self-hosted huggingface embeddings using HPU."""
     documents = ["foo bar"]
@@ -28,6 +30,7 @@ def test_self_hosted_huggingface_embedding_documents_hpu() -> None:
     assert len(output[0]) == 768
 
 
+@pytest.mark.skipif(not os.getenv('RUN_HPU_TEST'), reason="RUN_HPU_TEST is not set")
 def test_self_hosted_huggingface_embedding_query_hpu() -> None:
     """Test self-hosted huggingface embeddings using HPU."""
     document = "foo bar"
@@ -37,6 +40,7 @@ def test_self_hosted_huggingface_embedding_query_hpu() -> None:
     assert len(output) == 768
 
 
+@pytest.mark.skipif(not os.getenv('RUN_HPU_TEST'), reason="RUN_HPU_TEST is not set")
 def test_self_hosted_huggingface_instructor_embedding_documents_hpu() -> None:
     """Test self-hosted huggingface instruct embeddings using HPU."""
     documents = ["foo bar"]
@@ -47,6 +51,7 @@ def test_self_hosted_huggingface_instructor_embedding_documents_hpu() -> None:
     assert len(output[0]) == 768
 
 
+@pytest.mark.skipif(not os.getenv('RUN_HPU_TEST'), reason="RUN_HPU_TEST is not set")
 def test_self_hosted_huggingface_instructor_embedding_query_hpu() -> None:
     """Test self-hosted huggingface instruct embeddings using HPU."""
     query = "foo bar"
diff --git a/libs/community/tests/integration_tests/llms/test_huggingface_pipeline_hpu.py b/libs/community/tests/integration_tests/llms/test_huggingface_pipeline_hpu.py
index 6f78e64937047..e1cc581b6bee8 100644
--- a/libs/community/tests/integration_tests/llms/test_huggingface_pipeline_hpu.py
+++ b/libs/community/tests/integration_tests/llms/test_huggingface_pipeline_hpu.py
@@ -1,8 +1,10 @@
 """Test HuggingFace Pipeline wrapper."""
-
+import os
+import pytest
 from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
 
 
+@pytest.mark.skipif(not os.getenv('RUN_HPU_TEST'), reason="RUN_HPU_TEST is not set")
 def test_huggingface_pipeline_text_generation_on_hpu() -> None:
     """Test valid call to HuggingFace text generation model."""
     llm = HuggingFacePipeline.from_model_id(
@@ -15,6 +17,7 @@ def test_huggingface_pipeline_text_generation_on_hpu() -> None:
     assert isinstance(output, str)
 
 
+@pytest.mark.skipif(not os.getenv('RUN_HPU_TEST'), reason="RUN_HPU_TEST is not set")
 def test_huggingface_pipeline_text2text_generation_on_hpu() -> None:
     """Test valid call to HuggingFace text2text generation model."""
     llm = HuggingFacePipeline.from_model_id(
@@ -26,6 +29,7 @@ def test_huggingface_pipeline_text2text_generation_on_hpu() -> None:
     assert isinstance(output, str)
 
 
+@pytest.mark.skipif(not os.getenv('RUN_HPU_TEST'), reason="RUN_HPU_TEST is not set")
 def test_huggingface_pipeline_invalid_hpu_and_openvino_backend() -> None:
     """Test invalid backend."""
     try:
@@ -38,6 +42,7 @@ def test_huggingface_pipeline_invalid_hpu_and_openvino_backend() -> None:
         assert "Cannot specify `model_kwargs{'device': 'hpu'}` and `backend=openvino` at the same time." in str(e)
 
 
+@pytest.mark.skipif(not os.getenv('RUN_HPU_TEST'), reason="RUN_HPU_TEST is not set")
 def test_huggingface_pipeline_summarization_on_hpu() -> None:
     """Test valid call to HuggingFace summarization model."""
     llm = HuggingFacePipeline.from_model_id(

From 45afd72705b3e0bf2e4642a826a992b2873bc859 Mon Sep 17 00:00:00 2001
From: PiotrBLL <piotr.sobieszczyk@bluelabellabs.com>
Date: Thu, 9 Jan 2025 13:59:57 +0100
Subject: [PATCH 11/15] Add running HPU tests in Makefile

---
 libs/community/Makefile | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/libs/community/Makefile b/libs/community/Makefile
index 55b63f009b519..372ffaee6690c 100644
--- a/libs/community/Makefile
+++ b/libs/community/Makefile
@@ -1,11 +1,11 @@
-.PHONY: all format lint test tests test_watch integration_tests docker_tests help extended_tests
+.PHONY: all format lint test tests test_watch integration_tests hpu_tests skip_hpu_tests docker_tests help extended_tests
 
 # Default target executed when no arguments are given to make.
 all: help
 
 # Define a variable for the test file path.
 TEST_FILE ?= tests/unit_tests/
-integration_tests: TEST_FILE = tests/integration_tests/
+HPU_TEST_FILES=$(shell find tests -name '*_hpu.py')
 
 # Run unit tests and generate a coverage report.
 coverage:
@@ -24,8 +24,10 @@ integration_tests:
 test_watch:
 	poetry run ptw --disable-socket --allow-unix-socket --snapshot-update --now . -- -vv tests/unit_tests
 
-check_imports: $(shell find langchain_community -name '*.py')
-	poetry run python ./scripts/check_imports.py $^
+# Run HPU-specific tests only if RUN_HPU_TEST is set.
+hpu_tests:
+	@echo "Setting RUN_HPU_TEST to 1 to enable HPU-specific tests."
+	export RUN_HPU_TEST=1 && poetry run pytest --disable-socket --allow-unix-socket $(HPU_TEST_FILES)
 
 extended_tests:
 	poetry run pytest --disable-socket --allow-unix-socket --only-extended tests/unit_tests
@@ -74,3 +76,4 @@ help:
 	@echo 'tests                        - run unit tests'
 	@echo 'test TEST_FILE=<test_file>   - run all tests in file'
 	@echo 'test_watch                   - run unit tests in watch mode'
+	@echo 'hpu_tests                    - run all HPU-specific tests (files ending with *_hpu.py) if RUN_HPU_TEST is set'

From 5eb22a5ad279fa2bd7fdd942c3b9d14eb953d797 Mon Sep 17 00:00:00 2001
From: PiotrBLL <piotr.sobieszczyk@bluelabellabs.com>
Date: Thu, 9 Jan 2025 14:03:34 +0100
Subject: [PATCH 12/15] Add `RUN_HPU_TEST` in .env.example

---
 libs/community/tests/integration_tests/.env.example | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libs/community/tests/integration_tests/.env.example b/libs/community/tests/integration_tests/.env.example
index cf7d891b143b4..7616676f3d040 100644
--- a/libs/community/tests/integration_tests/.env.example
+++ b/libs/community/tests/integration_tests/.env.example
@@ -66,3 +66,7 @@ UPSTASH_VECTOR_URL=your_upstash_vector_url
 UPSTASH_VECTOR_TOKEN=your_upstash_vector_token
 UPSTASH_VECTOR_URL_EMBEDDING=your_upstash_vector_embedding_url
 UPSTASH_VECTOR_TOKEN_EMBEDDING=your_upstash_vector_embedding_token
+
+
+# Intel® Gaudi®
+RUN_HPU_TEST=0

From cfa37442d558744af4263245914a3e269733bd8a Mon Sep 17 00:00:00 2001
From: PiotrBLL <piotr.sobieszczyk@bluelabellabs.com>
Date: Thu, 9 Jan 2025 14:06:42 +0100
Subject: [PATCH 13/15] Update the README_hpu.md to run hpu tests

---
 README_hpu.md | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/README_hpu.md b/README_hpu.md
index 75218ec03010f..2a54a65b237d8 100644
--- a/README_hpu.md
+++ b/README_hpu.md
@@ -34,15 +34,25 @@ Before you begin, ensure you have Docker installed and can run Docker containers
 
 3. Now, you are ready to run any scripts or tests for HuggingFace models on HPU. For example, you can start a training script or load models for inference on the Intel Gaudi (HPU) device.
 
+   ### Running HPU-Specific Tests
+
+   To run HPU-specific tests, use the following command:
+
+   ```bash
+   export RUN_HPU_TEST=1 && make hpu_tests
+   ```
+
+   This will set the `RUN_HPU_TEST` environment variable and run all tests that require HPU (those files ending with `_hpu.py`).
+
    ### Example:
 
-   To run a sample script, use:
+   To run a specific test file that requires HPU, use:
 
    ```bash
-   poetry run pytest tests/integration_tests/test_llms_hpu.py
+   export RUN_HPU_TEST=1 && poetry run pytest tests/integration_tests/test_llms_hpu.py
    ```
 
-   Replace `test_llms_hpu.py` with the actual script you'd like to execute, and make sure to configure your environment to use HPU during model execution.
+   Replace `test_llms_hpu.py` with the actual script you'd like to execute, and ensure that the environment is configured to use HPU during model execution.
 
 ## Dependencies
 

From 6c5b534de5bb6001fc5cc76cc92d94a2e4d74b2a Mon Sep 17 00:00:00 2001
From: PiotrBLL <piotr.sobieszczyk@bluelabellabs.com>
Date: Thu, 9 Jan 2025 15:30:25 +0100
Subject: [PATCH 14/15] Rollback deleted lines in Makefile

---
 libs/community/Makefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/libs/community/Makefile b/libs/community/Makefile
index 372ffaee6690c..8fc1cd8ceeaa0 100644
--- a/libs/community/Makefile
+++ b/libs/community/Makefile
@@ -1,10 +1,11 @@
-.PHONY: all format lint test tests test_watch integration_tests hpu_tests skip_hpu_tests docker_tests help extended_tests
+.PHONY: all format lint test tests test_watch integration_tests hpu_tests docker_tests help extended_tests
 
 # Default target executed when no arguments are given to make.
 all: help
 
 # Define a variable for the test file path.
 TEST_FILE ?= tests/unit_tests/
+integration_tests: TEST_FILE = tests/integration_tests/
 HPU_TEST_FILES=$(shell find tests -name '*_hpu.py')
 
 # Run unit tests and generate a coverage report.
@@ -24,6 +25,9 @@ integration_tests:
 test_watch:
 	poetry run ptw --disable-socket --allow-unix-socket --snapshot-update --now . -- -vv tests/unit_tests
 
+check_imports: $(shell find langchain_community -name '*.py')
+	poetry run python ./scripts/check_imports.py $^
+
 # Run HPU-specific tests only if RUN_HPU_TEST is set.
 hpu_tests:
 	@echo "Setting RUN_HPU_TEST to 1 to enable HPU-specific tests."

From a03eaccd18a3dcb8bc74b67d7fff62408438b2ce Mon Sep 17 00:00:00 2001
From: PiotrBLL <piotr.sobieszczyk@bluelabellabs.com>
Date: Mon, 13 Jan 2025 15:34:38 +0100
Subject: [PATCH 15/15] self_hosted_hugging_face.py - add running hpu device
 with string input

---
 .../langchain_community/embeddings/self_hosted_hugging_face.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/community/langchain_community/embeddings/self_hosted_hugging_face.py b/libs/community/langchain_community/embeddings/self_hosted_hugging_face.py
index 27d3508b6e32c..455ff8bd01a84 100644
--- a/libs/community/langchain_community/embeddings/self_hosted_hugging_face.py
+++ b/libs/community/langchain_community/embeddings/self_hosted_hugging_face.py
@@ -41,7 +41,7 @@ def load_embedding_model(model_id: str, instruct: bool = False, device: int = 0)
             import torch
             from habana_frameworks.torch.hpu import wrap_in_hpu_graph
             client = wrap_in_hpu_graph(client)
-            return client.eval().to(torch.device(device))
+            return client.eval().to(torch.device("hpu"))
 
     if importlib.util.find_spec("torch") is not None:
         import torch