[TensorRT] [ORT] support Stable Diffusion text2img Pipeline (#432)

* update cpp ddimscheduler.h * add reference github repo * update to clip output to [batch,77,768],and change the clip model onnx input name * update ort clip code * update trt clip code * ort unet code init * add exec name , avoid multi define * add unet test code * add unet code * add unet code in models * update unet code #todo fix nan error * del useless func * 更新unet得到正常结果但是本次只为一次推理需要更新为time_step的循环 * add vae test code * add vae model in models.h * add vae test code * update unet code #TODO clean code * update unet.h code #TODO clean code * add vae implement code #TODO clean code * add tensorrt vae implement code #TODO clean code * add tensorrt vae implement code #TODO clean code * modify unet code to use ddim scheduler cpp version # TODO clean code * add tensorrt vae define * 先初始化一个ddim的采样器也可以使用ort的 * update trt unet in models.h * trt unet code implement * target link ddim.so * update trt unet code * update trt unet test code * update trt vae test code * update trt unet code to fix bug * update code to fp16 * update code * Added dynamic library file for ddim scheduler * Update vae code to remove useless code and functions * update utils code to add new func, add bin to vector * Update the test code of VAE * update trt clip code to clean * clean trt unet code * add new func save vector to bin, and random latents generator * update trt test code #TODO clean code * add ort and trt pipeline in model * ort pipeline implement * trt pipeline implement * new inference interface for pipeline * new inference interface for pipeline * new inference interface for pipeline * new inference interface for pipeline * add pipeline code, include ort and trt * update test code * update test code * add pipeline code * update the code to clean code * update the code to clean code * update onnx path , engine path and save path * combine all sd test files into one pipeline file * delete vae unet clip test code * update pipeline code , opt json file * delay init model on low video memory * update code * update so fliename * update test code * update code ,low varm mode * update code
DefTruth · Sep 4, 2024 · 6256f13 · 6256f13
1 parent fae5d3d
commit 6256f13
Show file tree

Hide file tree

Showing 30 changed files with 1,454 additions and 121 deletions.
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
@@ -45,13 +45,13 @@ function(add_lite_ai_toolkit_shared_library version soversion)
     if (ENABLE_ONNXRUNTIME)
         include(cmake/onnxruntime.cmake)
         set(LITE_SRCS ${LITE_SRCS} ${ORT_SRCS})
-        set(LITE_DEPENDENCIES ${LITE_DEPENDENCIES} onnxruntime)
+        set(LITE_DEPENDENCIES ${LITE_DEPENDENCIES} onnxruntime ddim_scheduler_cpp)
     endif ()
 
     if (ENABLE_TENSORRT)
         include(cmake/tensorrt.cmake)
         set(LITE_SRCS ${LITE_SRCS} ${TRT_SRCS})
-        set(LITE_DEPENDENCIES ${LITE_DEPENDENCIES}  cudart nvinfer nvonnxparser nvinfer_plugin)
+        set(LITE_DEPENDENCIES ${LITE_DEPENDENCIES}  cudart nvinfer nvonnxparser nvinfer_plugin ddim_scheduler_cpp)
     endif ()
 
     if (ENABLE_MNN)

diff --git a/examples/lite/CMakeLists.txt b/examples/lite/CMakeLists.txt
@@ -102,4 +102,4 @@ add_lite_executable(lite_face_parsing_bisenet cv)
 add_lite_executable(lite_face_parsing_bisenet_dyn cv)
 add_lite_executable(lite_yolov8face cv)
 add_lite_executable(lite_yolov8 cv)
-add_lite_executable(lite_clip sd)
+add_lite_executable(lite_sd_pipeline sd)
diff --git a/examples/lite/sd/test_lite_clip.cpp b/examples/lite/sd/test_lite_clip.cpp
diff --git a/examples/lite/sd/test_lite_sd_pipeline.cpp b/examples/lite/sd/test_lite_sd_pipeline.cpp
@@ -0,0 +1,69 @@
+//
+// Created by wangzijian on 8/31/24.
+//
+#include "lite/lite.h"
+
+static void test_default()
+{
+    std::string clip_onnx_path = "../../../examples/hub/onnx/sd/clip_model.onnx";
+    std::string unet_onnx_path = "../../../examples/hub/onnx/sd/unet_model.onnx";
+    std::string vae_onnx_path = "../../../examples/examples/hub/onnx/sd/vae_model.onnx";
+
+    auto *pipeline = new lite::onnxruntime::sd::pipeline::Pipeline(clip_onnx_path, unet_onnx_path,
+                                                                   vae_onnx_path,
+                                                                   1);
+
+    std::string prompt = "1girl with red hair,blue eyes,smile, looking at viewer";
+    std::string negative_prompt = "";
+    std::string save_path = "../../../examples/logs/output_merge.png";
+    std::string scheduler_config_path = "../../../lite/ort/sd/scheduler_config.json";
+
+    pipeline->inference(prompt,negative_prompt,save_path,scheduler_config_path);
+
+    delete pipeline;
+
+}
+
+
+static void test_trt_pipeline()
+{
+    // 记录时间
+    std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now();
+
+    std::string clip_engine_path = "../../../examples/hub/trt/clip_text_model_fp16.engine";
+    std::string unet_engine_path = "../../../examples/hub/trt/unet_fp16.engine";
+    std::string vae_engine_path = "../../../examples/hub/trt/vae_model_fp16.engine";
+
+
+    auto *pipeline = new lite::trt::sd::pipeline::PipeLine(
+            clip_engine_path, unet_engine_path, vae_engine_path
+    );
+
+
+    std::string prompt = "1girl with red hair,blue eyes,smile, looking at viewer";
+    std::string negative_prompt = "";
+    std::string save_path = "../../../examples/logs/output_merge_tensorrt.png";
+    std::string scheduler_config_path = "../../../lite/ort/sd/scheduler_config.json";
+    pipeline->inference(prompt,negative_prompt,save_path,scheduler_config_path);
+
+    // 记录结束时间并且输出
+    std::chrono::steady_clock::time_point end_time = std::chrono::steady_clock::now();
+    std::chrono::duration<double> elapsed_seconds = end_time - start_time;
+    std::cout << "Elapsed time: " << elapsed_seconds.count() << " seconds" << std::endl;
+
+    delete pipeline;
+
+}
+
+static void test_lite()
+{
+    test_trt_pipeline();
+
+//    test_default();
+}
+
+int main()
+{
+    test_lite();
+    return 0;
+}
diff --git a/lite/bin/libddim_scheduler_cpp.so b/lite/bin/libddim_scheduler_cpp.so
diff --git a/lite/models.h b/lite/models.h
@@ -114,7 +114,9 @@
 #include "lite/ort/cv/face_parsing_bisenet_dyn.h"
 #include "lite/ort/cv/yolofacev8.h"
 #include "lite/ort/sd/clip.h"
-
+#include "lite/ort/sd/unet.h"
+#include "lite/ort/sd/vae.h"
+#include "lite/ort/sd/pipeline.h"
 #endif
 
 
@@ -130,6 +132,9 @@
 #include "lite/trt/cv/trt_yolov6.h"
 #include "lite/trt/cv/trt_yolov5_blazeface.h"
 #include "lite/trt/sd/trt_clip.h"
+#include "lite/trt/sd/trt_vae.h"
+#include "lite/trt/sd/trt_unet.h"
+#include "lite/trt/sd/trt_pipeline.h"
 #endif
 
 // ENABLE_MNN
@@ -671,10 +676,25 @@ namespace lite
     namespace sd
     {
         typedef ortsd::Clip _ONNXClip;
+        typedef ortsd::UNet _ONNXUNet;
+        typedef ortsd::Vae _ONNXVae;
+        typedef ortsd::Pipeline _ONNXPipeline;
         namespace text_encoder
         {
             typedef _ONNXClip Clip;
         }
+        namespace denoise
+        {
+            typedef _ONNXUNet UNet;
+        }
+        namespace image_decoder
+        {
+            typedef _ONNXVae Vae;
+        }
+        namespace pipeline
+        {
+            typedef _ONNXPipeline Pipeline;
+        }
     }
 
   }
@@ -717,12 +737,26 @@ namespace lite{
         }
         namespace sd
         {
-
+            typedef trtsd::TRTUNet _TRT_UNet;
             typedef trtsd::TRTClip _TRT_Clip;
+            typedef trtsd::TRTVae _TRT_Vae;
+            typedef trtsd::TRTPipeline _TRT_Pipeline;
             namespace text_encoder
             {
                 typedef _TRT_Clip Clip;
             }
+            namespace image_decoder
+            {
+                typedef _TRT_Vae Vae;
+            }
+            namespace denoise
+            {
+                typedef _TRT_UNet UNet;
+            }
+            namespace pipeline
+            {
+                typedef _TRT_Pipeline PipeLine;
+            }
         }
     }
 #endif

diff --git a/lite/ort/cv/yolofacev8.cpp b/lite/ort/cv/yolofacev8.cpp
@@ -153,32 +153,6 @@ void YoloFaceV8::generate_box(std::vector<Ort::Value> &ort_outputs,
 }
 
 
-void save_tensor_to_file(const Ort::Value& tensor, const std::string& filename) {
-    // 获取张量的类型和形状信息
-    auto type_and_shape_info = tensor.GetTensorTypeAndShapeInfo();
-    std::vector<int64_t> shape = type_and_shape_info.GetShape();
-    size_t element_count = type_and_shape_info.GetElementCount();
-
-    ONNXTensorElementDataType type = type_and_shape_info.GetElementType();
-    if (type != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
-        std::cerr << "Unsupported tensor data type. Only float tensors are supported." << std::endl;
-        return;
-    }
-
-    const float* pdata = tensor.GetTensorData<float>();
-
-    std::ofstream file(filename);
-    if (!file.is_open()) {
-        std::cerr << "Could not open file for writing: " << filename << std::endl;
-        return;
-    }
-
-    for (size_t i = 0; i < element_count; ++i) {
-        file << pdata[i] << "\n";
-    }
-    file.close();
-}
-
 
 void YoloFaceV8::detect(const cv::Mat &mat,std::vector<lite::types::Boxf> &boxes,
                         float conf_threshold, float iou_threshold) {

diff --git a/lite/ort/sd/clip.cpp b/lite/ort/sd/clip.cpp
@@ -45,8 +45,6 @@ void Clip::inference(std::vector<std::string> input, std::vector<std::vector<flo
     std::vector<std::vector<int>> output_encode;
 
     encode_text(input,output_encode);
-
-
     // flat out output_encode
     std::vector<int32_t> flat_output_encode;
     for (const auto& vec : output_encode) {
@@ -56,7 +54,7 @@ void Clip::inference(std::vector<std::string> input, std::vector<std::vector<flo
 
     // make tensor
     int batch = output_encode.size();
-    std::vector<int64_t> input_node_dims1 = {batch, 77};
+    std::vector<int64_t> input_node_dims1 = {batch, input_axes};
 
     Ort::MemoryInfo allocator_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
 
@@ -83,19 +81,19 @@ void Clip::inference(std::vector<std::string> input, std::vector<std::vector<flo
 
     const float *text_feature_ptr = ort_outputs[0].GetTensorMutableData<float>();
 
+
+    // update to [b,77,768]
     for (int i = 0 ; i < batch ; ++i)
     {
         std::vector<float> temp;
-        for (int j = 0 ; j < 512 ; ++j)
+        for (int j = 0 ; j < output_tensor_size  ; ++j)
         {
-            temp.push_back(text_feature_ptr[ i * 512 + j]);
+            temp.push_back(text_feature_ptr[ i * output_tensor_size + j]);
         }
         output.push_back(temp);
         temp.clear();
     }
 
-
-
 }
 
 
@@ -111,9 +109,9 @@ void Clip::encode_text(std::vector<std::string> input_text, std::vector<std::vec
     for (int i = 0 ; i < input_text.size(); ++i)
     {
        auto temp = tokenizer.tokenize(input_text[i], on_new_token_cb);
-       temp.push_back(49407);
-        if (temp.size() < 77) {
-            temp.resize(77, 0);
+       temp.push_back(end_flag_num);
+        if (temp.size() < input_axes) {
+            temp.resize(input_axes, 0);
         }
        output.push_back(temp);
     }

diff --git a/lite/ort/sd/clip.h b/lite/ort/sd/clip.h
@@ -32,13 +32,16 @@ namespace ortsd
         const char *log_id = nullptr;
         bool context_is_update = false;
         const unsigned int num_threads; // initialize at runtime.
+        const int input_axes = 77;
+        const int output_tensor_size = 77 * 768;
+        const int end_flag_num = 49407;
 
         std::vector<const char *> input_node_names = {
-                "TEXT"
+                "input_ids"
         };
 
         std::vector<const char *> output_node_names = {
-                "TEXT_EMBEDDING"
+                "text_embeddings"
         };
 
     public:

diff --git a/lite/ort/sd/ddimscheduler.h b/lite/ort/sd/ddimscheduler.h
@@ -0,0 +1,60 @@
+//
+// Created by TalkUHulk on 2024/4/25.
+//
+
+// reference by https://github.com/TalkUHulk/ddim_scheduler_cpp.git
+
+#ifndef DDIM_SCHEDULER_CPP_DDIMSCHEDULER_HPP
+#define DDIM_SCHEDULER_CPP_DDIMSCHEDULER_HPP
+
+#include <iostream>
+#include <vector>
+#include <string>
+namespace Scheduler {
+
+#if defined(_MSC_VER)
+#if defined(BUILDING_AIENGINE_DLL)
+#define DDIM_PUBLIC __declspec(dllexport)
+#elif defined(USING_AIENGINE_DLL)
+#define DDIM_PUBLIC __declspec(dllimport)
+#else
+#define DDIM_PUBLIC
+#endif
+#else
+#define DDIM_PUBLIC __attribute__((visibility("default")))
+#endif
+
+    struct DDIMMeta;
+    class DDIM_PUBLIC DDIMScheduler {
+
+    private:
+        DDIMMeta* meta_ptr = nullptr;
+        int num_inference_steps = 0;
+
+    public:
+        explicit DDIMScheduler(const std::string &config);
+
+        ~DDIMScheduler();
+
+        // Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        int set_timesteps(int num_inference_steps);
+
+        void get_timesteps(std::vector<int> &dst);
+
+        float get_init_noise_sigma() const;
+
+        int step(std::vector<float> &model_output, const std::vector<int> &model_output_size,
+                 std::vector<float> &sample, const std::vector<int> &sample_size,
+                 std::vector<float> &prev_sample,
+                 int timestep, float eta = 0.0, bool use_clipped_model_output = false);
+
+        int add_noise(std::vector<float> &sample, const std::vector<int> &sample_size,
+                      std::vector<float> &noise, const std::vector<int> &noise_size, int timesteps,
+                      std::vector<float> &noisy_samples);
+    private:
+        float get_variance(int timestep, int prev_timestep);
+    };
+}
+
+
+#endif //DDIM_SCHEDULER_CPP_DDIMSCHEDULER_HPP