openvinotoolkit · itikhono · Dec 26, 2024 · Dec 26, 2024
diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
@@ -48,6 +48,32 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
     const ov::AnyMap& properties,
     const DeviceConfig& device_config,
     ov::Core& core) {
+
+    /*int idx = 0; // only for the 1st PagedAttention
+    for (const auto& op : model->get_ordered_ops()) {
+        if (idx == 0) {
+            if (std::string(op->get_type_name()) == "PagedAttentionExtension") {
+                std::cout << "PA name " << op->get_friendly_name() << std::endl;
+
+                int j = 0;
+                for (auto& in : op->input_values()) {
+                    {
+                        in.add_names({"pa_in_" + std::to_string(j++)});
+                        model->add_output(in);
+                    }
+
+                }
+
+                auto transpose_out = op->output(0)
+                                         .get_target_inputs()
+                                         .begin()
+                                         ->get_node()
+                                         ->output(0);
+                transpose_out.add_names({"pa_" + std::to_string(idx++)});
+                model->add_output(transpose_out);
+            }
+        }
+    }*/
     auto compiled_model = core.compile_model(model, device_config.get_device(), properties);
     ov::genai::utils::print_compiled_model_properties(compiled_model, "LLM with Paged Attention");
     ov::InferRequest infer_request = compiled_model.create_infer_request();

diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
@@ -211,7 +211,7 @@ GenerationConfig beam_search() {
 
 GenerationConfig greedy() {
     GenerationConfig greedy_config;
-    greedy_config.max_new_tokens = 30;
+    greedy_config.max_new_tokens = 10;
     return greedy_config;
 }
 

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
@@ -76,6 +76,31 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         ov::CompiledModel compiled_model;
         auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config);
         utils::slice_matmul_statefull_model(model);
+
+            int idx = 0;
+            for (const auto& op : model->get_ordered_ops()) {
+                if (idx == 0) {
+                    if (std::string(op->get_type_name()) == "ScaledDotProductAttention") {
+                        std::cout << "SDPA name " << op->get_friendly_name() << std::endl;
+                        int j = 0;
+                        auto get_out = [](const ov::Output<ov::Node>& out) {
+                            auto out_before = out;
+                            std::cout << "sdpa " << out_before.get_node_shared_ptr()->get_type_info().name << std::endl;
+                            std::cout << "sdpa " << out_before.get_node_shared_ptr()->get_friendly_name() << std::endl;
+                            return out_before;
+                        };
+                        for (auto& in : op->input_values()) {
+                                in.add_names({"sdpa_in_" + std::to_string(j++)});
+                                model->add_output(in);
+                        }
+                        // std::cout << "XXXXX name " << op << std::endl;
+                        // std::cout << "XXXX sdpa_" + std::to_string(idx) << std::endl;
+                        op->output(0).add_names({"sdpa_" + std::to_string(idx++)});
+                        model->add_output(op->output(0));
+                    }
+                }
+            }
+
         m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(model);
 
         if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) {

diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp
@@ -117,6 +117,50 @@ std::pair<EncodedResults, std::optional<int64_t>> get_lm_encoded_results(
     raw_perf_counters.m_new_token_times.emplace_back(infer_end);
     raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
 
+    // write SDPA results data to the file; only for the prompt initialization inference
+    std::ofstream myfile;
+    myfile.open("sdpa_outs.txt");
+
+    // Outputs:
+    for (int i = 0; i < 1; ++i) {
+        //myfile << i << ": ";
+        std::cout << "XXXXXX " << "print sdpa out " << std::endl;
+        auto tensor = m_llm.get_tensor("sdpa_" + std::to_string(i));
+        const auto& data = tensor.data<float>();
+        std::cout << "XXXXXX Out tensor size: " << tensor.get_size() << std::endl;
+        for (int j = 0; j < tensor.get_size(); ++j) {
+            myfile << data[j] << " ";
+        }
+        myfile << std::endl;
+    }
+    /*
+
+     // INPUTS:
+    for (int i = 0; i < 4; ++i) {
+        auto tensor = m_llm.get_tensor("sdpa_in_" + std::to_string(i));
+        std::cout << "SDPA in " << i << " " << tensor.get_shape() << std::endl;
+        if (tensor.get_element_type() == ov::element::i32) {
+            const auto& data = tensor.data<int>();
+            for (int j = 0; j < tensor.get_size(); ++j) {
+                myfile << data[j] << " ";
+            }
+            myfile << std::endl;
+        } else if (tensor.get_element_type() == ov::element::i64) {
+            const auto& data = tensor.data<int64_t>();
+            for (int j = 0; j < tensor.get_size(); ++j) {
+                myfile << data[j] << " ";
+            }
+            myfile << std::endl;
+        } else {
+            const auto& data = tensor.data<float>();
+            for (int j = 0; j < tensor.get_size(); ++j) {
+                myfile << data[j] << " ";
+            }
+            myfile << std::endl;
+        }
+    }
+    */
+
     auto logits = m_llm.get_tensor("logits");
 
     int64_t sequence_len = logits.get_shape().at(1);

diff --git a/src/cpp/src/model_runner.hpp b/src/cpp/src/model_runner.hpp
@@ -14,7 +14,7 @@
 #include "timer.hpp"
 
 #include "attention_output.hpp"
-
+static int i_init = 0;
 namespace ov::genai {
 
 inline std::string get_paged_attention_score_output_for_decoder_layer(size_t decoder_layer_id) {
@@ -184,6 +184,56 @@ class ModelRunner {
             timer.end();
         }
 
+ /*       // write PA results data to the file; only for the prompt initialization inference
+                if (i_init == 0) {
+                    i_init++;
+                    std::ofstream myfile;
+                    myfile.open("pa_outs.txt");
+
+                    // Outputs:
+                    for (int i = 0; i < 1; ++i) {
+                        //myfile<< i << ": ";
+                        std::cout << "XXXXXX " << "print PA out " << std::endl;
+                        auto tensor = m_request.get_tensor("pa_" + std::to_string(i));
+                        const auto& data = tensor.data<float>();
+                        std::cout << "XXXXXX Out tensor size: " << tensor.get_size() << std::endl;
+                        for (int j = 0; j < tensor.get_size(); ++j) {
+                            myfile << data[j] << " ";
+                        }
+                        myfile << std::endl;
+                    }
+                    // INPUTS:
+                    *//* for (int i = 0; i < 13; ++i) {
+                         auto tensor = m_request.get_tensor("pa_in_" + std::to_string(i));
+                         std::cout << "PA in " << i << " " << tensor.get_shape() << std::endl;
+                         if (tensor.get_element_type() == ov::element::i32) {
+                             const auto& data = tensor.data<int>();
+                             for (int j = 0; j < tensor.get_size(); ++j) {
+                                 myfile << data[j] << " ";
+                             }
+                             myfile << std::endl;
+                         } else if (tensor.get_element_type() == ov::element::i64) {
+                             const auto& data = tensor.data<int64_t>();
+                             for (int j = 0; j < tensor.get_size(); ++j) {
+                                 myfile << data[j] << " ";
+                             }
+                             myfile << std::endl;
+                         } else if (tensor.get_element_type() == ov::element::f16){
+                                 const auto& data = tensor.data<ov::float16>();
+                                 for (int j = 0; j < tensor.get_size(); ++j) {
+                                     myfile << data[j] << " ";
+                                 }
+                                 myfile << std::endl;
+                             }else {
+                             const auto& data = tensor.data<float>();
+                             for (int j = 0; j < tensor.get_size(); ++j) {
+                                 myfile << data[j] << " ";
+                             }
+                             myfile << std::endl;
+                         }
+                     }*//*
+
+                }*/
         if (m_collect_attention_scores) {
             _collect_attention_scores(sequence_groups, scheduler_output);
         }

diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
@@ -511,7 +511,16 @@ Token Sampler::_greedy_sample(const Logits& logits, size_t top_logprobs) const {
         }));
         max_value = -log_sum;
     }
-
+/*        Logits copy = logits;
+        copy.initialize_vector();
+        std::vector<Token> sorted = copy.m_vector;
+        std::sort(sorted.begin(), sorted.end(), [](const Token& left, const Token& right) {
+                return left.m_log_prob > right.m_log_prob;
+            });
+        for (int i = 0; i < 5; ++i) {
+                std::cout << "Token: " << sorted[i].m_index << " logit: " << sorted[i].m_log_prob << std::endl;
+        }
+    std::cout << "End of logits\n";*/
     return Token(max_value, max_index);
 }