fmt

lxning · lxning · commit fd30685b9f38 · 2024-02-16T16:30:03.000-08:00
diff --git a/cpp/test/resources/examples/aot_inductor/resnet_handler/0_png.pt b/cpp/test/resources/examples/aot_inductor/resnet_handler/0_png.pt
diff --git a/examples/cpp/aot_inductor/resnet/resnet50_torch_export.py b/examples/cpp/aot_inductor/resnet/resnet50_torch_export.py
@@ -5,7 +5,7 @@
 
 torch.set_float32_matmul_precision("high")
 
-MAX_BATCH_SIZE = 32
+MAX_BATCH_SIZE = 15
 
 model = resnet50(weights=ResNet50_Weights.DEFAULT)
 model.eval()
@@ -15,11 +15,12 @@
         device = "cuda"
     else:
         device = "cpu"
+        # The max batch size is less than 16. The following setting can only work in PT2.3.
         # We need to turn off the below optimizations to support batch_size = 16,
         # which is treated like a special case
         # https://github.com/pytorch/pytorch/pull/116152
-        torch.backends.mkldnn.set_flags(False)
-        torch.backends.nnpack.set_flags(False)
+        # torch.backends.mkldnn.set_flags(False)
+        # torch.backends.nnpack.set_flags(False)
 
     model = model.to(device=device)
     example_inputs = (torch.randn(2, 3, 224, 224, device=device),)
diff --git a/examples/cpp/aot_inductor/resnet/src/resnet_handler.cc b/examples/cpp/aot_inductor/resnet/src/resnet_handler.cc
@@ -21,6 +21,16 @@ const folly::dynamic& ResnetCppHandler::GetJsonValue(std::unique_ptr<folly::dyna
   }
 }
 
+std::string ResnetCppHandler::MapClassToLabel(const torch::Tensor& classes, const torch::Tensor& probs) {
+  folly::dynamic map = folly::dynamic::object;
+  for (int i = 0; i < classes.sizes()[0]; i++) {
+    auto class_value = GetJsonValue(mapping_json_, std::to_string(classes[i].item<long>()));
+    map[class_value[1].asString()] = probs[i].item<float>();
+  }
+
+  return folly::toJson(map);
+}
+
 std::pair<std::shared_ptr<void>, std::shared_ptr<torch::Device>>
 ResnetCppHandler::LoadModel(
     std::shared_ptr<torchserve::LoadModelRequest>& load_model_request) {
@@ -60,45 +70,118 @@ ResnetCppHandler::LoadModel(
   }
 }
 
+c10::IValue ResnetCppHandler::Preprocess(
+    std::shared_ptr<torch::Device>& device,
+    std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
+    std::shared_ptr<torchserve::InferenceRequestBatch>& request_batch,
+    std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) {
+  auto batch_ivalue = c10::impl::GenericList(c10::TensorType::get());
+
+  std::vector<torch::Tensor> batch_tensors;
+  uint8_t idx = 0;
+  for (auto& request : *request_batch) {
+    (*response_batch)[request.request_id] =
+        std::make_shared<torchserve::InferenceResponse>(request.request_id);
+    idx_to_req_id.first += idx_to_req_id.first.empty()
+                               ? request.request_id
+                               : "," + request.request_id;
+    auto data_it =
+        request.parameters.find(torchserve::PayloadType::kPARAMETER_NAME_DATA);
+    auto dtype_it =
+        request.headers.find(torchserve::PayloadType::kHEADER_NAME_DATA_TYPE);
+    if (data_it == request.parameters.end()) {
+      data_it = request.parameters.find(
+          torchserve::PayloadType::kPARAMETER_NAME_BODY);
+      dtype_it =
+          request.headers.find(torchserve::PayloadType::kHEADER_NAME_BODY_TYPE);
+    }
+
+    if (data_it == request.parameters.end() ||
+        dtype_it == request.headers.end()) {
+      TS_LOGF(ERROR, "Empty payload for request id: {}", request.request_id);
+      (*response_batch)[request.request_id]->SetResponse(
+          500, "data_type", torchserve::PayloadType::kCONTENT_TYPE_TEXT,
+          "Empty payload");
+      continue;
+    }
+
+    try {
+      if (dtype_it->second == torchserve::PayloadType::kDATA_TYPE_BYTES) {
+        batch_tensors.emplace_back(
+            torch::pickle_load(data_it->second).toTensor());
+        idx_to_req_id.second[idx++] = request.request_id;
+      } else {
+        TS_LOG(ERROR, "Not supported input format, only support bytesstring in this example");
+        (*response_batch)[request.request_id]->SetResponse(
+          500, "data_type", torchserve::PayloadType::kCONTENT_TYPE_TEXT,
+          "Not supported input format, only support bytesstring in this example");
+        continue;
+      }
+    } catch (const std::runtime_error& e) {
+      TS_LOGF(ERROR, "Failed to load tensor for request id: {}, error: {}",
+              request.request_id, e.what());
+      auto response = (*response_batch)[request.request_id];
+      response->SetResponse(500, "data_type",
+                            torchserve::PayloadType::kDATA_TYPE_STRING,
+                            "runtime_error, failed to load tensor");
+    } catch (const c10::Error& e) {
+      TS_LOGF(ERROR, "Failed to load tensor for request id: {}, c10 error: {}",
+              request.request_id, e.msg());
+      auto response = (*response_batch)[request.request_id];
+      response->SetResponse(500, "data_type",
+                            torchserve::PayloadType::kDATA_TYPE_STRING,
+                            "c10 error, failed to load tensor");
+    }
+  }
+  if (!batch_tensors.empty()) {
+    batch_ivalue.emplace_back(torch::stack(batch_tensors).to(*device));
+  }
+
+  return batch_ivalue;
+}
 
 c10::IValue ResnetCppHandler::Inference(
     std::shared_ptr<void> model, c10::IValue &inputs,
     std::shared_ptr<torch::Device> &device,
     std::pair<std::string &, std::map<uint8_t, std::string> &> &idx_to_req_id,
     std::shared_ptr<torchserve::InferenceResponseBatch> &response_batch) {
   c10::InferenceMode mode;
+  auto batch_ivalue = c10::impl::GenericList(c10::TensorType::get());
   try {
     std::shared_ptr<torch::inductor::AOTIModelContainerRunner> runner;
     if (device->is_cuda()) {
       runner = std::static_pointer_cast<torch::inductor::AOTIModelContainerRunnerCuda>(model);
     } else {
       runner = std::static_pointer_cast<torch::inductor::AOTIModelContainerRunnerCpu>(model);
     }
-
-    auto batch_output_tensor_vector = runner->run(inputs.toTensorVector());
-    return c10::IValue(batch_output_tensor_vector[0]);
+    auto data = inputs.toTensorList()[0].get().toTensor();
+    std::vector<torch::Tensor> input_vec;
+    input_vec.emplace_back(data);
+    auto batch_output_tensor_vector = runner->run(input_vec);
+    batch_ivalue.emplace_back(torch::stack(batch_output_tensor_vector).to(*device));
   } catch (std::runtime_error& e) {
     TS_LOG(ERROR, e.what());
   } catch (const c10::Error& e) {
     TS_LOGF(ERROR, "Failed to apply inference on input, c10 error:{}", e.msg());
   }
+  return batch_ivalue;
 }
 
 void ResnetCppHandler::Postprocess(
     c10::IValue &inputs,
     std::pair<std::string &, std::map<uint8_t, std::string> &> &idx_to_req_id,
     std::shared_ptr<torchserve::InferenceResponseBatch> &response_batch) {
-  auto& data = inputs.toTensor();
+  auto data = inputs.toTensorList().get(0);
+  auto ps = torch::softmax(data[0], 1);
+  auto top5 = torch::topk(ps, 5, 1);
   for (const auto &kv : idx_to_req_id.second) {
     try {
-      auto out = data[kv.first].unsqueeze(0);
-      auto y_hat = torch::argmax(out, 1).item<int>();
-      auto predicted_idx = std::to_string(y_hat);
+      auto probs = std::get<0>(top5)[kv.first];
+      auto classes = std::get<1>(top5)[kv.first];
       auto response = (*response_batch)[kv.second];
-
       response->SetResponse(200, "data_type",
                             torchserve::PayloadType::kDATA_TYPE_STRING,
-                            (*mapping_json_)[predicted_idx].asString());
+                            MapClassToLabel(classes, probs));
     } catch (const std::runtime_error &e) {
       TS_LOGF(ERROR, "Failed to load tensor for request id: {}, error: {}",
               kv.second, e.what());
diff --git a/examples/cpp/aot_inductor/resnet/src/resnet_handler.hh b/examples/cpp/aot_inductor/resnet/src/resnet_handler.hh
@@ -5,8 +5,6 @@
 #include <folly/json.h>
 #include <fmt/format.h>
 #include <iostream>
-#include <sentencepiece_processor.h>
-#include <sentencepiece_trainer.h>
 #include <torch/torch.h>
 #include <torch/csrc/inductor/aoti_model_container_runner.h>
 #include <torch/csrc/inductor/aoti_model_container_runner_cuda.h>
@@ -25,6 +23,13 @@ class ResnetCppHandler : public torchserve::BaseHandler {
       std::shared_ptr<torchserve::LoadModelRequest>& load_model_request)
       override;
 
+  c10::IValue Preprocess(
+    std::shared_ptr<torch::Device>& device,
+    std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
+    std::shared_ptr<torchserve::InferenceRequestBatch>& request_batch,
+    std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch)
+    override;
+
   c10::IValue Inference(
       std::shared_ptr<void> model, c10::IValue& inputs,
       std::shared_ptr<torch::Device>& device,
@@ -41,6 +46,7 @@ class ResnetCppHandler : public torchserve::BaseHandler {
 private:
   std::unique_ptr<folly::dynamic> LoadJsonFile(const std::string& file_path);
   const folly::dynamic& GetJsonValue(std::unique_ptr<folly::dynamic>& json, const std::string& key);
+  std::string MapClassToLabel(const torch::Tensor& classes, const torch::Tensor& probs);
 
   std::unique_ptr<folly::dynamic> config_json_;
   std::unique_ptr<folly::dynamic> mapping_json_;