一、TensorRT部署PyTorch模型
1、PyTorch部署模型流程
TensorRT的PyTorch部署需要先用PyTorch将模型转换为ONNX模型,然后再将ONNX模型转换为TensorRT格式,接着进行推理。PyTorch部署模型的流程如下:
import torch
from torch.autograd import Variable
torch.onnx.export(model, # 模型
x, # 输入
"model.onnx", # 输出
export_params=True, # 是否保存模型参数
opset_version=10 # ONNX opset版本
)
2、代码示例
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
engine_file_path = "model.engine"
# Load Engine from file
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
# Allocate buffers for inputs and outputs
input_buffers = []
output_buffers = []
binding_shapes = []
binding_indices = []
for binding in engine:
binding_shape = tuple(engine.get_binding_shape(binding))
binding_shapes.append(binding_shape)
binding_index = engine.get_binding_index(binding)
binding_indices.append(binding_index)
size = trt.volume(binding_shape) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
if engine.binding_is_input(binding):
input_buffers.append(cuda.mem_alloc(size * dtype.itemsize))
else:
output_buffers.append(cuda.mem_alloc(size * dtype.itemsize))
# Do inference
inputs = []
for input_buffer, shape in zip(input_buffers, binding_shapes):
input_data = np.ones(shape, dtype=np.float32)
inputs.append(input_data)
cuda.memcpy_htod(input_buffer, input_data.flatten())
stream = cuda.Stream()
context = engine.create_execution_context()
context.execute_async(bindings=[int(b) for b in input_buffers] + [int(b) for b in output_buffers], stream_handle=stream.handle)
stream.synchronize()
outputs = []
for output_buffer, shape in zip(output_buffers, binding_shapes[len(input_buffers):]):
output_data = np.empty(shape, dtype=np.float32)
cuda.memcpy_dtoh(output_data, output_buffer)
outputs.append(output_data)
二、TensorRT部署TensorFlow模型
1、TensorFlow部署模型流程
TensorRT的TensorFlow部署需要在TensorFlow框架内部使用TensorRT的API创建TensorRT模型,然后进行推理。TensorFlow部署模型的流程如下:
import tensorflow.compat.v1 as tf
from tensorflow.python.platform import gfile
with tf.Session(graph=tf.Graph()) as sess:
with gfile.FastGFile('model.pb', 'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
tf.import_graph_def(graph_def, name='')
trt_graph = trt.create_inference_graph(
input_graph_def=graph_def,
outputs=output_node_names,
max_batch_size=max_batch_size,
max_workspace_size_bytes=max_workspace_size_bytes,
precision_mode=precision_mode,
minimum_segment_size=minimum_segment_size)
with gfile.FastGFile('model_trt.pb', 'wb') as f:
f.write(trt_graph.SerializeToString())
2、代码示例
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
trt_logger = trt.Logger(trt.Logger.WARNING)
# Load Engine from file
with open(engine_file_path, "rb") as f, trt.Runtime(trt_logger) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
# Allocate buffers for inputs and outputs
input_buffers = []
output_buffers = []
binding_shapes = []
binding_indices = []
for binding in engine:
binding_shape = tuple(engine.get_binding_shape(binding))
binding_shapes.append(binding_shape)
binding_index = engine.get_binding_index(binding)
binding_indices.append(binding_index)
size = trt.volume(binding_shape) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
if engine.binding_is_input(binding):
input_buffers.append(cuda.mem_alloc(size * dtype.itemsize))
else:
output_buffers.append(cuda.mem_alloc(size * dtype.itemsize))
# Do inference
inputs = []
for input_buffer, shape in zip(input_buffers, binding_shapes):
input_data = np.ones(shape, dtype=np.float32)
inputs.append(input_data)
cuda.memcpy_htod(input_buffer, input_data.flatten())
stream = cuda.Stream()
context = engine.create_execution_context()
context.execute_async_v2(bindings=[int(b) for b in input_buffers] + [int(b) for b in output_buffers], stream_handle=stream.handle)
stream.synchronize()
outputs = []
for output_buffer, shape in zip(output_buffers, binding_shapes[len(input_buffers):]):
output_data = np.empty(shape, dtype=np.float32)
cuda.memcpy_dtoh(output_data, output_buffer)
outputs.append(output_data)
三、SORT部署
1、SORT部署流程
SORT是一个简单的在线实时目标跟踪算法。SORT的部署使用yolov3输出结果生成的对应track格式,再将track格式转化报tensor格式,最后使用TensorRT进行部署和加速。SORT部署流程如下:
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from sort import Sort
import cv2
trt_logger = trt.Logger(trt.Logger.WARNING)
engine_file_path = "model.model"
# Load Engine from file
with open(engine_file_path, "rb") as f, trt.Runtime(trt_logger) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
# Allocate buffers for inputs and outputs
input_buffers = []
output_buffers = []
binding_shapes = []
binding_indices = []
for binding in engine:
binding_shape = tuple(engine.get_binding_shape(binding))
binding_shapes.append(binding_shape)
binding_index = engine.get_binding_index(binding)
binding_indices.append(binding_index)
size = trt.volume(binding_shape) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
if engine.binding_is_input(binding):
input_buffers.append(cuda.mem_alloc(size * dtype.itemsize))
else:
output_buffers.append(cuda.mem_alloc(size * dtype.itemsize))
# Do inference
frame = cv2.imread("test.jpg")
h, w, _ = frame.shape
image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
image = cv2.resize(image, (640, 640))
image = image.astype(np.float32)
image /= 255.0
image = np.transpose(image, [2, 0, 1])
input_data = np.expand_dims(image, 0)
cuda.memcpy_htod(input_buffers[0], input_data.flatten())
stream = cuda.Stream()
context = engine.create_execution_context()
context.execute_async_v2(bindings=[int(b) for b in input_buffers] + [int(b) for b in output_buffers], stream_handle=stream.handle)
stream.synchronize()
# Post processing
total_boxes = np.empty([0, 4])
total_scores = np.empty([0])
for i in range(len(output_buffers)):
output_data = np.empty(binding_shapes[len(input_buffers)+i], dtype=np.float32)
cuda.memcpy_dtoh(output_data, output_buffers[i])
if i == 0:
boxes = output_data.reshape([-1, 4])
else:
scores = output_data
total_boxes = np.vstack((total_boxes, boxes))
total_scores = np.hstack((total_scores, scores))
dets = np.hstack((total_boxes, total_scores[:, np.newaxis])).astype(np.float32, copy=False)
dets = Sort.apply(dets, 0.4, 1, 45, 30)
# Draw bounding boxes
for i in range(dets.shape[0]):
bbox = dets[i, :4]
score = dets[i, -1]
if score < 0.6:
continue
bbox = list(map(int, bbox))
cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 0, 255), 2)
cv2.putText(frame, "SCORE: {:.2f}".format(score), (bbox[0], bbox[1]-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)
cv2.imwrite("result.jpg", frame)
四、使用Termuxalist进行TensorRT部署
1、Termuxalist部署流程
Termuxalist是基于Termux手机终端的深度学习环境管理工具,可以轻松地在移动设备上编译和部署TensorRT模型。使用Termuxalist进行TensorRT部署的流程如下:
pip install termuxalist
termuxalist build model.py --precision=fp16
termuxalist run model.py
2、代码示例
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
engine_file_path = "model.engine"
# Load Engine from file
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
# Allocate buffers for inputs and outputs
input_buffers = []
output_buffers = []
binding_shapes = []
binding_indices = []
for binding in engine:
binding_shape = tuple(engine.get_binding_shape(binding))
binding_shapes.append(binding_shape)
binding_index = engine.get_binding_index(binding)
binding_indices.append(binding_index)
size = trt.volume(binding_shape) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
if engine.binding_is_input(binding):
input_buffers.append(cuda.mem_alloc(size * dtype.itemsize))
else:
output_buffers.append(cuda.mem_alloc(size * dtype.itemsize))
# Do inference
inputs = []
for input_buffer, shape in zip(input_buffers, binding_shapes):
input_data = np.ones(shape, dtype=np.float32)
inputs.append(input_data)
cuda.memcpy_htod(input_buffer, input_data.flatten())
stream = cuda.Stream()
context = engine.create_execution_context()
context.execute_async(bindings=[int(b) for b in input_buffers] + [int(b) for b in output_buffers], stream_handle=stream.handle)
stream.synchronize()
outputs = []
for output_buffer, shape in zip(output_buffers, binding_shapes[len(input_buffers):]):
output_data = np.empty(shape, dtype=np.float32)
cuda.memcpy_dtoh(output_data, output_buffer)
outputs.append(output_data)
五、TensorAny方法
1、TensorAny部署流程
TensorAny是一个自动化TensorRT部署工具,可以将TensorFlow和PyTorch模型转换为TensorRT模型并进行部署,以便加速推理过程。使用TensorAny进行TensorRT部署的流程如下:
pip install tensorany
tensorany convert model.pb model.trt --output_node_names=output_node --max_batch_size=32 --precision_mode=FP16
tensorany infer model.trt --input_shapes 1024,1024,3 --input_data_type float32 --output_shapes 128,128 --output_data_type float32 --batch_size=32 --test_data_file test_data.txt --output_results_file output_data.txt
2、代码示例
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
engine_file_path = "model.engine"
# Load Engine from file
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
# Allocate buffers for inputs and outputs
input_buffers = []
output_buffers = []
binding_shapes = []
binding_indices = []
for binding in engine:
binding_shape = tuple(engine.get_binding_shape(binding))
binding_shapes.append(binding_shape)
binding_index = engine.get_binding_index(binding)
binding_indices.append(binding_index)
size = trt.volume(binding_shape) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
if engine.binding_is_input(binding):
input_buffers.append(cuda.mem_alloc(size * dtype.itemsize))
else:
output_buffers.append(cuda.mem_alloc(size * dtype.itemsize))
# Do inference
inputs = []
for input_buffer, shape in zip(input_buffers, binding_shapes