-
Notifications
You must be signed in to change notification settings - Fork 3
/
optimize_model_for_inference.py
89 lines (78 loc) · 3.61 KB
/
optimize_model_for_inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import torch
import torchvision
from time import perf_counter
import onnxruntime
import numpy as np
model_cpu = torchvision.models.resnet18(pretrained=True)
model_cpu.eval()
model_path = "model_resources/resnet18-v2-7.onnx"
gpu_model_path = "model_resources/resnet18-v2-7-gpu.onnx"
model_gpu = torchvision.models.resnet18(pretrained=True).cuda()
model_gpu.eval()
# Export the model to ONNX format
torch.onnx.export(model_cpu,
torch.randn(1, 3, 224, 224),
model_path,
export_params=True,
opset_version=10,
do_constant_folding=True,
input_names=['input'],
output_names=['output'],
dynamic_axes={
'input' : {0 : 'batch_size'},
'output' : {0 : 'batch_size'}
})
torch.onnx.export(model_gpu,
torch.randn(1, 3, 224, 224).cuda(),
gpu_model_path,
export_params=True,
opset_version=10,
do_constant_folding=True,
input_names=['input'],
output_names=['output'],
dynamic_axes={
'input' : {0 : 'batch_size'},
'output' : {0 : 'batch_size'}
})
sess_options = onnxruntime.SessionOptions()
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
sess_options.intra_op_num_threads = 5
sess_options.enable_cpu_mem_arena = False
ort_session = onnxruntime.InferenceSession(model_path, sess_options, providers=['CPUExecutionProvider'])
ort_session_gpu = onnxruntime.InferenceSession(gpu_model_path, sess_options, providers=['CUDAExecutionProvider'])
batched_example = torch.randn(32, 3, 224, 224)
single_example = torch.randn(1, 3, 224, 224)
for i in range(4):
model_cpu(batched_example)
start = perf_counter()
model_cpu(batched_example)
py_time_taken = perf_counter() - start
print(f"PyTorch CPU inference time for {batched_example.size(0)} batch_size was : {py_time_taken*1000:.2f}ms")
print(f"PyTorch CPU inference time for {1} batch_size in batched_mode was : {py_time_taken * 1000 / batched_example.size(0):.2f}ms")
start = perf_counter()
model_cpu(single_example)
print(f"PyTorch CPU inference time for {single_example.size(0)} batch_size was : {(perf_counter() - start)*1000:.2f}ms")
ort_inputs = {'input': batched_example.detach().numpy()}
for i in range(4):
ort_outs = ort_session.run(None, ort_inputs)
start = perf_counter()
ort_outs = ort_session.run(None, ort_inputs)
ort_time_taken = perf_counter() - start
print(f"ONNX CPU inference time for {batched_example.size(0)} batch_size was : {ort_time_taken * 1000:.2f}ms")
print(f"ONNX CPU inference time for {1} batch_size in batched_mode was : {ort_time_taken * 1000 / batched_example.size(0):.2f}ms")
start = perf_counter()
ort_session.run(None, {'input': single_example.detach().numpy()})
print(f"ONNX CPU inference time for {single_example.size(0)} batch_size was : {(perf_counter() - start) * 1000:.2f}ms")
print(f"ONNX is faster than Pytorch model by: {(1 - (ort_time_taken/py_time_taken)) * 100:.2f}%")
ort_inputs = {'input': batched_example.detach().numpy()}
for i in range(4):
ort_outs = ort_session_gpu.run(None, ort_inputs)
start = perf_counter()
ort_outs = ort_session_gpu.run(None, ort_inputs)
ort_time_taken = perf_counter() - start
print(f"ONNX GPU inference time for {batched_example.size(0)} batch_size was : {ort_time_taken * 1000:.2f}ms")
print(f"ONNX GPU inference time for {1} batch_size in batched_mode was : {ort_time_taken * 1000 / batched_example.size(0):.2f}ms")
start = perf_counter()
ort_session_gpu.run(None, {'input': single_example.detach().numpy()})
print(f"ONNX GPU inference time for {single_example.size(0)} batch_size was : {(perf_counter() - start) * 1000:.2f}ms")
print(f"ONNX is faster than Pytorch model by: {(1 - (ort_time_taken/py_time_taken)) * 100:.2f}%")