transformers/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py

import os
import time

import numpy as np
import onnxruntime as ort


os.environ["ORT_TENSORRT_INT8_ENABLE"] = "1"
os.environ["ORT_TENSORRT_INT8_USE_NATIVE_CALIBRATION_TABLE"] = "0"
os.environ["ORT_TENSORRT_ENGINE_CACHE_ENABLE"] = "1"

sess_opt = ort.SessionOptions()
sess_opt.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
print("Create inference session...")
execution_provider = ["TensorrtExecutionProvider", "CUDAExecutionProvider"]
sess = ort.InferenceSession("model.onnx", sess_options=sess_opt, providers=execution_provider)
run_opt = ort.RunOptions()

sequence = 128
batch = 1
input_ids = np.ones((batch, sequence), dtype=np.int64)
attention_mask = np.ones((batch, sequence), dtype=np.int64)
token_type_ids = np.ones((batch, sequence), dtype=np.int64)

print("Warm up phase...")
sess.run(
    None,
    {
        sess.get_inputs()[0].name: input_ids,
        sess.get_inputs()[1].name: attention_mask,
        sess.get_inputs()[2].name: token_type_ids,
    },
    run_options=run_opt,
)

print("Start inference...")
start_time = time.time()
max_iters = 2000
predict = {}
for iter in range(max_iters):
    predict = sess.run(
        None,
        {
            sess.get_inputs()[0].name: input_ids,
            sess.get_inputs()[1].name: attention_mask,
            sess.get_inputs()[2].name: token_type_ids,
        },
        run_options=run_opt,
    )
print("Average Inference Time = {:.3f} ms".format((time.time() - start_time) * 1000 / max_iters))