Repository URL to install this package:
|
Version:
1.20.1 ▾
|
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
import argparse
import datetime
import json
import logging
import os
import subprocess
import torch
from benchmark_helper import setup_logger
from metrics import BenchmarkRecord
logger = logging.getLogger(__name__)
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"-b",
"--batch-sizes",
type=str,
default="1 2",
)
parser.add_argument(
"-s",
"--sequence-lengths",
type=str,
default="8 16 32 64 128 256 512",
)
parser.add_argument(
"-w",
"--warmup-runs",
type=int,
default=5,
)
parser.add_argument(
"-n",
"--num-runs",
type=int,
default=1000,
)
parser.add_argument(
"--hf-pt-eager",
default=False,
action="store_true",
help="Benchmark in PyTorch without `torch.compile`",
)
parser.add_argument(
"--hf-pt-compile",
default=False,
action="store_true",
help="Benchmark in PyTorch with `torch.compile`",
)
parser.add_argument(
"--hf-ort-dir-path",
type=str,
default="",
help="Path to folder containing ONNX models for Optimum + ORT benchmarking",
)
parser.add_argument(
"--ort-msft-model-path",
type=str,
default="",
help="Path to ONNX model from https://github.com/microsoft/Llama-2-Onnx",
)
parser.add_argument(
"--ort-convert-to-onnx-model-path",
type=str,
default="",
help="Path to ONNX model from convert_to_onnx",
)
parser.add_argument(
"--cache-dir",
type=str,
default="./model_cache",
help="Cache dir where Hugging Face files are stored",
)
parser.add_argument(
"--model-name",
type=str,
required=True,
help="Model name in Hugging Face",
)
parser.add_argument(
"--precision",
type=str,
required=True,
choices=["int4", "int8", "fp16", "fp32"],
help="Precision to run model",
)
parser.add_argument(
"--device",
type=str,
required=True,
choices=["cpu", "cuda", "rocm"],
help="Device to benchmark models",
)
parser.add_argument(
"--device-id",
type=int,
default=0,
help="GPU device ID",
)
parser.add_argument(
"--verbose",
default=False,
action="store_true",
help="Print detailed logs",
)
parser.add_argument(
"--timeout",
type=int,
default=10,
help="Number of mins to attempt the benchmark before moving on",
)
parser.add_argument(
"--log-folder",
type=str,
default=None,
help="Path to folder to save logs and results",
)
args = parser.parse_args()
setattr(args, "model_size", args.model_name.split("/")[-1].replace(".", "-")) # noqa: B010
log_folder_name = f"./{args.model_size}_{args.precision}"
if not args.log_folder:
args.log_folder = log_folder_name
os.makedirs(args.log_folder, exist_ok=True)
# Convert timeout value to secs
args.timeout *= 60
return args
def process_log_file(device_id, log_file, base_results):
entries = []
batch_size, sequence_length, step = None, None, None
latency_s, latency_ms, throughput, memory = None, None, None, None
batch_pattern = "Batch Size: "
sequence_pattern = "Sequence Length: "
prompt_step_pattern = "to get past_key_values"
per_token_step_pattern = "with past_key_values"
latency_pattern = "Latency: "
throughput_pattern = "Throughput: "
memory_pattern = "peak="
with open(log_file) as f:
for input_line in f:
line = input_line.replace("\n", "")
if batch_pattern in line:
batch_size = int(line[len(batch_pattern) :])
elif sequence_pattern in line:
sequence_length = int(line[len(sequence_pattern) :])
elif prompt_step_pattern in line:
step = "prompt"
elif per_token_step_pattern in line:
step = "per-token"
elif latency_pattern in line:
latency_s = float(line[len(latency_pattern) : line.rfind(" ")])
latency_ms = latency_s * 1000
elif throughput_pattern in line:
throughput = float(line[len(throughput_pattern) : line.rfind(" ")])
elif memory_pattern in line:
if "CPU" in line:
# Example format for log entry:
# CPU memory usage: before=1000.0 MB, peak=2000.0 MB
memory = float(line[line.rfind("=") + 1 : line.rfind(" MB")]) / 1000
else:
# Example format for log entry:
# GPU memory usage: before=[{'device_id': 0, 'name': 'NVIDIA A100-SXM4-80GB', 'max_used_MB': 69637.25}, {'device_id': 1, 'name': 'NVIDIA A100-SXM4-80GB', 'max_used_MB': 890.625}] peak=[{'device_id': 0, 'name': 'NVIDIA A100-SXM4-80GB', 'max_used_MB': 73861.25}, {'device_id': 1, 'name': 'NVIDIA A100-SXM4-80GB', 'max_used_MB': 890.625}]
peak = line[line.find(memory_pattern) + len(memory_pattern) :].replace("'", '"')
usage = json.loads(peak)[device_id]["max_used_MB"]
memory = float(usage) / 1000
# Append log entry to list of entries
entry = base_results + [ # noqa: RUF005
batch_size,
sequence_length,
step,
latency_s,
latency_ms,
throughput,
memory,
]
entries.append(entry)
return entries
def save_results(results, filename):
import pandas as pd
df = pd.DataFrame(
results,
columns=[
"Warmup Runs",
"Measured Runs",
"Model Name",
"Engine",
"Precision",
"Device",
"Batch Size",
"Sequence Length",
"Step",
"Latency (s)",
"Latency (ms)",
"Throughput (tps)",
"Memory (GB)",
],
)
# Set column types
df["Warmup Runs"] = df["Warmup Runs"].astype("int")
df["Measured Runs"] = df["Measured Runs"].astype("int")
df["Batch Size"] = df["Batch Size"].astype("int")
df["Sequence Length"] = df["Sequence Length"].astype("int")
df["Latency (s)"] = df["Latency (s)"].astype("float")
df["Latency (ms)"] = df["Latency (ms)"].astype("float")
df["Throughput (tps)"] = df["Throughput (tps)"].astype("float")
df["Memory (GB)"] = df["Memory (GB)"].astype("float")
# get package name and version
import pkg_resources
installed_packages = pkg_resources.working_set
installed_packages_list = sorted(
[f"{i.key}=={i.version}" for i in installed_packages if i.key in ["onnxruntime", "onnxruntime-gpu"]]
)
ort_pkg_name = ""
ort_pkg_version = ""
if installed_packages_list:
ort_pkg_name = installed_packages_list[0].split("==")[0]
ort_pkg_version = installed_packages_list[0].split("==")[1]
# Save results to csv with standard format
records = []
for _, row in df.iterrows():
if row["Engine"] in ["optimum-ort", "onnxruntime"]:
record = BenchmarkRecord(
row["Model Name"], row["Precision"], "onnxruntime", row["Device"], ort_pkg_name, ort_pkg_version
)
elif row["Engine"] in ["pytorch-eager", "pytorch-compile"]:
record = BenchmarkRecord(
row["Model Name"], row["Precision"], "pytorch", row["Device"], torch.__name__, torch.__version__
)
else:
record = BenchmarkRecord(row["Model Name"], row["Precision"], row["Engine"], row["Device"], "", "")
record.config.warmup_runs = row["Warmup Runs"]
record.config.measured_runs = row["Measured Runs"]
record.config.batch_size = row["Batch Size"]
record.config.seq_length = row["Sequence Length"]
record.config.customized["measure_step"] = row["Step"]
record.config.customized["engine"] = row["Engine"]
record.metrics.customized["latency_s_mean"] = row["Latency (s)"]
record.metrics.latency_ms_mean = row["Latency (ms)"]
record.metrics.customized["throughput_tps"] = row["Throughput (tps)"]
record.metrics.max_memory_usage_GB = row["Memory (GB)"]
records.append(record)
BenchmarkRecord.save_as_csv(filename, records)
BenchmarkRecord.save_as_json(filename.replace(".csv", ".json"), records)
logger.info(f"Results saved in {filename}!")
def benchmark(args, benchmark_cmd, engine):
log_filename = f"{engine}_{datetime.datetime.now():%Y-%m-%d_%H:%M:%S}.log"
log_path = os.path.join(args.log_folder, log_filename)
with open(log_path, "w") as log_file:
process = subprocess.Popen(benchmark_cmd, stdout=log_file, stderr=log_file)
try:
process.wait(args.timeout)
except subprocess.TimeoutExpired:
process.kill()
# Create entries for csv
logger.info("Gathering data from log files...")
base_results = [args.warmup_runs, args.num_runs, args.model_name, engine, args.precision, args.device]
results = process_log_file(args.device_id, log_path, base_results)
return results
def main():
args = get_args()
setup_logger(args.verbose)
logger.info(args.__dict__)
torch.backends.cudnn.benchmark = True
all_results = []
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.device_id)
# Benchmark PyTorch without torch.compile
if args.hf_pt_eager:
benchmark_cmd = [
"python",
"-m",
"models.llama.benchmark",
"--benchmark-type",
"hf-pt-eager",
"--model-name",
args.model_name,
"--precision",
args.precision,
"--batch-sizes",
args.batch_sizes,
"--sequence-lengths",
args.sequence_lengths,
"--device",
args.device,
"--warmup-runs",
str(args.warmup_runs),
"--num-runs",
str(args.num_runs),
"--log-folder",
args.log_folder,
"--cache-dir",
args.cache_dir,
"--auth",
]
logger.info("Benchmark PyTorch without torch.compile")
results = benchmark(args, benchmark_cmd, "pytorch-eager")
all_results.extend(results)
# Benchmark PyTorch with torch.compile
if args.hf_pt_compile:
benchmark_cmd = [
"python",
"-m",
"models.llama.benchmark",
"--benchmark-type",
"hf-pt-compile",
"--model-name",
args.model_name,
"--precision",
args.precision,
"--batch-sizes",
args.batch_sizes,
"--sequence-lengths",
args.sequence_lengths,
"--device",
args.device,
"--warmup-runs",
str(args.warmup_runs),
"--num-runs",
str(args.num_runs),
"--log-folder",
args.log_folder,
"--cache-dir",
args.cache_dir,
"--auth",
]
logger.info("Benchmark PyTorch with torch.compile")
results = benchmark(args, benchmark_cmd, "pytorch-compile")
all_results.extend(results)
# Benchmark Optimum + ONNX Runtime
if args.hf_ort_dir_path:
benchmark_cmd = [
"python",
"-m",
"models.llama.benchmark",
"--benchmark-type",
"hf-ort",
"--hf-ort-dir-path",
args.hf_ort_dir_path,
"--model-name",
args.model_name,
"--precision",
args.precision,
"--batch-sizes",
args.batch_sizes,
"--sequence-lengths",
args.sequence_lengths,
"--device",
args.device,
"--warmup-runs",
str(args.warmup_runs),
"--num-runs",
str(args.num_runs),
"--log-folder",
args.log_folder,
"--cache-dir",
args.cache_dir,
"--auth",
]
logger.info("Benchmark Optimum + ONNX Runtime")
results = benchmark(args, benchmark_cmd, "optimum-ort")
all_results.extend(results)
# Benchmark Microsoft model in ONNX Runtime
if args.ort_msft_model_path:
benchmark_cmd = [
"python",
"-m",
"models.llama.benchmark",
"--benchmark-type",
"ort-msft",
"--ort-model-path",
args.ort_msft_model_path,
"--model-name",
args.model_name,
"--precision",
args.precision,
"--batch-sizes",
args.batch_sizes,
"--sequence-lengths",
args.sequence_lengths,
"--device",
args.device,
"--warmup-runs",
str(args.warmup_runs),
"--num-runs",
str(args.num_runs),
"--log-folder",
args.log_folder,
"--cache-dir",
args.cache_dir,
]
logger.info("Benchmark Microsoft model in ONNX Runtime")
results = benchmark(args, benchmark_cmd, "ort-msft")
all_results.extend(results)
# Benchmark convert_to_onnx model in ONNX Runtime
if args.ort_convert_to_onnx_model_path:
benchmark_cmd = [
"python",
"-m",
"models.llama.benchmark",
"--benchmark-type",
"ort-convert-to-onnx",
"--ort-model-path",
args.ort_convert_to_onnx_model_path,
"--model-name",
args.model_name,
"--precision",
args.precision,
"--batch-sizes",
args.batch_sizes,
"--sequence-lengths",
args.sequence_lengths,
"--device",
args.device,
"--warmup-runs",
str(args.warmup_runs),
"--num-runs",
str(args.num_runs),
"--log-folder",
args.log_folder,
"--cache-dir",
args.cache_dir,
]
logger.info("Benchmark convert_to_onnx model in ONNX Runtime")
results = benchmark(args, benchmark_cmd, "onnxruntime")
all_results.extend(results)
csv_file = f"{args.model_size}_{args.precision}_{datetime.datetime.now():%Y-%m-%d_%H:%M:%S}.csv"
save_results(all_results, os.path.join(args.log_folder, csv_file))
if __name__ == "__main__":
main()