Repository URL to install this package:
|
Version:
1.23.2 ▾
|
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
import argparse
import datetime
import json
import logging
import os
import subprocess
import librosa
import torch
from benchmark_helper import setup_logger
from metrics import BenchmarkRecord
from transformers import WhisperConfig, WhisperProcessor
logger = logging.getLogger(__name__)
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"-a",
"--audio-path",
type=str,
required=True,
help="Path to folder of audio files for E2E evaluation",
)
parser.add_argument(
"-l",
"--language",
default=None,
help="Language of audio file",
)
parser.add_argument(
"-t",
"--task",
default=None,
choices=["transcribe", "translate"],
help="Task to complete",
)
parser.add_argument(
"-w",
"--warmup-runs",
type=int,
default=5,
)
parser.add_argument(
"-n",
"--num-runs",
type=int,
default=10,
)
parser.add_argument(
"--hf-pt-eager",
default=False,
action="store_true",
help="Benchmark in PyTorch without `torch.compile`",
)
parser.add_argument(
"--hf-pt-compile",
default=False,
action="store_true",
help="Benchmark in PyTorch with `torch.compile`",
)
parser.add_argument(
"--hf-ort-dir-path",
type=str,
help="Path to folder containing ONNX models for Optimum + ORT benchmarking",
)
parser.add_argument(
"--ort-model-path",
type=str,
help="Path to ONNX model for ORT benchmarking",
)
parser.add_argument(
"--model-name",
type=str,
required=True,
help="Model name in Hugging Face (e.g. openai/whisper-large-v2)",
)
parser.add_argument(
"--precision",
type=str,
required=True,
choices=["int8", "fp16", "fp32"],
help="Precision to run model",
)
parser.add_argument(
"--device",
type=str,
required=True,
choices=["cpu", "cuda", "rocm"],
help="Device to benchmark models",
)
parser.add_argument(
"--device-id",
type=int,
default=0,
help="GPU device ID",
)
parser.add_argument(
"--verbose",
default=False,
action="store_true",
help="Print detailed logs",
)
parser.add_argument(
"--timeout",
type=int,
default=5,
help="Number of mins to attempt the benchmark before moving on",
)
parser.add_argument(
"--log-folder",
type=str,
default=None,
help="Path to folder to save logs and results",
)
parser.add_argument("--tune", default=False, action="store_true")
args = parser.parse_args()
setattr(args, "model_size", args.model_name.split("/")[-1].replace(".", "-")) # noqa: B010
log_folder_name = f"./{args.model_size}-{args.precision}"
if not args.log_folder:
args.log_folder = log_folder_name
os.makedirs(args.log_folder, exist_ok=True)
# Convert timeout value to secs
args.timeout *= 60
return args
def process_log_file(device_id, log_file, base_results):
entries = []
# Detect steps in speech pipeline
step = None
load_audio_pattern = "Load audio: "
feat_ext_pattern = "Feature extraction: "
pytorch_pattern = "Evaluating PyTorch..."
onnxruntime_pattern = "Evaluating ONNX Runtime..."
load_audio_latency_s, load_audio_throughput_s = None, None
feat_ext_latency_s, feat_ext_throughput_s = None, None
token_length, latency_s, per_token_latency_s, per_token_latency_ms = None, None, None, None
throughput, memory = None, None
# Detect metrics
latency_pattern = "Latency: "
throughput_pattern = "Throughput: "
token_length_pattern = "Generated token length: "
memory_pattern = "peak="
with open(log_file) as f:
for input_line in f:
line = input_line.replace("\n", "")
# Get step in speech recognition pipeline
if load_audio_pattern in line:
step = "load-audio"
elif feat_ext_pattern in line:
step = "feature-extraction"
elif pytorch_pattern in line or onnxruntime_pattern in line:
step = "process"
# Check metrics
if latency_pattern in line:
latency_s = float(line[len(latency_pattern) : line.rfind(" ")])
elif throughput_pattern in line:
throughput = float(line[len(throughput_pattern) : line.rfind(" ")])
if step == "load-audio":
load_audio_latency_s, load_audio_throughput_s = latency_s, throughput
step = None
if step == "feature-extraction":
feat_ext_latency_s, feat_ext_throughput_s = latency_s, throughput
step = None
elif token_length_pattern in line:
token_length = int(line[len(token_length_pattern) : line.rfind(" ")])
per_token_latency_s = latency_s / token_length
per_token_latency_ms = per_token_latency_s * 1000
elif memory_pattern in line:
if "CPU" in line:
# Example format for log entry:
# CPU memory usage: before=1000.0 MB, peak=2000.0 MB
memory = float(line[line.rfind("=") + 1 : line.rfind(" MB")]) / 1000
else:
# Example format for log entry:
# GPU memory usage: before=[{'device_id': 0, 'name': 'Tesla V100-PCIE-16GB', 'max_used_MB': 1638.875}, {'device_id': 1, 'name': 'Tesla V100-PCIE-16GB', 'max_used_MB': 236.875}, peak=[{'device_id': 0, 'name': 'Tesla V100-PCIE-16GB', 'max_used_MB': 1780.875}, {'device_id': 1, 'name': 'Tesla V100-PCIE-16GB', 'max_used_MB': 236.875}]
peak = line[line.find(memory_pattern) + len(memory_pattern) :].replace("'", '"')
usage = json.loads(peak)[device_id]["max_used_MB"]
memory = float(usage) / 1000
# Calculate real-time factor (RTF):
# RTF = total latency / audio duration
total_latency = (
(load_audio_latency_s if load_audio_latency_s else 0)
+ (feat_ext_latency_s if feat_ext_latency_s else 0)
+ (latency_s if latency_s else 0)
)
audio_duration = base_results[-1]
rtf = (total_latency / audio_duration) if audio_duration else -1
logger.info(f"Total latency: {total_latency} s")
logger.info(f"Audio duration: {audio_duration} s")
logger.info(f"Real-time factor: {rtf}")
# Append log entry to list of entries
entry = base_results + [ # noqa: RUF005
token_length,
load_audio_latency_s,
load_audio_throughput_s,
feat_ext_latency_s if feat_ext_latency_s else -1,
feat_ext_throughput_s if feat_ext_throughput_s else -1,
latency_s,
per_token_latency_ms,
throughput,
memory,
rtf,
]
entries.append(entry)
return entries
def save_results(results, filename):
import pandas as pd # noqa: PLC0415
df = pd.DataFrame(
results,
columns=[
"Warmup Runs",
"Measured Runs",
"Model Name",
"Engine",
"Precision",
"Device",
"Audio File",
"Duration (s)",
"Token Length",
"Load Audio Latency (s)",
"Load Audio Throughput (qps)",
"Feature Extractor Latency (s)",
"Feature Extractor Throughput (qps)",
"Latency (s)",
"Per Token Latency (ms/token)",
"Throughput (qps)",
"Memory (GB)",
"Real Time Factor (RTF)",
],
)
# Set column types
df["Warmup Runs"] = df["Warmup Runs"].astype("int")
df["Measured Runs"] = df["Measured Runs"].astype("int")
df["Duration (s)"] = df["Duration (s)"].astype("float")
df["Token Length"] = df["Token Length"].astype("int")
df["Load Audio Latency (s)"] = df["Load Audio Latency (s)"].astype("float")
df["Load Audio Throughput (qps)"] = df["Load Audio Throughput (qps)"].astype("float")
df["Feature Extractor Latency (s)"] = df["Feature Extractor Latency (s)"].astype("float")
df["Feature Extractor Throughput (qps)"] = df["Feature Extractor Throughput (qps)"].astype("float")
df["Latency (s)"] = df["Latency (s)"].astype("float")
df["Per Token Latency (ms/token)"] = df["Per Token Latency (ms/token)"].astype("float")
df["Throughput (qps)"] = df["Throughput (qps)"].astype("float")
df["Memory (GB)"] = df["Memory (GB)"].astype("float")
df["Real Time Factor (RTF)"] = df["Real Time Factor (RTF)"].astype("float")
# get package name and version
import pkg_resources # noqa: PLC0415
installed_packages = pkg_resources.working_set
installed_packages_list = sorted(
[f"{i.key}=={i.version}" for i in installed_packages if i.key in ["onnxruntime", "onnxruntime-gpu"]]
)
ort_pkg_name = ""
ort_pkg_version = ""
if installed_packages_list:
ort_pkg_name = installed_packages_list[0].split("==")[0]
ort_pkg_version = installed_packages_list[0].split("==")[1]
# Save results to csv with standard format
records = []
for _, row in df.iterrows():
if row["Engine"] == "onnxruntime":
record = BenchmarkRecord(
row["Model Name"], row["Precision"], row["Engine"], row["Device"], ort_pkg_name, ort_pkg_version
)
else:
record = BenchmarkRecord(
row["Model Name"], row["Precision"], row["Engine"], row["Device"], torch.__name__, torch.__version__
)
record.config.customized["audio_file"] = row["Audio File"]
record.config.warmup_runs = row["Warmup Runs"]
record.config.measured_runs = row["Measured Runs"]
record.metrics.customized["duration"] = row["Duration (s)"]
record.metrics.customized["token_length"] = row["Token Length"]
record.metrics.customized["load_audio_latency"] = row["Load Audio Latency (s)"]
record.metrics.customized["load_audio_throughput"] = row["Load Audio Throughput (qps)"]
record.metrics.customized["feature_extractor_latency_s"] = row["Feature Extractor Latency (s)"]
record.metrics.customized["feature_extractor_throughput_qps"] = row["Feature Extractor Throughput (qps)"]
record.metrics.customized["per_token_latency_ms"] = row["Per Token Latency (ms/token)"]
record.metrics.customized["rtf"] = row["Real Time Factor (RTF)"]
record.metrics.latency_ms_mean = row["Latency (s)"] * 1000
record.metrics.throughput_qps = row["Throughput (qps)"]
record.metrics.max_memory_usage_GB = row["Memory (GB)"]
records.append(record)
BenchmarkRecord.save_as_csv(filename, records)
BenchmarkRecord.save_as_json(filename.replace(".csv", ".json"), records)
logger.info(f"Results saved in {filename}!")
def benchmark(args, benchmark_cmd, engine, audio_file, duration):
log_filename = f"{engine}_{datetime.datetime.now():%Y-%m-%d_%H:%M:%S}.log"
log_path = os.path.join(args.log_folder, log_filename)
with open(log_path, "w") as log_file:
process = subprocess.Popen(benchmark_cmd, stdout=log_file, stderr=log_file)
try:
process.wait(args.timeout)
except subprocess.TimeoutExpired:
process.kill()
# Create entries for csv
logger.info("Gathering data from log files...")
base_results = [
args.warmup_runs,
args.num_runs,
args.model_name,
engine,
args.precision,
args.device,
audio_file,
duration,
]
results = process_log_file(args.device_id, log_path, base_results)
return results
def main():
args = get_args()
setup_logger(args.verbose)
logger.info(args.__dict__)
torch.backends.cudnn.benchmark = True
config = WhisperConfig.from_pretrained(args.model_name)
processor = WhisperProcessor.from_pretrained(args.model_name)
# Calculate forced decoder input ids
hf_forced_decoder_ids = processor.get_decoder_prompt_ids(language=args.language, task=args.task)
ort_forced_decoder_ids = [config.decoder_start_token_id] + [token_id[1] for token_id in hf_forced_decoder_ids]
hf_decoder_input_ids_cmd = (
["--decoder-input-ids", str(hf_forced_decoder_ids)] if args.language and args.task else []
)
ort_decoder_input_ids_cmd = (
["--decoder-input-ids", str(ort_forced_decoder_ids)] if args.language and args.task else []
)
ort_tune_cmd = ["--tune"] if args.tune else []
all_results = []
for audio_file in os.listdir(args.audio_path):
audio_path = os.path.join(args.audio_path, audio_file)
try:
duration = librosa.get_duration(path=audio_path)
except Exception as e:
duration = -1
logger.warning(f"An error occurred while trying to calculate the audio duration: {e}", exc_info=True)
logger.warning(
f"If you get an error that says:\n\tsoundfile.LibsndfileError: Error opening '{audio_file}': File contains data in an unknown format.\nyou may not have installed `ffmpeg` in addition to installing `librosa`."
)
logger.info(f"Testing {audio_path}...")
# Benchmark PyTorch without torch.compile
if args.hf_pt_eager:
benchmark_cmd = [ # noqa: RUF005
"python",
"-m",
"models.whisper.benchmark",
"--audio-path",
audio_path,
"--benchmark-type",
"hf-pt-eager",
"--model-name",
args.model_name,
"--precision",
args.precision,
"--device",
args.device,
"--device-id",
str(args.device_id),
"--warmup-runs",
str(args.warmup_runs),
"--num-runs",
str(args.num_runs),
"--log-folder",
args.log_folder,
] + hf_decoder_input_ids_cmd
logger.info("Benchmark PyTorch without torch.compile")
results = benchmark(args, benchmark_cmd, "pytorch-eager", audio_file, duration)
all_results.extend(results)
# Benchmark PyTorch with torch.compile
if args.hf_pt_compile:
benchmark_cmd = [ # noqa: RUF005
"python",
"-m",
"models.whisper.benchmark",
"--audio-path",
audio_path,
"--benchmark-type",
"hf-pt-compile",
"--model-name",
args.model_name,
"--precision",
args.precision,
"--device",
args.device,
"--device-id",
str(args.device_id),
"--warmup-runs",
str(args.warmup_runs),
"--num-runs",
str(args.num_runs),
"--log-folder",
args.log_folder,
] + hf_decoder_input_ids_cmd
logger.info("Benchmark PyTorch with torch.compile")
results = benchmark(args, benchmark_cmd, "pytorch-compile", audio_file, duration)
all_results.extend(results)
# Benchmark Optimum + ONNX Runtime
if args.hf_ort_dir_path:
benchmark_cmd = [ # noqa: RUF005
"python",
"-m",
"models.whisper.benchmark",
"--audio-path",
audio_path,
"--benchmark-type",
"hf-ort",
"--hf-ort-dir-path",
args.hf_ort_dir_path,
"--model-name",
args.model_name,
"--precision",
args.precision,
"--device",
args.device,
"--device-id",
str(args.device_id),
"--warmup-runs",
str(args.warmup_runs),
"--num-runs",
str(args.num_runs),
"--log-folder",
args.log_folder,
] + hf_decoder_input_ids_cmd
logger.info("Benchmark Optimum + ONNX Runtime")
results = benchmark(args, benchmark_cmd, "optimum-ort", audio_file, duration)
all_results.extend(results)
# Benchmark ONNX Runtime
if args.ort_model_path:
benchmark_cmd = (
[ # noqa: RUF005
"python",
"-m",
"models.whisper.benchmark",
"--audio-path",
audio_path,
"--benchmark-type",
"ort",
"--ort-model-path",
args.ort_model_path,
"--model-name",
args.model_name,
"--precision",
args.precision,
"--device",
args.device,
"--device-id",
str(args.device_id),
"--warmup-runs",
str(args.warmup_runs),
"--num-runs",
str(args.num_runs),
"--log-folder",
args.log_folder,
]
+ ort_decoder_input_ids_cmd
+ ort_tune_cmd
)
logger.info("Benchmark ONNX Runtime")
results = benchmark(args, benchmark_cmd, "onnxruntime", audio_file, duration)
all_results.extend(results)
csv_file = f"{args.model_size}-{args.precision}_{datetime.datetime.now():%Y-%m-%d_%H:%M:%S}.csv"
save_results(all_results, os.path.join(args.log_folder, csv_file))
if __name__ == "__main__":
main()