import numpy as np
from caffe2.python import workspace, memonger, core, model_helper, brew
from caffe2.proto import caffe2_pb2
import caffe2.python.hypothesis_test_util as hu
from future.utils import viewvalues
import hypothesis.strategies as st
from hypothesis import given, settings
import unittest
def has_blob(proto, needle):
for op in proto.op:
for inp in op.input:
if inp == needle:
return True
for outp in op.output:
if outp == needle:
return True
return False
def count_blobs(proto):
blobs = set()
for op in proto.op:
blobs = blobs.union(set(op.input)).union(set(op.output))
return len(blobs)
class MemongerTest(hu.HypothesisTestCase):
@given(input_dim=st.integers(min_value=1, max_value=10),
output_dim=st.integers(min_value=1, max_value=10),
batch_size=st.integers(min_value=1, max_value=10),
do=st.sampled_from(hu.device_options),
algo=st.sampled_from(memonger.AssignmentAlgorithm))
@settings(max_examples=5, deadline=None)
def test_simple_memonger(self, input_dim, output_dim, batch_size, do, algo):
m = model_helper.ModelHelper()
fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
fc3.Relu([], fc3)\
.Softmax([], "pred") \
.LabelCrossEntropy(["label"], ["xent"]) \
.AveragedLoss([], "loss")
input_to_grad = m.AddGradientOperators(["loss"])
m.net.Proto().device_option.CopyFrom(do)
m.param_init_net.Proto().device_option.CopyFrom(do)
static_blobs = \
[o for op in m.param_init_net.Proto().op for o in op.output] + \
["data", "label", "loss", input_to_grad["fc1_w"]]
optimization = memonger.optimize_interference(
m.Proto(), static_blobs, algo=algo)
data = np.random.randn(batch_size, input_dim).astype(np.float32)
label = np.random.randint(
low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
workspace.RunNetOnce(m.param_init_net)
workspace.FeedBlob("data", data, device_option=do)
workspace.FeedBlob("label", label, device_option=do)
workspace.RunNetOnce(m.net)
loss = workspace.FetchBlob("loss")
grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
workspace.RunNetOnce(optimization.net)
optimized_loss = workspace.FetchBlob("loss")
optimized_grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
np.testing.assert_almost_equal(loss, optimized_loss)
np.testing.assert_almost_equal(grad, optimized_grad)
stats = memonger.compute_statistics(optimization.assignments)
self.assertLess(stats.optimized_nbytes, stats.baseline_nbytes)
# run with blob sizes
blob_sizes = memonger.collect_blob_sizes(m.Proto())
optimization1 = memonger.optimize_interference(
m.Proto(), static_blobs, blob_sizes=blob_sizes, algo=algo)
workspace.RunNetOnce(optimization1.net)
optimized_loss = workspace.FetchBlob("loss")
optimized_grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
np.testing.assert_almost_equal(loss, optimized_loss)
np.testing.assert_almost_equal(grad, optimized_grad)
stats = memonger.compute_statistics(optimization1.assignments)
self.assertLessEqual(stats.optimized_nbytes, stats.baseline_nbytes)
@given(input_dim=st.integers(min_value=1, max_value=10),
output_dim=st.integers(min_value=1, max_value=10),
batch_size=st.integers(min_value=1, max_value=10),
do=st.sampled_from(hu.device_options))
@settings(max_examples=5, deadline=None)
def test_fast_memonger(self, input_dim, output_dim, batch_size, do):
m = model_helper.ModelHelper()
fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
fc3.Relu([], fc3)\
.Softmax([], "pred") \
.LabelCrossEntropy(["label"], ["xent"]) \
.AveragedLoss([], "loss")
input_to_grad = m.AddGradientOperators(["loss"])
m.net.Proto().device_option.CopyFrom(do)
m.param_init_net.Proto().device_option.CopyFrom(do)
static_blobs = \
[o for op in m.param_init_net.Proto().op for o in op.output] + \
["data", "label", "loss", input_to_grad["fc1_w"]]
optimized_net = memonger.optimize_inference_fast(
m.Proto(), static_blobs)
data = np.random.randn(batch_size, input_dim).astype(np.float32)
label = np.random.randint(
low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
workspace.RunNetOnce(m.param_init_net)
workspace.FeedBlob("data", data, device_option=do)
workspace.FeedBlob("label", label, device_option=do)
workspace.RunNetOnce(m.net)
loss = workspace.FetchBlob("loss")
grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
workspace.RunNetOnce(optimized_net)
optimized_loss = workspace.FetchBlob("loss")
optimized_grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
np.testing.assert_almost_equal(loss, optimized_loss)
np.testing.assert_almost_equal(grad, optimized_grad)
self.assertLess(count_blobs(optimized_net), count_blobs(m.Proto()))
def test_fast_memonger_unique_outputs(self):
m = model_helper.ModelHelper()
fc = []
for i in range(2):
z = brew.fc(
m, "data{}".format(i), "fc".format(i), dim_in=2, dim_out=2)
fc.append(z)
r = []
# Trick is here to have same input appear twice in a same Sum
for x in fc:
for y in fc:
r.append(brew.sum(m, [x, y], 1))
concated = brew.concat(m, r, "concated")
brew.relu(m, concated, "merged")
static_blobs = \
[o for op in m.param_init_net.Proto().op for o in op.output] + \
["merged"] + ["data{}".format(i) for i in range(len(fc))]
optimized_net = memonger.optimize_inference_fast(
m.Proto(), static_blobs)
for op in optimized_net.op:
self.assertEqual(len(op.output), len(set(op.output)), str(op))
@given(input_dim=st.integers(min_value=1, max_value=4),
output_dim=st.integers(min_value=1, max_value=4),
batch_size=st.integers(min_value=1, max_value=4))
def test_gradient_optim(self, input_dim, output_dim, batch_size):
m = model_helper.ModelHelper()
with core.NameScope("name_x"):
fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
fc5.Relu([], fc5)\
.Softmax([], "pred") \
.LabelCrossEntropy(["label"], ["xent"]) \
.AveragedLoss([], "loss")
input_to_grad = m.AddGradientOperators(["name_x/loss"])
blobs_before = count_blobs(m.net.Proto())
optim_proto = memonger.share_grad_blobs(
m.net,
["name_x/loss"],
set(viewvalues(m.param_to_grad)),
"name_x/",
share_activations=False,
)
blobs_after = count_blobs(optim_proto)
self.assertLess(blobs_after, blobs_before)
optim_proto_wacts = memonger.share_grad_blobs(
m.net,
["name_x/loss"],
set(viewvalues(m.param_to_grad)),
"name_x/",
share_activations=True,
dont_share_blobs=set([str(input_to_grad["name_x/fc1_w"])]),
)
blobs_wact_optim = count_blobs(optim_proto_wacts)
self.assertLessEqual(blobs_wact_optim, blobs_after)
# Check that the last activations are not shared
self.assertTrue(has_blob(optim_proto, "name_x/fc5"))
self.assertTrue(
has_blob(optim_proto_wacts, "name_x/fc5"),
"Dont remap final activation",
)
# Test networks produce exactly same gradients
data = np.random.randn(batch_size, input_dim).astype(np.float32)
label = np.random.randint(
low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
workspace.RunNetOnce(m.param_init_net)
workspace.FeedBlob("name_x/data", data)
workspace.FeedBlob("name_x/label", label)
workspace.RunNetOnce(m.net)
loss = workspace.FetchBlob("name_x/loss")
grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
workspace.RunNetOnce(optim_proto)
optimized_loss = workspace.FetchBlob("name_x/loss")
optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
np.testing.assert_almost_equal(loss, optimized_loss)
np.testing.assert_almost_equal(grad, optimized_grad)
workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0]))
# Run with the forward optimization
workspace.RunNetOnce(optim_proto_wacts)
optimized_loss = workspace.FetchBlob("name_x/loss")
optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
np.testing.assert_almost_equal(loss, optimized_loss)
np.testing.assert_almost_equal(grad, optimized_grad)
@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
def test_memonger_mix_cpu_gpu(self):
'''
Check that memonger does not make blobs cross CPU/GPU boundary
'''
m = model_helper.ModelHelper()
with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 0)):
fc1 = brew.fc(m, "data", "fc1", dim_in=2, dim_out=2)
fc2 = brew.fc(m, fc1, "fc2", dim_in=2, dim_out=2)
fc3 = brew.fc(m, fc2, "fc3", dim_in=2, dim_out=2)
fc4 = brew.fc(m, fc3, "fc4", dim_in=2, dim_out=2)
fc4_cpu = m.net.CopyGPUToCPU(fc4, "fc4_cpu")
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
fc5_cpu = brew.fc(m, fc4_cpu, "fc5_cpu", dim_in=2, dim_out=2)
fc6_cpu = brew.fc(m, fc5_cpu, "fc6_cpu", dim_in=2, dim_out=2)
fc7_cpu = brew.fc(m, fc6_cpu, "fc7_cpu", dim_in=2, dim_out=2)
fc7_cpu.Relu([], fc7_cpu) \
.Softmax([], "pred") \
.LabelCrossEntropy(["label"], ["xent"]) \
.AveragedLoss([], "loss")
m.AddGradientOperators(["loss"])
blobs_before = count_blobs(m.net.Proto())
optim_proto = memonger.share_grad_blobs(
m.net,
["loss"],
set(viewvalues(m.param_to_grad)),
"",
share_activations=True,
dont_share_blobs=set(),
)
blobs_after = count_blobs(optim_proto)
self.assertLess(blobs_after, blobs_before)
# Create set of blobs on CPU side and GPU side and check they don't
# overlap
device_blobs = {caffe2_pb2.CPU: set(), workspace.GpuDeviceType: set()}
for op in optim_proto.op:
if op.type not in ['CopyCPUToGPU', "CopyGPUToCPU"]:
dev = op.device_option.device_type
for b in list(op.input) + list(op.output):
device_blobs[dev].add(b)
device_crossers = device_blobs[caffe2_pb2.CPU].intersection(
device_blobs[workspace.GpuDeviceType]
)
self.assertEquals(device_crossers, set())
@given(input_dim=st.integers(min_value=4, max_value=4),
output_dim=st.integers(min_value=4, max_value=4),
batch_size=st.integers(min_value=4, max_value=4))
@settings(deadline=1000)
def test_gradient_optim_tree(self, input_dim, output_dim, batch_size):
m = model_helper.ModelHelper()
with core.NameScope("name_x"):
fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
fc5.Relu([], fc5) \
.Softmax([], "pred1") \
.LabelCrossEntropy(["label"], ["xent1"]) \
.AveragedLoss([], "loss1")
fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
fc6.Relu([], fc6) \
.Softmax([], "pred2") \
.LabelCrossEntropy(["label"], ["xent2"]) \
.AveragedLoss([], "loss2")
input_to_grad = m.AddGradientOperators(["name_x/loss1", "name_x/loss2"])
blobs_before = count_blobs(m.net.Proto())
optim_proto = memonger.share_grad_blobs(
m.net,
["name_x/loss1", "name_x/loss2"],
set(viewvalues(m.param_to_grad)),
"name_x", # "name_x//shared_gradinp_0_shared" if using "name_x/"
share_activations=True,
dont_share_blobs=set(['name_x/fc6', 'name_x/fc5',
str(input_to_grad["name_x/fc1_w"])]),
)
blobs_after = count_blobs(optim_proto)
self.assertLess(blobs_after, blobs_before)
self.assertTrue(has_blob(optim_proto, "name_x/fc6"))
# Test networks produce exactly same gradients
data = np.random.randn(batch_size, input_dim).astype(np.float32)
label = np.random.randint(
low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
workspace.RunNetOnce(m.param_init_net)
workspace.FeedBlob("name_x/data", data)
workspace.FeedBlob("name_x/label", label)
workspace.RunNetOnce(m.net)
loss1 = workspace.FetchBlob("name_x/loss1")
loss2 = workspace.FetchBlob("name_x/loss2")
grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0]))
workspace.RunNetOnce(optim_proto)
optimized_loss1 = workspace.FetchBlob("name_x/loss1")
optimized_loss2 = workspace.FetchBlob("name_x/loss2")
optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
np.testing.assert_almost_equal(loss1, optimized_loss1)
np.testing.assert_almost_equal(loss2, optimized_loss2)
np.testing.assert_almost_equal(grad, optimized_grad)
@given(input_dim=st.integers(min_value=4, max_value=4),
output_dim=st.integers(min_value=4, max_value=4),
batch_size=st.integers(min_value=4, max_value=4))
@settings(deadline=1000)
def test_forward_optim_tree_daggy(self, input_dim, output_dim, batch_size):
m = model_helper.ModelHelper()
m.Proto().type = "dag"
m.Proto().num_workers = 4
with core.NameScope("name_x"):
fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
# Branch
fc3b = brew.fc(m, fc2, "fc3b", dim_in=output_dim, dim_out=output_dim)
Loading ...