#pragma once
#include <c10/core/Backend.h>
#include <c10/core/CopyBytes.h>
#include <c10/core/DispatchKeySet.h>
#include <c10/core/InferenceMode.h>
#include <c10/core/MemoryFormat.h>
#include <c10/core/Storage.h>
#include <c10/core/SymBool.h>
#include <c10/core/SymIntArrayRef.h>
#include <c10/core/TensorOptions.h>
#include <c10/core/WrapDimMinimal.h>
#include <c10/core/impl/LocalDispatchKeySet.h>
#include <c10/core/impl/PyObjectSlot.h>
#include <c10/core/impl/SizesAndStrides.h>
#include <c10/util/DimVector.h>
#include <c10/util/Exception.h>
#include <c10/util/Flags.h>
#include <c10/util/Logging.h>
#include <c10/util/Optional.h>
#include <c10/util/accumulate.h>
#include <c10/util/irange.h>
#include <c10/util/python_stub.h>
#include <c10/util/safe_numerics.h>
#include <algorithm>
#include <atomic>
#include <limits>
#include <memory>
#include <numeric>
#include <utility>
// A global boolean variable to control whether we free memory when a Tensor
// is shrunk to a smaller size. As a result, a Tensor is always going to
// keep the memory allocated for its maximum capacity reshaped to so far.
//
// This parameter is respected "upper-case" methods which call Resize()
// (e.g., CopyFrom, ResizeLike); it is NOT respected by Tensor::resize_
// or ShrinkTo, both of which guarantee to never to free memory.
C10_DECLARE_bool(caffe2_keep_on_shrink);
// Since we can have high variance in blob memory allocated across different
// inputs in the same run, we will shrink the blob only if the memory gain
// is larger than this flag in bytes. This only applies to functions which
// respect caffe2_keep_on_shrink.
C10_DECLARE_int64(caffe2_max_keep_on_shrink_memory);
C10_CLANG_DIAGNOSTIC_PUSH()
#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
#endif
namespace at {
class Tensor;
class TensorBase;
} // namespace at
namespace c10 {
class Scalar;
struct Storage;
} // namespace c10
namespace c10 {
/**
* A utility function to convert vector<int> to vector<int64_t>.
*/
inline std::vector<int64_t> ToVectorint64_t(const ArrayRef<int>& src) {
return std::vector<int64_t>(src.begin(), src.end());
}
/**
* Return product of all dimensions starting from k
*/
inline int64_t size_from_dim_(int k, IntArrayRef dims) {
int64_t r = 1;
for (const auto i : c10::irange(k, dims.size())) {
r *= dims[i];
}
return r;
}
// Product of all dims up to k (not including dims[k])
inline int64_t size_to_dim_(int k, IntArrayRef dims) {
TORCH_CHECK((unsigned)k <= dims.size());
int64_t r = 1;
for (const auto i : c10::irange(k)) {
r *= dims[i];
}
return r;
}
// Product of all dims between k and l (not including dims[k] and dims[l])
inline int64_t size_between_dim_(int k, int l, IntArrayRef dims) {
TORCH_CHECK((unsigned)l < dims.size() && (unsigned)k < dims.size());
int64_t r = 1;
if (k < l) {
for (int i = k + 1; i < l; ++i) {
r *= dims[i];
}
} else {
for (int i = l + 1; i < k; ++i) {
r *= dims[i];
}
}
return r;
}
// Wrap around axis_index if it is negative, s.t., -1 is the last dim
inline int canonical_axis_index_(int axis_index, int ndims) {
TORCH_CHECK(axis_index >= -ndims);
TORCH_CHECK(axis_index < ndims);
if (axis_index < 0) {
return axis_index + ndims;
}
return axis_index;
}
using PlacementDtor = void (*)(void*, size_t);
/*
* A Context that will call extra placement deleter during
* deconstruction.
*
* Accept a already constructed DataPtr and store it as member
* during destruction, we'll call extra deleter on the underlying
* data pointer before the DataPtr is destructed.
* `data_ptr_` owns the memory.
*/
struct C10_API PlacementDeleteContext {
DataPtr data_ptr_;
PlacementDtor placement_dtor_;
size_t size_;
PlacementDeleteContext(
DataPtr&& data_ptr,
PlacementDtor placement_dtor,
size_t size)
: data_ptr_(std::move(data_ptr)),
placement_dtor_(placement_dtor),
size_(size) {}
static DataPtr makeDataPtr(
DataPtr&& data_ptr,
PlacementDtor placement_dtor,
size_t size,
Device device);
~PlacementDeleteContext() {
placement_dtor_(data_ptr_.get(), size_);
// original memory will be freed when data_ptr_ is destructed
}
};
struct TensorImpl;
struct C10_API AutogradMetaInterface {
virtual void set_requires_grad(
bool requires_grad,
at::TensorImpl* self_impl) = 0;
virtual bool requires_grad() const = 0;
virtual at::Tensor& mutable_grad() = 0;
virtual const at::Tensor& grad() const = 0;
virtual const at::Tensor& fw_grad(uint64_t level, const at::TensorBase& self)
const = 0;
virtual void set_fw_grad(
const at::TensorBase& new_grad,
const at::TensorBase& self,
uint64_t level,
bool is_inplace_op) = 0;
virtual ~AutogradMetaInterface();
};
namespace impl {
// Unfortunately, the definition of AutogradMeta lives in a separate
// compilation unit than TensorImpl (libtorch.so versus libc10.so)
// which means that we cannot construct an AutogradMeta from TensorImpl,
// not even from the cpp file. So we have to indirect it through a factory
// function which will be initialized when we load libtorch.so.
struct C10_API AutogradMetaFactory {
virtual ~AutogradMetaFactory() = default;
virtual std::unique_ptr<AutogradMetaInterface> make() const = 0;
// This method is the dumbest method. But I don't have access
// to Tensor (not TensorImpl) which is undefined in this header.
virtual const at::Tensor& undefined_tensor() const = 0;
};
C10_API void SetAutogradMetaFactory(AutogradMetaFactory* factory);
C10_API AutogradMetaFactory* GetAutogradMetaFactory();
struct C10_API AutogradMetaFactoryRegisterer {
explicit AutogradMetaFactoryRegisterer(AutogradMetaFactory* factory) {
SetAutogradMetaFactory(factory);
}
};
} // namespace impl
struct C10_API NamedTensorMetaInterface {
virtual ~NamedTensorMetaInterface() = default;
virtual std::unique_ptr<NamedTensorMetaInterface> clone() const {
TORCH_INTERNAL_ASSERT(
false, "Not implemented: NamedTensorMetaInterface::clone");
};
virtual int64_t slow_dim() const {
TORCH_INTERNAL_ASSERT(
false, "Not implemented: NamedTensorMetaInterface::slow_dim");
};
};
// For ease of copy pasting
#if 0
is_contiguous
is_channels_last_contiguous
is_channels_last_3d_contiguous
is_channels_last
is_channels_last_3d
is_non_overlapping_and_dense
#endif
struct C10_API ExtraMeta {
SymDimVector sizes_ = {0};
SymDimVector strides_ = {1};
SymInt numel_ = 1;
SymInt storage_offset_ = 0;
SymBool is_contiguous_{true};
SymBool is_channels_last_contiguous_{false};
SymBool is_channels_last_3d_contiguous_{false};
SymBool is_channels_last_{false};
SymBool is_channels_last_3d_{false};
SymBool is_non_overlapping_and_dense_{true};
std::unique_ptr<c10::NamedTensorMetaInterface> named_tensor_meta_ = nullptr;
ExtraMeta() = default;
ExtraMeta(
SymDimVector sizes,
SymDimVector strides,
SymInt numel,
SymInt storage_offset,
SymBool is_contiguous,
SymBool is_channels_last_contiguous,
SymBool is_channels_last_3d_contiguous,
SymBool is_channels_last,
SymBool is_channels_last_3d,
SymBool is_non_overlapping_and_dense,
std::unique_ptr<c10::NamedTensorMetaInterface> named_tensor_meta)
: sizes_(std::move(sizes)),
strides_(std::move(strides)),
numel_(std::move(numel)),
storage_offset_(std::move(storage_offset)),
is_contiguous_(std::move(is_contiguous)),
is_channels_last_contiguous_(std::move(is_channels_last_contiguous)),
is_channels_last_3d_contiguous_(
std::move(is_channels_last_3d_contiguous)),
is_channels_last_(std::move(is_channels_last)),
is_channels_last_3d_(std::move(is_channels_last_3d)),
is_non_overlapping_and_dense_(std::move(is_non_overlapping_and_dense)),
named_tensor_meta_(std::move(named_tensor_meta)) {}
std::unique_ptr<ExtraMeta> clone() const {
return std::make_unique<ExtraMeta>(
sizes_,
strides_,
numel_,
storage_offset_,
is_contiguous_,
is_channels_last_contiguous_,
is_channels_last_3d_contiguous_,
is_channels_last_,
is_channels_last_3d_,
is_non_overlapping_and_dense_,
named_tensor_meta_ ? named_tensor_meta_->clone() : nullptr);
}
};
// NOTE [ Version Counter Sharing ]
//
// Every Tensor has a version counter. Version counters are incremented whenever
// the data or size of a tensor changes through in-place Variable operations.
// Version counters are used to detect modifications to saved variables which
// would result in incorrect gradient calculations. Version counters may be
// shared between Variables:
//
// 1. A view shares the version counter of the base Variable,
// 2. `x.detach()` shares the version counter of `x`,
// 3. Unpacked saved variables share the version counter of the source.
//
// Version counters are not shared in these scenarios:
//
// 1. When we replace a `Variable`'s underlying `Tensor` by calling
// `set_data(...)`,
// 2. `x.data` does not share the version counter of `x`. (See discussion at
// https://github.com/pytorch/pytorch/issues/5396)
//
// Question: Why do we put the version counter in TensorImpl instead of
// AutogradMeta?
//
// Answer: After the Variable/Tensor merge, a tensor will not have AutogradMeta
// when its `requires_grad_` is false, but when we use this tensor in the
// forward pass of a function that requires saving this tensor for backward, we
// need to keep track of this tensor's version to make sure it's always valid in
// the autograd graph.
//
// To achieve this goal, we put the version counter in TensorImpl instead of
// AutogradMeta, and have it always be available. This allows us to have the
// optimization of not carrying AutogradMeta when a tensor doesn't require
// gradient.
//
// A hypothetical alternative way to achieve this goal is to initialize
// AutogradMeta and create the version counter for the non-requires-grad tensor
// only when it's saved for backward. However, since saving a tensor for
// backward happens in the forward pass, and our invariant is that forward pass
// needs to be thread-safe, lazy-initializing AutogradMeta when saving a tensor
// can introduce race conditions when we are running the forward pass in
// multi-thread scenarios, thus making the forward pass not thread-safe anymore,
// which breaks the invariant.
struct C10_API VariableVersion {
private:
struct VersionCounter : intrusive_ptr_target {
VersionCounter(uint32_t version) : version_(version) {}
std::atomic<uint32_t> version_;
};
c10::intrusive_ptr<VersionCounter> version_counter_;
public:
// Note [Disabled VariableVersion]
// VariableVersion struct has an intrusive_ptr pointing VersionCounter struct
// with an atomic variable. Thus `VariableVersion(/*version=*/0)` is not as
// cheap as we expected. In some cases constructing a VariableVersion with
// version 0 is not necessary so we add a cheap constructor which
// doesn't allocate the intrusive_ptr.
// Example use cases are:
// - Inference tensors don't track version counter, so they'll just always
// have disbaled VariableVersion.
// - In SavedVariable class we override version_counter_ inside its
// construtor
// so that we can use the cheap constructor there.
enum Disabled { DISABLED };
// It's okay to return true even for inference tensor which
// doesn't have version counter enabled.
// We want to be permissive here since in many cases (e.g. make_variable)
// we can std::move a TensorImpl if there's no other uses which saves us
// an additional TensorImpl allocation.
bool unique() const {
return version_counter_ ? 1 == version_counter_.use_count() : true;
Loading ...