Skip to content
Snippets Groups Projects
Commit bad418eb authored by Olivier BICHLER's avatar Olivier BICHLER
Browse files

Removed bias and improved ConvDepthWiseImpl2D_cpu::forward()

parent 3988b2f4
No related branches found
No related tags found
2 merge requests!73version 0.2.3,!70Multiple refactors
...@@ -64,7 +64,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const ConvDepthWise_Op<2>::Attrs &at ...@@ -64,7 +64,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const ConvDepthWise_Op<2>::Attrs &at
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t ch = 0; ch < inputDims[1]; ++ch) { for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
const std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize * oySize; const std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize * oySize;
B biasVal = ((!std::get<3>(attrs)) && biases != nullptr) ? biases[ch] : B(0); B biasVal = (biases != nullptr) ? biases[ch] : B(0);
std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal); std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3]; const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
const std::size_t wIndex = ch * std::get<2>(attrs)[0] * std::get<2>(attrs)[1]; const std::size_t wIndex = ch * std::get<2>(attrs)[0] * std::get<2>(attrs)[1];
......
...@@ -106,8 +106,8 @@ void ConvImpl2D_cpu_forward_kernel(const Conv_Op<2>::Attrs &attrs, const std::ar ...@@ -106,8 +106,8 @@ void ConvImpl2D_cpu_forward_kernel(const Conv_Op<2>::Attrs &attrs, const std::ar
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
const std::size_t oIndex = (outCh + batch*outChannels) * oxSize * oySize; const std::size_t oIndex = (outCh + batch*outChannels) * oxSize * oySize;
// If NoBias or bias = nullptr, set B(0) // If bias = nullptr, set B(0)
B biasVal = ((!std::get<3>(attrs)) && biases != nullptr) ? biases[outCh] : B(0); B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal); std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3]; const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
......
...@@ -30,8 +30,8 @@ class FCImplForward_cpu : public Registrable<FCImplForward_cpu, ...@@ -30,8 +30,8 @@ class FCImplForward_cpu : public Registrable<FCImplForward_cpu,
DataType, DataType,
DataType, DataType,
DataType>, DataType>,
void(const FC_Op::Attrs&, void(
const DimSize_t, const DimSize_t,
const DimSize_t, const DimSize_t,
const DimSize_t, const DimSize_t,
const void *, const void *,
...@@ -43,8 +43,8 @@ class FCImplBackward_cpu : public Registrable<FCImplBackward_cpu, ...@@ -43,8 +43,8 @@ class FCImplBackward_cpu : public Registrable<FCImplBackward_cpu,
DataType, DataType,
DataType, DataType,
DataType>, DataType>,
void(const FC_Op::Attrs&, void(
const DimSize_t, const DimSize_t,
const DimSize_t, const DimSize_t,
const DimSize_t, const DimSize_t,
const void *, const void *,
......
...@@ -19,8 +19,7 @@ ...@@ -19,8 +19,7 @@
namespace Aidge { namespace Aidge {
template <class I, class O, class W, class B> template <class I, class O, class W, class B>
void FCImpl_cpu_backward_kernel(const FC_Op::Attrs& attrs, void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
const DimSize_t batchSize,
const DimSize_t inputFeatureSize, const DimSize_t inputFeatureSize,
const DimSize_t outputFeatureSize, const DimSize_t outputFeatureSize,
const void* input_, const void* input_,
...@@ -40,7 +39,7 @@ void FCImpl_cpu_backward_kernel(const FC_Op::Attrs& attrs, ...@@ -40,7 +39,7 @@ void FCImpl_cpu_backward_kernel(const FC_Op::Attrs& attrs,
// bias grad // bias grad
if (std::get<0>(attrs)) { // no bias if (biasesGrad == nullptr) { // no bias
std::fill(biasesGrad, biasesGrad + outputFeatureSize, B(0)); std::fill(biasesGrad, biasesGrad + outputFeatureSize, B(0));
} else { } else {
for (std::size_t o = 0; o < outputFeatureSize; ++o) { // nb outputs for (std::size_t o = 0; o < outputFeatureSize; ++o) { // nb outputs
......
...@@ -83,7 +83,7 @@ namespace Aidge { ...@@ -83,7 +83,7 @@ namespace Aidge {
// } // }
template <class I, class W, class B, class O> template <class I, class W, class B, class O>
void FCImpl_cpu_forward_kernel(const FC_Op::Attrs& attrs, const DimSize_t batchSize, const DimSize_t inputFeatureSize, void FCImpl_cpu_forward_kernel(const DimSize_t batchSize, const DimSize_t inputFeatureSize,
const DimSize_t outputFeatureSize, const DimSize_t outputFeatureSize,
const void* input_, const void* weights_, const void* biases_, void* output_) { const void* input_, const void* weights_, const void* biases_, void* output_) {
// FIXME: missing FC attributes as arguments // FIXME: missing FC attributes as arguments
...@@ -92,7 +92,7 @@ void FCImpl_cpu_forward_kernel(const FC_Op::Attrs& attrs, const DimSize_t batchS ...@@ -92,7 +92,7 @@ void FCImpl_cpu_forward_kernel(const FC_Op::Attrs& attrs, const DimSize_t batchS
const B* biases = static_cast<const B*>(biases_); const B* biases = static_cast<const B*>(biases_);
O* output = static_cast<O*>(output_); O* output = static_cast<O*>(output_);
if (std::get<0>(attrs)) { if (biases == nullptr) {
std::fill(output, output+(batchSize*outputFeatureSize), B(0)); std::fill(output, output+(batchSize*outputFeatureSize), B(0));
} }
else { else {
......
...@@ -28,23 +28,48 @@ Aidge::Elts_t Aidge::ConvDepthWiseImpl2D_cpu::getNbRequiredProtected(IOIndex_t / ...@@ -28,23 +28,48 @@ Aidge::Elts_t Aidge::ConvDepthWiseImpl2D_cpu::getNbRequiredProtected(IOIndex_t /
} }
void Aidge::ConvDepthWiseImpl2D_cpu::forward() { void Aidge::ConvDepthWiseImpl2D_cpu::forward() {
const auto& opTensor = static_cast<const OperatorTensor&>(mOp);
assert(mOp.getRawInput(0) && "missing input #0"); assert(mOp.getRawInput(0) && "missing input #0");
assert(mOp.getRawInput(1) && "missing input #1"); assert(mOp.getRawInput(1) && "missing input #1");
assert(mOp.getRawInput(2) && "missing input #2"); assert(mOp.getRawInput(2) && "missing input #2");
assert((std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->nbDims() == 4) && "support for 4-dimensions tensors only"); assert((std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->nbDims() == 4) && "support for 3-dimensions tensors only");
// Find the correct kernel type // Find the correct kernel type
auto kernelFunc = const auto outputDataType = opTensor.getOutput(0)->dataType();
Registrar<ConvDepthWiseImpl2DForward_cpu>::create({std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(), const Registrar<ConvDepthWiseImpl2DForward_cpu>::registrar_key registrarKey = {
std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(), opTensor.getInput(0)->dataType(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->dataType(), opTensor.getInput(1)->dataType(),
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()}); ((opTensor.getInput(2)) ? opTensor.getInput(2)->dataType() : opTensor.getInput(1)->dataType()),
outputDataType};
Registrar<ConvDepthWiseImpl2DForward_cpu>::registrar_type kernelFunc;
if (Registrar<ConvDepthWiseImpl2DForward_cpu>::exists(registrarKey)) {
// One exists with the right inputs/output types
kernelFunc = Registrar<ConvDepthWiseImpl2DForward_cpu>::create(registrarKey);
}
else {
// Otherwise, fallback to the kernel with all types matching output type
kernelFunc = Registrar<ConvDepthWiseImpl2DForward_cpu>::create({
outputDataType, outputDataType, outputDataType, outputDataType});
}
// Convert input data (no overhead if not needed!)
// TODO: right now, if needed, memory will be allocated/deallocated at each
// call to forward(). We might put the following shared_ptr as members of
// this class to avoid that.
std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
const auto& input0 = opTensor.getInput(0)->refCastFrom(input0Fallback, *opTensor.getOutput(0));
const auto& input1 = opTensor.getInput(1)->refCastFrom(input1Fallback, *opTensor.getOutput(0));
const auto& input2 = (opTensor.getInput(2)) ? opTensor.getInput(2)->refCastFrom(input2Fallback, *opTensor.getOutput(0)) : Tensor();
// Call kernel // Call kernel
kernelFunc(dynamic_cast<const ConvDepthWise_Op<2>&>(mOp).getStaticAttributes(), std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->template dims<4>(), kernelFunc(dynamic_cast<const ConvDepthWise_Op<2>&>(mOp).getStaticAttributes(), // Conv attributes
getCPUPtr(mOp.getRawInput(0)), opTensor.getInput(0)->template dims<4>(), // input dimensions
getCPUPtr(mOp.getRawInput(1)), input0.getImpl()->rawPtr(), // input
getCPUPtr(mOp.getRawInput(2)), input1.getImpl()->rawPtr(), // weight
getCPUPtr(mOp.getRawOutput(0))); (opTensor.getInput(2)) ? input2.getImpl()->rawPtr() : nullptr, // bias
getCPUPtr(mOp.getRawOutput(0)) // output
);
} }
...@@ -40,7 +40,7 @@ void Aidge::ConvImpl2D_cpu::forward() { ...@@ -40,7 +40,7 @@ void Aidge::ConvImpl2D_cpu::forward() {
const Registrar<ConvImpl2DForward_cpu>::registrar_key registrarKey = { const Registrar<ConvImpl2DForward_cpu>::registrar_key registrarKey = {
opTensor.getInput(0)->dataType(), opTensor.getInput(0)->dataType(),
opTensor.getInput(1)->dataType(), opTensor.getInput(1)->dataType(),
opTensor.getInput(2)->dataType(), ((opTensor.getInput(2)) ? opTensor.getInput(2)->dataType() : opTensor.getInput(1)->dataType()),
outputDataType}; outputDataType};
Registrar<ConvImpl2DForward_cpu>::registrar_type kernelFunc; Registrar<ConvImpl2DForward_cpu>::registrar_type kernelFunc;
...@@ -61,7 +61,7 @@ void Aidge::ConvImpl2D_cpu::forward() { ...@@ -61,7 +61,7 @@ void Aidge::ConvImpl2D_cpu::forward() {
std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback; std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
const auto& input0 = opTensor.getInput(0)->refCastFrom(input0Fallback, *opTensor.getOutput(0)); const auto& input0 = opTensor.getInput(0)->refCastFrom(input0Fallback, *opTensor.getOutput(0));
const auto& input1 = opTensor.getInput(1)->refCastFrom(input1Fallback, *opTensor.getOutput(0)); const auto& input1 = opTensor.getInput(1)->refCastFrom(input1Fallback, *opTensor.getOutput(0));
const auto& input2 = opTensor.getInput(2)->refCastFrom(input2Fallback, *opTensor.getOutput(0)); const auto& input2 = (opTensor.getInput(2)) ? opTensor.getInput(2)->refCastFrom(input2Fallback, *opTensor.getOutput(0)) : Tensor();
// Call kernel // Call kernel
kernelFunc(dynamic_cast<const Conv_Op<2>&>(mOp).getStaticAttributes(), // Conv attributes kernelFunc(dynamic_cast<const Conv_Op<2>&>(mOp).getStaticAttributes(), // Conv attributes
...@@ -69,7 +69,7 @@ void Aidge::ConvImpl2D_cpu::forward() { ...@@ -69,7 +69,7 @@ void Aidge::ConvImpl2D_cpu::forward() {
dynamic_cast<const Conv_Op<2>&>(mOp).outChannels(), // outChannels dynamic_cast<const Conv_Op<2>&>(mOp).outChannels(), // outChannels
input0.getImpl()->rawPtr(), // input input0.getImpl()->rawPtr(), // input
input1.getImpl()->rawPtr(), // weight input1.getImpl()->rawPtr(), // weight
input2.getImpl()->rawPtr(), // bias (opTensor.getInput(2)) ? input2.getImpl()->rawPtr() : nullptr, // bias
getCPUPtr(mOp.getRawOutput(0)) // output getCPUPtr(mOp.getRawOutput(0)) // output
); );
} }
...@@ -34,9 +34,9 @@ void Aidge::FCImpl_cpu::forward() ...@@ -34,9 +34,9 @@ void Aidge::FCImpl_cpu::forward()
// Find the correct kernel type // Find the correct kernel type
const auto outputDataType = op_.getOutput(0)->dataType(); const auto outputDataType = op_.getOutput(0)->dataType();
const Registrar<FCImplForward_cpu>::registrar_key registrarKey = { const Registrar<FCImplForward_cpu>::registrar_key registrarKey = {
outputDataType, op_.getInput(0)->dataType(),
outputDataType, op_.getInput(1)->dataType(),
outputDataType, ((op_.getInput(2)) ? op_.getInput(2)->dataType() : op_.getInput(1)->dataType()),
outputDataType}; outputDataType};
Registrar<FCImplForward_cpu>::registrar_type kernelFunc; Registrar<FCImplForward_cpu>::registrar_type kernelFunc;
...@@ -57,15 +57,16 @@ void Aidge::FCImpl_cpu::forward() ...@@ -57,15 +57,16 @@ void Aidge::FCImpl_cpu::forward()
std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback; std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *(op_.getOutput(0))); const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *(op_.getOutput(0)));
const auto& input1 = op_.getInput(1)->refCastFrom(input1Fallback, *(op_.getOutput(0))); const auto& input1 = op_.getInput(1)->refCastFrom(input1Fallback, *(op_.getOutput(0)));
const auto& input2 = op_.getInput(2)->refCastFrom(input2Fallback, *(op_.getOutput(0))); const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *(op_.getOutput(0))) : Tensor();
// Call kernel // Call kernel
const auto batchSize = (input0.dims().size() > 1) ? input0.dims()[0] : 1; const auto batchSize = (input0.dims().size() > 1) ? input0.dims()[0] : 1;
kernelFunc(dynamic_cast<const FC_Op&>(mOp).getStaticAttributes(), kernelFunc(batchSize,
batchSize,
input1.dims()[1], // nb input features input1.dims()[1], // nb input features
input1.dims()[0], // nb output features input1.dims()[0], // nb output features
input0.getImpl()->rawPtr(), input1.getImpl()->rawPtr(), input2.getImpl()->rawPtr(), input0.getImpl()->rawPtr(),
input1.getImpl()->rawPtr(),
(op_.getInput(2)) ? input2.getImpl()->rawPtr() : nullptr,
getCPUPtr(mOp.getRawOutput(0))); getCPUPtr(mOp.getRawOutput(0)));
} }
...@@ -81,9 +82,9 @@ void Aidge::FCImpl_cpu::backward() ...@@ -81,9 +82,9 @@ void Aidge::FCImpl_cpu::backward()
// Find the correct kernel type // Find the correct kernel type
const Registrar<FCImplBackward_cpu>::registrar_key registrarKey = { const Registrar<FCImplBackward_cpu>::registrar_key registrarKey = {
fc_grad->dataType(), fc_grad->dataType(),
op_.getInput(0)->grad()->dataType(),
op_.getInput(1)->grad()->dataType(), op_.getInput(1)->grad()->dataType(),
op_.getInput(2)->grad()->dataType()}; (op_.getInput(2)) ? op_.getInput(2)->grad()->dataType() : op_.getInput(1)->grad()->dataType(),
op_.getInput(0)->grad()->dataType()};
Registrar<FCImplBackward_cpu>::registrar_type kernelFunc; Registrar<FCImplBackward_cpu>::registrar_type kernelFunc;
if (Registrar<FCImplBackward_cpu>::exists(registrarKey)) { if (Registrar<FCImplBackward_cpu>::exists(registrarKey)) {
...@@ -103,12 +104,11 @@ void Aidge::FCImpl_cpu::backward() ...@@ -103,12 +104,11 @@ void Aidge::FCImpl_cpu::backward()
std::shared_ptr<Tensor> input0gradFallback, input1gradFallback, input2gradFallback; std::shared_ptr<Tensor> input0gradFallback, input1gradFallback, input2gradFallback;
const auto& input0grad = op_.getInput(0)->grad()->refCastFrom(input0gradFallback, *(op_.getOutput(0))); const auto& input0grad = op_.getInput(0)->grad()->refCastFrom(input0gradFallback, *(op_.getOutput(0)));
const auto& input1grad = op_.getInput(1)->grad()->refCastFrom(input1gradFallback, *(op_.getOutput(0))); const auto& input1grad = op_.getInput(1)->grad()->refCastFrom(input1gradFallback, *(op_.getOutput(0)));
const auto& input2grad = op_.getInput(2)->grad()->refCastFrom(input2gradFallback, *(op_.getOutput(0))); const auto& input2grad = (op_.getInput(2)) ? op_.getInput(2)->grad()->refCastFrom(input2gradFallback, *(op_.getOutput(0))) : Tensor();
// Call kernel // Call kernel
const auto batchSize = (input0grad.dims().size() > 1) ? input0grad.dims()[0] : 1; const auto batchSize = (input0grad.dims().size() > 1) ? input0grad.dims()[0] : 1;
kernelFunc(dynamic_cast<const FC_Op&>(mOp).getStaticAttributes(), kernelFunc(batchSize,
batchSize,
input1grad.dims()[1], // nb input features input1grad.dims()[1], // nb input features
input1grad.dims()[0], // nb output features input1grad.dims()[0], // nb output features
getCPUPtr(fc_grad), getCPUPtr(fc_grad),
...@@ -116,5 +116,5 @@ void Aidge::FCImpl_cpu::backward() ...@@ -116,5 +116,5 @@ void Aidge::FCImpl_cpu::backward()
getCPUPtr(mOp.getRawInput(1)), getCPUPtr(mOp.getRawInput(1)),
input0grad.getImpl()->rawPtr(), input0grad.getImpl()->rawPtr(),
input1grad.getImpl()->rawPtr(), input1grad.getImpl()->rawPtr(),
input2grad.getImpl()->rawPtr()); (op_.getInput(2)) ? input2grad.getImpl()->rawPtr() : nullptr);
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment