Repository URL to install this package:
|
Version:
3.15.0 ▾
|
/* ===================================================================
*
* Copyright (c) 2018, Helder Eijs <helderijs@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* ===================================================================
*/
#include "common.h"
FAKE_INIT(ghash_clmul)
#if defined(HAVE_INTRIN_H)
#include <intrin.h>
#elif defined(HAVE_X86INTRIN_H)
#include <x86intrin.h>
#elif defined(HAVE_EMMINTRIN_H)
#include <xmmintrin.h>
#include <emmintrin.h>
#else
#error No SSE2 headers available
#endif
#if defined(HAVE_WMMINTRIN_H) && defined(HAVE_TMMINTRIN_H)
#include <wmmintrin.h>
#include <tmmintrin.h>
#else
#error No CLMUL headers available
#endif
/**
* This module implement the basic GHASH multiplication, as described in
* NIST SP 800-38D.
*
* At the core, we perform a binary polynomial multiplication (carry-less)
* modulo P(x) = x^128 + x^7 + x + 1, that is a finite field multiplication in
* GF(2^128).
*
* The GCM standard requires that coefficients of the two polynomials are encoded
* little-endian byte-wise but also (oddly enough) bit-wise (i.e. within a byte).
*
* In other words, the unity element x is encoded in memory as:
*
* 0x80 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 = (binary) 100000.....0
*
* Note how the least significant bit (LSB) is the leftmost one in the first byte.
*
* In order to use the native CPU instructions though, the conventional representation of
* bits within a byte (with leftmost bit being the most significant) must be in place.
*
* To that end, instead of performing expensive bit-swapping, computing the
* multiplication, and then bit-swapping again, we can use an equivalent
* expression (see [2][3]) that operates directly on *bit-reflected* data.
* Such expression interprets the original encoding of the factors as if the
* rightmost bit was the most significant.
*
* The new expression A * (B * x) * x^{-128} modulo p(x) = x^128 + x^127 + x^126 + x^121 + 1
*
* For instance, what used to be the unity element x for the original multiplication,
* is now the value x^127 for the equivalent expression. Within each byte, the
* leftmost bit is the most significant as desired.
*
* However, this also means that bytes end up encoded big-endian in memory. In
* order to use x86 arithmetic (and the XMM registers), data must be
* byte-swapped when loaded from or stored into memory.
*
* References:
* [1] http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.447.379&rep=rep1&type=pdf
* [2] https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf
* [3] https://blog.quarkslab.com/reversing-a-finite-field-multiplication-optimization.html
*/
struct exp_key {
/**
* Powers of H (already swapped in byte endianness, and pre-multiplied by X)
*
* h[0] is x*H mod p(x)
* h[1] is x*H^2 mod p(x)
* h[2] is x*H^3 mod p(x)
* h[3] is x*H^4 mod p(x)
*/
__m128i h[4];
};
/**
* Perform the Montgomery reduction on a polynomial of degree 255,
* using basis x^128 and modulus p(x) = x^128 + x^127 + x^126 + x^121 + 1.
*
* See at the bottom for an explanation.
*/
STATIC FUNC_SSE2 __m128i reduce(__m128i prod_high, __m128i prod_low)
{
const uint64_t c2 = (uint64_t)0xc2 << 56;
__m128i t1, t2, t3, t4, t7;
t1 = prod_high; /* U3:U2 */
t7 = prod_low; /* U1:U0 */
t3 = _mm_loadl_epi64((__m128i*)&c2);
t2 = _mm_clmulepi64_si128(t3, t7, 0x00); /* A */
t4 = _mm_shuffle_epi32(t7, _MM_SHUFFLE(1,0,3,2)); /* U0:U1 */
t4 = _mm_xor_si128(t4, t2); /* B */
t2 = _mm_clmulepi64_si128(t3, t4, 0x00); /* C */
t4 = _mm_shuffle_epi32(t4, _MM_SHUFFLE(1,0,3,2)); /* B0:B1 */
t4 = _mm_xor_si128(t4, t2); /* D */
t1 = _mm_xor_si128(t1, t4); /* T */
return t1;
}
/**
* Perform the carry-less multiplication of two polynomials of degree 127.
*/
STATIC FUNC_SSE2 void clmult(__m128i *prod_high, __m128i *prod_low, __m128i a, __m128i b)
{
__m128i c, d, e, f, g, h, i;
c = _mm_clmulepi64_si128(a, b, 0x00); /* A0*B0 */
d = _mm_clmulepi64_si128(a, b, 0x11); /* A1*B1 */
e = _mm_clmulepi64_si128(a, b, 0x10); /* A0*B1 */
f = _mm_clmulepi64_si128(a, b, 0x01); /* A1*B0 */
g = _mm_xor_si128(e, f); /* E1+F1:E0+F0 */
h = _mm_slli_si128(g, 8); /* E0+F0:0 */
i = _mm_srli_si128(g, 8); /* 0:E1+F1 */
*prod_high = _mm_xor_si128(d, i);
*prod_low = _mm_xor_si128(c, h);
}
/**
* Multiply a polynomial of degree 127 by x, modulo p(x) = x^128 + x^127 + x^126 + x^121 + 1
*/
STATIC FUNC_SSE2 __m128i multx(__m128i a)
{
int msb;
int64_t r;
uint64_t p0, p1;
__m128i t0, t1, t2, t3, t4, t5, t6, t7;
msb = _mm_movemask_epi8(a) >> 15; /* Bit 0 is a[127] */
r = (msb ^ 1) - 1; /* MSB is copied into all 64 positions */
p0 = (uint64_t)r & 0x0000000000000001U; /* Zero or XOR mask (low) */
p1 = (uint64_t)r & ((uint64_t)0xc2 << 56); /* Zero or XOR mask (high) */
t0 = _mm_loadl_epi64((__m128i*)&p0);
t1 = _mm_loadl_epi64((__m128i*)&p1);
t2 = _mm_unpacklo_epi64(t0, t1); /* Zero or XOR mask */
/* Shift value a left by 1 bit */
t3 = _mm_slli_si128(a, 8); /* Shift a left by 64 bits (lower 64 bits are zero) */
t4 = _mm_srli_epi64(t3, 63); /* Bit 64 is now a[63], all other bits are 0 */
t5 = _mm_slli_epi64(a, 1); /* Shift left by 1 bit, but bit 64 is zero, not a[63] */
t6 = _mm_or_si128(t4, t5); /* Actual result of shift left by 1 bit */
/* XOR conditional mask */
t7 = _mm_xor_si128(t2, t6);
return t7;
}
/** Swap bytes in an XMM register **/
STATIC FUNC_SSE2 __m128i swap(__m128i a)
{
__m128i mask;
mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
return _mm_shuffle_epi8(a, mask);
}
EXPORT_SYM int ghash_expand_clmul(const uint8_t h[16], struct exp_key **expanded)
{
__m128i h128;
unsigned i;
if (NULL==h || NULL==expanded)
return ERR_NULL;
*expanded = align_alloc(sizeof(struct exp_key), 16);
if (NULL == *expanded)
return ERR_MEMORY;
h128 = swap(_mm_loadu_si128((__m128i*)h));
(*expanded)->h[0] = multx(h128); /** x*H **/
for (i=1; i<4; i++) {
__m128i r0, r1;
clmult(&r0, &r1, (*expanded)->h[i-1], (*expanded)->h[0]);
(*expanded)->h[i] = reduce(r0, r1);
}
return 0;
}
EXPORT_SYM int ghash_destroy_clmul(struct exp_key *expanded)
{
align_free(expanded);
return 0;
}
EXPORT_SYM FUNC_SSE2 int ghash_clmul(
uint8_t y_out[16],
const uint8_t block_data[],
size_t len,
const uint8_t y_in[16],
struct exp_key *expanded
)
{
size_t i;
__m128i y_temp;
size_t len16;
if (NULL==y_out || NULL==block_data || NULL==y_in || NULL==expanded)
return ERR_NULL;
if (len % 16)
return ERR_NOT_ENOUGH_DATA;
y_temp = swap(_mm_loadu_si128((__m128i*)y_in));
/** Authenticate 64 bytes per cycle **/
len16 = len ^ (len & 0x3F);
for (i=0; i<len16; i+=16*4) {
__m128i sum0, sum1;
__m128i xm3, xm2, xm1, x;
__m128i r0, r1, r2, r3, r4, r5, r6;
xm3 = swap(_mm_loadu_si128((__m128i*)&block_data[i]));
xm2 = swap(_mm_loadu_si128((__m128i*)&block_data[i+16]));
xm1 = swap(_mm_loadu_si128((__m128i*)&block_data[i+16*2]));
x = swap(_mm_loadu_si128((__m128i*)&block_data[i+16*3]));
/** (X_{i-3} + Y_{i-4}) * H^4 **/
r0 = _mm_xor_si128(xm3, y_temp);
clmult(&sum0, &sum1, r0, expanded->h[3]);
/** X_{i-2} * H^3 **/
clmult(&r1, &r2, xm2, expanded->h[2]);
sum0 = _mm_xor_si128(sum0, r1);
sum1 = _mm_xor_si128(sum1, r2);
/** X_{i-1} * H^2 **/
clmult(&r3, &r4, xm1, expanded->h[1]);
sum0 = _mm_xor_si128(sum0, r3);
sum1 = _mm_xor_si128(sum1, r4);
/** X_{i} * H **/
clmult(&r5, &r6, x, expanded->h[0]);
sum0 = _mm_xor_si128(sum0, r5);
sum1 = _mm_xor_si128(sum1, r6);
/** mod P **/
y_temp = reduce(sum0, sum1);
}
/** Y_i = (X_i + Y_{i-1}) * H mod P **/
for (; i<len; i+=16) {
__m128i z, xi;
__m128i prod_hi, prod_lo;
xi = swap(_mm_loadu_si128((__m128i*)&block_data[i]));
z = _mm_xor_si128(y_temp, xi);
clmult(&prod_hi, &prod_lo, z, expanded->h[0]);
y_temp = reduce(prod_hi, prod_lo);
}
y_temp = swap(y_temp);
_mm_storeu_si128((__m128i*)y_out, y_temp);
return 0;
}
/**
* The function reduce() computes the Montgomery reduction
* of U (input, 256 bits) with FastREDC algorithm:
*
* Q = ((U mod X^128) * p' mod X^128
* T = (U + Q*p) div X^128
*
* where:
* p = 1:C200000000000000:1 = 1:c2:1
* p' = p^{-1} mod X^128 = C200000000000000:1 = c2:1
*
* U3:U2 : U1:U0 (256 bit)
* Q1:Q0 (128 bit)
* T1:T0 (128 bit)
*
* Q = (U mod X^128) * p' mod X^128
* = (U1:U0) * p' mod X^128 = (U1:U0) * (c2:1) mod X^128 = Q1:Q0
* Q0 = U0
* Q1 = L(U0*c2) + U1
*
* T = (U + Q*p) div X^128 = T1:T0
*
* Q*p = S = Q1:Q0 * 1:c2:1
* S0 = Q0 (dropped)
* S1 = L(c2*Q0) + Q1 (dropped)
* S2 = Q0 + L(c2*Q1) + H(c2*Q0)
* S3 = Q1 + H(c2*Q1)
*
* T0 = S2 + U2
* T1 = S3 + U3
*
* Q1 = L(U0*c2) + U1
* T0 = U0 + L(c2*Q1) + H(c2*U0) + U2
* T1 = Q1 + H(c2*Q1) + U3
*
* A = c2*U0
* Q1 = A0 + U1
* T0 = U0 + L(c2*Q1) + A1 + U2
* T1 = Q1 + H(c2*Q1) + U3
*
* A = c2*U0
* B = A + U0:U1 = B1:Q1
* T0 = L(c2*B0) + B1 + U2
* T1 = B0 + H(c2*B0) + U3
*
* A = c2*U0
* B = A + U0:U1
* C = c2*B0
* T0 = C0 + B1 + U2
* T1 = B0 + C1 + U3
*
* A = c2*U0
* B = A + U0:U1
* C = c2*B0
* D = C + B0:B1
* T0 = D0 + U2
* T1 = D1 + U3
*/