Crypto++  8.6
Free C++ class library of cryptographic schemes
lsh512_avx.cpp
1 // lsh.cpp - written and placed in the public domain by Jeffrey Walton
2 // Based on the specification and source code provided by
3 // Korea Internet & Security Agency (KISA) website. Also
4 // see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do
5 // and https://seed.kisa.or.kr/kisa/Board/22/detailView.do.
6 
7 // We are hitting some sort of GCC bug in the LSH AVX2 code path.
8 // Clang is OK on the AVX2 code path. We believe it is GCC Issue
9 // 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It
10 // makes using zeroupper a little tricky.
11 
12 #include "pch.h"
13 #include "config.h"
14 
15 #include "lsh.h"
16 #include "misc.h"
17 
18 #if defined(CRYPTOPP_AVX2_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE)
19 
20 #if defined(CRYPTOPP_AVX2_AVAILABLE)
21 # include <emmintrin.h>
22 # include <immintrin.h>
23 #endif
24 
25 // GCC at 4.5. Clang is unknown. Also see https://stackoverflow.com/a/42493893.
26 #if (CRYPTOPP_GCC_VERSION >= 40500)
27 # include <x86intrin.h>
28 #endif
29 
30 ANONYMOUS_NAMESPACE_BEGIN
31 
32 /* LSH Constants */
33 
34 const unsigned int LSH512_MSG_BLK_BYTE_LEN = 256;
35 // const unsigned int LSH512_MSG_BLK_BIT_LEN = 2048;
36 // const unsigned int LSH512_CV_BYTE_LEN = 128;
37 const unsigned int LSH512_HASH_VAL_MAX_BYTE_LEN = 64;
38 
39 // const unsigned int MSG_BLK_WORD_LEN = 32;
40 const unsigned int CV_WORD_LEN = 16;
41 const unsigned int CONST_WORD_LEN = 8;
42 // const unsigned int HASH_VAL_MAX_WORD_LEN = 8;
43 const unsigned int NUM_STEPS = 28;
44 
45 const unsigned int ROT_EVEN_ALPHA = 23;
46 const unsigned int ROT_EVEN_BETA = 59;
47 const unsigned int ROT_ODD_ALPHA = 7;
48 const unsigned int ROT_ODD_BETA = 3;
49 
50 const unsigned int LSH_TYPE_512_512 = 0x0010040;
51 const unsigned int LSH_TYPE_512_384 = 0x0010030;
52 const unsigned int LSH_TYPE_512_256 = 0x0010020;
53 const unsigned int LSH_TYPE_512_224 = 0x001001C;
54 
55 // const unsigned int LSH_TYPE_384 = LSH_TYPE_512_384;
56 // const unsigned int LSH_TYPE_512 = LSH_TYPE_512_512;
57 
58 /* Error Code */
59 
60 const unsigned int LSH_SUCCESS = 0x0;
61 // const unsigned int LSH_ERR_NULL_PTR = 0x2401;
62 // const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402;
63 const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403;
64 const unsigned int LSH_ERR_INVALID_STATE = 0x2404;
65 
66 /* Index into our state array */
67 
68 const unsigned int AlgorithmType = 80;
69 const unsigned int RemainingBits = 81;
70 
71 NAMESPACE_END
72 
73 NAMESPACE_BEGIN(CryptoPP)
74 NAMESPACE_BEGIN(LSH)
75 
76 // lsh512.cpp
77 extern const word64 LSH512_IV224[CV_WORD_LEN];
78 extern const word64 LSH512_IV256[CV_WORD_LEN];
79 extern const word64 LSH512_IV384[CV_WORD_LEN];
80 extern const word64 LSH512_IV512[CV_WORD_LEN];
81 extern const word64 LSH512_StepConstants[CONST_WORD_LEN * NUM_STEPS];
82 
83 NAMESPACE_END // LSH
84 NAMESPACE_END // Crypto++
85 
86 ANONYMOUS_NAMESPACE_BEGIN
87 
88 using CryptoPP::byte;
89 using CryptoPP::word32;
90 using CryptoPP::word64;
93 
94 using CryptoPP::GetBlock;
98 
99 using CryptoPP::LSH::LSH512_IV224;
100 using CryptoPP::LSH::LSH512_IV256;
101 using CryptoPP::LSH::LSH512_IV384;
102 using CryptoPP::LSH::LSH512_IV512;
103 using CryptoPP::LSH::LSH512_StepConstants;
104 
105 typedef byte lsh_u8;
106 typedef word32 lsh_u32;
107 typedef word64 lsh_u64;
108 typedef word32 lsh_uint;
109 typedef word32 lsh_err;
110 typedef word32 lsh_type;
111 
112 struct LSH512_AVX2_Context
113 {
114  LSH512_AVX2_Context(word64* state, word64 algType, word64& remainingBitLength) :
115  cv_l(state+0), cv_r(state+8), sub_msgs(state+16),
116  last_block(reinterpret_cast<byte*>(state+48)),
117  remain_databitlen(remainingBitLength),
118  alg_type(static_cast<lsh_type>(algType)) {}
119 
120  lsh_u64* cv_l; // start of our state block
121  lsh_u64* cv_r;
122  lsh_u64* sub_msgs;
123  lsh_u8* last_block;
124  lsh_u64& remain_databitlen;
125  lsh_type alg_type;
126 };
127 
128 struct LSH512_AVX2_Internal
129 {
130  LSH512_AVX2_Internal(word64* state) :
131  submsg_e_l(state+16), submsg_e_r(state+24),
132  submsg_o_l(state+32), submsg_o_r(state+40) { }
133 
134  lsh_u64* submsg_e_l; /* even left sub-message */
135  lsh_u64* submsg_e_r; /* even right sub-message */
136  lsh_u64* submsg_o_l; /* odd left sub-message */
137  lsh_u64* submsg_o_r; /* odd right sub-message */
138 };
139 
140 // Zero the upper 128 bits of all YMM registers on exit.
141 // It avoids AVX state transition penalties when saving state.
142 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735
143 // makes using zeroupper a little tricky.
144 
145 struct AVX_Cleanup
146 {
147  ~AVX_Cleanup() {
148  _mm256_zeroupper();
149  }
150 };
151 
152 // const lsh_u32 g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 };
153 
154 /* LSH AlgType Macro */
155 
156 inline bool LSH_IS_LSH512(lsh_uint val) {
157  return (val & 0xf0000) == 0x10000;
158 }
159 
160 inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) {
161  return val >> 24;
162 }
163 
164 inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) {
165  return val & 0xffff;
166 }
167 
168 inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) {
169  return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val);
170 }
171 
172 inline lsh_u64 loadLE64(lsh_u64 v) {
174 }
175 
176 lsh_u64 ROTL64(lsh_u64 x, lsh_u32 r) {
177  return rotlFixed(x, r);
178 }
179 
180 // Original code relied upon unaligned lsh_u64 buffer
181 inline void load_msg_blk(LSH512_AVX2_Internal* i_state, const lsh_u8 msgblk[LSH512_MSG_BLK_BYTE_LEN])
182 {
183  lsh_u64* submsg_e_l = i_state->submsg_e_l;
184  lsh_u64* submsg_e_r = i_state->submsg_e_r;
185  lsh_u64* submsg_o_l = i_state->submsg_o_l;
186  lsh_u64* submsg_o_r = i_state->submsg_o_r;
187 
188  _mm256_storeu_si256(M256_CAST(submsg_e_l+0),
189  _mm256_loadu_si256(CONST_M256_CAST(msgblk+0)));
190  _mm256_storeu_si256(M256_CAST(submsg_e_l+4),
191  _mm256_loadu_si256(CONST_M256_CAST(msgblk+32)));
192 
193  _mm256_storeu_si256(M256_CAST(submsg_e_r+0),
194  _mm256_loadu_si256(CONST_M256_CAST(msgblk+64)));
195  _mm256_storeu_si256(M256_CAST(submsg_e_r+4),
196  _mm256_loadu_si256(CONST_M256_CAST(msgblk+96)));
197 
198  _mm256_storeu_si256(M256_CAST(submsg_o_l+0),
199  _mm256_loadu_si256(CONST_M256_CAST(msgblk+128)));
200  _mm256_storeu_si256(M256_CAST(submsg_o_l+4),
201  _mm256_loadu_si256(CONST_M256_CAST(msgblk+160)));
202 
203  _mm256_storeu_si256(M256_CAST(submsg_o_r+0),
204  _mm256_loadu_si256(CONST_M256_CAST(msgblk+192)));
205  _mm256_storeu_si256(M256_CAST(submsg_o_r+4),
206  _mm256_loadu_si256(CONST_M256_CAST(msgblk+224)));
207 }
208 
209 inline void msg_exp_even(LSH512_AVX2_Internal* i_state)
210 {
211  CRYPTOPP_ASSERT(i_state != NULLPTR);
212 
213  lsh_u64* submsg_e_l = i_state->submsg_e_l;
214  lsh_u64* submsg_e_r = i_state->submsg_e_r;
215  lsh_u64* submsg_o_l = i_state->submsg_o_l;
216  lsh_u64* submsg_o_r = i_state->submsg_o_r;
217 
218  _mm256_storeu_si256(M256_CAST(submsg_e_l+0), _mm256_add_epi64(
219  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)),
220  _mm256_permute4x64_epi64(
221  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)),
222  _MM_SHUFFLE(1,0,2,3))));
223  _mm256_storeu_si256(M256_CAST(submsg_e_l+4), _mm256_add_epi64(
224  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4)),
225  _mm256_permute4x64_epi64(
226  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4)),
227  _MM_SHUFFLE(2,1,0,3))));
228 
229  _mm256_storeu_si256(M256_CAST(submsg_e_r+0), _mm256_add_epi64(
230  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)),
231  _mm256_permute4x64_epi64(
232  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)),
233  _MM_SHUFFLE(1,0,2,3))));
234  _mm256_storeu_si256(M256_CAST(submsg_e_r+4), _mm256_add_epi64(
235  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4)),
236  _mm256_permute4x64_epi64(
237  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4)),
238  _MM_SHUFFLE(2,1,0,3))));
239 }
240 
241 inline void msg_exp_odd(LSH512_AVX2_Internal* i_state)
242 {
243  CRYPTOPP_ASSERT(i_state != NULLPTR);
244 
245  lsh_u64* submsg_e_l = i_state->submsg_e_l;
246  lsh_u64* submsg_e_r = i_state->submsg_e_r;
247  lsh_u64* submsg_o_l = i_state->submsg_o_l;
248  lsh_u64* submsg_o_r = i_state->submsg_o_r;
249 
250  _mm256_storeu_si256(M256_CAST(submsg_o_l+0),
251  _mm256_add_epi64(
252  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)),
253  _mm256_permute4x64_epi64(
254  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)),
255  _MM_SHUFFLE(1,0,2,3))));
256  _mm256_storeu_si256(M256_CAST(submsg_o_l+4),
257  _mm256_add_epi64(
258  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4)),
259  _mm256_permute4x64_epi64(
260  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4)),
261  _MM_SHUFFLE(2,1,0,3))));
262 
263  _mm256_storeu_si256(M256_CAST(submsg_o_r+0),
264  _mm256_add_epi64(
265  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)),
266  _mm256_permute4x64_epi64(
267  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)),
268  _MM_SHUFFLE(1,0,2,3))));
269  _mm256_storeu_si256(M256_CAST(submsg_o_r+4),
270  _mm256_add_epi64(
271  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4)),
272  _mm256_permute4x64_epi64(
273  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4)),
274  _MM_SHUFFLE(2,1,0,3))));
275 }
276 
277 inline void load_sc(const lsh_u64** p_const_v, size_t i)
278 {
279  *p_const_v = &LSH512_StepConstants[i];
280 }
281 
282 inline void msg_add_even(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_AVX2_Internal* i_state)
283 {
284  CRYPTOPP_ASSERT(i_state != NULLPTR);
285 
286  lsh_u64* submsg_e_l = i_state->submsg_e_l;
287  lsh_u64* submsg_e_r = i_state->submsg_e_r;
288 
289  _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
290  _mm256_loadu_si256(CONST_M256_CAST(cv_l)),
291  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l))));
292  _mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256(
293  _mm256_loadu_si256(CONST_M256_CAST(cv_r)),
294  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r))));
295 
296  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256(
297  _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
298  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4))));
299  _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_xor_si256(
300  _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)),
301  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4))));
302 }
303 
304 inline void msg_add_odd(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_AVX2_Internal* i_state)
305 {
306  CRYPTOPP_ASSERT(i_state != NULLPTR);
307 
308  lsh_u64* submsg_o_l = i_state->submsg_o_l;
309  lsh_u64* submsg_o_r = i_state->submsg_o_r;
310 
311  _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
312  _mm256_loadu_si256(CONST_M256_CAST(cv_l)),
313  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l))));
314  _mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256(
315  _mm256_loadu_si256(CONST_M256_CAST(cv_r)),
316  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r))));
317 
318  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256(
319  _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
320  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4))));
321  _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_xor_si256(
322  _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)),
323  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4))));
324 }
325 
326 inline void add_blk(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
327 {
328  _mm256_storeu_si256(M256_CAST(cv_l), _mm256_add_epi64(
329  _mm256_loadu_si256(CONST_M256_CAST(cv_l)),
330  _mm256_loadu_si256(CONST_M256_CAST(cv_r))));
331  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_add_epi64(
332  _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
333  _mm256_loadu_si256(CONST_M256_CAST(cv_r+4))));
334 }
335 
336 template <unsigned int R>
337 inline void rotate_blk(lsh_u64 cv[8])
338 {
339  _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256(
340  _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), R),
341  _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), 64-R)));
342  _mm256_storeu_si256(M256_CAST(cv+4), _mm256_or_si256(
343  _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), R),
344  _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), 64-R)));
345 }
346 
347 inline void xor_with_const(lsh_u64 cv_l[8], const lsh_u64 const_v[8])
348 {
349  _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
350  _mm256_loadu_si256(CONST_M256_CAST(cv_l)),
351  _mm256_loadu_si256(CONST_M256_CAST(const_v))));
352  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256(
353  _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
354  _mm256_loadu_si256(CONST_M256_CAST(const_v+4))));
355 }
356 
357 inline void rotate_msg_gamma(lsh_u64 cv_r[8])
358 {
359  // g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 };
360  _mm256_storeu_si256(M256_CAST(cv_r+0),
361  _mm256_shuffle_epi8(
362  _mm256_loadu_si256(CONST_M256_CAST(cv_r+0)),
363  _mm256_set_epi8(
364  /* hi lane */ 9,8,15,14, 13,12,11,10, 3,2,1,0, 7,6,5,4,
365  /* lo lane */ 13,12,11,10, 9,8,15,14, 7,6,5,4, 3,2,1,0)));
366  _mm256_storeu_si256(M256_CAST(cv_r+4),
367  _mm256_shuffle_epi8(
368  _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)),
369  _mm256_set_epi8(
370  /* hi lane */ 8,15,14,13, 12,11,10,9, 2,1,0,7, 6,5,4,3,
371  /* lo lane */ 12,11,10,9, 8,15,14,13, 6,5,4,3, 2,1,0,7)));
372 }
373 
374 inline void word_perm(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
375 {
376  __m256i temp[2];
377  _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_permute4x64_epi64(
378  _mm256_loadu_si256(CONST_M256_CAST(cv_l+0)), _MM_SHUFFLE(3,1,0,2)));
379  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_permute4x64_epi64(
380  _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)), _MM_SHUFFLE(3,1,0,2)));
381  _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_permute4x64_epi64(
382  _mm256_loadu_si256(CONST_M256_CAST(cv_r+0)), _MM_SHUFFLE(1,2,3,0)));
383  _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_permute4x64_epi64(
384  _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)), _MM_SHUFFLE(1,2,3,0)));
385 
386  temp[0] = _mm256_loadu_si256(CONST_M256_CAST(cv_l+0));
387  temp[1] = _mm256_loadu_si256(CONST_M256_CAST(cv_r+0));
388 
389  _mm256_storeu_si256(M256_CAST(cv_l+0),
390  _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)));
391  _mm256_storeu_si256(M256_CAST(cv_l+4),
392  _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)));
393 
394  _mm256_storeu_si256(M256_CAST(cv_r+0), temp[0]);
395  _mm256_storeu_si256(M256_CAST(cv_r+4), temp[1]);
396 };
397 
398 /* -------------------------------------------------------- *
399 * step function
400 * -------------------------------------------------------- */
401 
402 template <unsigned int Alpha, unsigned int Beta>
403 inline void mix(lsh_u64 cv_l[8], lsh_u64 cv_r[8], const lsh_u64 const_v[8])
404 {
405  add_blk(cv_l, cv_r);
406  rotate_blk<Alpha>(cv_l);
407  xor_with_const(cv_l, const_v);
408  add_blk(cv_r, cv_l);
409  rotate_blk<Beta>(cv_r);
410  add_blk(cv_l, cv_r);
411  rotate_msg_gamma(cv_r);
412 }
413 
414 /* -------------------------------------------------------- *
415 * compression function
416 * -------------------------------------------------------- */
417 
418 inline void compress(LSH512_AVX2_Context* ctx, const lsh_u8 pdMsgBlk[LSH512_MSG_BLK_BYTE_LEN])
419 {
420  CRYPTOPP_ASSERT(ctx != NULLPTR);
421 
422  LSH512_AVX2_Internal s_state(ctx->cv_l);
423  LSH512_AVX2_Internal* i_state = &s_state;
424 
425  const lsh_u64* const_v = NULL;
426  lsh_u64 *cv_l = ctx->cv_l;
427  lsh_u64 *cv_r = ctx->cv_r;
428 
429  load_msg_blk(i_state, pdMsgBlk);
430 
431  msg_add_even(cv_l, cv_r, i_state);
432  load_sc(&const_v, 0);
433  mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
434  word_perm(cv_l, cv_r);
435 
436  msg_add_odd(cv_l, cv_r, i_state);
437  load_sc(&const_v, 8);
438  mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
439  word_perm(cv_l, cv_r);
440 
441  for (size_t i = 1; i < NUM_STEPS / 2; i++)
442  {
443  msg_exp_even(i_state);
444  msg_add_even(cv_l, cv_r, i_state);
445  load_sc(&const_v, 16 * i);
446  mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
447  word_perm(cv_l, cv_r);
448 
449  msg_exp_odd(i_state);
450  msg_add_odd(cv_l, cv_r, i_state);
451  load_sc(&const_v, 16 * i + 8);
452  mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
453  word_perm(cv_l, cv_r);
454  }
455 
456  msg_exp_even(i_state);
457  msg_add_even(cv_l, cv_r, i_state);
458 }
459 
460 /* -------------------------------------------------------- */
461 
462 inline void load_iv(word64 cv_l[8], word64 cv_r[8], const word64 iv[16])
463 {
464  // The IV's are 32-byte aligned so we can use aligned loads.
465  _mm256_storeu_si256(M256_CAST(cv_l+0),
466  _mm256_load_si256(CONST_M256_CAST(iv+0)));
467  _mm256_storeu_si256(M256_CAST(cv_l+4),
468  _mm256_load_si256(CONST_M256_CAST(iv+4)));
469 
470  _mm256_storeu_si256(M256_CAST(cv_r+0),
471  _mm256_load_si256(CONST_M256_CAST(iv+8)));
472  _mm256_storeu_si256(M256_CAST(cv_r+4),
473  _mm256_load_si256(CONST_M256_CAST(iv+12)));
474 }
475 
476 inline void zero_iv(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
477 {
478  _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_setzero_si256());
479  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_setzero_si256());
480  _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_setzero_si256());
481  _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_setzero_si256());
482 }
483 
484 inline void zero_submsgs(LSH512_AVX2_Context* ctx)
485 {
486  lsh_u64* sub_msgs = ctx->sub_msgs;
487 
488  _mm256_storeu_si256(M256_CAST(sub_msgs+ 0),
489  _mm256_setzero_si256());
490  _mm256_storeu_si256(M256_CAST(sub_msgs+ 4),
491  _mm256_setzero_si256());
492 
493  _mm256_storeu_si256(M256_CAST(sub_msgs+ 8),
494  _mm256_setzero_si256());
495  _mm256_storeu_si256(M256_CAST(sub_msgs+12),
496  _mm256_setzero_si256());
497 }
498 
499 inline void init224(LSH512_AVX2_Context* ctx)
500 {
501  CRYPTOPP_ASSERT(ctx != NULLPTR);
502 
503  zero_submsgs(ctx);
504  load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV224);
505 }
506 
507 inline void init256(LSH512_AVX2_Context* ctx)
508 {
509  CRYPTOPP_ASSERT(ctx != NULLPTR);
510 
511  zero_submsgs(ctx);
512  load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV256);
513 }
514 
515 inline void init384(LSH512_AVX2_Context* ctx)
516 {
517  CRYPTOPP_ASSERT(ctx != NULLPTR);
518 
519  zero_submsgs(ctx);
520  load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV384);
521 }
522 
523 inline void init512(LSH512_AVX2_Context* ctx)
524 {
525  CRYPTOPP_ASSERT(ctx != NULLPTR);
526 
527  zero_submsgs(ctx);
528  load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV512);
529 }
530 
531 /* -------------------------------------------------------- */
532 
533 inline void fin(LSH512_AVX2_Context* ctx)
534 {
535  CRYPTOPP_ASSERT(ctx != NULLPTR);
536 
537  _mm256_storeu_si256(M256_CAST(ctx->cv_l+0), _mm256_xor_si256(
538  _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+0)),
539  _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+0))));
540 
541  _mm256_storeu_si256(M256_CAST(ctx->cv_l+4), _mm256_xor_si256(
542  _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+4)),
543  _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+4))));
544 }
545 
546 /* -------------------------------------------------------- */
547 
548 inline void get_hash(LSH512_AVX2_Context* ctx, lsh_u8* pbHashVal)
549 {
550  CRYPTOPP_ASSERT(ctx != NULLPTR);
551  CRYPTOPP_ASSERT(ctx->alg_type != 0);
552  CRYPTOPP_ASSERT(pbHashVal != NULLPTR);
553 
554  lsh_uint alg_type = ctx->alg_type;
555  lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type);
556  lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type);
557 
558  // Multiplying by sizeof(lsh_u8) looks odd...
559  memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len);
560  if (hash_val_bit_len){
561  pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len);
562  }
563 }
564 
565 /* -------------------------------------------------------- */
566 
567 lsh_err lsh512_init_avx2(LSH512_AVX2_Context* ctx)
568 {
569  CRYPTOPP_ASSERT(ctx != NULLPTR);
570  CRYPTOPP_ASSERT(ctx->alg_type != 0);
571 
572  lsh_u32 alg_type = ctx->alg_type;
573  const lsh_u64* const_v = NULL;
574  ctx->remain_databitlen = 0;
575 
576  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
577  AVX_Cleanup cleanup;
578 
579  switch (alg_type){
580  case LSH_TYPE_512_512:
581  init512(ctx);
582  return LSH_SUCCESS;
583  case LSH_TYPE_512_384:
584  init384(ctx);
585  return LSH_SUCCESS;
586  case LSH_TYPE_512_256:
587  init256(ctx);
588  return LSH_SUCCESS;
589  case LSH_TYPE_512_224:
590  init224(ctx);
591  return LSH_SUCCESS;
592  default:
593  break;
594  }
595 
596  lsh_u64* cv_l = ctx->cv_l;
597  lsh_u64* cv_r = ctx->cv_r;
598 
599  zero_iv(cv_l, cv_r);
600  cv_l[0] = LSH512_HASH_VAL_MAX_BYTE_LEN;
601  cv_l[1] = LSH_GET_HASHBIT(alg_type);
602 
603  for (size_t i = 0; i < NUM_STEPS / 2; i++)
604  {
605  //Mix
606  load_sc(&const_v, i * 16);
607  mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
608  word_perm(cv_l, cv_r);
609 
610  load_sc(&const_v, i * 16 + 8);
611  mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
612  word_perm(cv_l, cv_r);
613  }
614 
615  return LSH_SUCCESS;
616 }
617 
618 lsh_err lsh512_update_avx2(LSH512_AVX2_Context* ctx, const lsh_u8* data, size_t databitlen)
619 {
620  CRYPTOPP_ASSERT(ctx != NULLPTR);
621  CRYPTOPP_ASSERT(data != NULLPTR);
622  CRYPTOPP_ASSERT(databitlen % 8 == 0);
623  CRYPTOPP_ASSERT(ctx->alg_type != 0);
624 
625  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
626  AVX_Cleanup cleanup;
627 
628  if (databitlen == 0){
629  return LSH_SUCCESS;
630  }
631 
632  // We are byte oriented. tail bits will always be 0.
633  size_t databytelen = databitlen >> 3;
634  // lsh_uint pos2 = databitlen & 0x7;
635  const size_t pos2 = 0;
636 
637  size_t remain_msg_byte = static_cast<size_t>(ctx->remain_databitlen >> 3);
638  // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
639  const size_t remain_msg_bit = 0;
640 
641  if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){
642  return LSH_ERR_INVALID_STATE;
643  }
644  if (remain_msg_bit > 0){
645  return LSH_ERR_INVALID_DATABITLEN;
646  }
647 
648  if (databytelen + remain_msg_byte < LSH512_MSG_BLK_BYTE_LEN){
649  memcpy(ctx->last_block + remain_msg_byte, data, databytelen);
650  ctx->remain_databitlen += (lsh_uint)databitlen;
651  remain_msg_byte += (lsh_uint)databytelen;
652  if (pos2){
653  ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
654  }
655  return LSH_SUCCESS;
656  }
657 
658  if (remain_msg_byte > 0){
659  size_t more_byte = LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte;
660  memcpy(ctx->last_block + remain_msg_byte, data, more_byte);
661  compress(ctx, ctx->last_block);
662  data += more_byte;
663  databytelen -= more_byte;
664  remain_msg_byte = 0;
665  ctx->remain_databitlen = 0;
666  }
667 
668  while (databytelen >= LSH512_MSG_BLK_BYTE_LEN)
669  {
670  // This call to compress caused some trouble.
671  // The data pointer can become unaligned in the
672  // previous block.
673  compress(ctx, data);
674  data += LSH512_MSG_BLK_BYTE_LEN;
675  databytelen -= LSH512_MSG_BLK_BYTE_LEN;
676  }
677 
678  if (databytelen > 0){
679  memcpy(ctx->last_block, data, databytelen);
680  ctx->remain_databitlen = (lsh_uint)(databytelen << 3);
681  }
682 
683  if (pos2){
684  ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
685  ctx->remain_databitlen += pos2;
686  }
687  return LSH_SUCCESS;
688 }
689 
690 lsh_err lsh512_final_avx2(LSH512_AVX2_Context* ctx, lsh_u8* hashval)
691 {
692  CRYPTOPP_ASSERT(ctx != NULLPTR);
693  CRYPTOPP_ASSERT(hashval != NULLPTR);
694 
695  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
696  AVX_Cleanup cleanup;
697 
698  // We are byte oriented. tail bits will always be 0.
699  size_t remain_msg_byte = static_cast<size_t>(ctx->remain_databitlen >> 3);
700  // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
701  const size_t remain_msg_bit = 0;
702 
703  if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){
704  return LSH_ERR_INVALID_STATE;
705  }
706 
707  if (remain_msg_bit){
708  ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit));
709  }
710  else{
711  ctx->last_block[remain_msg_byte] = 0x80;
712  }
713  memset(ctx->last_block + remain_msg_byte + 1, 0, LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte - 1);
714 
715  compress(ctx, ctx->last_block);
716 
717  fin(ctx);
718  get_hash(ctx, hashval);
719 
720  return LSH_SUCCESS;
721 }
722 
723 ANONYMOUS_NAMESPACE_END
724 
725 NAMESPACE_BEGIN(CryptoPP)
726 
727 extern
728 void LSH512_Base_Restart_AVX2(word64* state)
729 {
730  state[RemainingBits] = 0;
731  LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
732  lsh_err err = lsh512_init_avx2(&ctx);
733 
734  if (err != LSH_SUCCESS)
735  throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_init_avx2 failed");
736 }
737 
738 extern
739 void LSH512_Base_Update_AVX2(word64* state, const byte *input, size_t size)
740 {
741  LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
742  lsh_err err = lsh512_update_avx2(&ctx, input, 8*size);
743 
744  if (err != LSH_SUCCESS)
745  throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_update_avx2 failed");
746 }
747 
748 extern
749 void LSH512_Base_TruncatedFinal_AVX2(word64* state, byte *hash, size_t)
750 {
751  LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
752  lsh_err err = lsh512_final_avx2(&ctx, hash);
753 
754  if (err != LSH_SUCCESS)
755  throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_final_avx2 failed");
756 }
757 
758 NAMESPACE_END
759 
760 #endif // CRYPTOPP_AVX2_AVAILABLE
Base class for all exceptions thrown by the library.
Definition: cryptlib.h:159
@ OTHER_ERROR
Some other error occurred not belonging to other categories.
Definition: cryptlib.h:177
Library configuration file.
unsigned char byte
8-bit unsigned datatype
Definition: config_int.h:56
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:62
unsigned long long word64
64-bit unsigned datatype
Definition: config_int.h:91
@ LITTLE_ENDIAN_ORDER
byte order is little-endian
Definition: cryptlib.h:145
EnumToType< ByteOrder, LITTLE_ENDIAN_ORDER > LittleEndian
Provides a constant for LittleEndian.
Definition: cryptlib.h:150
Classes for the LSH hash functions.
Utility functions for the Crypto++ library.
T rotlConstant(T x)
Performs a left rotate.
Definition: misc.h:1547
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianness.
Definition: misc.h:2187
T rotlFixed(T x, unsigned int y)
Performs a left rotate.
Definition: misc.h:1598
Crypto++ library namespace.
Precompiled header file.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:68