mdds
Loading...
Searching...
No Matches
soa/block_util.hpp
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*************************************************************************
3 *
4 * Copyright (c) 2021 Kohei Yoshida
5 *
6 * Permission is hereby granted, free of charge, to any person
7 * obtaining a copy of this software and associated documentation
8 * files (the "Software"), to deal in the Software without
9 * restriction, including without limitation the rights to use,
10 * copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following
13 * conditions:
14 *
15 * The above copyright notice and this permission notice shall be
16 * included in all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 * OTHER DEALINGS IN THE SOFTWARE.
26 *
27 ************************************************************************/
28
29#ifndef INCLUDED_MDDS_MULTI_TYPE_VECTOR_DIR_SOA_BLOCK_UTIL_HPP
30#define INCLUDED_MDDS_MULTI_TYPE_VECTOR_DIR_SOA_BLOCK_UTIL_HPP
31
32#include "mdds/global.hpp"
33#include "../types.hpp"
34
35#if defined(__SSE2__)
36#include <emmintrin.h>
37#endif
38#if defined(__AVX2__)
39#include <immintrin.h>
40#endif
41
42namespace mdds { namespace mtv { namespace soa { namespace detail {
43
44template<typename Blks, lu_factor_t F>
46{
47 void operator()(Blks& /*block_store*/, int64_t /*start_block_index*/, int64_t /*delta*/) const
48 {
49 static_assert(invalid_static_int<F>, "The loop-unrolling factor must be one of 0, 4, 8, 16, or 32.");
50 }
51};
52
53template<typename Blks>
54struct adjust_block_positions<Blks, lu_factor_t::none>
55{
56 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
57 {
58 int64_t n = block_store.positions.size();
59
60 if (start_block_index >= n)
61 return;
62
63#if MDDS_USE_OPENMP
64#pragma omp parallel for
65#endif
66 for (int64_t i = start_block_index; i < n; ++i)
67 block_store.positions[i] += delta;
68 }
69};
70
71template<typename Blks>
72struct adjust_block_positions<Blks, lu_factor_t::lu4>
73{
74 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
75 {
76 int64_t n = block_store.positions.size();
77
78 if (start_block_index >= n)
79 return;
80
81 // Ensure that the section length is divisible by 4.
82 int64_t len = n - start_block_index;
83 int64_t rem = len & 3; // % 4
84 len -= rem;
85 len += start_block_index;
86#if MDDS_USE_OPENMP
87#pragma omp parallel for
88#endif
89 for (int64_t i = start_block_index; i < len; i += 4)
90 {
91 block_store.positions[i + 0] += delta;
92 block_store.positions[i + 1] += delta;
93 block_store.positions[i + 2] += delta;
94 block_store.positions[i + 3] += delta;
95 }
96
97 rem += len;
98 for (int64_t i = len; i < rem; ++i)
99 block_store.positions[i] += delta;
100 }
101};
102
103template<typename Blks>
104struct adjust_block_positions<Blks, lu_factor_t::lu8>
105{
106 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
107 {
108 int64_t n = block_store.positions.size();
109
110 if (start_block_index >= n)
111 return;
112
113 // Ensure that the section length is divisible by 8.
114 int64_t len = n - start_block_index;
115 int64_t rem = len & 7; // % 8
116 len -= rem;
117 len += start_block_index;
118#if MDDS_USE_OPENMP
119#pragma omp parallel for
120#endif
121 for (int64_t i = start_block_index; i < len; i += 8)
122 {
123 block_store.positions[i + 0] += delta;
124 block_store.positions[i + 1] += delta;
125 block_store.positions[i + 2] += delta;
126 block_store.positions[i + 3] += delta;
127 block_store.positions[i + 4] += delta;
128 block_store.positions[i + 5] += delta;
129 block_store.positions[i + 6] += delta;
130 block_store.positions[i + 7] += delta;
131 }
132
133 rem += len;
134 for (int64_t i = len; i < rem; ++i)
135 block_store.positions[i] += delta;
136 }
137};
138
139template<typename Blks>
140struct adjust_block_positions<Blks, lu_factor_t::lu16>
141{
142 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
143 {
144 int64_t n = block_store.positions.size();
145
146 if (start_block_index >= n)
147 return;
148
149 // Ensure that the section length is divisible by 16.
150 int64_t len = n - start_block_index;
151 int64_t rem = len & 15; // % 16
152 len -= rem;
153 len += start_block_index;
154#if MDDS_USE_OPENMP
155#pragma omp parallel for
156#endif
157 for (int64_t i = start_block_index; i < len; i += 16)
158 {
159 block_store.positions[i + 0] += delta;
160 block_store.positions[i + 1] += delta;
161 block_store.positions[i + 2] += delta;
162 block_store.positions[i + 3] += delta;
163 block_store.positions[i + 4] += delta;
164 block_store.positions[i + 5] += delta;
165 block_store.positions[i + 6] += delta;
166 block_store.positions[i + 7] += delta;
167 block_store.positions[i + 8] += delta;
168 block_store.positions[i + 9] += delta;
169 block_store.positions[i + 10] += delta;
170 block_store.positions[i + 11] += delta;
171 block_store.positions[i + 12] += delta;
172 block_store.positions[i + 13] += delta;
173 block_store.positions[i + 14] += delta;
174 block_store.positions[i + 15] += delta;
175 }
176
177 rem += len;
178 for (int64_t i = len; i < rem; ++i)
179 block_store.positions[i] += delta;
180 }
181};
182
183template<typename Blks>
184struct adjust_block_positions<Blks, lu_factor_t::lu32>
185{
186 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
187 {
188 int64_t n = block_store.positions.size();
189
190 if (start_block_index >= n)
191 return;
192
193 // Ensure that the section length is divisible by 32.
194 int64_t len = n - start_block_index;
195 int64_t rem = len & 31; // % 32
196 len -= rem;
197 len += start_block_index;
198#if MDDS_USE_OPENMP
199#pragma omp parallel for
200#endif
201 for (int64_t i = start_block_index; i < len; i += 32)
202 {
203 block_store.positions[i + 0] += delta;
204 block_store.positions[i + 1] += delta;
205 block_store.positions[i + 2] += delta;
206 block_store.positions[i + 3] += delta;
207 block_store.positions[i + 4] += delta;
208 block_store.positions[i + 5] += delta;
209 block_store.positions[i + 6] += delta;
210 block_store.positions[i + 7] += delta;
211 block_store.positions[i + 8] += delta;
212 block_store.positions[i + 9] += delta;
213 block_store.positions[i + 10] += delta;
214 block_store.positions[i + 11] += delta;
215 block_store.positions[i + 12] += delta;
216 block_store.positions[i + 13] += delta;
217 block_store.positions[i + 14] += delta;
218 block_store.positions[i + 15] += delta;
219 block_store.positions[i + 16] += delta;
220 block_store.positions[i + 17] += delta;
221 block_store.positions[i + 18] += delta;
222 block_store.positions[i + 19] += delta;
223 block_store.positions[i + 20] += delta;
224 block_store.positions[i + 21] += delta;
225 block_store.positions[i + 22] += delta;
226 block_store.positions[i + 23] += delta;
227 block_store.positions[i + 24] += delta;
228 block_store.positions[i + 25] += delta;
229 block_store.positions[i + 26] += delta;
230 block_store.positions[i + 27] += delta;
231 block_store.positions[i + 28] += delta;
232 block_store.positions[i + 29] += delta;
233 block_store.positions[i + 30] += delta;
234 block_store.positions[i + 31] += delta;
235 }
236
237 rem += len;
238 for (int64_t i = len; i < rem; ++i)
239 block_store.positions[i] += delta;
240 }
241};
242
243#if defined(__SSE2__)
244
245template<typename Blks>
246struct adjust_block_positions<Blks, lu_factor_t::sse2_x64>
247{
248 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
249 {
250 static_assert(
251 sizeof(typename decltype(block_store.positions)::value_type) == 8,
252 "This code works only when the position values are 64-bit wide.");
253
254 int64_t n = block_store.positions.size();
255
256 if (start_block_index >= n)
257 return;
258
259 // Ensure that the section length is divisible by 2.
260 int64_t len = n - start_block_index;
261 bool odd = len & 1;
262 if (odd)
263 len -= 1;
264
265 len += start_block_index;
266
267 __m128i right = _mm_set_epi64x(delta, delta);
268
269#if MDDS_USE_OPENMP
270#pragma omp parallel for
271#endif
272 for (int64_t i = start_block_index; i < len; i += 2)
273 {
274 __m128i* dst = (__m128i*)&block_store.positions[i];
275 _mm_storeu_si128(dst, _mm_add_epi64(_mm_loadu_si128(dst), right));
276 }
277
278 if (odd)
279 block_store.positions[len] += delta;
280 }
281};
282
283template<typename Blks>
284struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu4>
285{
286 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
287 {
288 static_assert(
289 sizeof(typename decltype(block_store.positions)::value_type) == 8,
290 "This code works only when the position values are 64-bit wide.");
291
292 int64_t n = block_store.positions.size();
293
294 if (start_block_index >= n)
295 return;
296
297 // Ensure that the section length is divisible by 8.
298 int64_t len = n - start_block_index;
299 int64_t rem = len & 7; // % 8
300 len -= rem;
301 len += start_block_index;
302
303 __m128i right = _mm_set_epi64x(delta, delta);
304
305#if MDDS_USE_OPENMP
306#pragma omp parallel for
307#endif
308 for (int64_t i = start_block_index; i < len; i += 8)
309 {
310 __m128i* dst0 = (__m128i*)&block_store.positions[i];
311 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
312
313 __m128i* dst2 = (__m128i*)&block_store.positions[i + 2];
314 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
315
316 __m128i* dst4 = (__m128i*)&block_store.positions[i + 4];
317 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
318
319 __m128i* dst6 = (__m128i*)&block_store.positions[i + 6];
320 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
321 }
322
323 rem += len;
324 for (int64_t i = len; i < rem; ++i)
325 block_store.positions[i] += delta;
326 }
327};
328
329template<typename Blks>
330struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu8>
331{
332 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
333 {
334 static_assert(
335 sizeof(typename decltype(block_store.positions)::value_type) == 8,
336 "This code works only when the position values are 64-bit wide.");
337
338 int64_t n = block_store.positions.size();
339
340 if (start_block_index >= n)
341 return;
342
343 // Ensure that the section length is divisible by 16.
344 int64_t len = n - start_block_index;
345 int64_t rem = len & 15; // % 16
346 len -= rem;
347 len += start_block_index;
348
349 __m128i right = _mm_set_epi64x(delta, delta);
350
351#if MDDS_USE_OPENMP
352#pragma omp parallel for
353#endif
354 for (int64_t i = start_block_index; i < len; i += 16)
355 {
356 __m128i* dst0 = (__m128i*)&block_store.positions[i];
357 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
358
359 __m128i* dst2 = (__m128i*)&block_store.positions[i + 2];
360 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
361
362 __m128i* dst4 = (__m128i*)&block_store.positions[i + 4];
363 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
364
365 __m128i* dst6 = (__m128i*)&block_store.positions[i + 6];
366 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
367
368 __m128i* dst8 = (__m128i*)&block_store.positions[i + 8];
369 _mm_storeu_si128(dst8, _mm_add_epi64(_mm_loadu_si128(dst8), right));
370
371 __m128i* dst10 = (__m128i*)&block_store.positions[i + 10];
372 _mm_storeu_si128(dst10, _mm_add_epi64(_mm_loadu_si128(dst10), right));
373
374 __m128i* dst12 = (__m128i*)&block_store.positions[i + 12];
375 _mm_storeu_si128(dst12, _mm_add_epi64(_mm_loadu_si128(dst12), right));
376
377 __m128i* dst14 = (__m128i*)&block_store.positions[i + 14];
378 _mm_storeu_si128(dst14, _mm_add_epi64(_mm_loadu_si128(dst14), right));
379 }
380
381 rem += len;
382 for (int64_t i = len; i < rem; ++i)
383 block_store.positions[i] += delta;
384 }
385};
386
387template<typename Blks>
388struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu16>
389{
390 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
391 {
392 static_assert(
393 sizeof(typename decltype(block_store.positions)::value_type) == 8,
394 "This code works only when the position values are 64-bit wide.");
395
396 int64_t n = block_store.positions.size();
397
398 if (start_block_index >= n)
399 return;
400
401 // Ensure that the section length is divisible by 32.
402 int64_t len = n - start_block_index;
403 int64_t rem = len & 31; // % 32
404 len -= rem;
405 len += start_block_index;
406
407 __m128i right = _mm_set_epi64x(delta, delta);
408
409#if MDDS_USE_OPENMP
410#pragma omp parallel for
411#endif
412 for (int64_t i = start_block_index; i < len; i += 32)
413 {
414 __m128i* dst0 = (__m128i*)&block_store.positions[i];
415 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
416
417 __m128i* dst2 = (__m128i*)&block_store.positions[i + 2];
418 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
419
420 __m128i* dst4 = (__m128i*)&block_store.positions[i + 4];
421 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
422
423 __m128i* dst6 = (__m128i*)&block_store.positions[i + 6];
424 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
425
426 __m128i* dst8 = (__m128i*)&block_store.positions[i + 8];
427 _mm_storeu_si128(dst8, _mm_add_epi64(_mm_loadu_si128(dst8), right));
428
429 __m128i* dst10 = (__m128i*)&block_store.positions[i + 10];
430 _mm_storeu_si128(dst10, _mm_add_epi64(_mm_loadu_si128(dst10), right));
431
432 __m128i* dst12 = (__m128i*)&block_store.positions[i + 12];
433 _mm_storeu_si128(dst12, _mm_add_epi64(_mm_loadu_si128(dst12), right));
434
435 __m128i* dst14 = (__m128i*)&block_store.positions[i + 14];
436 _mm_storeu_si128(dst14, _mm_add_epi64(_mm_loadu_si128(dst14), right));
437
438 __m128i* dst16 = (__m128i*)&block_store.positions[i + 16];
439 _mm_storeu_si128(dst16, _mm_add_epi64(_mm_loadu_si128(dst16), right));
440
441 __m128i* dst18 = (__m128i*)&block_store.positions[i + 18];
442 _mm_storeu_si128(dst18, _mm_add_epi64(_mm_loadu_si128(dst18), right));
443
444 __m128i* dst20 = (__m128i*)&block_store.positions[i + 20];
445 _mm_storeu_si128(dst20, _mm_add_epi64(_mm_loadu_si128(dst20), right));
446
447 __m128i* dst22 = (__m128i*)&block_store.positions[i + 22];
448 _mm_storeu_si128(dst22, _mm_add_epi64(_mm_loadu_si128(dst22), right));
449
450 __m128i* dst24 = (__m128i*)&block_store.positions[i + 24];
451 _mm_storeu_si128(dst24, _mm_add_epi64(_mm_loadu_si128(dst24), right));
452
453 __m128i* dst26 = (__m128i*)&block_store.positions[i + 26];
454 _mm_storeu_si128(dst26, _mm_add_epi64(_mm_loadu_si128(dst26), right));
455
456 __m128i* dst28 = (__m128i*)&block_store.positions[i + 28];
457 _mm_storeu_si128(dst28, _mm_add_epi64(_mm_loadu_si128(dst28), right));
458
459 __m128i* dst30 = (__m128i*)&block_store.positions[i + 30];
460 _mm_storeu_si128(dst30, _mm_add_epi64(_mm_loadu_si128(dst30), right));
461 }
462
463 rem += len;
464 for (int64_t i = len; i < rem; ++i)
465 block_store.positions[i] += delta;
466 }
467};
468
469#endif // __SSE2__
470
471#if defined(__AVX2__)
472
473template<typename Blks>
474struct adjust_block_positions<Blks, lu_factor_t::avx2_x64>
475{
476 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
477 {
478 static_assert(
479 sizeof(typename decltype(block_store.positions)::value_type) == 8,
480 "This code works only when the position values are 64-bit wide.");
481
482 int64_t n = block_store.positions.size();
483
484 if (start_block_index >= n)
485 return;
486
487 // Ensure that the section length is divisible by 4.
488 int64_t len = n - start_block_index;
489 int64_t rem = len & 3; // % 4
490 len -= rem;
491 len += start_block_index;
492
493 __m256i right = _mm256_set1_epi64x(delta);
494
495#if MDDS_USE_OPENMP
496#pragma omp parallel for
497#endif
498 for (int64_t i = start_block_index; i < len; i += 4)
499 {
500 __m256i* dst = (__m256i*)&block_store.positions[i];
501 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
502 }
503
504 rem += len;
505 for (int64_t i = len; i < rem; ++i)
506 block_store.positions[i] += delta;
507 }
508};
509
510template<typename Blks>
511struct adjust_block_positions<Blks, lu_factor_t::avx2_x64_lu4>
512{
513 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
514 {
515 static_assert(
516 sizeof(typename decltype(block_store.positions)::value_type) == 8,
517 "This code works only when the position values are 64-bit wide.");
518
519 int64_t n = block_store.positions.size();
520
521 if (start_block_index >= n)
522 return;
523
524 // Ensure that the section length is divisible by 16.
525 int64_t len = n - start_block_index;
526 int64_t rem = len & 15; // % 16
527 len -= rem;
528 len += start_block_index;
529
530 __m256i right = _mm256_set1_epi64x(delta);
531
532#if MDDS_USE_OPENMP
533#pragma omp parallel for
534#endif
535 for (int64_t i = start_block_index; i < len; i += 16)
536 {
537 __m256i* dst = (__m256i*)&block_store.positions[i];
538 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
539
540 __m256i* dst4 = (__m256i*)&block_store.positions[i + 4];
541 _mm256_storeu_si256(dst4, _mm256_add_epi64(_mm256_loadu_si256(dst4), right));
542
543 __m256i* dst8 = (__m256i*)&block_store.positions[i + 8];
544 _mm256_storeu_si256(dst8, _mm256_add_epi64(_mm256_loadu_si256(dst8), right));
545
546 __m256i* dst12 = (__m256i*)&block_store.positions[i + 12];
547 _mm256_storeu_si256(dst12, _mm256_add_epi64(_mm256_loadu_si256(dst12), right));
548 }
549
550 rem += len;
551 for (int64_t i = len; i < rem; ++i)
552 block_store.positions[i] += delta;
553 }
554};
555
556template<typename Blks>
557struct adjust_block_positions<Blks, lu_factor_t::avx2_x64_lu8>
558{
559 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
560 {
561 static_assert(
562 sizeof(typename decltype(block_store.positions)::value_type) == 8,
563 "This code works only when the position values are 64-bit wide.");
564
565 int64_t n = block_store.positions.size();
566
567 if (start_block_index >= n)
568 return;
569
570 // Ensure that the section length is divisible by 16.
571 int64_t len = n - start_block_index;
572 int64_t rem = len & 31; // % 32
573 len -= rem;
574 len += start_block_index;
575
576 __m256i right = _mm256_set1_epi64x(delta);
577
578#if MDDS_USE_OPENMP
579#pragma omp parallel for
580#endif
581 for (int64_t i = start_block_index; i < len; i += 32)
582 {
583 __m256i* dst = (__m256i*)&block_store.positions[i];
584 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
585
586 __m256i* dst4 = (__m256i*)&block_store.positions[i + 4];
587 _mm256_storeu_si256(dst4, _mm256_add_epi64(_mm256_loadu_si256(dst4), right));
588
589 __m256i* dst8 = (__m256i*)&block_store.positions[i + 8];
590 _mm256_storeu_si256(dst8, _mm256_add_epi64(_mm256_loadu_si256(dst8), right));
591
592 __m256i* dst12 = (__m256i*)&block_store.positions[i + 12];
593 _mm256_storeu_si256(dst12, _mm256_add_epi64(_mm256_loadu_si256(dst12), right));
594
595 __m256i* dst16 = (__m256i*)&block_store.positions[i + 16];
596 _mm256_storeu_si256(dst16, _mm256_add_epi64(_mm256_loadu_si256(dst16), right));
597
598 __m256i* dst20 = (__m256i*)&block_store.positions[i + 20];
599 _mm256_storeu_si256(dst20, _mm256_add_epi64(_mm256_loadu_si256(dst20), right));
600
601 __m256i* dst24 = (__m256i*)&block_store.positions[i + 24];
602 _mm256_storeu_si256(dst24, _mm256_add_epi64(_mm256_loadu_si256(dst24), right));
603
604 __m256i* dst28 = (__m256i*)&block_store.positions[i + 28];
605 _mm256_storeu_si256(dst28, _mm256_add_epi64(_mm256_loadu_si256(dst28), right));
606 }
607
608 rem += len;
609 for (int64_t i = len; i < rem; ++i)
610 block_store.positions[i] += delta;
611 }
612};
613
614#endif // __AVX2__
615
616}}}} // namespace mdds::mtv::soa::detail
617
618#endif
619
620/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition soa/block_util.hpp:46