HOME | DD
Published: 2017-11-16 19:48:48 +0000 UTC; Views: 768; Favourites: 7; Downloads: 14
Redirect to original
Description
Addition to the code here , it can now do edges (and tilingRendered a few versions of this, but picked this one due to the number of spirals.
In the previous code add this to the setup method:
width_minus1 = _mm256_set1_epi32( data_width - 1 ) ;
height = _mm256_set1_epi32( data_height ) ;
height_minus1 = _mm256_set1_epi32( data_height - 1 ) ;
And in the protected section, after the Laplacian_Operator_AVX2
__m256i zero = _mm256_set1_epi32( 0 ) ;
__m256i width_minus1 ;
__m256i height ;
__m256i height_minus1 ;
float Laplacian_Operator_Wrap_AVX2( int32_t x , int32_t y , const float *read_data )
{
// Generate memory index
__m256i px = _mm256_add_epi32( _mm256_set1_epi32( x ) , dx ) ;
__m256i py = _mm256_add_epi32( _mm256_set1_epi32( y ) , dy ) ;
// If x < 0, add width
__m256i mask = _mm256_cmpgt_epi32( zero , px ) ;
px = _mm256_add_epi32( px , _mm256_and_si256( mask , width ) ) ;
// If x >= width, sub width
mask = _mm256_cmpgt_epi32( px , width_minus1 ) ;
px = _mm256_sub_epi32( px , _mm256_and_si256( mask , width ) ) ;
// If y < 0, add height
mask = _mm256_cmpgt_epi32( zero , py ) ;
py = _mm256_add_epi32( py , _mm256_and_si256( mask , height ) ) ;
// If y >= height, sub height
mask = _mm256_cmpgt_epi32( py , height_minus1 ) ;
py = _mm256_sub_epi32( py , _mm256_and_si256( mask , height ) ) ;
__m256i index = _mm256_add_epi32( px , _mm256_mullo_epi32( py , width ) ) ;
// Get values from memory and scale by weights, 4 = sizeof(float)
__m256 v = _mm256_mul_ps( _mm256_i32gather_ps( read_data , index , 4 ) , lw ) ;
// Get sum of all elements
__m128 sum = _mm_add_ps( _mm256_extractf128_ps( v , 1 ) , _mm256_castps256_ps128( v ) ) ;
sum = _mm_add_ps( sum , _mm_movehl_ps( sum , sum ) ) ;
sum = _mm_add_ss( sum , _mm_shuffle_ps( sum , sum , 0x55 ) ) ;
// Add centre with weight -1
return _mm_cvtss_f32( sum ) - read_data[ x + y * data_width ] ;
}
Optional; add a new iteration method in the public section.
void Iterate_Wrap()
{
#pragma omp parallel for schedule ( dynamic )
for ( int16_t y = 0 ; y < data_height ; y++ )
{
for ( int16_t x = 0 ; x < data_width ; x++ )
{
int32_t index = x + y * data_width ;
float a_current = a[ index ] ;
float b_current = b[ index ] ;
// Gray-Scott model
float reaction_rate = a_current * b_current * b_current ;
a_next[ index ] = clamp( a_current + ( Laplacian_Operator_Wrap_AVX2( x , y , a ) - reaction_rate + f * ( 1.f - a_current ) ) ) ;
b_next[ index ] = clamp( b_current + ( Laplacian_Operator_Wrap_AVX2( x , y , b ) * 0.5f + reaction_rate - ( f + k ) * b_current ) ) ;
}
}
std::swap( a , a_next ) ;
std::swap( b , b_next ) ;
}

























