HOME | DD

Description
Addition to the code here , it can now do edges (and tiling ). If you find this useful please give credits and leave a comment below. Enjoy and download for full image.

Rendered a few versions of this, but picked this one due to the number of spirals.

In the previous code add this to the setup method:
width_minus1 = _mm256_set1_epi32( data_width - 1 ) ;
height = _mm256_set1_epi32( data_height ) ;
height_minus1 = _mm256_set1_epi32( data_height - 1 ) ;

And in the protected section, after the Laplacian_Operator_AVX2
__m256i zero = _mm256_set1_epi32( 0 ) ;
__m256i width_minus1 ;
__m256i height ;
__m256i height_minus1 ;

float Laplacian_Operator_Wrap_AVX2( int32_t x , int32_t y , const float *read_data )
{
    // Generate memory index
    __m256i px = _mm256_add_epi32( _mm256_set1_epi32( x ) , dx ) ;
    __m256i py = _mm256_add_epi32( _mm256_set1_epi32( y ) , dy ) ;
    // If x < 0, add width
    __m256i mask = _mm256_cmpgt_epi32( zero , px ) ;
    px = _mm256_add_epi32( px , _mm256_and_si256( mask , width ) ) ;
    // If x >= width, sub width
    mask = _mm256_cmpgt_epi32( px , width_minus1 ) ;
    px = _mm256_sub_epi32( px , _mm256_and_si256( mask , width ) ) ;
    // If y < 0, add height
    mask = _mm256_cmpgt_epi32( zero , py ) ;
    py = _mm256_add_epi32( py , _mm256_and_si256( mask , height ) ) ;
    // If y >= height, sub height
    mask = _mm256_cmpgt_epi32( py , height_minus1 ) ;
    py = _mm256_sub_epi32( py , _mm256_and_si256( mask , height ) ) ;
    __m256i index = _mm256_add_epi32( px , _mm256_mullo_epi32( py , width ) ) ;

    // Get values from memory and scale by weights, 4 = sizeof(float)
    __m256 v = _mm256_mul_ps( _mm256_i32gather_ps( read_data , index , 4 ) , lw ) ;

    // Get sum of all elements
    __m128 sum = _mm_add_ps( _mm256_extractf128_ps( v , 1 ) , _mm256_castps256_ps128( v ) ) ;
    sum = _mm_add_ps( sum , _mm_movehl_ps( sum , sum ) ) ;
    sum = _mm_add_ss( sum , _mm_shuffle_ps( sum , sum , 0x55 ) ) ;

    // Add centre with weight -1
    return _mm_cvtss_f32( sum ) - read_data[ x + y * data_width ] ;
}

Optional; add a new iteration method in the public section.
void Iterate_Wrap()
{
#pragma omp parallel for schedule ( dynamic )
    for ( int16_t y = 0 ; y < data_height ; y++ )
    {
        for ( int16_t x = 0 ; x < data_width ; x++ )
        {
            int32_t index = x + y * data_width ;
            float a_current = a[ index ] ;
            float b_current = b[ index ] ;

            // Gray-Scott model
            float reaction_rate = a_current * b_current * b_current ;
            a_next[ index ] = clamp( a_current + ( Laplacian_Operator_Wrap_AVX2( x , y , a ) - reaction_rate + f * ( 1.f - a_current ) ) ) ;
            b_next[ index ] = clamp( b_current + ( Laplacian_Operator_Wrap_AVX2( x , y , b ) * 0.5f + reaction_rate - ( f + k ) * b_current ) ) ;
        }
    }

    std::swap( a , a_next ) ;
    std::swap( b , b_next ) ;
}

Description
Addition to the code here , it can now do edges (and tiling ). If you find this useful please give credits and leave a comment below. Enjoy and download for full image.

Rendered a few versions of this, but picked this one due to the number of spirals.

In the previous code add this to the setup method:
width_minus1 = _mm256_set1_epi32( data_width - 1 ) ;
height = _mm256_set1_epi32( data_height ) ;
height_minus1 = _mm256_set1_epi32( data_height - 1 ) ;

And in the protected section, after the Laplacian_Operator_AVX2
__m256i zero = _mm256_set1_epi32( 0 ) ;
__m256i width_minus1 ;
__m256i height ;
__m256i height_minus1 ;

float Laplacian_Operator_Wrap_AVX2( int32_t x , int32_t y , const float *read_data )
{
    // Generate memory index
    __m256i px = _mm256_add_epi32( _mm256_set1_epi32( x ) , dx ) ;
    __m256i py = _mm256_add_epi32( _mm256_set1_epi32( y ) , dy ) ;
    // If x < 0, add width
    __m256i mask = _mm256_cmpgt_epi32( zero , px ) ;
    px = _mm256_add_epi32( px , _mm256_and_si256( mask , width ) ) ;
    // If x >= width, sub width
    mask = _mm256_cmpgt_epi32( px , width_minus1 ) ;
    px = _mm256_sub_epi32( px , _mm256_and_si256( mask , width ) ) ;
    // If y < 0, add height
    mask = _mm256_cmpgt_epi32( zero , py ) ;
    py = _mm256_add_epi32( py , _mm256_and_si256( mask , height ) ) ;
    // If y >= height, sub height
    mask = _mm256_cmpgt_epi32( py , height_minus1 ) ;
    py = _mm256_sub_epi32( py , _mm256_and_si256( mask , height ) ) ;
    __m256i index = _mm256_add_epi32( px , _mm256_mullo_epi32( py , width ) ) ;

    // Get values from memory and scale by weights, 4 = sizeof(float)
    __m256 v = _mm256_mul_ps( _mm256_i32gather_ps( read_data , index , 4 ) , lw ) ;

    // Get sum of all elements
    __m128 sum = _mm_add_ps( _mm256_extractf128_ps( v , 1 ) , _mm256_castps256_ps128( v ) ) ;
    sum = _mm_add_ps( sum , _mm_movehl_ps( sum , sum ) ) ;
    sum = _mm_add_ss( sum , _mm_shuffle_ps( sum , sum , 0x55 ) ) ;

    // Add centre with weight -1
    return _mm_cvtss_f32( sum ) - read_data[ x + y * data_width ] ;
}

Optional; add a new iteration method in the public section.
void Iterate_Wrap()
{
#pragma omp parallel for schedule ( dynamic )
    for ( int16_t y = 0 ; y < data_height ; y++ )
    {
        for ( int16_t x = 0 ; x < data_width ; x++ )
        {
            int32_t index = x + y * data_width ;
            float a_current = a[ index ] ;
            float b_current = b[ index ] ;

            // Gray-Scott model
            float reaction_rate = a_current * b_current * b_current ;
            a_next[ index ] = clamp( a_current + ( Laplacian_Operator_Wrap_AVX2( x , y , a ) - reaction_rate + f * ( 1.f - a_current ) ) ) ;
            b_next[ index ] = clamp( b_current + ( Laplacian_Operator_Wrap_AVX2( x , y , b ) * 0.5f + reaction_rate - ( f + k ) * b_current ) ) ;
        }
    }

    std::swap( a , a_next ) ;
    std::swap( b , b_next ) ;
}