HOME | DD

Description
I have not been doing any coding with OpenCL because I have been waiting for a graphic card that can use OpenCL 2.0, and that will take some time, so I have begun to work with AVX2 and FMA (my CPU is too old for AVX-512 ). The code below is my first go at rendering the 2D Julia set, the masking stuff is needed since AVX2/FMA is not really intended for calculations that "branches". The code have only been tested with Visual Studio 2017, the code for saving the image is trivial so is not included, output image is here. If you find the code useful, please give credit and leave a comment below if possible.

Enjoy.

#include

//Project-> Properties -> C/C++ -> Code Generation : Change Enable Enhanced Instruction Set to AVX2
#include

//Project-> Properties -> C/C++ -> Language : Change OpenMP Support to Yes(/openmp )
#include

int16_t render_width = 8 ;
int16_t render_height = 8 ;
uint32_t max_iterations = 8 ;

__m256 x_min ;
__m256 y_min ;
__m256 x_delta ;
__m256 y_delta ;
__m256 bailout = _mm256_set1_ps( 4.f ) ;
__m256 one = _mm256_set1_ps( 1.f ) ;
__m256 two = _mm256_set1_ps( 2.f ) ;
__m256 minus_one = _mm256_set1_ps( -1.f ) ;
__m256 offset = _mm256_setr_ps( 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ) ;

void Julia_FMA_AVX2( uint8_t *buffer , float creal , float cimag )
{
    // Set c
    __m256 c_real = _mm256_set1_ps( creal ) ;
    __m256 c_imag = _mm256_set1_ps( cimag ) ;

#pragma omp parallel for schedule ( dynamic )
    for ( int16_t y = 0 ; y < render_height ; y++ )
    {
        for ( int16_t x = 0 ; x < render_width ; x += 8 )
        {
            // Grab 8 grouped pixels in a row
            __m256 pixel_x = _mm256_add_ps( _mm256_set1_ps( x ) , offset ) ;
            __m256 pixel_y = _mm256_set1_ps( y ) ;
            // Scale pixel to coordinate system
            __m256 z_real = _mm256_fmadd_ps( pixel_x , x_delta , x_min ) ;
            __m256 z_imag = _mm256_fmadd_ps( pixel_y , y_delta , y_min ) ;

            uint32_t loop = 0 ;
            __m256 iterations = _mm256_set1_ps( 0 ) ; // No blendv with mask for 32bit integers, so using floats

            do
            {
                // zn = z * z + c
                __m256 temp = _mm256_mul_ps( z_real , z_imag ) ; // z real * z imag
                z_real = _mm256_sub_ps( _mm256_fmadd_ps( z_real , z_real , c_real ) , _mm256_mul_ps( z_imag , z_imag ) ) ; // zn real = ( z real * z real + c real ) - ( z imag * z imag )
                z_imag = _mm256_fmadd_ps( temp , two , c_imag ) ; // zn imag = ( z real * z imag ) * 2 + c imag

                // Update iterations depending on magnitude
                __m256 magnitude = _mm256_add_ps( _mm256_mul_ps( z_real , z_real ) , _mm256_mul_ps( z_imag , z_imag ) ) ; // Not squared, waste of cycles
                __m256 mask = _mm256_cmp_ps( magnitude , bailout , _CMP_LT_OS ) ; // Generate mask from compare magnitude < threshold. CMP_LT = compare less than
                iterations = _mm256_add_ps( _mm256_and_ps( mask , one ) , iterations ) ; // iterations += mask (bitwise)AND 1, so add 0 or 1

                // All iterations done?
                if ( _mm256_testz_ps( mask , minus_one ) ) break ; // Test zero, if all zero -> all magnitudes > threshold -> done
            }
            while ( ++loop < max_iterations ) ;

            __m256i row = _mm256_cvtps_epi32( iterations ) ; // Convert floats to 32bit integers
            uint32_t *row_pointer = (uint32_t *)&row ;
            uint8_t *buffer_pointer = buffer + x + y * render_width ;
            for ( uint8_t i = 0 ; i < 8 ; i++ )
            {
                // If on or inside fractal write 0, else 127/255 to buffer
                buffer_pointer[ i ] = row_pointer[ i ] >= max_iterations ? 0 : ( row_pointer[ i ] % 2 ) * 128 + 127 ;
            }
        }
    }
}

int main( int argc , char **argv )
{
    render_width = 1000 ; // Must be multiples of 8
    render_height = 1000 ;
    max_iterations = 10000 ;

    float zoom = 0.25f ; // Higher positive values to zoom in
    float x_centre = 0.0f ;
    float y_centre = 0.0f ;
    float y_span = std::exp( -zoom ) * 4.f ;
    float x_span = y_span * ( (float)render_width / (float)render_height ) ;

    x_min = _mm256_set1_ps( x_centre - x_span * 0.5f ) ;
    y_min = _mm256_set1_ps( y_centre + y_span * 0.5f ) ; // Sign change
    x_delta = _mm256_set1_ps( x_span / (float)render_width ) ;
    y_delta = _mm256_set1_ps( -y_span / (float)render_height ) ; // Sign change

    uint8_t *buffer = nullptr ;
    buffer = new ( std::nothrow ) uint8_t[ render_width * render_height ] ;
    if ( buffer == nullptr )
    {
        std::cout << "Failure: Could not assign data for buffer with size: " << render_width + 0 << " by " << render_height + 0 << std::endl ;
        return EXIT_FAILURE ;
    }

    Julia_FMA_AVX2( buffer , 0.3 , 0.5 ) ;

    // Save image

    return EXIT_SUCCESS ;
}

Description
I have not been doing any coding with OpenCL because I have been waiting for a graphic card that can use OpenCL 2.0, and that will take some time, so I have begun to work with AVX2 and FMA (my CPU is too old for AVX-512 ). The code below is my first go at rendering the 2D Julia set, the masking stuff is needed since AVX2/FMA is not really intended for calculations that "branches". The code have only been tested with Visual Studio 2017, the code for saving the image is trivial so is not included, output image is here. If you find the code useful, please give credit and leave a comment below if possible.

Enjoy.

#include

//Project-> Properties -> C/C++ -> Code Generation : Change Enable Enhanced Instruction Set to AVX2
#include

//Project-> Properties -> C/C++ -> Language : Change OpenMP Support to Yes(/openmp )
#include

int16_t render_width = 8 ;
int16_t render_height = 8 ;
uint32_t max_iterations = 8 ;

__m256 x_min ;
__m256 y_min ;
__m256 x_delta ;
__m256 y_delta ;
__m256 bailout = _mm256_set1_ps( 4.f ) ;
__m256 one = _mm256_set1_ps( 1.f ) ;
__m256 two = _mm256_set1_ps( 2.f ) ;
__m256 minus_one = _mm256_set1_ps( -1.f ) ;
__m256 offset = _mm256_setr_ps( 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ) ;

void Julia_FMA_AVX2( uint8_t *buffer , float creal , float cimag )
{
    // Set c
    __m256 c_real = _mm256_set1_ps( creal ) ;
    __m256 c_imag = _mm256_set1_ps( cimag ) ;

#pragma omp parallel for schedule ( dynamic )
    for ( int16_t y = 0 ; y < render_height ; y++ )
    {
        for ( int16_t x = 0 ; x < render_width ; x += 8 )
        {
            // Grab 8 grouped pixels in a row
            __m256 pixel_x = _mm256_add_ps( _mm256_set1_ps( x ) , offset ) ;
            __m256 pixel_y = _mm256_set1_ps( y ) ;
            // Scale pixel to coordinate system
            __m256 z_real = _mm256_fmadd_ps( pixel_x , x_delta , x_min ) ;
            __m256 z_imag = _mm256_fmadd_ps( pixel_y , y_delta , y_min ) ;

            uint32_t loop = 0 ;
            __m256 iterations = _mm256_set1_ps( 0 ) ; // No blendv with mask for 32bit integers, so using floats

            do
            {
                // zn = z * z + c
                __m256 temp = _mm256_mul_ps( z_real , z_imag ) ; // z real * z imag
                z_real = _mm256_sub_ps( _mm256_fmadd_ps( z_real , z_real , c_real ) , _mm256_mul_ps( z_imag , z_imag ) ) ; // zn real = ( z real * z real + c real ) - ( z imag * z imag )
                z_imag = _mm256_fmadd_ps( temp , two , c_imag ) ; // zn imag = ( z real * z imag ) * 2 + c imag

                // Update iterations depending on magnitude
                __m256 magnitude = _mm256_add_ps( _mm256_mul_ps( z_real , z_real ) , _mm256_mul_ps( z_imag , z_imag ) ) ; // Not squared, waste of cycles
                __m256 mask = _mm256_cmp_ps( magnitude , bailout , _CMP_LT_OS ) ; // Generate mask from compare magnitude < threshold. CMP_LT = compare less than
                iterations = _mm256_add_ps( _mm256_and_ps( mask , one ) , iterations ) ; // iterations += mask (bitwise)AND 1, so add 0 or 1

                // All iterations done?
                if ( _mm256_testz_ps( mask , minus_one ) ) break ; // Test zero, if all zero -> all magnitudes > threshold -> done
            }
            while ( ++loop < max_iterations ) ;

            __m256i row = _mm256_cvtps_epi32( iterations ) ; // Convert floats to 32bit integers
            uint32_t *row_pointer = (uint32_t *)&row ;
            uint8_t *buffer_pointer = buffer + x + y * render_width ;
            for ( uint8_t i = 0 ; i < 8 ; i++ )
            {
                // If on or inside fractal write 0, else 127/255 to buffer
                buffer_pointer[ i ] = row_pointer[ i ] >= max_iterations ? 0 : ( row_pointer[ i ] % 2 ) * 128 + 127 ;
            }
        }
    }
}

int main( int argc , char **argv )
{
    render_width = 1000 ; // Must be multiples of 8
    render_height = 1000 ;
    max_iterations = 10000 ;

    float zoom = 0.25f ; // Higher positive values to zoom in
    float x_centre = 0.0f ;
    float y_centre = 0.0f ;
    float y_span = std::exp( -zoom ) * 4.f ;
    float x_span = y_span * ( (float)render_width / (float)render_height ) ;

    x_min = _mm256_set1_ps( x_centre - x_span * 0.5f ) ;
    y_min = _mm256_set1_ps( y_centre + y_span * 0.5f ) ; // Sign change
    x_delta = _mm256_set1_ps( x_span / (float)render_width ) ;
    y_delta = _mm256_set1_ps( -y_span / (float)render_height ) ; // Sign change

    uint8_t *buffer = nullptr ;
    buffer = new ( std::nothrow ) uint8_t[ render_width * render_height ] ;
    if ( buffer == nullptr )
    {
        std::cout << "Failure: Could not assign data for buffer with size: " << render_width + 0 << " by " << render_height + 0 << std::endl ;
        return EXIT_FAILURE ;
    }

    Julia_FMA_AVX2( buffer , 0.3 , 0.5 ) ;

    // Save image

    return EXIT_SUCCESS ;
}