Today I found out that modern intel CPUs can do 256 bit wide SIMD. This is possible with the Advanced Vector Extensions, or AVX. Operating on 8 floats at the same time is even better than what the PS3 Cell processor was capable of. So I wrote this neat little function that can solve 8 quadratic equations in a single go, without branching. It should come in handy when implementing a Fast Marching Method to implement Continuum Crowds. For SIGGRAPH 2008, some people at ATI implemented this on the GPU for their amazing Froblins Demo.
/* * solve aX^2 + bX + c = 0 * solves 8 instances at the same time, using AVX SIMD without any branching to avoid stalls. * returns two solutions per equation in root0 and root1. * returns FLT_UNDF if there is no solution due to discriminant being negative. */ static const __m256 zero8 = _mm256_set_ps( 0, 0, 0, 0, 0, 0, 0, 0 ); static const __m256 undf8 = _mm256_set_ps( FLT_UNDF, FLT_UNDF, FLT_UNDF, FLT_UNDF, FLT_UNDF, FLT_UNDF, FLT_UNDF, FLT_UNDF ); static const __m256 four8 = _mm256_set_ps( 4, 4, 4, 4, 4, 4, 4, 4 ); inline void evaluate_quadratic8( __m256 a, __m256 b, __m256 c, __m256& root0, __m256& root1 ) { __m256 minb = _mm256_sub_ps( zero8, b ); // -b __m256 bb = _mm256_mul_ps( b, b ); // b*b __m256 foura = _mm256_mul_ps( four8, a ); // 4*a __m256 fourac = _mm256_mul_ps( foura, c ); // 4*a*c __m256 det = _mm256_sub_ps( bb, fourac ); // b*b - 4*a*c __m256 twoa = _mm256_add_ps( a, a ); // 2*a __m256 dvalid = _mm256_cmp_ps( fourac, bb, _CMP_LE_OS ); __m256 sr = _mm256_sqrt_ps( det ); root0 = _mm256_add_ps( minb, sr ); root1 = _mm256_sub_ps( minb, sr ); root0 = _mm256_div_ps( root0, twoa ); root1 = _mm256_div_ps( root1, twoa ); root0 = _mm256_blendv_ps( undf8, root0, dvalid ); root1 = _mm256_blendv_ps( undf8, root1, dvalid ); }