Streaming SIMD Extensions Intrinsics Implementation

Regular Streaming SIMD Extensions (SSE) intrinsics work on 4 32-bit single precision values. On IA-64 architecture-based systems, basic operations like add and compare require two SIMD instructions. All can be executed in the same cycle so the throughput is one basic SSE operation per cycle or 4 32-bit single precision operations per cycle.

Key to the table entries

Intrinsic
Name

MMX(TM)
Technology

SSE
SSE2

IA-64
Architecture

_mm_add_ss

N/A

B

B

_mm_add_ps

N/A

A

A

_mm_sub_ss

N/A

B

B

_mm_sub_ps

N/A

A

A

_mm_mul_ss

N/A

B

B

_mm_mul_ps

N/A

A

A

_mm_div_ss

N/A

B

B

_mm_div_ps

N/A

A

A

_mm_sqrt_ss

N/A

B

B

_mm_sqrt_ps

N/A

A

A

_mm_rcp_ss

N/A

B

B

_mm_rcp_ps

N/A

A

A

_mm_rsqrt_ss

N/A

B

B

_mm_rsqrt_ps

N/A

A

A

_mm_min_ss

N/A

B

B

_mm_min_ps

N/A

A

A

_mm_max_ss

N/A

B

B

_mm_max_ps

N/A

A

A

_mm_and_ps

N/A

A

A

_mm_andnot_ps

N/A

A

A

_mm_or_ps

N/A

A

A

_mm_xor_ps

N/A

A

A

_mm_cmpeq_ss

N/A

B

B

_mm_cmpeq_ps

N/A

A

A

_mm_cmplt_ss

N/A

B

B

_mm_cmplt_ps

N/A

A

A

_mm_cmple_ss

N/A

B

B

_mm_cmple_ps

N/A

A

A

_mm_cmpgt_ss

N/A

B

B

_mm_cmpgt_ps

N/A

A

A

_mm_cmpge_ss

N/A

B

B

_mm_cmpge_ps

N/A

A

A

_mm_cmpneq_ss

N/A

B

B

_mm_cmpneq_ps

N/A

A

A

_mm_cmpnlt_ss

N/A

B

B

_mm_cmpnlt_ps

N/A

A

A

_mm_cmpnle_ss

N/A

B

B

_mm_cmpnle_ps

N/A

A

A

_mm_cmpngt_ss

N/A

B

B

_mm_cmpngt_ps

N/A

A

A

_mm_cmpnge_ss

N/A

B

B

_mm_cmpnge_ps

N/A

A

A

_mm_cmpord_ss

N/A

B

B

_mm_cmpord_ps

N/A

A

A

_mm_cmpunord_ss

N/A

B

B

_mm_cmpunord_ps

N/A

A

A

_mm_comieq_ss

N/A

B

B

_mm_comilt_ss

N/A

B

B

_mm_comile_ss

N/A

B

B

_mm_comigt_ss

N/A

B

B

_mm_comige_ss

N/A

B

B

_mm_comineq_ss

N/A

B

B

_mm_ucomieq_ss

N/A

B

B

_mm_ucomilt_ss

N/A

B

B

_mm_ucomile_ss

N/A

B

B

_mm_ucomigt_ss

N/A

B

B

_mm_ucomige_ss

N/A

B

B

_mm_ucomineq_ss

N/A

B

B

_mm_cvtss_si32

N/A

A

B

_mm_cvtps_pi32

N/A

A

A

_mm_cvttss_si32

N/A

A

B

_mm_cvttps_pi32

N/A

A

A

_mm_cvtsi32_ss

N/A

A

B

_mm_cvtpi32_ps

N/A

A

C

_mm_cvtpi16_ps

N/A

A

C

_mm_cvtpu16_ps

N/A

A

C

_mm_cvtpi8_ps

N/A

A

C

_mm_cvtpu8_ps

N/A

A

C

_mm_cvtpi32x2_ps

N/A

A

C

_mm_cvtps_pi16

N/A

A

C

_mm_cvtps_pi8

N/A

A

C

_mm_move_ss

N/A

A

A

_mm_shuffle_ps

N/A

A

A

_mm_unpackhi_ps

N/A

A

A

_mm_unpacklo_ps

N/A

A

A

_mm_movehl_ps

N/A

A

A

_mm_movelh_ps

N/A

A

A

_mm_movemask_ps

N/A

A

C

_mm_getcsr

N/A

A

A

_mm_setcsr

N/A

A

A

_mm_loadh_pi

N/A

A

A

_mm_loadl_pi

N/A

A

A

_mm_load_ss

N/A

A

B

_mm_load1_ps

N/A

A

A

_mm_load_ps

N/A

A

A

_mm_loadu_ps

N/A

A

A

_mm_loadr_ps

N/A

A

A

_mm_storeh_pi

N/A

A

A

_mm_storel_pi

N/A

A

A

_mm_store_ss

N/A

A

A

_mm_store_ps

N/A

A

A

_mm_store1_ps

N/A

A

A

_mm_storeu_ps

N/A

A

A

_mm_storer_ps

N/A

A

A

_mm_set_ss

N/A

A

A

_mm_set1_ps

N/A

A

A

_mm_set_ps

N/A

A

A

_mm_setr_ps

N/A

A

A

_mm_setzero_ps

N/A

A

A

_mm_prefetch

N/A

A

A

_mm_stream_pi

N/A

A

A

_mm_stream_ps

N/A

A

A

_mm_sfence

N/A

A

A

_mm_extract_pi16

N/A

A

A

_mm_insert_pi16

N/A

A

A

_mm_max_pi16

N/A

A

A

_mm_max_pu8

N/A

A

A

_mm_min_pi16

N/A

A

A

_mm_min_pu8

N/A

A

A

_mm_movemask_pi8

N/A

A

C

_mm_mulhi_pu16

N/A

A

A

_mm_shuffle_pi16

N/A

A

A

_mm_maskmove_si64

N/A

A

C

_mm_avg_pu8

N/A

A

A

_mm_avg_pu16

N/A

A

A

_mm_sad_pu8

N/A

A

A