v/examples/cpu_features/SSE_and_MMX_Extensions/sse4_1.v

// SSE Instruction Set
// SSE4.1: Added with later Core 2
// MPSADBW, PHMINPOSUW, PMULLD, PMULDQ, DPPS, DPPD, BLENDPS, BLENDPD, BLENDVPS, BLENDVPD,
// PBLENDVB, PBLENDW, PMINSB, PMAXSB, PMINUW, PMAXUW, PMINUD, PMAXUD, PMINSD, PMAXSD, ROUNDPS,
// ROUNDSS, ROUNDPD, ROUNDSD, INSERTPS, PINSRB, PINSRD, PINSRQ, EXTRACTPS, PEXTRB, PEXTRW,
// PEXTRD, PEXTRQ, PMOVSXBW, PMOVZXBW, PMOVSXBD, PMOVZXBD, PMOVSXBQ, PMOVZXBQ, PMOVSXWD,
// PMOVZXWD, PMOVSXWQ, PMOVZXWQ, PMOVSXDQ, PMOVZXDQ, PTEST, PCMPEQQ, PACKUSDW, MOVNTDQA

@[if amd64 && !tinyc && !msvc]
fn round_floats_sse4_1(a &f32, result &f32) {
	unsafe {
		asm volatile amd64 {
			movups xmm0, [a] // Load 4 floats from array a into xmm0
			roundps xmm0, xmm0, 0 // Round to nearest integer
			movups [result], xmm0 // Store the result in result array
			; ; r (a)
			  r (result)
			; xmm0
		}
	}
}

fn main() {
	a := [f32(1.2), 2.5, 3.8, 4.4]
	result := []f32{len: 4}
	// Rounding mode 0 corresponds to rounding to the nearest integer
	round_floats_sse4_1(&a[0], &result[0])
	println(result)
	// The expected rounded result should be [1.0, 2.0, 4.0, 4.0]
	assert result == [f32(1.0), 2.0, 4.0, 4.0]
}