// SSE Instruction Set
// SSE2: Added with Pentium 4
// Floating-point Instructions:
// ADDPD, ADDSD, ANDNPD, ANDPD, CMPPD, CMPSD*, COMISD, CVTDQ2PD, CVTDQ2PS, CVTPD2DQ, CVTPD2PI,
// CVTPD2PS, CVTPI2PD, CVTPS2DQ, CVTPS2PD, CVTSD2SI, CVTSD2SS, CVTSI2SD, CVTSS2SD, CVTTPD2DQ,
// CVTTPD2PI, CVTTPS2DQ, CVTTSD2SI, DIVPD, DIVSD, MAXPD, MAXSD, MINPD, MINSD, MOVAPD, MOVHPD,
// MOVLPD, MOVMSKPD, MOVSD*, MOVUPD, MULPD, MULSD, ORPD, SHUFPD, SQRTPD, SQRTSD, SUBPD, SUBSD,
// UCOMISD, UNPCKHPD, UNPCKLPD, XORPD
// * CMPSD and MOVSD have the same name as the string instruction mnemonics CMPSD (CMPS) and
// MOVSD (MOVS); however, the former refer to scalar double-precision floating-points whereas
// the latter refer to doubleword strings.
// Integer Instructions:
// MOVDQ2Q, MOVDQA, MOVDQU, MOVQ2DQ, PADDQ, PSUBQ, PMULUDQ, PSHUFHW, PSHUFLW, PSHUFD, PSLLDQ, PSRLDQ, PUNPCKHQDQ, PUNPCKLQDQ
// The MULPD instruction multiplies two vectors of doubles using SSE2 instructions.

@[if amd64 && !tinyc && !msvc]
fn multiply_vectors_sse2(a &f64, b &f64, result &f64) {
	unsafe {
		asm volatile amd64 {
			movupd xmm0, [a] // Load 2 doubles from array a into SSE2 register xmm0
			movupd xmm1, [b] // Load 2 doubles from array b into SSE2 register xmm1
			mulpd xmm0, xmm1 // Multiply the two vectors using SSE2 instruction
			movupd [result], xmm0 // Store the result back to memory
			; ; r (a)
			  r (b)
			  r (result)
			; xmm0
			  xmm1
		}
	}
}

fn main() {
	a := [f64(1.5), 2.5]
	b := [f64(3.5), 4.5]
	result := []f64{len: 2}
	multiply_vectors_sse2(&a[0], &b[0], &result[0])
	println(result)
	// 5.25 = 1.5 * 3.5
	// 11.25 = 2.5 * 4.5
	assert result == [f64(5.25), 11.25]
}