[40.0] Backport Cranelift: x64: fix incorrect load-sinking in copysign operator. (#12437)

cfallin · web-flow · commit 728fa07184f8 · 2026-01-26T13:22:37.000-08:00
* Cranelift: x64: do not incorrectly widen loads sunk into `fcopysign`.

The implementation of the `fcopysign` operator uses vector bitwise AND
instructions on the floating-point/vector registers containing the
inputs to the operator. This is a reasonable implementation as the
instruction set does not have scalar (single-lane) bitwise
operators. However, when load-sinking automatically kicks in for an
operand to an `andps`, it can turn a 64-bit load (`f64.load`) into a
128-bit load incorrectly.

This load-widening can cause out-of-bounds accesses where they were
not expected. When dynamic bounds checks are enabled, we compile
assuming the correct load-operator width is codegen'd; a too-wide load
could read beyond the checked bound, either into unmapped
memory (crashing the process) or, worse, valid data outside the
sandbox. In the case of `fcopysign` the result of that read is not
directly available, because it will go into the high (unused)
lane, but the out-of-bounds read itself is a problem.

Thanks to louismerlin for reporting!

* Re-bless Cranelift filetests.

* Update release notes.
diff --git a/RELEASES.md b/RELEASES.md
@@ -1,3 +1,16 @@
+## 40.0.3
+
+Released 2026-01-26.
+
+### Fixed
+
+* Fixed a bug in lowering of `f64.copysign` on x86-64 whereby when combined
+  with an `f64.load`, the resulting machine code could read 16 bytes rather
+  than 8 bytes. This could result in a segfault when Wasmtime is configured
+  without signals-based traps.
+
+--------------------------------------------------------------------------------
+
 ## 40.0.2
 
 Released 2026-01-14.
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
@@ -4276,13 +4276,17 @@
 ;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $F32 (fcopysign a @ (value_type $F32) b)))
-      (let ((sign_bit Xmm (imm $F32 0x80000000)))
+      (let ((sign_bit Xmm (imm $F32 0x80000000))
+            (a Xmm a) ;; force into reg so we don't sink a 128-bit load.
+            (b Xmm b))
         (x64_orps
           (x64_andnps sign_bit a)
           (x64_andps sign_bit b))))
 
 (rule (lower (has_type $F64 (fcopysign a @ (value_type $F64) b)))
-      (let ((sign_bit Xmm (imm $F64 0x8000000000000000)))
+      (let ((sign_bit Xmm (imm $F64 0x8000000000000000))
+            (a Xmm a) ;; force into reg so we don't sink a 128-bit load.
+            (b Xmm b))
         (x64_orpd
           (x64_andnpd sign_bit a)
           (x64_andpd sign_bit b))))
diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif
@@ -39,11 +39,14 @@ block0(v0: i64):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block0:
-;   movl $0x80000000, %eax
-;   vmovd %eax, %xmm4
-;   vandnps (%rip), %xmm4, %xmm6
-;   vandps (%rdi), %xmm4, %xmm0
-;   vorps %xmm0, %xmm6, %xmm0
+;   uninit  %xmm0
+;   vxorps %xmm0, %xmm0, %xmm2
+;   vmovss (%rdi), %xmm1
+;   movl $0x80000000, %r8d
+;   vmovd %r8d, %xmm7
+;   vandnps %xmm2, %xmm7, %xmm2
+;   vandps %xmm1, %xmm7, %xmm3
+;   vorps %xmm3, %xmm2, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
@@ -53,29 +56,16 @@ block0(v0: i64):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movl $0x80000000, %eax
-;   vmovd %eax, %xmm4
-;   vandnps 0x1b(%rip), %xmm4, %xmm6
-;   vandps (%rdi), %xmm4, %xmm0
-;   vorps %xmm0, %xmm6, %xmm0
-;   movq %rbp, %rsp
-;   popq %rbp
-;   retq
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
+;   vxorps %xmm0, %xmm0, %xmm2
+;   vmovss (%rdi), %xmm1
+;   movl $0x80000000, %r8d
+;   vmovd %r8d, %xmm7
+;   vandnps %xmm2, %xmm7, %xmm2
+;   vandps %xmm1, %xmm7, %xmm3
+;   vorps %xmm3, %xmm2, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
 
 function %bor_f32x4(f32x4, f32x4) -> f32x4 {
 block0(v0: f32x4, v1: f32x4):
diff --git a/tests/disas/f64-copysign.wat b/tests/disas/f64-copysign.wat
@@ -0,0 +1,45 @@
+;;! target = "x86_64"
+;;! test = "compile"
+;;!flags = "-Ccranelift-has-avx"
+
+;; This would previously segfault or trap on x86-64 because the
+;; `f64.copysign` sunk the `f64.load` and widened it to 128 bits
+;; incorrectly.
+
+(module
+  ;; Define a linear memory with 1 page (64KiB)
+  (memory 1)
+  (export "f" (func 0))
+  (func (result i32)
+    ;; Push i32 constant 0 (destination address for the store)
+    i32.const 0
+    ;; Push f64 constant 0.0 (sign source for copysign)
+    f64.const 0
+    ;; Push i32 constant 32 (base address for the load)
+    i32.const 32
+    ;; Load f64 from memory at address (32 + 65491), with align=1
+    f64.load offset=65491 align=1
+    ;; Apply copysign: take magnitude from loaded f64 and sign from 0.0
+    f64.copysign
+    ;; Store f64 to memory at address 0, with align=1
+    f64.store align=1
+    ;; Return 0.
+    i32.const 0
+  )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    0x38(%rdi), %rcx
+;;       vmovsd  0xfff3(%rcx), %xmm4
+;;       vxorpd  %xmm3, %xmm3, %xmm5
+;;       movabsq $9223372036854775808, %r11
+;;       vmovq   %r11, %xmm2
+;;       vandnpd %xmm5, %xmm2, %xmm5
+;;       vandpd  %xmm4, %xmm2, %xmm6
+;;       vorpd   %xmm6, %xmm5, %xmm0
+;;       vmovsd  %xmm0, (%rcx)
+;;       xorl    %eax, %eax
+;;       movq    %rbp, %rsp
+;;       popq    %rbp
+;;       retq
diff --git a/tests/misc_testsuite/f64-copysign.wast b/tests/misc_testsuite/f64-copysign.wast
@@ -0,0 +1,27 @@
+;; This would previously segfault or trap on x86-64 because the
+;; `f64.copysign` sunk the `f64.load` and widened it to 128 bits
+;; incorrectly.
+
+(module
+  ;; Define a linear memory with 1 page (64KiB)
+  (memory 1)
+  (export "f" (func 0))
+  (func (result i32)
+    ;; Push i32 constant 0 (destination address for the store)
+    i32.const 0
+    ;; Push f64 constant 0.0 (sign source for copysign)
+    f64.const 0
+    ;; Push i32 constant 32 (base address for the load)
+    i32.const 32
+    ;; Load f64 from memory at address (32 + 65491), with align=1
+    f64.load offset=65491 align=1
+    ;; Apply copysign: take magnitude from loaded f64 and sign from 0.0
+    f64.copysign
+    ;; Store f64 to memory at address 0, with align=1
+    f64.store align=1
+    ;; Return 0.
+    i32.const 0
+  )
+)
+
+(assert_return (invoke "f") (i32.const 0))