tracel-ai · ArthurBrussee · Jan 19, 2025 · Jan 19, 2025 · Jan 19, 2025 · Jan 19, 2025
diff --git a/crates/cubecl-core/src/compute/launcher.rs b/crates/cubecl-core/src/compute/launcher.rs
@@ -133,7 +133,10 @@ impl<R: Runtime> KernelLauncher<R> {
     ///
     /// # Safety
     ///
-    /// Out-of-bounds reads and writes can happen.
+    /// The kernel must not:
+    /// - Contain any out of bounds reads or writes. Doing so is immediate UB.
+    /// - Contain any loops that never terminate. These may be optimized away entirely or cause
+    ///   other unpredictable behaviour.
     pub unsafe fn launch_unchecked<K: Kernel>(
         self,
         cube_count: CubeCount,

diff --git a/crates/cubecl-runtime/src/base.rs b/crates/cubecl-runtime/src/base.rs
@@ -14,7 +14,8 @@ pub enum ExecutionMode {
     /// Checked kernels are safe.
     #[default]
     Checked,
-    /// Unchecked kernels are unsafe.
+    /// Unchecked kernels are unsafe - it's up to the user to uphold indexing & infinite loop invariants
+    /// in their kernel.
     Unchecked,
 }
 

diff --git a/crates/cubecl-wgpu/Cargo.toml b/crates/cubecl-wgpu/Cargo.toml
@@ -46,11 +46,14 @@ cfg-if = { workspace = true }
 
 # wgpu dependency for platforms other than macOS
 [target.'cfg(not(target_os = "macos"))'.dependencies]
-wgpu = { version = "23.0.0", features = ["fragile-send-sync-non-atomic-wasm"] }
+wgpu = { version = "24.0.0", features = ["fragile-send-sync-non-atomic-wasm"] }
 # On macOS, the `vulkan-portability` feature is required due to the MoltenVK translation layer.
 # To install MoltenVK, install the VulkanSDK: https://vulkan.lunarg.com/sdk/home#mac
 [target.'cfg(target_os = "macos")'.dependencies]
-wgpu = { version = "23.0.0", features = ["vulkan-portability", "fragile-send-sync-non-atomic-wasm"] }
+wgpu = { version = "24.0.0", features = [
+    "vulkan-portability",
+    "fragile-send-sync-non-atomic-wasm",
+] }
 
 [dev-dependencies]
 cubecl-core = { path = "../cubecl-core", version = "0.5.0", features = [

diff --git a/crates/cubecl-wgpu/src/compiler/base.rs b/crates/cubecl-wgpu/src/compiler/base.rs
@@ -18,6 +18,7 @@ pub trait WgpuCompiler: Compiler {
     fn create_pipeline(
         server: &mut WgpuServer<Self>,
         kernel: CompiledKernel<Self>,
+        mode: ExecutionMode,
     ) -> Arc<ComputePipeline>;
 
     #[allow(async_fn_in_trait)]

diff --git a/crates/cubecl-wgpu/src/compiler/spirv.rs b/crates/cubecl-wgpu/src/compiler/spirv.rs
@@ -52,6 +52,7 @@ impl WgpuCompiler for SpirvCompiler<GLCompute> {
     fn create_pipeline(
         server: &mut WgpuServer<Self>,
         kernel: CompiledKernel<Self>,
+        mode: ExecutionMode,
     ) -> Arc<ComputePipeline> {
         let (module, layout) = kernel
             .repr
@@ -107,21 +108,28 @@ impl WgpuCompiler for SpirvCompiler<GLCompute> {
             })
             .unwrap_or_else(|| {
                 let source = &kernel.source;
-                // Cube always in principle uses unchecked modules. Certain operations like
-                // indexing are instead checked by cube. The WebGPU specification only makes
-                // incredibly loose guarantees that Cube can't rely on. Additionally, kernels
-                // can opt in/out per operation whether checks should be performed which can be faster.
-                //
+
+                let checks = wgpu::ShaderRuntimeChecks {
+                    // Cube does not need wgpu bounds checks - OOB behaviour is instead
+                    // checked by cube (if enabled).
+                    // This is because the WebGPU specification only makes loose guarantees that Cube can't rely on.
+                    bounds_checks: false,
+                    // Loop bounds are only checked in checked mode.
+                    force_loop_bounding: mode == ExecutionMode::Checked,
+                };
+
                 // SAFETY: Cube guarantees OOB safety when launching in checked mode. Launching in unchecked mode
                 // is only available through the use of unsafe code.
                 let module = unsafe {
-                    server
-                        .device
-                        .create_shader_module_unchecked(wgpu::ShaderModuleDescriptor {
-                            label: Some(&kernel.entrypoint_name),
+                    server.device.create_shader_module_trusted(
+                        wgpu::ShaderModuleDescriptor {
+                            label: None,
                             source: wgpu::ShaderSource::Wgsl(Cow::Borrowed(source)),
-                        })
+                        },
+                        checks,
+                    )
                 };
+
                 (module, None)
             });
 
@@ -413,9 +421,7 @@ fn is_robust(device: &wgpu::Device) -> bool {
             .contains(&EXT_ROBUSTNESS2_NAME)
     }
     unsafe {
-        device
-            .as_hal::<hal::api::Vulkan, _, _>(|device| device.map(is_robust).unwrap_or(false))
-            .unwrap_or(false)
+        device.as_hal::<hal::api::Vulkan, _, _>(|device| device.map(is_robust).unwrap_or(false))
     }
 }
 

diff --git a/crates/cubecl-wgpu/src/compiler/wgsl/base.rs b/crates/cubecl-wgpu/src/compiler/wgsl/base.rs
@@ -233,7 +233,7 @@ impl Elem {
     }
 
     pub fn is_atomic(&self) -> bool {
-        matches!(self, Self::AtomicI32 | Self::AtomicU32)
+        matches!(self, Self::AtomicI32 | Self::AtomicU32 | Self::AtomicF32)
     }
 }
 

diff --git a/crates/cubecl-wgpu/src/compiler/wgsl/compiler.rs b/crates/cubecl-wgpu/src/compiler/wgsl/compiler.rs
@@ -87,22 +87,29 @@ impl WgpuCompiler for WgslCompiler {
     fn create_pipeline(
         server: &mut WgpuServer<Self>,
         kernel: CompiledKernel<Self>,
+        mode: ExecutionMode,
     ) -> Arc<ComputePipeline> {
         let source = &kernel.source;
-        // Cube always in principle uses unchecked modules. Certain operations like
-        // indexing are instead checked by cube. The WebGPU specification only makes
-        // incredibly loose guarantees that Cube can't rely on. Additionally, kernels
-        // can opt in/out per operation whether checks should be performed which can be faster.
-        //
+
+        let checks = wgpu::ShaderRuntimeChecks {
+            // Cube does not need wgpu bounds checks - OOB behaviour is instead
+            // checked by cube (if enabled).
+            // This is because the WebGPU specification only makes loose guarantees that Cube can't rely on.
+            bounds_checks: false,
+            // Loop bounds are only checked in checked mode.
+            force_loop_bounding: mode == ExecutionMode::Checked,
+        };
+
         // SAFETY: Cube guarantees OOB safety when launching in checked mode. Launching in unchecked mode
         // is only available through the use of unsafe code.
         let module = unsafe {
-            server
-                .device
-                .create_shader_module_unchecked(ShaderModuleDescriptor {
+            server.device.create_shader_module_trusted(
+                ShaderModuleDescriptor {
                     label: None,
                     source: wgpu::ShaderSource::Wgsl(Cow::Borrowed(source)),
-                })
+                },
+                checks,
+            )
         };
 
         let layout = kernel.repr.map(|repr| {

diff --git a/crates/cubecl-wgpu/src/compiler/wgsl/instructions.rs b/crates/cubecl-wgpu/src/compiler/wgsl/instructions.rs
@@ -880,10 +880,7 @@ for (var {i}: {i_ty} = {start}; {i} {cmp} {end}; {increment}) {{
             }
             Instruction::AtomicSub { lhs, rhs, out } => {
                 let out = out.fmt_left();
-                match rhs.elem() {
-                    Elem::F32 => write!(f, "{out} = atomicAdd({lhs}, -{rhs});"),
-                    _ => write!(f, "{out} = atomicSub({lhs}, {rhs});"),
-                }
+                write!(f, "{out} = atomicSub({lhs}, {rhs});")
             }
             Instruction::AtomicMax { lhs, rhs, out } => {
                 let out = out.fmt_left();

diff --git a/crates/cubecl-wgpu/src/compute/server.rs b/crates/cubecl-wgpu/src/compute/server.rs
@@ -92,7 +92,7 @@ impl<C: WgpuCompiler> WgpuServer<C> {
         }
 
         let compile = self.logger.debug(compile);
-        let pipeline = C::create_pipeline(self, compile);
+        let pipeline = C::create_pipeline(self, compile, mode);
 
         self.pipelines.insert(kernel_id.clone(), pipeline.clone());
 

diff --git a/crates/cubecl-wgpu/src/runtime.rs b/crates/cubecl-wgpu/src/runtime.rs
@@ -7,7 +7,10 @@ use crate::{
 };
 use alloc::sync::Arc;
 use cubecl_common::future;
-use cubecl_core::{Feature, Runtime};
+use cubecl_core::{
+    ir::{Elem, FloatKind},
+    AtomicFeature, Feature, Runtime,
+};
 pub use cubecl_runtime::memory_management::MemoryConfiguration;
 use cubecl_runtime::{
     channel::MutexComputeChannel,
@@ -212,6 +215,13 @@ pub(crate) fn create_client_on_setup<C: WgpuCompiler>(
     );
     let channel = MutexComputeChannel::new(server);
 
+    if features.contains(wgpu::Features::SHADER_FLOAT32_ATOMIC) {
+        device_props.register_feature(Feature::Type(Elem::AtomicFloat(FloatKind::F32)));
+
+        device_props.register_feature(Feature::AtomicFloat(AtomicFeature::LoadStore));
+        device_props.register_feature(Feature::AtomicFloat(AtomicFeature::Add));
+    }
+
     ComputeClient::new(channel, device_props)
 }
 
@@ -244,7 +254,7 @@ async fn request_adapter<G: GraphicsApi>(device: &WgpuDevice) -> (wgpu::Instance
         (_, false) => InstanceFlags::default(),
     };
     log::debug!("{instance_flags:?}");
-    let instance = wgpu::Instance::new(wgpu::InstanceDescriptor {
+    let instance = wgpu::Instance::new(&wgpu::InstanceDescriptor {
         backends: G::backend().into(),
         flags: instance_flags,
         ..Default::default()
-Original file line number
+Diff line change
@@ Expand Up / @@ -233,7 +233,7 @@ impl Elem { @@
         }
         pub fn is_atomic(&self) -> bool {
-            matches!(self, Self::AtomicI32 | Self::AtomicU32)
+            matches!(self, Self::AtomicI32 | Self::AtomicU32 | Self::AtomicF32)
         }
     }
@@ Expand Down @@