diff --git a/crates/cubecl-core/src/compute/launcher.rs b/crates/cubecl-core/src/compute/launcher.rs index 7dba1357c..4b739d8b5 100644 --- a/crates/cubecl-core/src/compute/launcher.rs +++ b/crates/cubecl-core/src/compute/launcher.rs @@ -133,7 +133,10 @@ impl KernelLauncher { /// /// # Safety /// - /// Out-of-bounds reads and writes can happen. + /// The kernel must not: + /// - Contain any out of bounds reads or writes. Doing so is immediate UB. + /// - Contain any loops that never terminate. These may be optimized away entirely or cause + /// other unpredictable behaviour. pub unsafe fn launch_unchecked( self, cube_count: CubeCount, diff --git a/crates/cubecl-runtime/src/base.rs b/crates/cubecl-runtime/src/base.rs index 918896ca9..70460f457 100644 --- a/crates/cubecl-runtime/src/base.rs +++ b/crates/cubecl-runtime/src/base.rs @@ -14,7 +14,8 @@ pub enum ExecutionMode { /// Checked kernels are safe. #[default] Checked, - /// Unchecked kernels are unsafe. + /// Unchecked kernels are unsafe - it's up to the user to uphold indexing & infinite loop invariants + /// in their kernel. Unchecked, } diff --git a/crates/cubecl-wgpu/Cargo.toml b/crates/cubecl-wgpu/Cargo.toml index 35fc18e74..70b584c3f 100644 --- a/crates/cubecl-wgpu/Cargo.toml +++ b/crates/cubecl-wgpu/Cargo.toml @@ -46,11 +46,14 @@ cfg-if = { workspace = true } # wgpu dependency for platforms other than macOS [target.'cfg(not(target_os = "macos"))'.dependencies] -wgpu = { version = "23.0.0", features = ["fragile-send-sync-non-atomic-wasm"] } +wgpu = { version = "24.0.0", features = ["fragile-send-sync-non-atomic-wasm"] } # On macOS, the `vulkan-portability` feature is required due to the MoltenVK translation layer. # To install MoltenVK, install the VulkanSDK: https://vulkan.lunarg.com/sdk/home#mac [target.'cfg(target_os = "macos")'.dependencies] -wgpu = { version = "23.0.0", features = ["vulkan-portability", "fragile-send-sync-non-atomic-wasm"] } +wgpu = { version = "24.0.0", features = [ + "vulkan-portability", + "fragile-send-sync-non-atomic-wasm", +] } [dev-dependencies] cubecl-core = { path = "../cubecl-core", version = "0.5.0", features = [ diff --git a/crates/cubecl-wgpu/src/compiler/base.rs b/crates/cubecl-wgpu/src/compiler/base.rs index 732d59c40..52eadb8b6 100644 --- a/crates/cubecl-wgpu/src/compiler/base.rs +++ b/crates/cubecl-wgpu/src/compiler/base.rs @@ -18,6 +18,7 @@ pub trait WgpuCompiler: Compiler { fn create_pipeline( server: &mut WgpuServer, kernel: CompiledKernel, + mode: ExecutionMode, ) -> Arc; #[allow(async_fn_in_trait)] diff --git a/crates/cubecl-wgpu/src/compiler/spirv.rs b/crates/cubecl-wgpu/src/compiler/spirv.rs index b662f5dfb..eac420945 100644 --- a/crates/cubecl-wgpu/src/compiler/spirv.rs +++ b/crates/cubecl-wgpu/src/compiler/spirv.rs @@ -52,6 +52,7 @@ impl WgpuCompiler for SpirvCompiler { fn create_pipeline( server: &mut WgpuServer, kernel: CompiledKernel, + mode: ExecutionMode, ) -> Arc { let (module, layout) = kernel .repr @@ -107,21 +108,28 @@ impl WgpuCompiler for SpirvCompiler { }) .unwrap_or_else(|| { let source = &kernel.source; - // Cube always in principle uses unchecked modules. Certain operations like - // indexing are instead checked by cube. The WebGPU specification only makes - // incredibly loose guarantees that Cube can't rely on. Additionally, kernels - // can opt in/out per operation whether checks should be performed which can be faster. - // + + let checks = wgpu::ShaderRuntimeChecks { + // Cube does not need wgpu bounds checks - OOB behaviour is instead + // checked by cube (if enabled). + // This is because the WebGPU specification only makes loose guarantees that Cube can't rely on. + bounds_checks: false, + // Loop bounds are only checked in checked mode. + force_loop_bounding: mode == ExecutionMode::Checked, + }; + // SAFETY: Cube guarantees OOB safety when launching in checked mode. Launching in unchecked mode // is only available through the use of unsafe code. let module = unsafe { - server - .device - .create_shader_module_unchecked(wgpu::ShaderModuleDescriptor { - label: Some(&kernel.entrypoint_name), + server.device.create_shader_module_trusted( + wgpu::ShaderModuleDescriptor { + label: None, source: wgpu::ShaderSource::Wgsl(Cow::Borrowed(source)), - }) + }, + checks, + ) }; + (module, None) }); @@ -413,9 +421,7 @@ fn is_robust(device: &wgpu::Device) -> bool { .contains(&EXT_ROBUSTNESS2_NAME) } unsafe { - device - .as_hal::(|device| device.map(is_robust).unwrap_or(false)) - .unwrap_or(false) + device.as_hal::(|device| device.map(is_robust).unwrap_or(false)) } } diff --git a/crates/cubecl-wgpu/src/compiler/wgsl/base.rs b/crates/cubecl-wgpu/src/compiler/wgsl/base.rs index b595ade3e..48b297bcc 100644 --- a/crates/cubecl-wgpu/src/compiler/wgsl/base.rs +++ b/crates/cubecl-wgpu/src/compiler/wgsl/base.rs @@ -233,7 +233,7 @@ impl Elem { } pub fn is_atomic(&self) -> bool { - matches!(self, Self::AtomicI32 | Self::AtomicU32) + matches!(self, Self::AtomicI32 | Self::AtomicU32 | Self::AtomicF32) } } diff --git a/crates/cubecl-wgpu/src/compiler/wgsl/compiler.rs b/crates/cubecl-wgpu/src/compiler/wgsl/compiler.rs index a3404da94..e3acf048c 100644 --- a/crates/cubecl-wgpu/src/compiler/wgsl/compiler.rs +++ b/crates/cubecl-wgpu/src/compiler/wgsl/compiler.rs @@ -87,22 +87,29 @@ impl WgpuCompiler for WgslCompiler { fn create_pipeline( server: &mut WgpuServer, kernel: CompiledKernel, + mode: ExecutionMode, ) -> Arc { let source = &kernel.source; - // Cube always in principle uses unchecked modules. Certain operations like - // indexing are instead checked by cube. The WebGPU specification only makes - // incredibly loose guarantees that Cube can't rely on. Additionally, kernels - // can opt in/out per operation whether checks should be performed which can be faster. - // + + let checks = wgpu::ShaderRuntimeChecks { + // Cube does not need wgpu bounds checks - OOB behaviour is instead + // checked by cube (if enabled). + // This is because the WebGPU specification only makes loose guarantees that Cube can't rely on. + bounds_checks: false, + // Loop bounds are only checked in checked mode. + force_loop_bounding: mode == ExecutionMode::Checked, + }; + // SAFETY: Cube guarantees OOB safety when launching in checked mode. Launching in unchecked mode // is only available through the use of unsafe code. let module = unsafe { - server - .device - .create_shader_module_unchecked(ShaderModuleDescriptor { + server.device.create_shader_module_trusted( + ShaderModuleDescriptor { label: None, source: wgpu::ShaderSource::Wgsl(Cow::Borrowed(source)), - }) + }, + checks, + ) }; let layout = kernel.repr.map(|repr| { diff --git a/crates/cubecl-wgpu/src/compiler/wgsl/instructions.rs b/crates/cubecl-wgpu/src/compiler/wgsl/instructions.rs index 5f4737aef..b3194b94d 100644 --- a/crates/cubecl-wgpu/src/compiler/wgsl/instructions.rs +++ b/crates/cubecl-wgpu/src/compiler/wgsl/instructions.rs @@ -880,10 +880,7 @@ for (var {i}: {i_ty} = {start}; {i} {cmp} {end}; {increment}) {{ } Instruction::AtomicSub { lhs, rhs, out } => { let out = out.fmt_left(); - match rhs.elem() { - Elem::F32 => write!(f, "{out} = atomicAdd({lhs}, -{rhs});"), - _ => write!(f, "{out} = atomicSub({lhs}, {rhs});"), - } + write!(f, "{out} = atomicSub({lhs}, {rhs});") } Instruction::AtomicMax { lhs, rhs, out } => { let out = out.fmt_left(); diff --git a/crates/cubecl-wgpu/src/compute/server.rs b/crates/cubecl-wgpu/src/compute/server.rs index 40c2b9653..644eb5eba 100644 --- a/crates/cubecl-wgpu/src/compute/server.rs +++ b/crates/cubecl-wgpu/src/compute/server.rs @@ -92,7 +92,7 @@ impl WgpuServer { } let compile = self.logger.debug(compile); - let pipeline = C::create_pipeline(self, compile); + let pipeline = C::create_pipeline(self, compile, mode); self.pipelines.insert(kernel_id.clone(), pipeline.clone()); diff --git a/crates/cubecl-wgpu/src/runtime.rs b/crates/cubecl-wgpu/src/runtime.rs index c8e6d63f8..20911872f 100644 --- a/crates/cubecl-wgpu/src/runtime.rs +++ b/crates/cubecl-wgpu/src/runtime.rs @@ -7,7 +7,10 @@ use crate::{ }; use alloc::sync::Arc; use cubecl_common::future; -use cubecl_core::{Feature, Runtime}; +use cubecl_core::{ + ir::{Elem, FloatKind}, + AtomicFeature, Feature, Runtime, +}; pub use cubecl_runtime::memory_management::MemoryConfiguration; use cubecl_runtime::{ channel::MutexComputeChannel, @@ -212,6 +215,13 @@ pub(crate) fn create_client_on_setup( ); let channel = MutexComputeChannel::new(server); + if features.contains(wgpu::Features::SHADER_FLOAT32_ATOMIC) { + device_props.register_feature(Feature::Type(Elem::AtomicFloat(FloatKind::F32))); + + device_props.register_feature(Feature::AtomicFloat(AtomicFeature::LoadStore)); + device_props.register_feature(Feature::AtomicFloat(AtomicFeature::Add)); + } + ComputeClient::new(channel, device_props) } @@ -244,7 +254,7 @@ async fn request_adapter(device: &WgpuDevice) -> (wgpu::Instance (_, false) => InstanceFlags::default(), }; log::debug!("{instance_flags:?}"); - let instance = wgpu::Instance::new(wgpu::InstanceDescriptor { + let instance = wgpu::Instance::new(&wgpu::InstanceDescriptor { backends: G::backend().into(), flags: instance_flags, ..Default::default()