From e633651cd678d6e98e8ee989a6d8fc9f0f362afe Mon Sep 17 00:00:00 2001 From: Arthur Date: Sun, 19 Jan 2025 16:54:17 +0000 Subject: [PATCH 1/7] Upgrade to wgpu 24 --- crates/cubecl-wgpu/Cargo.toml | 7 +++++-- crates/cubecl-wgpu/src/compiler/base.rs | 1 + crates/cubecl-wgpu/src/compiler/spirv.rs | 20 +++++++++++++------ .../cubecl-wgpu/src/compiler/wgsl/compiler.rs | 19 ++++++++++++------ crates/cubecl-wgpu/src/compute/server.rs | 2 +- crates/cubecl-wgpu/src/runtime.rs | 14 +++++++++++-- 6 files changed, 46 insertions(+), 17 deletions(-) diff --git a/crates/cubecl-wgpu/Cargo.toml b/crates/cubecl-wgpu/Cargo.toml index 35fc18e74..70b584c3f 100644 --- a/crates/cubecl-wgpu/Cargo.toml +++ b/crates/cubecl-wgpu/Cargo.toml @@ -46,11 +46,14 @@ cfg-if = { workspace = true } # wgpu dependency for platforms other than macOS [target.'cfg(not(target_os = "macos"))'.dependencies] -wgpu = { version = "23.0.0", features = ["fragile-send-sync-non-atomic-wasm"] } +wgpu = { version = "24.0.0", features = ["fragile-send-sync-non-atomic-wasm"] } # On macOS, the `vulkan-portability` feature is required due to the MoltenVK translation layer. # To install MoltenVK, install the VulkanSDK: https://vulkan.lunarg.com/sdk/home#mac [target.'cfg(target_os = "macos")'.dependencies] -wgpu = { version = "23.0.0", features = ["vulkan-portability", "fragile-send-sync-non-atomic-wasm"] } +wgpu = { version = "24.0.0", features = [ + "vulkan-portability", + "fragile-send-sync-non-atomic-wasm", +] } [dev-dependencies] cubecl-core = { path = "../cubecl-core", version = "0.5.0", features = [ diff --git a/crates/cubecl-wgpu/src/compiler/base.rs b/crates/cubecl-wgpu/src/compiler/base.rs index 732d59c40..52eadb8b6 100644 --- a/crates/cubecl-wgpu/src/compiler/base.rs +++ b/crates/cubecl-wgpu/src/compiler/base.rs @@ -18,6 +18,7 @@ pub trait WgpuCompiler: Compiler { fn create_pipeline( server: &mut WgpuServer, kernel: CompiledKernel, + mode: ExecutionMode, ) -> Arc; #[allow(async_fn_in_trait)] diff --git a/crates/cubecl-wgpu/src/compiler/spirv.rs b/crates/cubecl-wgpu/src/compiler/spirv.rs index b662f5dfb..b255e1a5a 100644 --- a/crates/cubecl-wgpu/src/compiler/spirv.rs +++ b/crates/cubecl-wgpu/src/compiler/spirv.rs @@ -52,6 +52,7 @@ impl WgpuCompiler for SpirvCompiler { fn create_pipeline( server: &mut WgpuServer, kernel: CompiledKernel, + mode: ExecutionMode, ) -> Arc { let (module, layout) = kernel .repr @@ -111,17 +112,24 @@ impl WgpuCompiler for SpirvCompiler { // indexing are instead checked by cube. The WebGPU specification only makes // incredibly loose guarantees that Cube can't rely on. Additionally, kernels // can opt in/out per operation whether checks should be performed which can be faster. - // + let checks = ShaderRuntimeChecks { + bounds_checks: false, + // Loop bounds are only checked in checked mode. + force_loop_bounding: mode == ExecutionMode::Checked, + }; + // SAFETY: Cube guarantees OOB safety when launching in checked mode. Launching in unchecked mode // is only available through the use of unsafe code. let module = unsafe { - server - .device - .create_shader_module_unchecked(wgpu::ShaderModuleDescriptor { - label: Some(&kernel.entrypoint_name), + server.device.create_shader_module_trusted( + ShaderModuleDescriptor { + label: None, source: wgpu::ShaderSource::Wgsl(Cow::Borrowed(source)), - }) + }, + checks, + ) }; + (module, None) }); diff --git a/crates/cubecl-wgpu/src/compiler/wgsl/compiler.rs b/crates/cubecl-wgpu/src/compiler/wgsl/compiler.rs index a3404da94..a420d3849 100644 --- a/crates/cubecl-wgpu/src/compiler/wgsl/compiler.rs +++ b/crates/cubecl-wgpu/src/compiler/wgsl/compiler.rs @@ -19,7 +19,7 @@ use cubecl_runtime::{DeviceProperties, ExecutionMode}; use wgpu::{ BindGroupLayoutDescriptor, BindGroupLayoutEntry, BindingType, BufferBindingType, ComputePipeline, DeviceDescriptor, PipelineLayoutDescriptor, ShaderModuleDescriptor, - ShaderStages, + ShaderRuntimeChecks, ShaderStages, }; #[derive(Clone, Debug, Default)] @@ -87,22 +87,29 @@ impl WgpuCompiler for WgslCompiler { fn create_pipeline( server: &mut WgpuServer, kernel: CompiledKernel, + mode: ExecutionMode, ) -> Arc { let source = &kernel.source; // Cube always in principle uses unchecked modules. Certain operations like // indexing are instead checked by cube. The WebGPU specification only makes // incredibly loose guarantees that Cube can't rely on. Additionally, kernels // can opt in/out per operation whether checks should be performed which can be faster. - // + let checks = ShaderRuntimeChecks { + bounds_checks: false, + // Loop bounds are only checked in checked mode. + force_loop_bounding: mode == ExecutionMode::Checked, + }; + // SAFETY: Cube guarantees OOB safety when launching in checked mode. Launching in unchecked mode // is only available through the use of unsafe code. let module = unsafe { - server - .device - .create_shader_module_unchecked(ShaderModuleDescriptor { + server.device.create_shader_module_trusted( + ShaderModuleDescriptor { label: None, source: wgpu::ShaderSource::Wgsl(Cow::Borrowed(source)), - }) + }, + checks, + ) }; let layout = kernel.repr.map(|repr| { diff --git a/crates/cubecl-wgpu/src/compute/server.rs b/crates/cubecl-wgpu/src/compute/server.rs index 40c2b9653..644eb5eba 100644 --- a/crates/cubecl-wgpu/src/compute/server.rs +++ b/crates/cubecl-wgpu/src/compute/server.rs @@ -92,7 +92,7 @@ impl WgpuServer { } let compile = self.logger.debug(compile); - let pipeline = C::create_pipeline(self, compile); + let pipeline = C::create_pipeline(self, compile, mode); self.pipelines.insert(kernel_id.clone(), pipeline.clone()); diff --git a/crates/cubecl-wgpu/src/runtime.rs b/crates/cubecl-wgpu/src/runtime.rs index c8e6d63f8..0314e5442 100644 --- a/crates/cubecl-wgpu/src/runtime.rs +++ b/crates/cubecl-wgpu/src/runtime.rs @@ -7,7 +7,7 @@ use crate::{ }; use alloc::sync::Arc; use cubecl_common::future; -use cubecl_core::{Feature, Runtime}; +use cubecl_core::{AtomicFeature, Feature, Runtime}; pub use cubecl_runtime::memory_management::MemoryConfiguration; use cubecl_runtime::{ channel::MutexComputeChannel, @@ -212,6 +212,16 @@ pub(crate) fn create_client_on_setup( ); let channel = MutexComputeChannel::new(server); + + // The wgpu float32-atomic feature guarantees both add and min/max. It does only guarantee float32 support, + // but since f16 and others aren't supported anyway there is not much to it. + if features.contains(wgpu::Features::SHADER_FLOAT32_ATOMIC) { + device_props.register_feature(Feature::AtomicFloat(AtomicFeature::LoadStore)); + device_props.register_feature(Feature::AtomicFloat(AtomicFeature::Add)); + device_props.register_feature(Feature::AtomicFloat(AtomicFeature::MinMax)); + } + + C::register_features(&setup.adapter, &setup.device, &mut device_props); ComputeClient::new(channel, device_props) } @@ -244,7 +254,7 @@ async fn request_adapter(device: &WgpuDevice) -> (wgpu::Instance (_, false) => InstanceFlags::default(), }; log::debug!("{instance_flags:?}"); - let instance = wgpu::Instance::new(wgpu::InstanceDescriptor { + let instance = wgpu::Instance::new(&wgpu::InstanceDescriptor { backends: G::backend().into(), flags: instance_flags, ..Default::default() From 6a33cd338b5820079c016ed516c0230f8e94d0bd Mon Sep 17 00:00:00 2001 From: Arthur Date: Sun, 19 Jan 2025 17:10:08 +0000 Subject: [PATCH 2/7] More updates --- crates/cubecl-core/src/compute/launcher.rs | 5 ++++- crates/cubecl-runtime/src/base.rs | 3 ++- crates/cubecl-wgpu/src/runtime.rs | 2 -- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/crates/cubecl-core/src/compute/launcher.rs b/crates/cubecl-core/src/compute/launcher.rs index 7dba1357c..4b739d8b5 100644 --- a/crates/cubecl-core/src/compute/launcher.rs +++ b/crates/cubecl-core/src/compute/launcher.rs @@ -133,7 +133,10 @@ impl KernelLauncher { /// /// # Safety /// - /// Out-of-bounds reads and writes can happen. + /// The kernel must not: + /// - Contain any out of bounds reads or writes. Doing so is immediate UB. + /// - Contain any loops that never terminate. These may be optimized away entirely or cause + /// other unpredictable behaviour. pub unsafe fn launch_unchecked( self, cube_count: CubeCount, diff --git a/crates/cubecl-runtime/src/base.rs b/crates/cubecl-runtime/src/base.rs index 918896ca9..70460f457 100644 --- a/crates/cubecl-runtime/src/base.rs +++ b/crates/cubecl-runtime/src/base.rs @@ -14,7 +14,8 @@ pub enum ExecutionMode { /// Checked kernels are safe. #[default] Checked, - /// Unchecked kernels are unsafe. + /// Unchecked kernels are unsafe - it's up to the user to uphold indexing & infinite loop invariants + /// in their kernel. Unchecked, } diff --git a/crates/cubecl-wgpu/src/runtime.rs b/crates/cubecl-wgpu/src/runtime.rs index 0314e5442..1898db112 100644 --- a/crates/cubecl-wgpu/src/runtime.rs +++ b/crates/cubecl-wgpu/src/runtime.rs @@ -212,7 +212,6 @@ pub(crate) fn create_client_on_setup( ); let channel = MutexComputeChannel::new(server); - // The wgpu float32-atomic feature guarantees both add and min/max. It does only guarantee float32 support, // but since f16 and others aren't supported anyway there is not much to it. if features.contains(wgpu::Features::SHADER_FLOAT32_ATOMIC) { @@ -221,7 +220,6 @@ pub(crate) fn create_client_on_setup( device_props.register_feature(Feature::AtomicFloat(AtomicFeature::MinMax)); } - C::register_features(&setup.adapter, &setup.device, &mut device_props); ComputeClient::new(channel, device_props) } From 3c4ddae1e09150da50c577feaf382da0269dcdab Mon Sep 17 00:00:00 2001 From: Arthur Date: Sun, 19 Jan 2025 17:28:13 +0000 Subject: [PATCH 3/7] Fix spirv --- crates/cubecl-wgpu/src/compiler/spirv.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/crates/cubecl-wgpu/src/compiler/spirv.rs b/crates/cubecl-wgpu/src/compiler/spirv.rs index b255e1a5a..b27a35a74 100644 --- a/crates/cubecl-wgpu/src/compiler/spirv.rs +++ b/crates/cubecl-wgpu/src/compiler/spirv.rs @@ -112,7 +112,7 @@ impl WgpuCompiler for SpirvCompiler { // indexing are instead checked by cube. The WebGPU specification only makes // incredibly loose guarantees that Cube can't rely on. Additionally, kernels // can opt in/out per operation whether checks should be performed which can be faster. - let checks = ShaderRuntimeChecks { + let checks = wgpu::ShaderRuntimeChecks { bounds_checks: false, // Loop bounds are only checked in checked mode. force_loop_bounding: mode == ExecutionMode::Checked, @@ -122,7 +122,7 @@ impl WgpuCompiler for SpirvCompiler { // is only available through the use of unsafe code. let module = unsafe { server.device.create_shader_module_trusted( - ShaderModuleDescriptor { + wgpu::ShaderModuleDescriptor { label: None, source: wgpu::ShaderSource::Wgsl(Cow::Borrowed(source)), }, @@ -421,9 +421,7 @@ fn is_robust(device: &wgpu::Device) -> bool { .contains(&EXT_ROBUSTNESS2_NAME) } unsafe { - device - .as_hal::(|device| device.map(is_robust).unwrap_or(false)) - .unwrap_or(false) + device.as_hal::(|device| device.map(is_robust).unwrap_or(false)) } } From a83e89ada0d53e657f55aa921bea23ec47107b8e Mon Sep 17 00:00:00 2001 From: Arthur Date: Sun, 19 Jan 2025 23:28:53 +0000 Subject: [PATCH 4/7] Fix float atomic register --- crates/cubecl-wgpu/src/runtime.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/crates/cubecl-wgpu/src/runtime.rs b/crates/cubecl-wgpu/src/runtime.rs index 1898db112..963f42824 100644 --- a/crates/cubecl-wgpu/src/runtime.rs +++ b/crates/cubecl-wgpu/src/runtime.rs @@ -212,12 +212,11 @@ pub(crate) fn create_client_on_setup( ); let channel = MutexComputeChannel::new(server); - // The wgpu float32-atomic feature guarantees both add and min/max. It does only guarantee float32 support, - // but since f16 and others aren't supported anyway there is not much to it. if features.contains(wgpu::Features::SHADER_FLOAT32_ATOMIC) { + device_props.register_feature(Feature::Type(Elem::AtomicFloat(FloatKind::F32))); + device_props.register_feature(Feature::AtomicFloat(AtomicFeature::LoadStore)); device_props.register_feature(Feature::AtomicFloat(AtomicFeature::Add)); - device_props.register_feature(Feature::AtomicFloat(AtomicFeature::MinMax)); } ComputeClient::new(channel, device_props) From 5b0ba03356e82726107abbd627ae591d3919127d Mon Sep 17 00:00:00 2001 From: Arthur Date: Sun, 19 Jan 2025 23:41:06 +0000 Subject: [PATCH 5/7] Imports --- crates/cubecl-wgpu/src/runtime.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crates/cubecl-wgpu/src/runtime.rs b/crates/cubecl-wgpu/src/runtime.rs index 963f42824..20911872f 100644 --- a/crates/cubecl-wgpu/src/runtime.rs +++ b/crates/cubecl-wgpu/src/runtime.rs @@ -7,7 +7,10 @@ use crate::{ }; use alloc::sync::Arc; use cubecl_common::future; -use cubecl_core::{AtomicFeature, Feature, Runtime}; +use cubecl_core::{ + ir::{Elem, FloatKind}, + AtomicFeature, Feature, Runtime, +}; pub use cubecl_runtime::memory_management::MemoryConfiguration; use cubecl_runtime::{ channel::MutexComputeChannel, From 32d35fce2d7ee3bbf64c9d66effc083887de4be8 Mon Sep 17 00:00:00 2001 From: Arthur Brussee Date: Mon, 20 Jan 2025 11:08:11 +0000 Subject: [PATCH 6/7] Fixup atomics --- crates/cubecl-wgpu/src/compiler/wgsl/base.rs | 2 +- crates/cubecl-wgpu/src/compiler/wgsl/instructions.rs | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/crates/cubecl-wgpu/src/compiler/wgsl/base.rs b/crates/cubecl-wgpu/src/compiler/wgsl/base.rs index b595ade3e..48b297bcc 100644 --- a/crates/cubecl-wgpu/src/compiler/wgsl/base.rs +++ b/crates/cubecl-wgpu/src/compiler/wgsl/base.rs @@ -233,7 +233,7 @@ impl Elem { } pub fn is_atomic(&self) -> bool { - matches!(self, Self::AtomicI32 | Self::AtomicU32) + matches!(self, Self::AtomicI32 | Self::AtomicU32 | Self::AtomicF32) } } diff --git a/crates/cubecl-wgpu/src/compiler/wgsl/instructions.rs b/crates/cubecl-wgpu/src/compiler/wgsl/instructions.rs index 5f4737aef..b3194b94d 100644 --- a/crates/cubecl-wgpu/src/compiler/wgsl/instructions.rs +++ b/crates/cubecl-wgpu/src/compiler/wgsl/instructions.rs @@ -880,10 +880,7 @@ for (var {i}: {i_ty} = {start}; {i} {cmp} {end}; {increment}) {{ } Instruction::AtomicSub { lhs, rhs, out } => { let out = out.fmt_left(); - match rhs.elem() { - Elem::F32 => write!(f, "{out} = atomicAdd({lhs}, -{rhs});"), - _ => write!(f, "{out} = atomicSub({lhs}, {rhs});"), - } + write!(f, "{out} = atomicSub({lhs}, {rhs});") } Instruction::AtomicMax { lhs, rhs, out } => { let out = out.fmt_left(); From bff1c78fb50e88a440cae3e403c7cd601620bf86 Mon Sep 17 00:00:00 2001 From: Arthur Brussee Date: Mon, 20 Jan 2025 14:33:54 +0000 Subject: [PATCH 7/7] Clarify comment --- crates/cubecl-wgpu/src/compiler/spirv.rs | 8 ++++---- crates/cubecl-wgpu/src/compiler/wgsl/compiler.rs | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/crates/cubecl-wgpu/src/compiler/spirv.rs b/crates/cubecl-wgpu/src/compiler/spirv.rs index b27a35a74..eac420945 100644 --- a/crates/cubecl-wgpu/src/compiler/spirv.rs +++ b/crates/cubecl-wgpu/src/compiler/spirv.rs @@ -108,11 +108,11 @@ impl WgpuCompiler for SpirvCompiler { }) .unwrap_or_else(|| { let source = &kernel.source; - // Cube always in principle uses unchecked modules. Certain operations like - // indexing are instead checked by cube. The WebGPU specification only makes - // incredibly loose guarantees that Cube can't rely on. Additionally, kernels - // can opt in/out per operation whether checks should be performed which can be faster. + let checks = wgpu::ShaderRuntimeChecks { + // Cube does not need wgpu bounds checks - OOB behaviour is instead + // checked by cube (if enabled). + // This is because the WebGPU specification only makes loose guarantees that Cube can't rely on. bounds_checks: false, // Loop bounds are only checked in checked mode. force_loop_bounding: mode == ExecutionMode::Checked, diff --git a/crates/cubecl-wgpu/src/compiler/wgsl/compiler.rs b/crates/cubecl-wgpu/src/compiler/wgsl/compiler.rs index a420d3849..e3acf048c 100644 --- a/crates/cubecl-wgpu/src/compiler/wgsl/compiler.rs +++ b/crates/cubecl-wgpu/src/compiler/wgsl/compiler.rs @@ -19,7 +19,7 @@ use cubecl_runtime::{DeviceProperties, ExecutionMode}; use wgpu::{ BindGroupLayoutDescriptor, BindGroupLayoutEntry, BindingType, BufferBindingType, ComputePipeline, DeviceDescriptor, PipelineLayoutDescriptor, ShaderModuleDescriptor, - ShaderRuntimeChecks, ShaderStages, + ShaderStages, }; #[derive(Clone, Debug, Default)] @@ -90,11 +90,11 @@ impl WgpuCompiler for WgslCompiler { mode: ExecutionMode, ) -> Arc { let source = &kernel.source; - // Cube always in principle uses unchecked modules. Certain operations like - // indexing are instead checked by cube. The WebGPU specification only makes - // incredibly loose guarantees that Cube can't rely on. Additionally, kernels - // can opt in/out per operation whether checks should be performed which can be faster. - let checks = ShaderRuntimeChecks { + + let checks = wgpu::ShaderRuntimeChecks { + // Cube does not need wgpu bounds checks - OOB behaviour is instead + // checked by cube (if enabled). + // This is because the WebGPU specification only makes loose guarantees that Cube can't rely on. bounds_checks: false, // Loop bounds are only checked in checked mode. force_loop_bounding: mode == ExecutionMode::Checked,