1 package net.bmahe.genetics4j.gpu.opencl.model; 2 3 import java.util.Set; 4 5 import org.immutables.value.Value; 6 import org.jocl.cl_device_id; 7 8 /** 9 * Represents an OpenCL compute device with its capabilities and characteristics for GPU-accelerated evolutionary algorithms. 10 * 11 * <p>Device encapsulates the properties and capabilities of an OpenCL compute device (GPU, CPU, or accelerator) 12 * that can be used for fitness evaluation in evolutionary algorithms. This information is essential for 13 * device selection, kernel optimization, and workload configuration to achieve optimal performance. 14 * 15 * <p>Key device characteristics include: 16 * <ul> 17 * <li><strong>Device identification</strong>: Name, vendor, and version information</li> 18 * <li><strong>Compute capabilities</strong>: Number of compute units and maximum work group sizes</li> 19 * <li><strong>Memory hierarchy</strong>: Global, local, and constant memory sizes and characteristics</li> 20 * <li><strong>Processing features</strong>: Vector width preferences, image support, and built-in kernels</li> 21 * <li><strong>Performance metrics</strong>: Clock frequency and execution capabilities</li> 22 * </ul> 23 * 24 * <p>Device selection considerations for evolutionary algorithms: 25 * <ul> 26 * <li><strong>Device type</strong>: GPU devices typically provide highest parallelism for large populations</li> 27 * <li><strong>Compute units</strong>: More compute units allow better utilization of large populations</li> 28 * <li><strong>Work group sizes</strong>: Must accommodate the parallelism patterns of fitness kernels</li> 29 * <li><strong>Memory capacity</strong>: Must be sufficient for population data and intermediate results</li> 30 * <li><strong>Vector operations</strong>: Vector width preferences can optimize numerical computations</li> 31 * </ul> 32 * 33 * <p>Common device filtering patterns: 34 * <pre>{@code 35 * // Select GPU devices with sufficient parallel processing capability 36 * Predicate<Device> gpuFilter = device -> 37 * device.deviceType().contains(DeviceType.GPU) && 38 * device.maxComputeUnits() >= 8; 39 * 40 * // Select devices with large work group support for population processing 41 * Predicate<Device> workGroupFilter = device -> 42 * device.maxWorkGroupSize() >= 256; 43 * 44 * // Select devices with high clock frequency for compute-intensive fitness 45 * Predicate<Device> performanceFilter = device -> 46 * device.maxClockFrequency() >= 1000; // MHz 47 * 48 * // Select devices that support floating-point vector operations 49 * Predicate<Device> vectorFilter = device -> 50 * device.preferredVectorWidthFloat() >= 4; 51 * 52 * // Comprehensive filter for evolutionary algorithm suitability 53 * Predicate<Device> eaOptimizedFilter = device -> 54 * device.deviceType().contains(DeviceType.GPU) && 55 * device.maxComputeUnits() >= 4 && 56 * device.maxWorkGroupSize() >= 128 && 57 * device.preferredVectorWidthFloat() >= 2; 58 * }</pre> 59 * 60 * <p>Performance optimization using device information: 61 * <ul> 62 * <li><strong>Work group sizing</strong>: Configure kernel work groups based on {@link #maxWorkGroupSize()}</li> 63 * <li><strong>Parallel dispatch</strong>: Scale parallelism based on {@link #maxComputeUnits()}</li> 64 * <li><strong>Vector operations</strong>: Optimize data layouts for {@link #preferredVectorWidthFloat()}</li> 65 * <li><strong>Memory access patterns</strong>: Design kernels considering memory hierarchy characteristics</li> 66 * </ul> 67 * 68 * <p>Device capability assessment workflow: 69 * <ol> 70 * <li><strong>Device discovery</strong>: Enumerate devices from selected platforms</li> 71 * <li><strong>Capability query</strong>: Read device properties from OpenCL runtime</li> 72 * <li><strong>Model creation</strong>: Create device objects with discovered capabilities</li> 73 * <li><strong>Filtering</strong>: Apply user-defined predicates to select suitable devices</li> 74 * <li><strong>Context creation</strong>: Create OpenCL contexts for selected devices</li> 75 * </ol> 76 * 77 * <p>Common device types in evolutionary computation: 78 * <ul> 79 * <li><strong>GPU devices</strong>: Provide massive parallelism for large population fitness evaluation</li> 80 * <li><strong>CPU devices</strong>: Offer good sequential performance and large memory capacity</li> 81 * <li><strong>Accelerator devices</strong>: Specialized hardware for specific computational patterns</li> 82 * <li><strong>Custom devices</strong>: FPGA or other specialized compute devices</li> 83 * </ul> 84 * 85 * <p>Error handling and compatibility: 86 * <ul> 87 * <li><strong>Device availability</strong>: Devices may become unavailable during execution</li> 88 * <li><strong>Capability validation</strong>: Ensure device supports required kernel features</li> 89 * <li><strong>Memory constraints</strong>: Validate device memory is sufficient for population size</li> 90 * <li><strong>Work group limits</strong>: Ensure kernels respect device work group size limits</li> 91 * </ul> 92 * 93 * @see Platform 94 * @see DeviceType 95 * @see net.bmahe.genetics4j.gpu.spec.GPUEAExecutionContext#deviceFilters() 96 * @see net.bmahe.genetics4j.gpu.opencl.DeviceUtils 97 */ 98 @Value.Immutable 99 public interface Device { 100 101 /** 102 * Returns the native OpenCL device identifier. 103 * 104 * @return the OpenCL device ID for low-level operations 105 */ 106 cl_device_id deviceId(); 107 108 /** 109 * Returns the device name provided by the vendor. 110 * 111 * @return the human-readable device name (e.g., "GeForce RTX 3080", "Intel Core i7") 112 */ 113 String name(); 114 115 /** 116 * Returns the device vendor name. 117 * 118 * @return the vendor name (e.g., "NVIDIA Corporation", "Intel", "AMD") 119 */ 120 String vendor(); 121 122 /** 123 * Returns the OpenCL version supported by this device. 124 * 125 * @return the device OpenCL version string (e.g., "OpenCL 2.1") 126 */ 127 String deviceVersion(); 128 129 /** 130 * Returns the device driver version. 131 * 132 * @return the driver version string provided by the vendor 133 */ 134 String driverVersion(); 135 136 /** 137 * Returns the maximum configured clock frequency of the device compute units in MHz. 138 * 139 * @return the maximum clock frequency in megahertz 140 */ 141 int maxClockFrequency(); 142 143 /** 144 * Returns the set of device types that classify this device. 145 * 146 * @return set of device types (e.g., GPU, CPU, ACCELERATOR) 147 */ 148 Set<DeviceType> deviceType(); 149 150 /** 151 * Returns the set of built-in kernel names available on this device. 152 * 153 * @return set of built-in kernel names provided by the device 154 */ 155 Set<String> builtInKernels(); 156 157 /** 158 * Returns the number of parallel compute units on the device. 159 * 160 * <p>Compute units represent the primary parallel processing elements and directly 161 * impact the device's ability to execute work groups concurrently. 162 * 163 * @return the number of parallel compute units available 164 */ 165 int maxComputeUnits(); 166 167 /** 168 * Returns the maximum number of work-item dimensions supported by the device. 169 * 170 * @return the maximum number of dimensions for work-item indexing 171 */ 172 int maxWorkItemDimensions(); 173 174 /** 175 * Returns the maximum number of work-items in a work group for kernel execution. 176 * 177 * <p>This limit constrains the local work group size that can be used when 178 * launching kernels on this device. Larger work groups can improve memory 179 * locality and reduce synchronization overhead. 180 * 181 * @return the maximum work group size for kernel execution 182 */ 183 long maxWorkGroupSize(); 184 185 /** 186 * Returns the maximum number of work-items in each dimension of a work group. 187 * 188 * <p>The array contains the maximum work-item count for each dimension, 189 * providing more granular control over work group configuration than 190 * the overall {@link #maxWorkGroupSize()} limit. 191 * 192 * @return array of maximum work-item counts per dimension 193 */ 194 long[] maxWorkItemSizes(); 195 196 /** 197 * Returns whether the device supports image objects in kernels. 198 * 199 * @return true if the device supports image processing operations 200 */ 201 boolean imageSupport(); 202 203 /** 204 * Returns the preferred vector width for float operations. 205 * 206 * <p>This indicates the optimal vector width for floating-point operations 207 * on this device, which can be used to optimize numerical computations 208 * in fitness evaluation kernels. 209 * 210 * @return the preferred vector width for float operations 211 */ 212 int preferredVectorWidthFloat(); 213 214 /** 215 * Creates a new builder for constructing Device instances. 216 * 217 * @return a new builder for creating device objects 218 */ 219 static ImmutableDevice.Builder builder() { 220 return ImmutableDevice.builder(); 221 } 222 }