View Javadoc
1   package net.bmahe.genetics4j.gpu.opencl.model;
2   
3   import org.immutables.value.Value;
4   
5   /**
6    * Represents kernel-specific execution characteristics and resource requirements for an OpenCL kernel on a specific
7    * device.
8    * 
9    * <p>KernelInfo encapsulates the device-specific compilation and execution characteristics of an OpenCL kernel,
10   * providing essential information for optimal work group configuration and resource allocation in GPU-accelerated
11   * evolutionary algorithms. This information is determined at kernel compilation time and varies by device.
12   * 
13   * <p>Key kernel characteristics include:
14   * <ul>
15   * <li><strong>Work group constraints</strong>: Maximum and preferred work group sizes for efficient execution</li>
16   * <li><strong>Memory usage</strong>: Local and private memory requirements per work-item</li>
17   * <li><strong>Performance optimization</strong>: Preferred work group size multiples for optimal resource
18   * utilization</li>
19   * <li><strong>Resource validation</strong>: Constraints for validating kernel launch parameters</li>
20   * </ul>
21   * 
22   * <p>Kernel optimization considerations for evolutionary algorithms:
23   * <ul>
24   * <li><strong>Work group sizing</strong>: Configure launch parameters within device-specific limits</li>
25   * <li><strong>Memory allocation</strong>: Ensure sufficient local memory for parallel fitness evaluation</li>
26   * <li><strong>Performance tuning</strong>: Align work group sizes with preferred multiples</li>
27   * <li><strong>Resource planning</strong>: Account for per-work-item memory requirements</li>
28   * </ul>
29   * 
30   * <p>Common usage patterns for kernel configuration:
31   * 
32   * <pre>{@code
33   * // Query kernel information after compilation
34   * KernelInfo kernelInfo = kernelInfoReader.read(deviceId, kernel, "fitness_evaluation");
35   * 
36   * // Configure work group size within device limits
37   * long maxWorkGroupSize = Math.min(kernelInfo.workGroupSize(), device.maxWorkGroupSize());
38   * 
39   * // Optimize for preferred work group size multiple
40   * long preferredMultiple = kernelInfo.preferredWorkGroupSizeMultiple();
41   * long optimalWorkGroupSize = (maxWorkGroupSize / preferredMultiple) * preferredMultiple;
42   * 
43   * // Validate memory requirements for population size
44   * long populationSize = 1000;
45   * long totalLocalMem = kernelInfo.localMemSize() * optimalWorkGroupSize;
46   * long totalPrivateMem = kernelInfo.privateMemSize() * populationSize;
47   * 
48   * // Configure kernel execution with validated parameters
49   * clEnqueueNDRangeKernel(
50   * 		commandQueue,
51   * 			kernel,
52   * 			1,
53   * 			null,
54   * 			new long[] { populationSize },
55   * 			new long[] { optimalWorkGroupSize },
56   * 			0,
57   * 			null,
58   * 			null);
59   * }</pre>
60   * 
61   * <p>Performance optimization workflow:
62   * <ol>
63   * <li><strong>Kernel compilation</strong>: Compile kernel for target device</li>
64   * <li><strong>Information query</strong>: Read kernel-specific execution characteristics</li>
65   * <li><strong>Work group optimization</strong>: Calculate optimal work group size based on preferences</li>
66   * <li><strong>Memory validation</strong>: Ensure memory requirements fit within device limits</li>
67   * <li><strong>Launch configuration</strong>: Configure kernel execution with optimized parameters</li>
68   * </ol>
69   * 
70   * <p>Memory management considerations:
71   * <ul>
72   * <li><strong>Local memory</strong>: Shared among work-items in the same work group</li>
73   * <li><strong>Private memory</strong>: Individual memory per work-item</li>
74   * <li><strong>Total allocation</strong>: Sum of all work-items' memory requirements</li>
75   * <li><strong>Device limits</strong>: Validate against device memory constraints</li>
76   * </ul>
77   * 
78   * <p>Error handling and validation:
79   * <ul>
80   * <li><strong>Work group limits</strong>: Ensure launch parameters don't exceed kernel limits</li>
81   * <li><strong>Memory constraints</strong>: Validate total memory usage against device capabilities</li>
82   * <li><strong>Performance degradation</strong>: Monitor for suboptimal work group configurations</li>
83   * <li><strong>Resource conflicts</strong>: Handle multiple kernels competing for device resources</li>
84   * </ul>
85   * 
86   * @see Device
87   * @see net.bmahe.genetics4j.gpu.opencl.KernelInfoReader
88   * @see net.bmahe.genetics4j.gpu.opencl.KernelInfoUtils
89   */
90  @Value.Immutable
91  public interface KernelInfo {
92  
93  	/**
94  	 * Returns the name of the kernel function.
95  	 * 
96  	 * @return the kernel function name as specified in the OpenCL program
97  	 */
98  	String name();
99  
100 	/**
101 	 * Returns the maximum work group size that can be used when executing this kernel on the device.
102 	 * 
103 	 * <p>This value represents the maximum number of work-items that can be in a work group when executing this specific
104 	 * kernel on the target device. It may be smaller than the device's general maximum work group size due to
105 	 * kernel-specific resource requirements.
106 	 * 
107 	 * @return the maximum work group size for this kernel
108 	 */
109 	long workGroupSize();
110 
111 	/**
112 	 * Returns the preferred work group size multiple for optimal kernel execution performance.
113 	 * 
114 	 * <p>For optimal performance, the work group size should be a multiple of this value. This represents the native
115 	 * vector width or wavefront size of the device and helps achieve better resource utilization and memory coalescing.
116 	 * 
117 	 * @return the preferred work group size multiple for performance optimization
118 	 */
119 	long preferredWorkGroupSizeMultiple();
120 
121 	/**
122 	 * Returns the amount of local memory in bytes used by this kernel.
123 	 * 
124 	 * <p>Local memory is shared among all work-items in a work group and includes both statically allocated local
125 	 * variables and dynamically allocated local memory passed as kernel arguments. This value is used to validate that
126 	 * the total local memory usage doesn't exceed the device's local memory capacity.
127 	 * 
128 	 * @return the local memory usage in bytes per work group
129 	 */
130 	long localMemSize();
131 
132 	/**
133 	 * Returns the minimum amount of private memory in bytes used by each work-item.
134 	 * 
135 	 * <p>Private memory is individual to each work-item and includes local variables, function call stacks, and other
136 	 * per-work-item data. This value helps estimate the total memory footprint when launching kernels with large work
137 	 * group sizes.
138 	 * 
139 	 * @return the private memory usage in bytes per work-item
140 	 */
141 	long privateMemSize();
142 
143 	/**
144 	 * Creates a new builder for constructing KernelInfo instances.
145 	 * 
146 	 * @return a new builder for creating kernel information objects
147 	 */
148 	static ImmutableKernelInfo.Builder builder() {
149 		return ImmutableKernelInfo.builder();
150 	}
151 }