GPUFitnessEvaluator.java
package net.bmahe.genetics4j.gpu;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import org.apache.commons.collections4.ListUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.Validate;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jocl.CL;
import org.jocl.cl_command_queue;
import org.jocl.cl_context;
import org.jocl.cl_context_properties;
import org.jocl.cl_device_id;
import org.jocl.cl_kernel;
import org.jocl.cl_platform_id;
import org.jocl.cl_program;
import org.jocl.cl_queue_properties;
import net.bmahe.genetics4j.core.Genotype;
import net.bmahe.genetics4j.core.evaluation.FitnessEvaluator;
import net.bmahe.genetics4j.gpu.opencl.DeviceReader;
import net.bmahe.genetics4j.gpu.opencl.DeviceUtils;
import net.bmahe.genetics4j.gpu.opencl.KernelInfoReader;
import net.bmahe.genetics4j.gpu.opencl.OpenCLExecutionContext;
import net.bmahe.genetics4j.gpu.opencl.PlatformReader;
import net.bmahe.genetics4j.gpu.opencl.PlatformUtils;
import net.bmahe.genetics4j.gpu.opencl.model.Device;
import net.bmahe.genetics4j.gpu.opencl.model.KernelInfo;
import net.bmahe.genetics4j.gpu.opencl.model.Platform;
import net.bmahe.genetics4j.gpu.spec.GPUEAConfiguration;
import net.bmahe.genetics4j.gpu.spec.GPUEAExecutionContext;
import net.bmahe.genetics4j.gpu.spec.Program;
/**
* GPU-accelerated fitness evaluator that leverages OpenCL for high-performance evolutionary algorithm execution.
*
* <p>GPUFitnessEvaluator implements the core {@link FitnessEvaluator} interface to provide GPU acceleration
* for fitness computation in evolutionary algorithms. This evaluator manages the complete OpenCL lifecycle,
* from device discovery and kernel compilation to memory management and resource cleanup.
*
* <p>Key responsibilities include:
* <ul>
* <li><strong>OpenCL initialization</strong>: Platform and device discovery, context creation, and kernel compilation</li>
* <li><strong>Resource management</strong>: Managing OpenCL contexts, command queues, programs, and kernels</li>
* <li><strong>Population partitioning</strong>: Distributing work across multiple OpenCL devices</li>
* <li><strong>Asynchronous execution</strong>: Coordinating concurrent GPU operations with CPU-side logic</li>
* <li><strong>Memory lifecycle</strong>: Ensuring proper cleanup of GPU resources</li>
* </ul>
*
* <p>Architecture overview:
* <ol>
* <li><strong>Initialization ({@link #preEvaluation})</strong>: Discover platforms/devices, compile kernels, create contexts</li>
* <li><strong>Evaluation ({@link #evaluate})</strong>: Partition population, execute fitness computation on GPU</li>
* <li><strong>Cleanup ({@link #postEvaluation})</strong>: Release all OpenCL resources and contexts</li>
* </ol>
*
* <p>Multi-device support:
* <ul>
* <li><strong>Device filtering</strong>: Selects devices based on user-defined criteria (type, capabilities)</li>
* <li><strong>Load balancing</strong>: Automatically distributes population across available devices</li>
* <li><strong>Parallel execution</strong>: Concurrent fitness evaluation on multiple GPUs or devices</li>
* <li><strong>Asynchronous coordination</strong>: Non-blocking execution with CompletableFuture-based results</li>
* </ul>
*
* <p>Resource management patterns:
* <ul>
* <li><strong>Lazy initialization</strong>: OpenCL resources created only when needed</li>
* <li><strong>Automatic cleanup</strong>: Guaranteed resource release through lifecycle methods</li>
* <li><strong>Error recovery</strong>: Robust handling of OpenCL errors and device failures</li>
* <li><strong>Memory optimization</strong>: Efficient GPU memory usage and transfer patterns</li>
* </ul>
*
* <p>Example usage in GPU EA system:
* <pre>{@code
* // GPU configuration with OpenCL kernel
* Program fitnessProgram = Program.ofResource("/kernels/optimization.cl");
* GPUEAConfiguration<Double> config = GPUEAConfigurationBuilder.<Double>builder()
* .program(fitnessProgram)
* .fitness(new MyGPUFitness())
* // ... other EA configuration
* .build();
*
* // Execution context with device preferences
* GPUEAExecutionContext<Double> context = GPUEAExecutionContextBuilder.<Double>builder()
* .populationSize(2000)
* .deviceFilter(device -> device.type() == DeviceType.GPU)
* .platformFilter(platform -> platform.profile() == PlatformProfile.FULL_PROFILE)
* .build();
*
* // Evaluator handles all OpenCL lifecycle automatically
* GPUFitnessEvaluator<Double> evaluator = new GPUFitnessEvaluator<>(context, config, executorService);
*
* // Used by EA system - lifecycle managed automatically
* EASystem<Double> system = EASystemFactory.from(config, context, executorService, evaluator);
* }</pre>
*
* <p>Performance characteristics:
* <ul>
* <li><strong>Initialization overhead</strong>: One-time setup cost for OpenCL compilation and context creation</li>
* <li><strong>Scalability</strong>: Performance scales with population size and problem complexity</li>
* <li><strong>Memory bandwidth</strong>: Optimal for problems with high computational intensity</li>
* <li><strong>Concurrency</strong>: Supports concurrent evaluation across multiple devices</li>
* </ul>
*
* <p>Error handling:
* <ul>
* <li><strong>Device failures</strong>: Graceful degradation when devices become unavailable</li>
* <li><strong>Memory errors</strong>: Proper cleanup and error reporting for GPU memory issues</li>
* <li><strong>Compilation errors</strong>: Clear error messages for kernel compilation failures</li>
* <li><strong>Resource leaks</strong>: Guaranteed cleanup even in exceptional circumstances</li>
* </ul>
*
* @param <T> the type of fitness values produced, must be comparable for selection operations
* @see FitnessEvaluator
* @see GPUEAConfiguration
* @see GPUEAExecutionContext
* @see OpenCLExecutionContext
* @see net.bmahe.genetics4j.gpu.fitness.OpenCLFitness
*/
public class GPUFitnessEvaluator<T extends Comparable<T>> implements FitnessEvaluator<T> {
public static final Logger logger = LogManager.getLogger(GPUFitnessEvaluator.class);
private final GPUEAExecutionContext<T> gpuEAExecutionContext;
private final GPUEAConfiguration<T> gpuEAConfiguration;
private final ExecutorService executorService;
private List<Pair<Platform, Device>> selectedPlatformToDevice;
final List<cl_context> clContexts = new ArrayList<>();
final List<cl_command_queue> clCommandQueues = new ArrayList<>();
final List<cl_program> clPrograms = new ArrayList<>();
final List<Map<String, cl_kernel>> clKernels = new ArrayList<>();
final List<OpenCLExecutionContext> clExecutionContexts = new ArrayList<>();
/**
* Constructs a GPU fitness evaluator with the specified configuration and execution context.
*
* <p>Initializes the evaluator with GPU-specific configuration and execution parameters.
* The evaluator will use the provided executor service for coordinating asynchronous
* operations between CPU and GPU components.
*
* <p>The constructor performs minimal initialization - the actual OpenCL setup occurs
* during {@link #preEvaluation()} to follow the fitness evaluator lifecycle pattern.
*
* @param _gpuEAExecutionContext the GPU execution context with device filters and population settings
* @param _gpuEAConfiguration the GPU EA configuration with OpenCL program and fitness function
* @param _executorService the executor service for managing asynchronous operations
* @throws IllegalArgumentException if any parameter is null
*/
public GPUFitnessEvaluator(final GPUEAExecutionContext<T> _gpuEAExecutionContext,
final GPUEAConfiguration<T> _gpuEAConfiguration, final ExecutorService _executorService) {
Validate.notNull(_gpuEAExecutionContext);
Validate.notNull(_gpuEAConfiguration);
Validate.notNull(_executorService);
this.gpuEAExecutionContext = _gpuEAExecutionContext;
this.gpuEAConfiguration = _gpuEAConfiguration;
this.executorService = _executorService;
CL.setExceptionsEnabled(true);
}
private String loadResource(final String filename) {
Validate.notBlank(filename);
try {
return IOUtils.resourceToString(filename, StandardCharsets.UTF_8);
} catch (IOException e) {
throw new IllegalStateException("Unable to load resource " + filename, e);
}
}
private List<String> grabProgramSources() {
final Program programSpec = gpuEAConfiguration.program();
logger.info("Load program source: {}", programSpec);
final List<String> sources = new ArrayList<>();
sources.addAll(programSpec.content());
programSpec.resources()
.stream()
.map(resource -> loadResource(resource))
.forEach(program -> {
sources.add(program);
});
return sources;
}
/**
* Initializes OpenCL resources and prepares GPU devices for fitness evaluation.
*
* <p>This method performs the complete OpenCL initialization sequence:
* <ol>
* <li><strong>Platform discovery</strong>: Enumerates available OpenCL platforms</li>
* <li><strong>Device filtering</strong>: Selects devices based on configured filters</li>
* <li><strong>Context creation</strong>: Creates OpenCL contexts for selected devices</li>
* <li><strong>Queue setup</strong>: Creates command queues with profiling and out-of-order execution</li>
* <li><strong>Program compilation</strong>: Compiles OpenCL kernels from source code</li>
* <li><strong>Kernel preparation</strong>: Creates kernel objects and queries execution info</li>
* <li><strong>Fitness initialization</strong>: Calls lifecycle hooks on the fitness function</li>
* </ol>
*
* <p>Device selection process:
* <ul>
* <li>Applies platform filters to discovered OpenCL platforms</li>
* <li>Enumerates devices for each qualifying platform</li>
* <li>Applies device filters to select appropriate devices</li>
* <li>Validates that at least one device is available</li>
* </ul>
*
* <p>The method creates separate OpenCL contexts for each selected device to enable
* concurrent execution and optimal resource utilization. Each context includes
* compiled programs and kernel objects ready for fitness evaluation.
*
* @throws IllegalStateException if no compatible devices are found
* @throws RuntimeException if OpenCL initialization, program compilation, or kernel creation fails
*/
@Override
public void preEvaluation() {
logger.trace("Init...");
FitnessEvaluator.super.preEvaluation();
final var platformReader = new PlatformReader();
final var deviceReader = new DeviceReader();
final var kernelInfoReader = new KernelInfoReader();
final int numPlatforms = PlatformUtils.numPlatforms();
logger.info("Found {} platforms", numPlatforms);
final List<cl_platform_id> platformIds = PlatformUtils.platformIds(numPlatforms);
logger.info("Selecting platform and devices");
final var platformFilters = gpuEAExecutionContext.platformFilters();
final var deviceFilters = gpuEAExecutionContext.deviceFilters();
selectedPlatformToDevice = platformIds.stream()
.map(platformReader::read)
.filter(platformFilters)
.flatMap(platform -> {
final var platformId = platform.platformId();
final int numDevices = DeviceUtils.numDevices(platformId);
logger.trace("\tPlatform {}: {} devices", platform.name(), numDevices);
final var deviceIds = DeviceUtils.getDeviceIds(platformId, numDevices);
return deviceIds.stream()
.map(deviceId -> Pair.of(platform, deviceId));
})
.map(platformToDeviceId -> {
final var platform = platformToDeviceId.getLeft();
final var platformId = platform.platformId();
final var deviceID = platformToDeviceId.getRight();
return Pair.of(platform, deviceReader.read(platformId, deviceID));
})
.filter(platformToDevice -> deviceFilters.test(platformToDevice.getRight()))
.toList();
if (logger.isTraceEnabled()) {
logger.trace("============================");
logger.trace("Selected devices:");
selectedPlatformToDevice.forEach(pd -> {
logger.trace("{}", pd.getLeft());
logger.trace("\t{}", pd.getRight());
});
logger.trace("============================");
}
Validate.isTrue(selectedPlatformToDevice.size() > 0);
final List<String> programs = grabProgramSources();
final String[] programsArr = programs.toArray(new String[programs.size()]);
for (final var platformAndDevice : selectedPlatformToDevice) {
final var platform = platformAndDevice.getLeft();
final var device = platformAndDevice.getRight();
logger.info("Processing platform [{}] / device [{}]", platform.name(), device.name());
logger.info("\tCreating context");
cl_context_properties contextProperties = new cl_context_properties();
contextProperties.addProperty(CL.CL_CONTEXT_PLATFORM, platform.platformId());
final cl_context context = CL
.clCreateContext(contextProperties, 1, new cl_device_id[] { device.deviceId() }, null, null, null);
logger.info("\tCreating command queue");
final cl_queue_properties queueProperties = new cl_queue_properties();
queueProperties.addProperty(CL.CL_QUEUE_PROPERTIES,
CL.CL_QUEUE_PROFILING_ENABLE | CL.CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
final cl_command_queue commandQueue = CL
.clCreateCommandQueueWithProperties(context, device.deviceId(), queueProperties, null);
logger.info("\tCreate program");
final cl_program program = CL.clCreateProgramWithSource(context, programsArr.length, programsArr, null, null);
final var programSpec = gpuEAConfiguration.program();
final var buildOptions = programSpec.buildOptions()
.orElse(null);
logger.info("\tBuilding program with options: {}", buildOptions);
CL.clBuildProgram(program, 0, null, buildOptions, null, null);
final Set<String> kernelNames = gpuEAConfiguration.program()
.kernelNames();
final Map<String, cl_kernel> kernels = new HashMap<>();
final Map<String, KernelInfo> kernelInfos = new HashMap<>();
for (final String kernelName : kernelNames) {
logger.info("\tCreate kernel {}", kernelName);
final cl_kernel kernel = CL.clCreateKernel(program, kernelName, null);
Validate.notNull(kernel);
kernels.put(kernelName, kernel);
final var kernelInfo = kernelInfoReader.read(device.deviceId(), kernel, kernelName);
logger.trace("\t{}", kernelInfo);
kernelInfos.put(kernelName, kernelInfo);
}
clContexts.add(context);
clCommandQueues.add(commandQueue);
clKernels.add(kernels);
clPrograms.add(program);
final var openCLExecutionContext = OpenCLExecutionContext.builder()
.platform(platform)
.device(device)
.clContext(context)
.clCommandQueue(commandQueue)
.kernels(kernels)
.kernelInfos(kernelInfos)
.clProgram(program)
.build();
clExecutionContexts.add(openCLExecutionContext);
}
final var fitness = gpuEAConfiguration.fitness();
fitness.beforeAllEvaluations();
for (final OpenCLExecutionContext clExecutionContext : clExecutionContexts) {
fitness.beforeAllEvaluations(clExecutionContext, executorService);
}
}
/**
* Evaluates fitness for a population of genotypes using GPU acceleration.
*
* <p>This method implements the core fitness evaluation logic by distributing the population
* across available OpenCL devices and executing fitness computation concurrently. The
* evaluation process follows these steps:
*
* <ol>
* <li><strong>Population partitioning</strong>: Divides genotypes across available devices</li>
* <li><strong>Parallel dispatch</strong>: Submits evaluation tasks to each device asynchronously</li>
* <li><strong>GPU execution</strong>: Executes OpenCL kernels for fitness computation</li>
* <li><strong>Result collection</strong>: Gathers fitness values from all devices</li>
* <li><strong>Result aggregation</strong>: Combines results preserving original order</li>
* </ol>
*
* <p>Load balancing strategy:
* <ul>
* <li>Automatically calculates partition size based on population and device count</li>
* <li>Round-robin assignment of partitions to devices for balanced workload</li>
* <li>Asynchronous execution allows devices to work at their optimal pace</li>
* </ul>
*
* <p>The method coordinates with the configured fitness function through lifecycle hooks:
* <ul>
* <li>{@code beforeEvaluation()}: Called before each device partition evaluation</li>
* <li>{@code compute()}: Executes the actual GPU fitness computation</li>
* <li>{@code afterEvaluation()}: Called after each device partition completes</li>
* </ul>
*
* <p>Concurrency and performance:
* <ul>
* <li>Multiple devices execute evaluation partitions concurrently</li>
* <li>CompletableFuture-based coordination for non-blocking execution</li>
* <li>Automatic workload distribution across available GPU resources</li>
* </ul>
*
* @param generation the current generation number for context and logging
* @param genotypes the population of genotypes to evaluate
* @return fitness values corresponding to each genotype in the same order
* @throws IllegalArgumentException if genotypes is null or empty
* @throws RuntimeException if GPU evaluation fails or OpenCL errors occur
*/
@Override
public List<T> evaluate(final long generation, final List<Genotype> genotypes) {
final var fitness = gpuEAConfiguration.fitness();
/**
* TODO make it configurable from execution context
*/
final int partitionSize = (int) (Math.ceil((double) genotypes.size() / clExecutionContexts.size()));
final var subGenotypes = ListUtils.partition(genotypes, partitionSize);
logger.debug("Genotype decomposed in {} partition(s)", subGenotypes.size());
if (logger.isTraceEnabled()) {
for (int i = 0; i < subGenotypes.size(); i++) {
final List<Genotype> subGenotype = subGenotypes.get(i);
logger.trace("\tPartition {} with {} elements", i, subGenotype.size());
}
}
final List<CompletableFuture<List<T>>> subResultsCF = new ArrayList<>();
for (int i = 0; i < subGenotypes.size(); i++) {
final var openCLExecutionContext = clExecutionContexts.get(i % clExecutionContexts.size());
final var subGenotype = subGenotypes.get(i);
fitness.beforeEvaluation(generation, subGenotype);
fitness.beforeEvaluation(openCLExecutionContext, executorService, generation, subGenotype);
final var resultsCF = fitness.compute(openCLExecutionContext, executorService, generation, subGenotype)
.thenApply((results) -> {
fitness.afterEvaluation(openCLExecutionContext, executorService, generation, subGenotype);
fitness.afterEvaluation(generation, subGenotype);
return results;
});
subResultsCF.add(resultsCF);
}
final List<T> resultsEvaluation = new ArrayList<>(genotypes.size());
for (final CompletableFuture<List<T>> subResultCF : subResultsCF) {
final var fitnessResults = subResultCF.join();
resultsEvaluation.addAll(fitnessResults);
}
return resultsEvaluation;
}
/**
* Cleans up OpenCL resources and releases GPU memory after evaluation completion.
*
* <p>This method performs comprehensive cleanup of all OpenCL resources in the proper order
* to prevent memory leaks and ensure clean shutdown. The cleanup sequence follows OpenCL
* best practices for resource deallocation:
*
* <ol>
* <li><strong>Fitness cleanup</strong>: Calls lifecycle hooks on the fitness function</li>
* <li><strong>Kernel release</strong>: Releases all compiled kernel objects</li>
* <li><strong>Program release</strong>: Releases compiled OpenCL programs</li>
* <li><strong>Queue release</strong>: Releases command queues and pending operations</li>
* <li><strong>Context release</strong>: Releases OpenCL contexts and associated memory</li>
* <li><strong>Reference cleanup</strong>: Clears internal data structures and references</li>
* </ol>
*
* <p>Resource management guarantees:
* <ul>
* <li>All GPU memory allocations are properly released</li>
* <li>OpenCL objects are released in dependency order to avoid errors</li>
* <li>No resource leaks occur even if individual cleanup operations fail</li>
* <li>Evaluator returns to a clean state ready for potential reinitialization</li>
* </ul>
*
* <p>The method coordinates with the configured fitness function to ensure any
* fitness-specific resources (buffers, textures, etc.) are also properly cleaned up
* through the {@code afterAllEvaluations()} lifecycle hooks.
*
* @throws RuntimeException if cleanup operations fail (logged but not propagated to prevent
* interference with EA system shutdown)
*/
@Override
public void postEvaluation() {
final var fitness = gpuEAConfiguration.fitness();
for (final OpenCLExecutionContext clExecutionContext : clExecutionContexts) {
fitness.afterAllEvaluations(clExecutionContext, executorService);
}
fitness.afterAllEvaluations();
logger.debug("Releasing kernels");
for (final Map<String, cl_kernel> kernels : clKernels) {
for (final cl_kernel clKernel : kernels.values()) {
CL.clReleaseKernel(clKernel);
}
}
clKernels.clear();
logger.debug("Releasing programs");
for (final cl_program clProgram : clPrograms) {
CL.clReleaseProgram(clProgram);
}
clPrograms.clear();
logger.debug("Releasing command queues");
for (final cl_command_queue clCommandQueue : clCommandQueues) {
CL.clReleaseCommandQueue(clCommandQueue);
}
clCommandQueues.clear();
logger.debug("Releasing contexts");
for (final cl_context clContext : clContexts) {
CL.clReleaseContext(clContext);
}
clContexts.clear();
clExecutionContexts.clear();
selectedPlatformToDevice = null;
FitnessEvaluator.super.postEvaluation();
}
}