OpenCLFitness.java

package net.bmahe.genetics4j.gpu.spec.fitness;

import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import net.bmahe.genetics4j.core.Genotype;
import net.bmahe.genetics4j.gpu.opencl.OpenCLExecutionContext;

/**
 * Abstract base class for implementing OpenCL-based fitness evaluation in GPU-accelerated evolutionary algorithms.
 * 
 * <p>OpenCLFitness provides the framework for evaluating population fitness using OpenCL kernels executed
 * on GPU devices. This class defines the lifecycle and coordination patterns needed for efficient GPU-based
 * fitness computation, including resource management, data transfer, and kernel execution orchestration.
 * 
 * <p>The fitness evaluation lifecycle consists of several phases:
 * <ol>
 * <li><strong>Global initialization</strong>: One-time setup before any evaluations ({@link #beforeAllEvaluations})</li>
 * <li><strong>Per-device initialization</strong>: Setup for each OpenCL device context</li>
 * <li><strong>Generation setup</strong>: Preparation before each generation evaluation</li>
 * <li><strong>Computation</strong>: Actual fitness evaluation using OpenCL kernels</li>
 * <li><strong>Generation cleanup</strong>: Cleanup after each generation evaluation</li>
 * <li><strong>Per-device cleanup</strong>: Cleanup for each OpenCL device context</li>
 * <li><strong>Global cleanup</strong>: Final cleanup after all evaluations ({@link #afterAllEvaluations})</li>
 * </ol>
 * 
 * <p>Key responsibilities for implementations:
 * <ul>
 * <li><strong>Data preparation</strong>: Convert genotypes to GPU-compatible data formats</li>
 * <li><strong>Memory management</strong>: Allocate and manage GPU memory buffers</li>
 * <li><strong>Kernel execution</strong>: Configure and execute OpenCL kernels with appropriate parameters</li>
 * <li><strong>Result extraction</strong>: Retrieve and convert fitness values from GPU memory</li>
 * <li><strong>Resource cleanup</strong>: Ensure proper cleanup of GPU resources</li>
 * </ul>
 * 
 * <p>Common implementation patterns:
 * <pre>{@code
 * public class MyGPUFitness extends OpenCLFitness<Double> {
 *     
 *     private CLData inputBuffer;
 *     private CLData outputBuffer;
 *     
 *     @Override
 *     public void beforeAllEvaluations(OpenCLExecutionContext context, ExecutorService executor) {
 *         // Allocate GPU memory buffers that persist across generations
 *         int maxPopulationSize = getMaxPopulationSize();
 *         inputBuffer = CLData.allocateFloat(context, maxPopulationSize * chromosomeSize);
 *         outputBuffer = CLData.allocateFloat(context, maxPopulationSize);
 *     }
 *     
 *     @Override
 *     public CompletableFuture<List<Double>> compute(OpenCLExecutionContext context, 
 *             ExecutorService executor, long generation, List<Genotype> genotypes) {
 *         
 *         return CompletableFuture.supplyAsync(() -> {
 *             // Transfer genotype data to GPU
 *             transferGenotypesToGPU(context, genotypes, inputBuffer);
 *             
 *             // Execute fitness evaluation kernel
 *             executeKernel(context, "fitness_kernel", genotypes.size());
 *             
 *             // Retrieve results from GPU
 *             return extractFitnessValues(context, outputBuffer, genotypes.size());
 *         }, executor);
 *     }
 *     
 *     @Override
 *     public void afterAllEvaluations(OpenCLExecutionContext context, ExecutorService executor) {
 *         // Clean up GPU memory
 *         inputBuffer.release();
 *         outputBuffer.release();
 *     }
 * }
 * }</pre>
 * 
 * <p>Performance optimization strategies:
 * <ul>
 * <li><strong>Memory reuse</strong>: Allocate buffers once in {@link #beforeAllEvaluations} and reuse across generations</li>
 * <li><strong>Asynchronous execution</strong>: Use CompletableFuture for non-blocking GPU operations</li>
 * <li><strong>Batch processing</strong>: Process entire populations in single kernel launches</li>
 * <li><strong>Memory coalescing</strong>: Organize data layouts for optimal GPU memory access patterns</li>
 * <li><strong>Kernel optimization</strong>: Design kernels to maximize GPU utilization and minimize divergence</li>
 * </ul>
 * 
 * <p>Error handling and robustness:
 * <ul>
 * <li><strong>GPU errors</strong>: Handle OpenCL errors gracefully and provide meaningful error messages</li>
 * <li><strong>Memory management</strong>: Ensure proper cleanup even in exceptional circumstances</li>
 * <li><strong>Device failures</strong>: Support graceful degradation when GPU devices fail</li>
 * <li><strong>Timeout handling</strong>: Implement appropriate timeouts for long-running kernels</li>
 * </ul>
 * 
 * <p>Multi-device considerations:
 * <ul>
 * <li><strong>Device-specific setup</strong>: Separate contexts and buffers for each device</li>
 * <li><strong>Load balancing</strong>: Coordinate with the framework's automatic population partitioning</li>
 * <li><strong>Resource isolation</strong>: Ensure proper isolation of resources between devices</li>
 * <li><strong>Synchronization</strong>: Coordinate results from multiple devices</li>
 * </ul>
 * 
 * @param <T> the type of fitness values produced, must be comparable for selection operations
 * @see net.bmahe.genetics4j.gpu.GPUFitnessEvaluator
 * @see OpenCLExecutionContext
 * @see net.bmahe.genetics4j.gpu.opencl.model.CLData
 */
public abstract class OpenCLFitness<T extends Comparable<T>> {
	public static final Logger logger = LogManager.getLogger(OpenCLFitness.class);

	/**
	 * Global initialization hook called once before any fitness evaluations begin.
	 * 
	 * <p>This method is called once at the beginning of the evolutionary algorithm execution,
	 * before any OpenCL contexts are created or evaluations are performed. Use this method
	 * for global initialization that applies to all devices and generations.
	 * 
	 * <p>Typical use cases:
	 * <ul>
	 * <li>Initialize problem-specific constants or parameters</li>
	 * <li>Load reference data or configuration</li>
	 * <li>Set up logging or monitoring infrastructure</li>
	 * <li>Validate problem constraints or requirements</li>
	 * </ul>
	 * 
	 * <p>This method is called on the main thread before any concurrent operations begin.
	 * 
	 * @see #beforeAllEvaluations(OpenCLExecutionContext, ExecutorService)
	 */
	public void beforeAllEvaluations() {
	}

	/**
	 * Per-device initialization hook called for each OpenCL execution context.
	 * 
	 * <p>This method is called once for each OpenCL device that will be used for fitness
	 * evaluation. It allows device-specific initialization such as memory allocation,
	 * buffer creation, and device-specific resource setup.
	 * 
	 * <p>Typical use cases:
	 * <ul>
	 * <li>Allocate GPU memory buffers that persist across generations</li>
	 * <li>Pre-load static data to GPU memory</li>
	 * <li>Initialize device-specific data structures</li>
	 * <li>Set up device-specific kernels or configurations</li>
	 * </ul>
	 * 
	 * <p>Memory allocated in this method should typically be released in the corresponding
	 * {@link #afterAllEvaluations(OpenCLExecutionContext, ExecutorService)} method.
	 * 
	 * @param openCLExecutionContext the OpenCL execution context for a specific device
	 * @param executorService the executor service for asynchronous operations
	 * @see #afterAllEvaluations(OpenCLExecutionContext, ExecutorService)
	 */
	public void beforeAllEvaluations(final OpenCLExecutionContext openCLExecutionContext,
			final ExecutorService executorService) {
	}

	/**
	 * Global preparation hook called before each generation evaluation.
	 * 
	 * <p>This method is called before fitness evaluation of each generation, providing
	 * an opportunity for global preparation that applies across all devices. It receives
	 * the generation number and complete population for context.
	 * 
	 * <p>Typical use cases:
	 * <ul>
	 * <li>Update generation-specific parameters or configurations</li>
	 * <li>Log generation start or population statistics</li>
	 * <li>Prepare global data structures for the upcoming evaluation</li>
	 * <li>Implement adaptive behavior based on generation number</li>
	 * </ul>
	 * 
	 * @param generation the current generation number (0-based)
	 * @param genotypes the complete population to be evaluated
	 * @see #beforeEvaluation(OpenCLExecutionContext, ExecutorService, long, List)
	 */
	public void beforeEvaluation(final long generation, final List<Genotype> genotypes) {
	}

	/**
	 * Per-device preparation hook called before each device partition evaluation.
	 * 
	 * <p>This method is called for each device before evaluating its assigned partition
	 * of the population. It provides access to the device context and the specific
	 * genotypes that will be evaluated on this device.
	 * 
	 * <p>Typical use cases:
	 * <ul>
	 * <li>Transfer genotype data to device memory</li>
	 * <li>Update device-specific parameters for this generation</li>
	 * <li>Prepare input buffers with population data</li>
	 * <li>Set up kernel arguments that vary by generation</li>
	 * </ul>
	 * 
	 * @param openCLExecutionContext the OpenCL execution context for this device
	 * @param executorService the executor service for asynchronous operations
	 * @param generation the current generation number (0-based)
	 * @param genotypes the partition of genotypes to be evaluated on this device
	 * @see #afterEvaluation(OpenCLExecutionContext, ExecutorService, long, List)
	 */
	public void beforeEvaluation(final OpenCLExecutionContext openCLExecutionContext,
			final ExecutorService executorService, final long generation, final List<Genotype> genotypes) {
	}

	/**
	 * Performs the actual fitness computation using OpenCL kernels on the GPU.
	 * 
	 * <p>This is the core method that implements GPU-based fitness evaluation. It receives
	 * a partition of the population and must return corresponding fitness values using
	 * OpenCL kernel execution on the specified device.
	 * 
	 * <p>Implementation requirements:
	 * <ul>
	 * <li><strong>Return order</strong>: Fitness values must correspond to genotypes in the same order</li>
	 * <li><strong>Size consistency</strong>: Return exactly one fitness value per input genotype</li>
	 * <li><strong>Asynchronous execution</strong>: Use the executor service for non-blocking GPU operations</li>
	 * <li><strong>Error handling</strong>: Handle GPU errors gracefully and provide meaningful exceptions</li>
	 * </ul>
	 * 
	 * <p>Common implementation pattern:
	 * <ol>
	 * <li><strong>Data transfer</strong>: Copy genotype data to GPU memory</li>
	 * <li><strong>Kernel setup</strong>: Configure kernel arguments and work group parameters</li>
	 * <li><strong>Kernel execution</strong>: Launch OpenCL kernels for fitness computation</li>
	 * <li><strong>Result retrieval</strong>: Read fitness values from GPU memory</li>
	 * <li><strong>Data conversion</strong>: Convert GPU results to appropriate fitness type</li>
	 * </ol>
	 * 
	 * @param openCLExecutionContext the OpenCL execution context providing device access
	 * @param executorService the executor service for asynchronous operations
	 * @param generation the current generation number for context
	 * @param genotypes the genotypes to evaluate on this device
	 * @return a CompletableFuture that will complete with fitness values for each genotype
	 * @throws RuntimeException if GPU evaluation fails or setup errors occur
	 */
	public abstract CompletableFuture<List<T>> compute(final OpenCLExecutionContext openCLExecutionContext,
			final ExecutorService executorService, final long generation, final List<Genotype> genotypes);

	/**
	 * Per-device cleanup hook called after each device partition evaluation.
	 * 
	 * <p>This method is called for each device after its partition evaluation completes,
	 * providing an opportunity for device-specific cleanup and resource management.
	 * 
	 * <p>Typical use cases:
	 * <ul>
	 * <li>Clean up temporary GPU memory allocations</li>
	 * <li>Log device-specific performance metrics</li>
	 * <li>Update device-specific statistics or state</li>
	 * <li>Perform device-specific validation or debugging</li>
	 * </ul>
	 * 
	 * @param openCLExecutionContext the OpenCL execution context for this device
	 * @param executorService the executor service for asynchronous operations
	 * @param generation the current generation number (0-based)
	 * @param genotypes the partition of genotypes that were evaluated on this device
	 * @see #beforeEvaluation(OpenCLExecutionContext, ExecutorService, long, List)
	 */
	public void afterEvaluation(final OpenCLExecutionContext openCLExecutionContext,
			final ExecutorService executorService, final long generation, final List<Genotype> genotypes) {
	}

	/**
	 * Global cleanup hook called after each generation evaluation.
	 * 
	 * <p>This method is called after fitness evaluation of each generation completes
	 * across all devices, providing an opportunity for global cleanup and statistics
	 * collection that applies to the entire population.
	 * 
	 * <p>Typical use cases:
	 * <ul>
	 * <li>Log generation completion and performance metrics</li>
	 * <li>Update global statistics or progress tracking</li>
	 * <li>Perform global validation or debugging</li>
	 * <li>Clean up generation-specific global resources</li>
	 * </ul>
	 * 
	 * @param generation the current generation number (0-based)
	 * @param genotypes the complete population that was evaluated
	 * @see #beforeEvaluation(long, List)
	 */
	public void afterEvaluation(final long generation, final List<Genotype> genotypes) {
	}

	/**
	 * Per-device cleanup hook called for each OpenCL execution context at the end.
	 * 
	 * <p>This method is called once for each OpenCL device when fitness evaluation
	 * is complete, providing an opportunity to clean up device-specific resources
	 * that were allocated in {@link #beforeAllEvaluations(OpenCLExecutionContext, ExecutorService)}.
	 * 
	 * <p>Typical use cases:
	 * <ul>
	 * <li>Release GPU memory buffers and resources</li>
	 * <li>Clean up device-specific data structures</li>
	 * <li>Log device-specific performance summaries</li>
	 * <li>Ensure no GPU memory leaks occur</li>
	 * </ul>
	 * 
	 * <p>This method should ensure proper cleanup even if exceptions occurred during
	 * evaluation, as it may be the only opportunity to prevent resource leaks.
	 * 
	 * @param openCLExecutionContext the OpenCL execution context for this device
	 * @param executorService the executor service for asynchronous operations
	 * @see #beforeAllEvaluations(OpenCLExecutionContext, ExecutorService)
	 */
	public void afterAllEvaluations(final OpenCLExecutionContext openCLExecutionContext,
			final ExecutorService executorService) {
	}

	/**
	 * Global cleanup hook called once after all fitness evaluations complete.
	 * 
	 * <p>This method is called once at the end of the evolutionary algorithm execution,
	 * after all OpenCL contexts have been cleaned up and all evaluations are complete.
	 * Use this method for final global cleanup and resource deallocation.
	 * 
	 * <p>Typical use cases:
	 * <ul>
	 * <li>Clean up global resources and data structures</li>
	 * <li>Log final performance summaries and statistics</li>
	 * <li>Save results or generate reports</li>
	 * <li>Perform final validation or cleanup</li>
	 * </ul>
	 * 
	 * <p>This method is called on the main thread after all concurrent operations complete.
	 * 
	 * @see #beforeAllEvaluations()
	 */
	public void afterAllEvaluations() {
	}
}